diff --git a/examples/nemotron-h/120b-a12b-qlora.yaml b/examples/nemotron-h/120b-a12b-qlora.yaml index 67dcdb96e..03e6d3b5e 100644 --- a/examples/nemotron-h/120b-a12b-qlora.yaml +++ b/examples/nemotron-h/120b-a12b-qlora.yaml @@ -1,5 +1,15 @@ base_model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16 +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + - axolotl.integrations.liger.LigerPlugin + +liger_layer_norm: true +liger_rope: true +liger_rms_norm: true +liger_glu_activation: true +liger_rms_norm_gated: true + # LoRA kernel patches are incompatible with this architecture — see README. lora_mlp_kernel: false lora_qkv_kernel: false @@ -22,8 +32,6 @@ dataset_prepared_path: last_run_prepared sequence_len: 4096 sample_packing: true -use_cut_cross_entropy: true - load_in_4bit: true quantize_moe_experts: true adapter: qlora @@ -31,16 +39,16 @@ lora_r: 16 lora_alpha: 32 lora_dropout: 0.0 lora_target_modules: - # Attention projection layers (present in ~12 attention layers out of 88) - q_proj - k_proj - v_proj - o_proj - # To also train MoE expert weights, add them via lora_target_parameters - # (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj): - # lora_target_parameters: - # - up_proj - # - down_proj + +# To also train MoE expert weights, add them via lora_target_parameters +# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj): +# lora_target_parameters: +# - up_proj +# - down_proj wandb_project: wandb_entity: diff --git a/examples/nemotron-h/nano-30b-a3b-qlora.yaml b/examples/nemotron-h/nano-30b-a3b-qlora.yaml index 2d7307f99..3994ab08e 100644 --- a/examples/nemotron-h/nano-30b-a3b-qlora.yaml +++ b/examples/nemotron-h/nano-30b-a3b-qlora.yaml @@ -1,6 +1,16 @@ # See examples/nemotron-h/README.md for architecture notes and requirements. base_model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16 +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + - axolotl.integrations.liger.LigerPlugin + +liger_layer_norm: true +liger_rope: true +liger_rms_norm: true +liger_glu_activation: true +liger_rms_norm_gated: true + # LoRA kernel patches are incompatible with this architecture — see README. lora_mlp_kernel: false lora_qkv_kernel: false @@ -23,8 +33,6 @@ dataset_prepared_path: last_run_prepared sequence_len: 4096 sample_packing: true -use_cut_cross_entropy: true - load_in_4bit: true quantize_moe_experts: true adapter: qlora @@ -36,11 +44,12 @@ lora_target_modules: - k_proj - v_proj - o_proj - # To also train MoE expert weights, add them via lora_target_parameters - # (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj): - # lora_target_parameters: - # - up_proj - # - down_proj + +# To also train MoE expert weights, add them via lora_target_parameters +# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj): +# lora_target_parameters: +# - up_proj +# - down_proj wandb_project: wandb_entity: