fix(config): add cce and liger to nemotron-h example (#3573) [skip ci]

2026-04-07 00:10:25 +07:00
parent 6f15da4cac
commit dc638e723f
2 changed files with 32 additions and 15 deletions
--- a/examples/nemotron-h/120b-a12b-qlora.yaml
+++ b/examples/nemotron-h/120b-a12b-qlora.yaml
@@ -1,5 +1,15 @@
 base_model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16

+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+
 # LoRA kernel patches are incompatible with this architecture — see README.
 lora_mlp_kernel: false
 lora_qkv_kernel: false
@@ -22,8 +32,6 @@ dataset_prepared_path: last_run_prepared
 sequence_len: 4096
 sample_packing: true

-use_cut_cross_entropy: true
-
 load_in_4bit: true
 quantize_moe_experts: true
 adapter: qlora
@@ -31,16 +39,16 @@ lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.0
 lora_target_modules:
-  # Attention projection layers (present in ~12 attention layers out of 88)
  - q_proj
  - k_proj
  - v_proj
  - o_proj
-  # To also train MoE expert weights, add them via lora_target_parameters
-  # (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
-  #   lora_target_parameters:
-  #     - up_proj
-  #     - down_proj
+
+# To also train MoE expert weights, add them via lora_target_parameters
+# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
+# lora_target_parameters:
+#   - up_proj
+#   - down_proj

 wandb_project:
 wandb_entity:
--- a/examples/nemotron-h/nano-30b-a3b-qlora.yaml
+++ b/examples/nemotron-h/nano-30b-a3b-qlora.yaml
@@ -1,6 +1,16 @@
 # See examples/nemotron-h/README.md for architecture notes and requirements.
 base_model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16

+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_layer_norm: true
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_rms_norm_gated: true
+
 # LoRA kernel patches are incompatible with this architecture — see README.
 lora_mlp_kernel: false
 lora_qkv_kernel: false
@@ -23,8 +33,6 @@ dataset_prepared_path: last_run_prepared
 sequence_len: 4096
 sample_packing: true

-use_cut_cross_entropy: true
-
 load_in_4bit: true
 quantize_moe_experts: true
 adapter: qlora
@@ -36,11 +44,12 @@ lora_target_modules:
  - k_proj
  - v_proj
  - o_proj
-  # To also train MoE expert weights, add them via lora_target_parameters
-  # (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
-  #   lora_target_parameters:
-  #     - up_proj
-  #     - down_proj
+
+# To also train MoE expert weights, add them via lora_target_parameters
+# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
+# lora_target_parameters:
+#   - up_proj
+#   - down_proj

 wandb_project:
 wandb_entity: