fix(config): add cce and liger to nemotron-h example (#3573) [skip ci]
This commit is contained in:
@@ -1,5 +1,15 @@
|
|||||||
base_model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
|
base_model: nvidia/NVIDIA-Nemotron-3-Super-120B-A12B-BF16
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_layer_norm: true
|
||||||
|
liger_rope: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm_gated: true
|
||||||
|
|
||||||
# LoRA kernel patches are incompatible with this architecture — see README.
|
# LoRA kernel patches are incompatible with this architecture — see README.
|
||||||
lora_mlp_kernel: false
|
lora_mlp_kernel: false
|
||||||
lora_qkv_kernel: false
|
lora_qkv_kernel: false
|
||||||
@@ -22,8 +32,6 @@ dataset_prepared_path: last_run_prepared
|
|||||||
sequence_len: 4096
|
sequence_len: 4096
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
|
||||||
use_cut_cross_entropy: true
|
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
quantize_moe_experts: true
|
quantize_moe_experts: true
|
||||||
adapter: qlora
|
adapter: qlora
|
||||||
@@ -31,16 +39,16 @@ lora_r: 16
|
|||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
lora_dropout: 0.0
|
lora_dropout: 0.0
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
# Attention projection layers (present in ~12 attention layers out of 88)
|
|
||||||
- q_proj
|
- q_proj
|
||||||
- k_proj
|
- k_proj
|
||||||
- v_proj
|
- v_proj
|
||||||
- o_proj
|
- o_proj
|
||||||
# To also train MoE expert weights, add them via lora_target_parameters
|
|
||||||
# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
|
# To also train MoE expert weights, add them via lora_target_parameters
|
||||||
# lora_target_parameters:
|
# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
|
||||||
# - up_proj
|
# lora_target_parameters:
|
||||||
# - down_proj
|
# - up_proj
|
||||||
|
# - down_proj
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
|
|||||||
@@ -1,6 +1,16 @@
|
|||||||
# See examples/nemotron-h/README.md for architecture notes and requirements.
|
# See examples/nemotron-h/README.md for architecture notes and requirements.
|
||||||
base_model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
|
base_model: nvidia/NVIDIA-Nemotron-3-Nano-30B-A3B-BF16
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_layer_norm: true
|
||||||
|
liger_rope: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm_gated: true
|
||||||
|
|
||||||
# LoRA kernel patches are incompatible with this architecture — see README.
|
# LoRA kernel patches are incompatible with this architecture — see README.
|
||||||
lora_mlp_kernel: false
|
lora_mlp_kernel: false
|
||||||
lora_qkv_kernel: false
|
lora_qkv_kernel: false
|
||||||
@@ -23,8 +33,6 @@ dataset_prepared_path: last_run_prepared
|
|||||||
sequence_len: 4096
|
sequence_len: 4096
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
|
||||||
use_cut_cross_entropy: true
|
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
quantize_moe_experts: true
|
quantize_moe_experts: true
|
||||||
adapter: qlora
|
adapter: qlora
|
||||||
@@ -36,11 +44,12 @@ lora_target_modules:
|
|||||||
- k_proj
|
- k_proj
|
||||||
- v_proj
|
- v_proj
|
||||||
- o_proj
|
- o_proj
|
||||||
# To also train MoE expert weights, add them via lora_target_parameters
|
|
||||||
# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
|
# To also train MoE expert weights, add them via lora_target_parameters
|
||||||
# lora_target_parameters:
|
# (they are 3D nn.Parameter tensors, not nn.Linear — no gate_proj):
|
||||||
# - up_proj
|
# lora_target_parameters:
|
||||||
# - down_proj
|
# - up_proj
|
||||||
|
# - down_proj
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
|
|||||||
Reference in New Issue
Block a user