# Example full fine-tuning config for a DeepSeek-V3 MoE model using Axolotl's # vendored Triton contiguous grouped GEMM kernels. # Replace `your-org/deepseek-v3-model` with the name of the model you uploaded to HF. base_model: axolotl-ai-co/deepseek-v3-8b model_config_type: deepseek_v3 trust_remote_code: true moe_kernels: true # --- Data ------------------------------------------------------------------ datasets: - path: tatsu-lab/alpaca type: alpaca val_set_size: 0.0 output_dir: ./outputs/deepseek-v3/full-ft sequence_len: 4096 sample_packing: true # --- Optimisation ---------------------------------------------------------- num_epochs: 1 micro_batch_size: 1 gradient_accumulation_steps: 8 optimizer: adamw_torch_fused learning_rate: 2e-5 lr_scheduler: cosine warmup_ratio: 0.1 weight_decay: 0.01 # --- Precision & Performance ----------------------------------------------- bf16: auto flash_attention: true # enable GC to keep activation memory manageable for the MoE blocks gradient_checkpointing: true gradient_checkpointing_kwargs: use_reentrant: false # Axolotl automatically applies the DeepSeek-V3 MoE monkeypatch when # model_config_type is set to `deepseek_v3`, routing matmuls through the # vendored Triton kernels. # --- Logging & Saving ------------------------------------------------------ logging_steps: 1 evals_per_epoch: 2 saves_per_epoch: 1 # Uncomment the section below for multi-GPU training with FSDP # fsdp: # - full_shard # - auto_wrap # fsdp_config: # fsdp_limit_all_gathers: true # fsdp_sync_module_states: true # fsdp_offload_params: true # fsdp_use_orig_params: false # fsdp_cpu_ram_efficient_loading: true # fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP # fsdp_transformer_layer_cls_to_wrap: DeepseekV3MoE # fsdp_state_dict_type: FULL_STATE_DICT # fsdp_sharding_strategy: FULL_SHARD # wandb_project: # wandb_entity: # wandb_name: