From 1640cd400658fc28980dfb6cefc065b83252f29a Mon Sep 17 00:00:00 2001 From: Dan Saunders Date: Tue, 23 Sep 2025 15:34:07 +0000 Subject: [PATCH] delete config --- examples/deepseek-v3/full-ft.yaml | 66 --------------------------- scripts/benchmarks/deepseek_v3_moe.py | 1 - 2 files changed, 67 deletions(-) delete mode 100644 examples/deepseek-v3/full-ft.yaml diff --git a/examples/deepseek-v3/full-ft.yaml b/examples/deepseek-v3/full-ft.yaml deleted file mode 100644 index dec8ab7c2..000000000 --- a/examples/deepseek-v3/full-ft.yaml +++ /dev/null @@ -1,66 +0,0 @@ -# Example full fine-tuning config for a DeepSeek-V3 MoE model using Axolotl's -# vendored Triton contiguous grouped GEMM kernels. -# Replace `your-org/deepseek-v3-model` with the name of the model you uploaded to HF. - -base_model: axolotl-ai-co/deepseek-v3-8b -model_config_type: deepseek_v3 -trust_remote_code: true -moe_kernels: true - -# --- Data ------------------------------------------------------------------ -datasets: - - path: tatsu-lab/alpaca - type: alpaca - -val_set_size: 0.0 -output_dir: ./outputs/deepseek-v3/full-ft - -sequence_len: 4096 -sample_packing: true - -# --- Optimisation ---------------------------------------------------------- -num_epochs: 1 -micro_batch_size: 1 -gradient_accumulation_steps: 8 -optimizer: adamw_torch_fused -learning_rate: 2e-5 -lr_scheduler: cosine -warmup_ratio: 0.1 -weight_decay: 0.01 - -# --- Precision & Performance ----------------------------------------------- -bf16: auto -flash_attention: true - -# enable GC to keep activation memory manageable for the MoE blocks -gradient_checkpointing: true -gradient_checkpointing_kwargs: - use_reentrant: false - -# Axolotl automatically applies the DeepSeek-V3 MoE monkeypatch when -# model_config_type is set to `deepseek_v3`, routing matmuls through the -# vendored Triton kernels. - -# --- Logging & Saving ------------------------------------------------------ -logging_steps: 1 -evals_per_epoch: 2 -saves_per_epoch: 1 - -# Uncomment the section below for multi-GPU training with FSDP -# fsdp: -# - full_shard -# - auto_wrap -# fsdp_config: -# fsdp_limit_all_gathers: true -# fsdp_sync_module_states: true -# fsdp_offload_params: true -# fsdp_use_orig_params: false -# fsdp_cpu_ram_efficient_loading: true -# fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP -# fsdp_transformer_layer_cls_to_wrap: DeepseekV3MoE -# fsdp_state_dict_type: FULL_STATE_DICT -# fsdp_sharding_strategy: FULL_SHARD - -# wandb_project: -# wandb_entity: -# wandb_name: diff --git a/scripts/benchmarks/deepseek_v3_moe.py b/scripts/benchmarks/deepseek_v3_moe.py index 453d8002d..68492e39e 100644 --- a/scripts/benchmarks/deepseek_v3_moe.py +++ b/scripts/benchmarks/deepseek_v3_moe.py @@ -131,7 +131,6 @@ def benchmark_deepseek_v3(args: argparse.Namespace) -> dict: device = resolve_device(args.device) dtype = DTYPE_MAP[args.dtype] - print(f"device: {device}, dtype: {dtype}") if args.n_experts % args.groups != 0: raise SystemExit("n-experts must be divisible by groups")