From 1640cd400658fc28980dfb6cefc065b83252f29a Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Tue, 23 Sep 2025 15:34:07 +0000
Subject: [PATCH] delete config

---
 examples/deepseek-v3/full-ft.yaml     | 66 ---------------------------
 scripts/benchmarks/deepseek_v3_moe.py |  1 -
 2 files changed, 67 deletions(-)
 delete mode 100644 examples/deepseek-v3/full-ft.yaml

diff --git a/examples/deepseek-v3/full-ft.yaml b/examples/deepseek-v3/full-ft.yaml
deleted file mode 100644
index dec8ab7c2..000000000
--- a/examples/deepseek-v3/full-ft.yaml
+++ /dev/null
@@ -1,66 +0,0 @@
-# Example full fine-tuning config for a DeepSeek-V3 MoE model using Axolotl's
-# vendored Triton contiguous grouped GEMM kernels.
-# Replace `your-org/deepseek-v3-model` with the name of the model you uploaded to HF.
-
-base_model: axolotl-ai-co/deepseek-v3-8b
-model_config_type: deepseek_v3
-trust_remote_code: true
-moe_kernels: true
-
-# --- Data ------------------------------------------------------------------
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-
-val_set_size: 0.0
-output_dir: ./outputs/deepseek-v3/full-ft
-
-sequence_len: 4096
-sample_packing: true
-
-# --- Optimisation ----------------------------------------------------------
-num_epochs: 1
-micro_batch_size: 1
-gradient_accumulation_steps: 8
-optimizer: adamw_torch_fused
-learning_rate: 2e-5
-lr_scheduler: cosine
-warmup_ratio: 0.1
-weight_decay: 0.01
-
-# --- Precision & Performance -----------------------------------------------
-bf16: auto
-flash_attention: true
-
-# enable GC to keep activation memory manageable for the MoE blocks
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-
-# Axolotl automatically applies the DeepSeek-V3 MoE monkeypatch when
-# model_config_type is set to `deepseek_v3`, routing matmuls through the
-# vendored Triton kernels.
-
-# --- Logging & Saving ------------------------------------------------------
-logging_steps: 1
-evals_per_epoch: 2
-saves_per_epoch: 1
-
-# Uncomment the section below for multi-GPU training with FSDP
-# fsdp:
-#   - full_shard
-#   - auto_wrap
-# fsdp_config:
-#   fsdp_limit_all_gathers: true
-#   fsdp_sync_module_states: true
-#   fsdp_offload_params: true
-#   fsdp_use_orig_params: false
-#   fsdp_cpu_ram_efficient_loading: true
-#   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-#   fsdp_transformer_layer_cls_to_wrap: DeepseekV3MoE
-#   fsdp_state_dict_type: FULL_STATE_DICT
-#   fsdp_sharding_strategy: FULL_SHARD
-
-# wandb_project:
-# wandb_entity:
-# wandb_name:
diff --git a/scripts/benchmarks/deepseek_v3_moe.py b/scripts/benchmarks/deepseek_v3_moe.py
index 453d8002d..68492e39e 100644
--- a/scripts/benchmarks/deepseek_v3_moe.py
+++ b/scripts/benchmarks/deepseek_v3_moe.py
@@ -131,7 +131,6 @@ def benchmark_deepseek_v3(args: argparse.Namespace) -> dict:
 
     device = resolve_device(args.device)
     dtype = DTYPE_MAP[args.dtype]
-    print(f"device: {device}, dtype: {dtype}")
 
     if args.n_experts % args.groups != 0:
         raise SystemExit("n-experts must be divisible by groups")