small deepseek script
This commit is contained in:
100
scripts/benchmarks/build_deepseek_v3_8b.py
Executable file
100
scripts/benchmarks/build_deepseek_v3_8b.py
Executable file
@@ -0,0 +1,100 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Instantiate a ~8.3B DeepSeek-V3 MoE model with random weights.
|
||||||
|
|
||||||
|
Run this on a GPU-equipped machine (e.g. 1× NVL H100) so the dense
|
||||||
|
initialization completes quickly:
|
||||||
|
|
||||||
|
python scripts/benchmarks/build_deepseek_v3_8b.py --output deepseek-v3-8b-moe
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import torch
|
||||||
|
from transformers import DeepseekV3Config, DeepseekV3ForCausalLM
|
||||||
|
|
||||||
|
DTYPE_MAP = {
|
||||||
|
"float32": torch.float32,
|
||||||
|
"bfloat16": torch.bfloat16,
|
||||||
|
"float16": torch.float16,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def build_config() -> DeepseekV3Config:
|
||||||
|
"""Return a DeepSeek V3 configuration totaling ~8.3B parameters."""
|
||||||
|
|
||||||
|
return DeepseekV3Config(
|
||||||
|
vocab_size=32_000,
|
||||||
|
hidden_size=3_072,
|
||||||
|
intermediate_size=8_192,
|
||||||
|
moe_intermediate_size=2_560,
|
||||||
|
num_hidden_layers=20,
|
||||||
|
num_attention_heads=24,
|
||||||
|
num_key_value_heads=24,
|
||||||
|
n_routed_experts=18,
|
||||||
|
num_experts_per_tok=4,
|
||||||
|
n_group=6,
|
||||||
|
topk_group=4,
|
||||||
|
kv_lora_rank=192,
|
||||||
|
q_lora_rank=384,
|
||||||
|
max_position_embeddings=2_048,
|
||||||
|
rope_theta=10_000.0,
|
||||||
|
rope_interleave=True,
|
||||||
|
hidden_act="silu",
|
||||||
|
initializer_range=0.02,
|
||||||
|
attention_dropout=0.0,
|
||||||
|
attention_bias=False,
|
||||||
|
n_shared_experts=1,
|
||||||
|
routed_scaling_factor=2.5,
|
||||||
|
norm_topk_prob=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args() -> argparse.Namespace:
|
||||||
|
parser = argparse.ArgumentParser(description=__doc__)
|
||||||
|
parser.add_argument(
|
||||||
|
"--output",
|
||||||
|
type=Path,
|
||||||
|
required=True,
|
||||||
|
help="Directory to save the generated model",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--dtype",
|
||||||
|
default="bfloat16",
|
||||||
|
choices=DTYPE_MAP.keys(),
|
||||||
|
help="Storage dtype for the checkpoint",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--seed",
|
||||||
|
type=int,
|
||||||
|
default=0,
|
||||||
|
help="Torch RNG seed for reproducibility",
|
||||||
|
)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
args = parse_args()
|
||||||
|
torch.manual_seed(args.seed)
|
||||||
|
|
||||||
|
output_dir = args.output
|
||||||
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
config = build_config()
|
||||||
|
model = DeepseekV3ForCausalLM(config)
|
||||||
|
|
||||||
|
dtype = DTYPE_MAP[args.dtype]
|
||||||
|
model.to(dtype=dtype)
|
||||||
|
|
||||||
|
param_count = sum(p.numel() for p in model.parameters())
|
||||||
|
print(f"Initialized DeepSeek-V3 MoE with {param_count / 1e9:.3f}B parameters")
|
||||||
|
|
||||||
|
model.save_pretrained(output_dir, safe_serialization=True)
|
||||||
|
config.save_pretrained(output_dir)
|
||||||
|
print(f"Saved model and config to {output_dir.resolve()}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user