From 50f2b94d50584fce89bba9ff33b7c3909e827991 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 8 Aug 2025 08:04:56 -0400 Subject: [PATCH] add 120b and deepspeed zero3 examples (#3035) [skip ci] * add 120b and deepspeed zero3 examples * add a bit of flavor and cleanup gpt oss readme * fix: remove expert vram usage * fix: remove redundant EOS token from eot_tokens * feat: add 120B to docs --------- Co-authored-by: NanoCode012 --- examples/gpt-oss/README.md | 18 +++-- .../gpt-oss-120b-fft-fsdp2-offload.yaml | 67 +++++++++++++++++++ .../gpt-oss-20b-fft-deepspeed-zero3.yaml | 58 ++++++++++++++++ .../gpt-oss-20b-fft-fsdp2-offload.yaml | 6 +- examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml | 1 - .../gpt-oss-20b-sft-lora-singlegpu.yaml | 1 - 6 files changed, 141 insertions(+), 10 deletions(-) create mode 100644 examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml create mode 100644 examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md index 8a19959e7..6dadb8230 100644 --- a/examples/gpt-oss/README.md +++ b/examples/gpt-oss/README.md @@ -16,11 +16,10 @@ pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0' ``` -2. Choose one of the following configs below for training the 20B model. +2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b)) ```bash -# LoRA SFT linear layers & 2 experts (1x48GB @ ~47GiB) -# (only linear layers @ ~44GiB) +# LoRA SFT linear layers (1x48GB @ ~44GiB) axolotl train examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml # FFT SFT with offloading (2x24GB @ ~21GiB/GPU) @@ -30,9 +29,16 @@ axolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml axolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml ``` -Notes: -- 120B coming soon! -- Memory usage taken from `device_mem_reserved(gib)` from logs. +Note: Memory usage taken from `device_mem_reserved(gib)` from logs. + +### Training 120B + +On 8xH100s + +```bash +# FFT SFT with offloading (8x80GB @ ~49GiB/GPU) +axolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml +``` ### Tool use diff --git a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml new file mode 100644 index 000000000..4a9d51fdf --- /dev/null +++ b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml @@ -0,0 +1,67 @@ +# the original mxfp4 quantized model is not supported with FSDP cpu_ram_efficient_loading +# FSDP cpu_ram_efficient_loading is used to reduce the initial CPU memory usage when loading the model +base_model: axolotl-ai-co/gpt-oss-120b-dequantized + +use_kernels: false + +dp_shard_size: 16 # requires 2x8xH100 nodes + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding + +datasets: + - path: HuggingFaceH4/Multilingual-Thinking + type: chat_template + field_thinking: thinking + template_thinking_key: thinking + +dataset_prepared_path: last_run_prepared +val_set_size: 0 +output_dir: ./outputs/gpt-oss-out/ + +sequence_len: 4096 +sample_packing: true +pad_to_sequence_len: true + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 2 +micro_batch_size: 1 +num_epochs: 1 + +optimizer: adamw_torch_fused # 8bit optimizers do not work with FSDP2 offload +lr_scheduler: constant_with_warmup +learning_rate: 2e-5 + +bf16: true +tf32: true + +flash_attention: true +attn_implementation: kernels-community/vllm-flash-attn3 + +gradient_checkpointing: true +activation_offloading: true + +logging_steps: 1 +saves_per_epoch: 1 + +warmup_ratio: 0.03 + +special_tokens: +eot_tokens: + - "<|end|>" + +fsdp_version: 2 +fsdp_config: + offload_params: true + state_dict_type: SHARDED_STATE_DICT + auto_wrap_policy: TRANSFORMER_BASED_WRAP + transformer_layer_cls_to_wrap: GptOssDecoderLayer + reshard_after_forward: true + cpu_ram_efficient_loading: true diff --git a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml new file mode 100644 index 000000000..440f0c509 --- /dev/null +++ b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml @@ -0,0 +1,58 @@ +base_model: openai/gpt-oss-20b +use_kernels: false +model_quantization_config: Mxfp4Config +model_quantization_config_kwargs: + dequantize: true + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +experimental_skip_move_to_device: true # prevent OOM by NOT putting model to GPU before sharding + +datasets: + - path: HuggingFaceH4/Multilingual-Thinking + type: chat_template + field_thinking: thinking + template_thinking_key: thinking + +dataset_prepared_path: last_run_prepared +val_set_size: 0 +output_dir: ./outputs/gpt-oss-out/ + +sequence_len: 4096 +sample_packing: true + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 2 +micro_batch_size: 1 +num_epochs: 1 + +optimizer: adamw_torch_8bit +lr_scheduler: constant_with_warmup +learning_rate: 2e-5 + +bf16: true +tf32: true + +flash_attention: true +attn_implementation: kernels-community/vllm-flash-attn3 + +gradient_checkpointing: true +activation_offloading: true + +logging_steps: 1 +saves_per_epoch: 1 + +warmup_ratio: 0.03 + +special_tokens: +eot_tokens: + - "<|end|>" + +# choose the zero3 configuration that best fits your system capabilities +deepspeed: deepspeed_configs/zero3_bf16.json diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml index b861876d1..a6ba83433 100644 --- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml +++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml @@ -54,7 +54,6 @@ warmup_ratio: 0.03 special_tokens: eot_tokens: - "<|end|>" - - "<|return|>" fsdp_version: 2 fsdp_config: @@ -63,4 +62,7 @@ fsdp_config: auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: GptOssDecoderLayer reshard_after_forward: true -# cpu_ram_efficient_loading: true + # cpu_ram_efficient_loading: true + +# cpu_ram_efficient_loading cannot be used with MXFP4 model quantization. +# It can only be used with a dequantized model like `axolotl-ai-co/gpt-oss-120b-dequantized` diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml index 6ec99304a..aa658c863 100644 --- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml +++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml @@ -53,7 +53,6 @@ warmup_ratio: 0.03 special_tokens: eot_tokens: - "<|end|>" - - "<|return|>" fsdp_version: 2 fsdp_config: diff --git a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml index 6016ce712..c4e1a982d 100644 --- a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml +++ b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml @@ -65,4 +65,3 @@ warmup_ratio: 0.1 special_tokens: eot_tokens: - "<|end|>" - - "<|return|>"