diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md index fb6c67498..9ab02b122 100644 --- a/examples/gpt-oss/README.md +++ b/examples/gpt-oss/README.md @@ -2,6 +2,8 @@ [GPT-OSS](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B. +In October 2025, OpenAI released safeguard models built upon GPT-OSS called [GPT-OSS-Safeguard](https://huggingface.co/collections/openai/gpt-oss-safeguard). They use the same architecture, so the same examples below can be re-used. + This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. ## Getting started @@ -64,6 +66,16 @@ axolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offlo mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/ ``` +### How to set reasoning_effort in template? + +The harmony template has a feature to set the `reasoning_effort` during prompt building. The default is `medium`. If you would like to adjust this, you can add the following to your config: + +```yaml +chat_template_kwargs: + reasoning_effort: "high" # low | medium | high +``` + +Currently, this applies globally. There is no method to apply per sample yet. If you are interested in adding this, please feel free to create an Issue to discuss. ### Inferencing your fine-tuned model diff --git a/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml b/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml new file mode 100644 index 000000000..ab026337d --- /dev/null +++ b/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml @@ -0,0 +1,67 @@ +base_model: openai/gpt-oss-safeguard-20b +use_kernels: true +model_quantization_config: Mxfp4Config +model_quantization_config_kwargs: + dequantize: true + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin + +experimental_skip_move_to_device: true # prevent OOM by not putting model to GPU before sharding + +datasets: + - path: HuggingFaceH4/Multilingual-Thinking + type: chat_template + field_thinking: thinking + template_thinking_key: thinking + +dataset_prepared_path: last_run_prepared +val_set_size: 0 +output_dir: ./outputs/gpt-oss-safeguard-out/ + +sequence_len: 4096 +sample_packing: true + +adapter: lora +lora_r: 8 +lora_alpha: 16 +lora_dropout: 0.0 # dropout not supported when using LoRA over expert parameters +lora_target_linear: true + +# TODO: not supported for now, see peft#2710 +#lora_target_parameters: # target the experts in the last two layers +# - "22._checkpoint_wrapped_module.mlp.experts.gate_up_proj" +# - "22._checkpoint_wrapped_module.mlp.experts.down_proj" +# - "23._checkpoint_wrapped_module.mlp.experts.gate_up_proj" +# - "23._checkpoint_wrapped_module.mlp.experts.down_proj" + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 8 +micro_batch_size: 1 +num_epochs: 1 + +optimizer: adamw_torch_8bit +lr_scheduler: constant_with_warmup +learning_rate: 2e-4 + +bf16: true +tf32: true + +flash_attention: true +attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3 + +gradient_checkpointing: true +activation_offloading: true + +logging_steps: 1 +saves_per_epoch: 1 +warmup_ratio: 0.1 + +special_tokens: +eot_tokens: + - "<|end|>"