nits

2025-07-28 01:47:40 +00:00
138 changed files with 805 additions and 1146 deletions
--- a/README.md
+++ b/README.md
@@ -25,7 +25,6 @@
 ## 🎉 Latest Updates
 - 2025/07: Voxtral with mistral-common tokenizer support has been integrated in Axolotl. Read the [docs](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral)!
 - 2025/07: TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
 - 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
 - 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
--- a/examples/archived/cerebras/btlm-ft.yml
+++ b/examples/archived/cerebras/btlm-ft.yml
@@ -66,7 +66,7 @@ flash_optimum:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 32
 evals_per_epoch: 4
 saves_per_epoch: 1
 save_total_limit:
--- a/examples/archived/cerebras/qlora.yml
+++ b/examples/archived/cerebras/qlora.yml
@@ -43,7 +43,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/code-llama/13b/lora.yml
+++ b/examples/archived/code-llama/13b/lora.yml
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/13b/qlora.yml
+++ b/examples/archived/code-llama/13b/qlora.yml
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/34b/lora.yml
+++ b/examples/archived/code-llama/34b/lora.yml
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/34b/qlora.yml
+++ b/examples/archived/code-llama/34b/qlora.yml
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/7b/lora.yml
+++ b/examples/archived/code-llama/7b/lora.yml
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/code-llama/7b/qlora.yml
+++ b/examples/archived/code-llama/7b/qlora.yml
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/dbrx/16bit-lora.yaml
+++ b/examples/archived/dbrx/16bit-lora.yaml
@@ -54,7 +54,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
--- a/examples/archived/dbrx/8bit-lora.yaml
+++ b/examples/archived/dbrx/8bit-lora.yaml
@@ -57,7 +57,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
--- a/examples/archived/dbrx/fft-ds-zero3.yaml
+++ b/examples/archived/dbrx/fft-ds-zero3.yaml
@@ -41,7 +41,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -51,7 +51,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/falcon/config-7b-lora.yml
+++ b/examples/archived/falcon/config-7b-lora.yml
@@ -47,7 +47,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 40
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/falcon/config-7b-qlora.yml
+++ b/examples/archived/falcon/config-7b-qlora.yml
@@ -77,7 +77,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.000001
--- a/examples/archived/falcon/config-7b.yml
+++ b/examples/archived/falcon/config-7b.yml
@@ -44,7 +44,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 40
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/gptj/qlora.yml
+++ b/examples/archived/gptj/qlora.yml
@@ -40,7 +40,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/jeopardy-bot/config.yml
+++ b/examples/archived/jeopardy-bot/config.yml
@@ -41,7 +41,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/mpt-7b/config.yml
+++ b/examples/archived/mpt-7b/config.yml
@@ -42,7 +42,7 @@ logging_steps: 5
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0001
--- a/examples/archived/openllama-3b/config.yml
+++ b/examples/archived/openllama-3b/config.yml
@@ -42,7 +42,7 @@ logging_steps: 1
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/openllama-3b/lora.yml
+++ b/examples/archived/openllama-3b/lora.yml
@@ -50,7 +50,7 @@ logging_steps: 1
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/openllama-3b/qlora.yml
+++ b/examples/archived/openllama-3b/qlora.yml
@@ -43,7 +43,7 @@ logging_steps: 1
 flash_attention: true
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/archived/qwen/lora.yml
+++ b/examples/archived/qwen/lora.yml
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention:
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/qwen/qlora.yml
+++ b/examples/archived/qwen/qlora.yml
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention:
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/qwen/qwen2-moe-lora.yaml
+++ b/examples/archived/qwen/qwen2-moe-lora.yaml
@@ -45,7 +45,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/qwen/qwen2-moe-qlora.yaml
+++ b/examples/archived/qwen/qwen2-moe-qlora.yaml
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/redpajama/config-3b.yml
+++ b/examples/archived/redpajama/config-3b.yml
@@ -43,7 +43,7 @@ logging_steps: 5
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0001
--- a/examples/archived/replit-3b/config-lora.yml
+++ b/examples/archived/replit-3b/config-lora.yml
@@ -41,7 +41,7 @@ logging_steps: 1
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0
--- a/examples/archived/stablelm-2/1.6b/fft.yml
+++ b/examples/archived/stablelm-2/1.6b/fft.yml
@@ -50,7 +50,7 @@ flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/archived/stablelm-2/1.6b/lora.yml
+++ b/examples/archived/stablelm-2/1.6b/lora.yml
@@ -51,7 +51,7 @@ flash_attention: true
 flash_attn_cross_entropy: false
 flash_attn_rms_norm: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/starcoder2/qlora.yml
+++ b/examples/archived/starcoder2/qlora.yml
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 4
 eval_steps:
 saves_per_epoch: 4
--- a/examples/archived/tiny-llama/lora-mps.yml
+++ b/examples/archived/tiny-llama/lora-mps.yml
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: false
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 0
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/tiny-llama/lora.yml
+++ b/examples/archived/tiny-llama/lora.yml
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/tiny-llama/pretrain.yml
+++ b/examples/archived/tiny-llama/pretrain.yml
@@ -38,7 +38,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/tiny-llama/qlora.yml
+++ b/examples/archived/tiny-llama/qlora.yml
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
+++ b/examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
@@ -75,7 +75,7 @@ xformers_attention: true
 flash_attention:
 gptq_groupsize:
 gptq_model_v1:
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/archived/yi-34B-chat/qlora.yml
+++ b/examples/archived/yi-34B-chat/qlora.yml
@@ -20,7 +20,7 @@ special_tokens:
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
-warmup_ratio: 0.1
+warmup_steps: 10
 # Iterations
 num_epochs: 1
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
        "%%capture\n",
        "# This step can take ~5-10 minutes to install dependencies\n",
        "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-        "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@010c3ac3f1e725098961832830303eeb4142dd88\""
+        "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@631d646\""
      ]
    },
    {
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -51,7 +51,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -51,7 +51,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/deepseek-v2/fft-fsdp-16b.yaml
+++ b/examples/deepseek-v2/fft-fsdp-16b.yaml
@@ -37,7 +37,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/deepseek-v2/qlora-fsdp-2_5.yaml
+++ b/examples/deepseek-v2/qlora-fsdp-2_5.yaml
@@ -61,7 +61,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/gemma3n/README.md
+++ b/examples/gemma3n/README.md
@@ -1,65 +1,19 @@
-# Finetune Gemma-3n with Axolotl
+# Gemma-3n
-Gemma-3n is a family of multimodal models from Google found on [HuggingFace](https://huggingface.co/collections/google/gemma-3n-685065323f5984ef315c93f4). This guide shows how to fine-tune it with Axolotl.
+## Requirements
-## Getting started
+In addition to Axolotl's requirements, Gemma-3n requires
-1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Gemma3n is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
+```
-
+pip3 install timm
    Here is an example of how to install from main for pip:
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min recommended)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 ```
-2. In addition to Axolotl's requirements, Gemma-3n requires:
+If you will load audio datasets, please also install
-```bash
+```
-pip3 install timm==1.0.17
+pip3 install librosa
 # for loading audio data
 pip3 install librosa==0.11.0
 ```
-3. Run the finetuning example:
+## Usage
-```bash
+See example configs and the [multimodal doc](https://docs.axolotl.ai/docs/multimodal.html).
 # text only
 axolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml
 # text + vision
 axolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml
 # text + vision + audio
 axolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
 ```
 Let us know how it goes. Happy finetuning! 🚀
 WARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.
 ### TIPS
 - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
 - The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
 - The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
 ## Optimization Guides
 - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
 - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
 - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
 ## Related Resources
 - [Gemma 3n Blog](https://ai.google.dev/gemma/docs/gemma-3n)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl Website](https://axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
+++ b/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
@@ -34,6 +34,8 @@ eot_tokens:
 datasets:
  - path: Nanobit/text-vision-audio-2k-test
    type: chat_template
    data_files:
      - dataset.jsonl
 dataset_prepared_path:
 val_set_size: 0.01
 output_dir: ./outputs/out
--- a/examples/glm4/qlora-32b.yaml
+++ b/examples/glm4/qlora-32b.yaml
@@ -55,7 +55,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/jamba/qlora.yaml
+++ b/examples/jamba/qlora.yaml
@@ -49,7 +49,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/jamba/qlora_deepspeed.yaml
+++ b/examples/jamba/qlora_deepspeed.yaml
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch:
 saves_per_epoch: 1
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -48,7 +48,7 @@ flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/llama-2/gptq-lora.yml
+++ b/examples/llama-2/gptq-lora.yml
@@ -56,7 +56,7 @@ logging_steps: 1
 flash_attention:
 sdp_attention:
 flash_optimum:
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/llama-2/lisa.yml
+++ b/examples/llama-2/lisa.yml
@@ -52,7 +52,7 @@ flash_attn_rms_norm: true
 flash_attn_fuse_qkv: false
 flash_attn_fuse_mlp: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -47,7 +47,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -50,7 +50,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-2/relora.yml
+++ b/examples/llama-2/relora.yml
@@ -26,7 +26,7 @@ lora_dropout: 0.05
 lora_target_linear: true
 relora_steps: 150
-relora_warmup_ratio: 0.1
+relora_warmup_steps: 10
 relora_cpu_offload: false
 wandb_project:
@@ -50,7 +50,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -58,7 +58,7 @@ logging_steps: 1
 evals_per_epoch: 1
 saves_per_epoch: 1
-warmup_ratio: 0.1
+warmup_steps: 10
 weight_decay: 0.0
 fsdp:
  - full_shard
--- a/examples/llama-3/fft-8b-liger-fsdp.yaml
+++ b/examples/llama-3/fft-8b-liger-fsdp.yaml
@@ -9,7 +9,6 @@ liger_rms_norm: true
 liger_glu_activation: true
 liger_fused_linear_cross_entropy: true
 chat_template: llama3
 datasets:
  - path: mlabonne/FineTome-100k
@@ -51,7 +50,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/fft-8b.yaml
+++ b/examples/llama-3/fft-8b.yaml
@@ -36,7 +36,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -67,7 +67,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -58,7 +58,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/lora-1b-deduplicate-dpo.yml
+++ b/examples/llama-3/lora-1b-deduplicate-dpo.yml
@@ -79,7 +79,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -55,7 +55,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/lora-1b-kernels.yml
+++ b/examples/llama-3/lora-1b-kernels.yml
@@ -15,7 +15,6 @@ lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 lora_r: 16
 lora_alpha: 32
 # Currently, we don't support dropout with our custom Triton kernels
@@ -59,7 +58,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/lora-1b-ray.yml
+++ b/examples/llama-3/lora-1b-ray.yml
@@ -53,7 +53,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/llama-3/lora-1b-sample-packing-sequentially.yml
+++ b/examples/llama-3/lora-1b-sample-packing-sequentially.yml
@@ -57,7 +57,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/lora-1b.yml
+++ b/examples/llama-3/lora-1b.yml
@@ -54,7 +54,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/lora-8b.yml
+++ b/examples/llama-3/lora-8b.yml
@@ -51,7 +51,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -55,7 +55,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/qlora-1b.yml
+++ b/examples/llama-3/qlora-1b.yml
@@ -56,7 +56,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -41,7 +41,7 @@ gradient_checkpointing_kwargs:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/qlora-fsdp-70b.yaml
+++ b/examples/llama-3/qlora-fsdp-70b.yaml
@@ -50,7 +50,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/qlora.yml
+++ b/examples/llama-3/qlora.yml
@@ -48,7 +48,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -47,7 +47,7 @@ logging_steps: 1
 xformers_attention:
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
--- a/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
@@ -66,7 +66,7 @@ gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
  use_reentrant: false
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -84,7 +84,7 @@ fsdp_config:
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
-  pad_token: <|finetune_right_pad|>
+  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
@@ -69,7 +69,7 @@ tf32: true
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -88,7 +88,7 @@ fsdp_config:
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_activation_checkpointing: true
 special_tokens:
-  pad_token: <|finetune_right_pad|>
+  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
@@ -76,12 +76,12 @@ gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
  use_reentrant: false
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-  pad_token: <|finetune_right_pad|>
+  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
@@ -65,7 +65,7 @@ tf32: true
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -84,7 +84,7 @@ fsdp_config:
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_activation_checkpointing: true
 special_tokens:
-  pad_token: <|finetune_right_pad|>
+  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
+++ b/examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
@@ -64,7 +64,7 @@ flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -82,7 +82,7 @@ fsdp_config:
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true
 special_tokens:
-  pad_token: <|finetune_right_pad|>
+  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/scout-qlora-single-h100-flex.yaml
+++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml
@@ -74,13 +74,13 @@ gradient_checkpointing_kwargs:
  use_reentrant: false
 logging_steps: 1
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
-  pad_token: <|finetune_right_pad|>
+  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
+++ b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
@@ -67,7 +67,7 @@ flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 1
 saves_per_epoch: 1
 weight_decay: 0.0
@@ -85,7 +85,7 @@ fsdp_config:
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true
 special_tokens:
-  pad_token: <|finetune_right_pad|>
+  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -1,6 +1,6 @@
 # Finetune Magistral Small with Axolotl
-Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at [2506](https://huggingface.co/mistralai/Magistral-Small-2506) and [2507](https://huggingface.co/mistralai/Magistral-Small-2507) (see [Thinking](#thinking)). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
+Magistral Small is a 24B parameter opensource model from MistralAI found on [HuggingFace](https://huggingface.co/mistralai/Magistral-Small-2506). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
 MistralAI has also released a proprietary medium-sized version called Magistral Medium.
@@ -13,7 +13,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
    Here is an example of how to install from main for pip:
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.6.0 recommended)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
@@ -31,37 +31,12 @@ This config uses about 24GB VRAM.
 Let us know how it goes. Happy finetuning! 🚀
 ### Thinking
 MistralAI has released their [2507](https://huggingface.co/mistralai/Magistral-Small-2507) model with thinking capabilities. The model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.
 Example format:
 ```json
 {
    "messages": [
        {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
        {"role": "user", "content": [{ "type": "text", "text": "..."}]},
        {"role": "assistant", "content": [{ "type": "thinking", "thinking": "..."}, { "type": "text", "text": "..." }]},
    ],
 }
 ```
 Example config: `./magistral-small-think-qlora.yaml`.
 The `thinking` section also supports an optional arg `closed: bool` (`True` default) which controls adding the closing `[/THINK]` tag.
 Limitations:
 - You cannot mix `content: str` with `content: list[dict]` as the `dataset.load_dataset` may complain about different types for `content` key.
 - This mode does not work with custom `train_detail` and `training` at the moment.
 ### TIPS
 - We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
 - For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
 - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
 ## Optimization Guides
--- a/examples/magistral/magistral-small-fsdp-qlora.yaml
+++ b/examples/magistral/magistral-small-fsdp-qlora.yaml
@@ -6,9 +6,6 @@ tokenizer_use_mistral_common: true
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/magistral/magistral-small-qlora.yaml
+++ b/examples/magistral/magistral-small-qlora.yaml
@@ -6,9 +6,6 @@ tokenizer_use_mistral_common: true
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 load_in_8bit: false
 load_in_4bit: true
--- a/examples/magistral/magistral-small-think-qlora.yaml
+++ b/examples/magistral/magistral-small-think-qlora.yaml
@@ -1,68 +0,0 @@
 base_model: mistralai/Magistral-Small-2507
 # Enable to use mistral-common tokenizer
 tokenizer_use_mistral_common: true
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 load_in_8bit: false
 load_in_4bit: true
 datasets:
  - path: Nanobit/text-think-2k-test
    type: chat_template
 dataset_prepared_path: last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/lora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -41,7 +41,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention:
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/mistral/config.yml
+++ b/examples/mistral/config.yml
@@ -38,7 +38,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/mistral/lora-mps.yml
+++ b/examples/mistral/lora-mps.yml
@@ -59,7 +59,7 @@ sdp_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/mistral/lora.yml
+++ b/examples/mistral/lora.yml
@@ -59,7 +59,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/mistral/mistral-dpo-qlora.yml
+++ b/examples/mistral/mistral-dpo-qlora.yml
@@ -73,7 +73,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: false
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/mistral/mistral-qlora-fsdp.yml
+++ b/examples/mistral/mistral-qlora-fsdp.yml
@@ -56,7 +56,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/mistral/mistral-qlora-orpo.yml
+++ b/examples/mistral/mistral-qlora-orpo.yml
@@ -64,7 +64,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-8x22b-qlora-fsdp.yml
@@ -54,7 +54,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/mistral/mixtral-qlora-fsdp.yml
+++ b/examples/mistral/mixtral-qlora-fsdp.yml
@@ -56,7 +56,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/mistral/mixtral.yml
+++ b/examples/mistral/mixtral.yml
@@ -74,7 +74,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
--- a/examples/mistral/qlora.yml
+++ b/examples/mistral/qlora.yml
@@ -59,7 +59,7 @@ flash_attention: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/examples/orpheus/finetune.yml
+++ b/examples/orpheus/finetune.yml
@@ -43,7 +43,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 20
 evals_per_epoch: 5
 saves_per_epoch: 5
 weight_decay: 0.05
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -59,7 +59,7 @@ gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
-warmup_ratio: 0.1
+warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 4
 weight_decay: 0.0
--- a/examples/phi/phi-ft.yml
+++ b/examples/phi/phi-ft.yml
@@ -50,7 +50,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/examples/phi/phi-qlora.yml
+++ b/examples/phi/phi-qlora.yml
@@ -53,7 +53,7 @@ resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
-warmup_ratio: 0.1
+warmup_steps: 100
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.1
--- a/Show More
+++ b/Show More