Compare commits

..

1 Commits

Author SHA1 Message Date
Dan Saunders
ede973b76c nits 2025-07-28 01:47:40 +00:00
138 changed files with 805 additions and 1146 deletions

View File

@@ -25,7 +25,6 @@
## 🎉 Latest Updates ## 🎉 Latest Updates
- 2025/07: Voxtral with mistral-common tokenizer support has been integrated in Axolotl. Read the [docs](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral)!
- 2025/07: TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl! - 2025/07: TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl! - 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more! - 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!

View File

@@ -66,7 +66,7 @@ flash_optimum:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 32
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
save_total_limit: save_total_limit:

View File

@@ -43,7 +43,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -54,7 +54,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -57,7 +57,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -41,7 +41,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 40
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -77,7 +77,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.000001 weight_decay: 0.000001

View File

@@ -44,7 +44,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 40
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -40,7 +40,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -41,7 +41,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -42,7 +42,7 @@ logging_steps: 5
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0001 weight_decay: 0.0001

View File

@@ -42,7 +42,7 @@ logging_steps: 1
flash_attention: true flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -50,7 +50,7 @@ logging_steps: 1
flash_attention: true flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -43,7 +43,7 @@ logging_steps: 1
flash_attention: true flash_attention: true
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: flash_attention:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: flash_attention:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -45,7 +45,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -43,7 +43,7 @@ logging_steps: 5
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0001 weight_decay: 0.0001

View File

@@ -41,7 +41,7 @@ logging_steps: 1
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0 weight_decay: 0

View File

@@ -50,7 +50,7 @@ flash_attn_rms_norm: true
flash_attn_fuse_qkv: false flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true flash_attn_fuse_mlp: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -51,7 +51,7 @@ flash_attention: true
flash_attn_cross_entropy: false flash_attn_cross_entropy: false
flash_attn_rms_norm: true flash_attn_rms_norm: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
eval_steps: eval_steps:
saves_per_epoch: 4 saves_per_epoch: 4

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: false flash_attention: false
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 0 evals_per_epoch: 0
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -38,7 +38,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -75,7 +75,7 @@ xformers_attention: true
flash_attention: flash_attention:
gptq_groupsize: gptq_groupsize:
gptq_model_v1: gptq_model_v1:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -20,7 +20,7 @@ special_tokens:
datasets: datasets:
- path: mhenrichsen/alpaca_2k_test - path: mhenrichsen/alpaca_2k_test
type: alpaca type: alpaca
warmup_ratio: 0.1 warmup_steps: 10
# Iterations # Iterations
num_epochs: 1 num_epochs: 1

View File

@@ -40,7 +40,7 @@
"%%capture\n", "%%capture\n",
"# This step can take ~5-10 minutes to install dependencies\n", "# This step can take ~5-10 minutes to install dependencies\n",
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n", "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@010c3ac3f1e725098961832830303eeb4142dd88\"" "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@631d646\""
] ]
}, },
{ {

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -37,7 +37,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -61,7 +61,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -1,65 +1,19 @@
# Finetune Gemma-3n with Axolotl # Gemma-3n
Gemma-3n is a family of multimodal models from Google found on [HuggingFace](https://huggingface.co/collections/google/gemma-3n-685065323f5984ef315c93f4). This guide shows how to fine-tune it with Axolotl. ## Requirements
## Getting started In addition to Axolotl's requirements, Gemma-3n requires
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Gemma3n is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html). ```
pip3 install timm
Here is an example of how to install from main for pip:
```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min recommended)
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation -e '.[flash-attn]'
``` ```
2. In addition to Axolotl's requirements, Gemma-3n requires: If you will load audio datasets, please also install
```bash ```
pip3 install timm==1.0.17 pip3 install librosa
# for loading audio data
pip3 install librosa==0.11.0
``` ```
3. Run the finetuning example: ## Usage
```bash See example configs and the [multimodal doc](https://docs.axolotl.ai/docs/multimodal.html).
# text only
axolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml
# text + vision
axolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml
# text + vision + audio
axolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
```
Let us know how it goes. Happy finetuning! 🚀
WARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.
### TIPS
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
## Optimization Guides
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
## Related Resources
- [Gemma 3n Blog](https://ai.google.dev/gemma/docs/gemma-3n)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)

View File

@@ -34,6 +34,8 @@ eot_tokens:
datasets: datasets:
- path: Nanobit/text-vision-audio-2k-test - path: Nanobit/text-vision-audio-2k-test
type: chat_template type: chat_template
data_files:
- dataset.jsonl
dataset_prepared_path: dataset_prepared_path:
val_set_size: 0.01 val_set_size: 0.01
output_dir: ./outputs/out output_dir: ./outputs/out

View File

@@ -55,7 +55,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -49,7 +49,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: evals_per_epoch:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ flash_attn_rms_norm: true
flash_attn_fuse_qkv: false flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true flash_attn_fuse_mlp: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -56,7 +56,7 @@ logging_steps: 1
flash_attention: flash_attention:
sdp_attention: sdp_attention:
flash_optimum: flash_optimum:
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -52,7 +52,7 @@ flash_attn_rms_norm: true
flash_attn_fuse_qkv: false flash_attn_fuse_qkv: false
flash_attn_fuse_mlp: true flash_attn_fuse_mlp: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -26,7 +26,7 @@ lora_dropout: 0.05
lora_target_linear: true lora_target_linear: true
relora_steps: 150 relora_steps: 150
relora_warmup_ratio: 0.1 relora_warmup_steps: 10
relora_cpu_offload: false relora_cpu_offload: false
wandb_project: wandb_project:
@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -58,7 +58,7 @@ logging_steps: 1
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
warmup_ratio: 0.1 warmup_steps: 10
weight_decay: 0.0 weight_decay: 0.0
fsdp: fsdp:
- full_shard - full_shard

View File

@@ -9,7 +9,6 @@ liger_rms_norm: true
liger_glu_activation: true liger_glu_activation: true
liger_fused_linear_cross_entropy: true liger_fused_linear_cross_entropy: true
chat_template: llama3 chat_template: llama3
datasets: datasets:
- path: mlabonne/FineTome-100k - path: mlabonne/FineTome-100k
@@ -51,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -36,7 +36,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -67,7 +67,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -58,7 +58,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -79,7 +79,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -55,7 +55,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -15,7 +15,6 @@ lora_model_dir:
sequence_len: 2048 sequence_len: 2048
sample_packing: true sample_packing: true
lora_r: 16 lora_r: 16
lora_alpha: 32 lora_alpha: 32
# Currently, we don't support dropout with our custom Triton kernels # Currently, we don't support dropout with our custom Triton kernels
@@ -59,7 +58,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -53,7 +53,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -57,7 +57,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -54,7 +54,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -51,7 +51,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -55,7 +55,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -56,7 +56,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -41,7 +41,7 @@ gradient_checkpointing_kwargs:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -48,7 +48,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -47,7 +47,7 @@ logging_steps: 1
xformers_attention: xformers_attention:
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 2 evals_per_epoch: 2
eval_table_size: eval_table_size:
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -66,7 +66,7 @@ gradient_checkpointing: offload
gradient_checkpointing_kwargs: gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0
@@ -84,7 +84,7 @@ fsdp_config:
fsdp_state_dict_type: FULL_STATE_DICT fsdp_state_dict_type: FULL_STATE_DICT
fsdp_sharding_strategy: FULL_SHARD fsdp_sharding_strategy: FULL_SHARD
special_tokens: special_tokens:
pad_token: <|finetune_right_pad|> pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|> eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config # save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -69,7 +69,7 @@ tf32: true
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0
@@ -88,7 +88,7 @@ fsdp_config:
fsdp_sharding_strategy: FULL_SHARD fsdp_sharding_strategy: FULL_SHARD
fsdp_activation_checkpointing: true fsdp_activation_checkpointing: true
special_tokens: special_tokens:
pad_token: <|finetune_right_pad|> pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|> eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config # save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -76,12 +76,12 @@ gradient_checkpointing: offload
gradient_checkpointing_kwargs: gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0
special_tokens: special_tokens:
pad_token: <|finetune_right_pad|> pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|> eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config # save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -65,7 +65,7 @@ tf32: true
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0
@@ -84,7 +84,7 @@ fsdp_config:
fsdp_sharding_strategy: FULL_SHARD fsdp_sharding_strategy: FULL_SHARD
fsdp_activation_checkpointing: true fsdp_activation_checkpointing: true
special_tokens: special_tokens:
pad_token: <|finetune_right_pad|> pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|> eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config # save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -64,7 +64,7 @@ flex_attn_compile_kwargs:
dynamic: false dynamic: false
mode: max-autotune-no-cudagraphs mode: max-autotune-no-cudagraphs
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0
@@ -82,7 +82,7 @@ fsdp_config:
fsdp_reshard_after_forward: true fsdp_reshard_after_forward: true
fsdp_activation_checkpointing: true fsdp_activation_checkpointing: true
special_tokens: special_tokens:
pad_token: <|finetune_right_pad|> pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|> eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config # save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -74,13 +74,13 @@ gradient_checkpointing_kwargs:
use_reentrant: false use_reentrant: false
logging_steps: 1 logging_steps: 1
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0
special_tokens: special_tokens:
pad_token: <|finetune_right_pad|> pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|> eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config # save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -67,7 +67,7 @@ flex_attn_compile_kwargs:
dynamic: false dynamic: false
mode: max-autotune-no-cudagraphs mode: max-autotune-no-cudagraphs
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 1 evals_per_epoch: 1
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0
@@ -85,7 +85,7 @@ fsdp_config:
fsdp_reshard_after_forward: true fsdp_reshard_after_forward: true
fsdp_activation_checkpointing: true fsdp_activation_checkpointing: true
special_tokens: special_tokens:
pad_token: <|finetune_right_pad|> pad_token: <|finetune_right_pad_id|>
eos_token: <|eot|> eos_token: <|eot|>
# save_first_step: true # uncomment this to validate checkpoint saving works with your config # save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -1,6 +1,6 @@
# Finetune Magistral Small with Axolotl # Finetune Magistral Small with Axolotl
Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at [2506](https://huggingface.co/mistralai/Magistral-Small-2506) and [2507](https://huggingface.co/mistralai/Magistral-Small-2507) (see [Thinking](#thinking)). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking. Magistral Small is a 24B parameter opensource model from MistralAI found on [HuggingFace](https://huggingface.co/mistralai/Magistral-Small-2506). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
MistralAI has also released a proprietary medium-sized version called Magistral Medium. MistralAI has also released a proprietary medium-sized version called Magistral Medium.
@@ -13,7 +13,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
Here is an example of how to install from main for pip: Here is an example of how to install from main for pip:
```bash ```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min) # Ensure you have Pytorch installed (Pytorch 2.6.0 recommended)
git clone https://github.com/axolotl-ai-cloud/axolotl.git git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl cd axolotl
@@ -31,37 +31,12 @@ This config uses about 24GB VRAM.
Let us know how it goes. Happy finetuning! 🚀 Let us know how it goes. Happy finetuning! 🚀
### Thinking
MistralAI has released their [2507](https://huggingface.co/mistralai/Magistral-Small-2507) model with thinking capabilities. The model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.
Example format:
```json
{
"messages": [
{"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
{"role": "user", "content": [{ "type": "text", "text": "..."}]},
{"role": "assistant", "content": [{ "type": "thinking", "thinking": "..."}, { "type": "text", "text": "..." }]},
],
}
```
Example config: `./magistral-small-think-qlora.yaml`.
The `thinking` section also supports an optional arg `closed: bool` (`True` default) which controls adding the closing `[/THINK]` tag.
Limitations:
- You cannot mix `content: str` with `content: list[dict]` as the `dataset.load_dataset` may complain about different types for `content` key.
- This mode does not work with custom `train_detail` and `training` at the moment.
### TIPS ### TIPS
- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`. - For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html). - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template). - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
## Optimization Guides ## Optimization Guides

View File

@@ -6,9 +6,6 @@ tokenizer_use_mistral_common: true
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true

View File

@@ -6,9 +6,6 @@ tokenizer_use_mistral_common: true
# Automatically upload checkpoint and final model to HF # Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name # hub_model_id: username/custom_model_name
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
load_in_8bit: false load_in_8bit: false
load_in_4bit: true load_in_4bit: true

View File

@@ -1,68 +0,0 @@
base_model: mistralai/Magistral-Small-2507
# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
load_in_8bit: false
load_in_4bit: true
datasets:
- path: Nanobit/text-think-2k-test
type: chat_template
dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/lora-out
adapter: qlora
lora_model_dir:
sequence_len: 2048
sample_packing: true
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
- gate_proj
- down_proj
- up_proj
- q_proj
- v_proj
- k_proj
- o_proj
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
bf16: auto
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -41,7 +41,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: flash_attention:
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -38,7 +38,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -59,7 +59,7 @@ sdp_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -59,7 +59,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -73,7 +73,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: false flash_attention: false
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -56,7 +56,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -64,7 +64,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -54,7 +54,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -56,7 +56,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -74,7 +74,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1

View File

@@ -59,7 +59,7 @@ flash_attention: true
loss_watchdog_threshold: 5.0 loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3 loss_watchdog_patience: 3
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -43,7 +43,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 20
evals_per_epoch: 5 evals_per_epoch: 5
saves_per_epoch: 5 saves_per_epoch: 5
weight_decay: 0.05 weight_decay: 0.05

View File

@@ -59,7 +59,7 @@ gradient_checkpointing: true
resume_from_checkpoint: resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
warmup_ratio: 0.1 warmup_steps: 10
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 4 saves_per_epoch: 4
weight_decay: 0.0 weight_decay: 0.0

View File

@@ -50,7 +50,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

View File

@@ -53,7 +53,7 @@ resume_from_checkpoint:
logging_steps: 1 logging_steps: 1
flash_attention: true flash_attention: true
warmup_ratio: 0.1 warmup_steps: 100
evals_per_epoch: 4 evals_per_epoch: 4
saves_per_epoch: 1 saves_per_epoch: 1
weight_decay: 0.1 weight_decay: 0.1

Some files were not shown because too many files have changed in this diff Show More