Compare commits
1 Commits
custom-mod
...
lora_kerne
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ede973b76c |
@@ -25,7 +25,6 @@
|
|||||||
|
|
||||||
## 🎉 Latest Updates
|
## 🎉 Latest Updates
|
||||||
|
|
||||||
- 2025/07: Voxtral with mistral-common tokenizer support has been integrated in Axolotl. Read the [docs](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/voxtral)!
|
|
||||||
- 2025/07: TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
|
- 2025/07: TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
|
||||||
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
|
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
|
||||||
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
|
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ flash_optimum:
|
|||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 32
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
save_total_limit:
|
save_total_limit:
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 40
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -77,7 +77,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.000001
|
weight_decay: 0.000001
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 40
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -40,7 +40,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ logging_steps: 5
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0001
|
weight_decay: 0.0001
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ logging_steps: 1
|
|||||||
flash_attention: true
|
flash_attention: true
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ logging_steps: 1
|
|||||||
flash_attention: true
|
flash_attention: true
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ logging_steps: 1
|
|||||||
flash_attention: true
|
flash_attention: true
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
flash_attention:
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
flash_attention:
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -45,7 +45,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ logging_steps: 5
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0001
|
weight_decay: 0.0001
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ logging_steps: 1
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0
|
weight_decay: 0
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ flash_attn_rms_norm: true
|
|||||||
flash_attn_fuse_qkv: false
|
flash_attn_fuse_qkv: false
|
||||||
flash_attn_fuse_mlp: true
|
flash_attn_fuse_mlp: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ flash_attention: true
|
|||||||
flash_attn_cross_entropy: false
|
flash_attn_cross_entropy: false
|
||||||
flash_attn_rms_norm: true
|
flash_attn_rms_norm: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
eval_steps:
|
eval_steps:
|
||||||
saves_per_epoch: 4
|
saves_per_epoch: 4
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: false
|
flash_attention: false
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 0
|
evals_per_epoch: 0
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ xformers_attention: true
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
gptq_groupsize:
|
gptq_groupsize:
|
||||||
gptq_model_v1:
|
gptq_model_v1:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ special_tokens:
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
|
|
||||||
# Iterations
|
# Iterations
|
||||||
num_epochs: 1
|
num_epochs: 1
|
||||||
|
|||||||
@@ -40,7 +40,7 @@
|
|||||||
"%%capture\n",
|
"%%capture\n",
|
||||||
"# This step can take ~5-10 minutes to install dependencies\n",
|
"# This step can take ~5-10 minutes to install dependencies\n",
|
||||||
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
|
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
|
||||||
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@010c3ac3f1e725098961832830303eeb4142dd88\""
|
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@631d646\""
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -37,7 +37,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -1,65 +1,19 @@
|
|||||||
# Finetune Gemma-3n with Axolotl
|
# Gemma-3n
|
||||||
|
|
||||||
Gemma-3n is a family of multimodal models from Google found on [HuggingFace](https://huggingface.co/collections/google/gemma-3n-685065323f5984ef315c93f4). This guide shows how to fine-tune it with Axolotl.
|
## Requirements
|
||||||
|
|
||||||
## Getting started
|
In addition to Axolotl's requirements, Gemma-3n requires
|
||||||
|
|
||||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Gemma3n is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
|
```
|
||||||
|
pip3 install timm
|
||||||
Here is an example of how to install from main for pip:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 min recommended)
|
|
||||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
|
||||||
cd axolotl
|
|
||||||
|
|
||||||
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
|
|
||||||
pip3 install --no-build-isolation -e '.[flash-attn]'
|
|
||||||
```
|
```
|
||||||
|
|
||||||
2. In addition to Axolotl's requirements, Gemma-3n requires:
|
If you will load audio datasets, please also install
|
||||||
|
|
||||||
```bash
|
```
|
||||||
pip3 install timm==1.0.17
|
pip3 install librosa
|
||||||
|
|
||||||
# for loading audio data
|
|
||||||
pip3 install librosa==0.11.0
|
|
||||||
```
|
```
|
||||||
|
|
||||||
3. Run the finetuning example:
|
## Usage
|
||||||
|
|
||||||
```bash
|
See example configs and the [multimodal doc](https://docs.axolotl.ai/docs/multimodal.html).
|
||||||
# text only
|
|
||||||
axolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml
|
|
||||||
|
|
||||||
# text + vision
|
|
||||||
axolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml
|
|
||||||
|
|
||||||
# text + vision + audio
|
|
||||||
axolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
|
|
||||||
```
|
|
||||||
|
|
||||||
Let us know how it goes. Happy finetuning! 🚀
|
|
||||||
|
|
||||||
WARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.
|
|
||||||
|
|
||||||
### TIPS
|
|
||||||
|
|
||||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
|
||||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
|
||||||
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
|
||||||
- The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
|
|
||||||
|
|
||||||
## Optimization Guides
|
|
||||||
|
|
||||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
|
||||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
|
||||||
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
|
|
||||||
|
|
||||||
## Related Resources
|
|
||||||
|
|
||||||
- [Gemma 3n Blog](https://ai.google.dev/gemma/docs/gemma-3n)
|
|
||||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
|
||||||
- [Axolotl Website](https://axolotl.ai)
|
|
||||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
|
||||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
|
||||||
|
|||||||
@@ -34,6 +34,8 @@ eot_tokens:
|
|||||||
datasets:
|
datasets:
|
||||||
- path: Nanobit/text-vision-audio-2k-test
|
- path: Nanobit/text-vision-audio-2k-test
|
||||||
type: chat_template
|
type: chat_template
|
||||||
|
data_files:
|
||||||
|
- dataset.jsonl
|
||||||
dataset_prepared_path:
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./outputs/out
|
output_dir: ./outputs/out
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -49,7 +49,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch:
|
evals_per_epoch:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ gradient_checkpointing_kwargs:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ flash_attn_rms_norm: true
|
|||||||
flash_attn_fuse_qkv: false
|
flash_attn_fuse_qkv: false
|
||||||
flash_attn_fuse_mlp: true
|
flash_attn_fuse_mlp: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ logging_steps: 1
|
|||||||
flash_attention:
|
flash_attention:
|
||||||
sdp_attention:
|
sdp_attention:
|
||||||
flash_optimum:
|
flash_optimum:
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -52,7 +52,7 @@ flash_attn_rms_norm: true
|
|||||||
flash_attn_fuse_qkv: false
|
flash_attn_fuse_qkv: false
|
||||||
flash_attn_fuse_mlp: true
|
flash_attn_fuse_mlp: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ lora_dropout: 0.05
|
|||||||
lora_target_linear: true
|
lora_target_linear: true
|
||||||
|
|
||||||
relora_steps: 150
|
relora_steps: 150
|
||||||
relora_warmup_ratio: 0.1
|
relora_warmup_steps: 10
|
||||||
relora_cpu_offload: false
|
relora_cpu_offload: false
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
@@ -50,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ logging_steps: 1
|
|||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
- full_shard
|
- full_shard
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ liger_rms_norm: true
|
|||||||
liger_glu_activation: true
|
liger_glu_activation: true
|
||||||
liger_fused_linear_cross_entropy: true
|
liger_fused_linear_cross_entropy: true
|
||||||
|
|
||||||
|
|
||||||
chat_template: llama3
|
chat_template: llama3
|
||||||
datasets:
|
datasets:
|
||||||
- path: mlabonne/FineTome-100k
|
- path: mlabonne/FineTome-100k
|
||||||
@@ -51,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ lora_model_dir:
|
|||||||
sequence_len: 2048
|
sequence_len: 2048
|
||||||
sample_packing: true
|
sample_packing: true
|
||||||
|
|
||||||
|
|
||||||
lora_r: 16
|
lora_r: 16
|
||||||
lora_alpha: 32
|
lora_alpha: 32
|
||||||
# Currently, we don't support dropout with our custom Triton kernels
|
# Currently, we don't support dropout with our custom Triton kernels
|
||||||
@@ -59,7 +58,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -57,7 +57,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -55,7 +55,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ gradient_checkpointing_kwargs:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -48,7 +48,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -47,7 +47,7 @@ logging_steps: 1
|
|||||||
xformers_attention:
|
xformers_attention:
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 2
|
||||||
eval_table_size:
|
eval_table_size:
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ gradient_checkpointing: offload
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
@@ -84,7 +84,7 @@ fsdp_config:
|
|||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
fsdp_sharding_strategy: FULL_SHARD
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ tf32: true
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
@@ -88,7 +88,7 @@ fsdp_config:
|
|||||||
fsdp_sharding_strategy: FULL_SHARD
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
fsdp_activation_checkpointing: true
|
fsdp_activation_checkpointing: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -76,12 +76,12 @@ gradient_checkpointing: offload
|
|||||||
gradient_checkpointing_kwargs:
|
gradient_checkpointing_kwargs:
|
||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ tf32: true
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
@@ -84,7 +84,7 @@ fsdp_config:
|
|||||||
fsdp_sharding_strategy: FULL_SHARD
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
fsdp_activation_checkpointing: true
|
fsdp_activation_checkpointing: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ flex_attn_compile_kwargs:
|
|||||||
dynamic: false
|
dynamic: false
|
||||||
mode: max-autotune-no-cudagraphs
|
mode: max-autotune-no-cudagraphs
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
@@ -82,7 +82,7 @@ fsdp_config:
|
|||||||
fsdp_reshard_after_forward: true
|
fsdp_reshard_after_forward: true
|
||||||
fsdp_activation_checkpointing: true
|
fsdp_activation_checkpointing: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -74,13 +74,13 @@ gradient_checkpointing_kwargs:
|
|||||||
use_reentrant: false
|
use_reentrant: false
|
||||||
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ flex_attn_compile_kwargs:
|
|||||||
dynamic: false
|
dynamic: false
|
||||||
mode: max-autotune-no-cudagraphs
|
mode: max-autotune-no-cudagraphs
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
@@ -85,7 +85,7 @@ fsdp_config:
|
|||||||
fsdp_reshard_after_forward: true
|
fsdp_reshard_after_forward: true
|
||||||
fsdp_activation_checkpointing: true
|
fsdp_activation_checkpointing: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
# Finetune Magistral Small with Axolotl
|
# Finetune Magistral Small with Axolotl
|
||||||
|
|
||||||
Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at [2506](https://huggingface.co/mistralai/Magistral-Small-2506) and [2507](https://huggingface.co/mistralai/Magistral-Small-2507) (see [Thinking](#thinking)). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
Magistral Small is a 24B parameter opensource model from MistralAI found on [HuggingFace](https://huggingface.co/mistralai/Magistral-Small-2506). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
|
||||||
|
|
||||||
MistralAI has also released a proprietary medium-sized version called Magistral Medium.
|
MistralAI has also released a proprietary medium-sized version called Magistral Medium.
|
||||||
|
|
||||||
@@ -13,7 +13,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
|
|||||||
Here is an example of how to install from main for pip:
|
Here is an example of how to install from main for pip:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
# Ensure you have Pytorch installed (Pytorch 2.6.0 recommended)
|
||||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||||
cd axolotl
|
cd axolotl
|
||||||
|
|
||||||
@@ -31,37 +31,12 @@ This config uses about 24GB VRAM.
|
|||||||
|
|
||||||
Let us know how it goes. Happy finetuning! 🚀
|
Let us know how it goes. Happy finetuning! 🚀
|
||||||
|
|
||||||
### Thinking
|
|
||||||
|
|
||||||
MistralAI has released their [2507](https://huggingface.co/mistralai/Magistral-Small-2507) model with thinking capabilities. The model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.
|
|
||||||
|
|
||||||
Example format:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"messages": [
|
|
||||||
{"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
|
|
||||||
{"role": "user", "content": [{ "type": "text", "text": "..."}]},
|
|
||||||
{"role": "assistant", "content": [{ "type": "thinking", "thinking": "..."}, { "type": "text", "text": "..." }]},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
Example config: `./magistral-small-think-qlora.yaml`.
|
|
||||||
|
|
||||||
The `thinking` section also supports an optional arg `closed: bool` (`True` default) which controls adding the closing `[/THINK]` tag.
|
|
||||||
|
|
||||||
Limitations:
|
|
||||||
- You cannot mix `content: str` with `content: list[dict]` as the `dataset.load_dataset` may complain about different types for `content` key.
|
|
||||||
- This mode does not work with custom `train_detail` and `training` at the moment.
|
|
||||||
|
|
||||||
### TIPS
|
### TIPS
|
||||||
|
|
||||||
- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
|
|
||||||
- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
|
- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
|
||||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
||||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
||||||
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
||||||
|
|
||||||
## Optimization Guides
|
## Optimization Guides
|
||||||
|
|
||||||
|
|||||||
@@ -6,9 +6,6 @@ tokenizer_use_mistral_common: true
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|
||||||
|
|||||||
@@ -6,9 +6,6 @@ tokenizer_use_mistral_common: true
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|
||||||
|
|||||||
@@ -1,68 +0,0 @@
|
|||||||
base_model: mistralai/Magistral-Small-2507
|
|
||||||
|
|
||||||
# Enable to use mistral-common tokenizer
|
|
||||||
tokenizer_use_mistral_common: true
|
|
||||||
|
|
||||||
# Automatically upload checkpoint and final model to HF
|
|
||||||
# hub_model_id: username/custom_model_name
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: true
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: Nanobit/text-think-2k-test
|
|
||||||
type: chat_template
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0
|
|
||||||
output_dir: ./outputs/lora-out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_target_modules:
|
|
||||||
- gate_proj
|
|
||||||
- down_proj
|
|
||||||
- up_proj
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
|
|
||||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
|
||||||
@@ -41,7 +41,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention:
|
flash_attention:
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -38,7 +38,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ sdp_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -73,7 +73,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: false
|
flash_attention: false
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -54,7 +54,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ flash_attention: true
|
|||||||
loss_watchdog_threshold: 5.0
|
loss_watchdog_threshold: 5.0
|
||||||
loss_watchdog_patience: 3
|
loss_watchdog_patience: 3
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 20
|
||||||
evals_per_epoch: 5
|
evals_per_epoch: 5
|
||||||
saves_per_epoch: 5
|
saves_per_epoch: 5
|
||||||
weight_decay: 0.05
|
weight_decay: 0.05
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ gradient_checkpointing: true
|
|||||||
resume_from_checkpoint:
|
resume_from_checkpoint:
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 10
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 4
|
saves_per_epoch: 4
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|||||||
@@ -50,7 +50,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
@@ -53,7 +53,7 @@ resume_from_checkpoint:
|
|||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flash_attention: true
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
warmup_steps: 100
|
||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user