Files
axolotl/examples/voxtral/voxtral-mini-qlora.yml
NanoCode012 90e5598930 Feat: Add voxtral, magistral small 1.1, and misc gemma3n fixes (#2979)
* fix: lock version in gemma3n docs

* feat: add sample configs and docs

* chore: move mistraltokenizer into mistral folder

* feat: update instructions

* feat: add dynamic load voxtral

* fix: remove incorrect vision config, add audio

* fix: support voxtral processing strategy and address none in data

* feat: patch mistraltokenizer subclass upstream and add missing

* feat: update cce commit to include voxtral

* fix: remove old comment

* fix: gemma3 patch not needed anymore

* fix: voxtral modeling code

* fix: remove incorrect ds path

* fix: adjust apply chat template parsing

* feat: enable voxtral patch

* fix: patch

* feat: update example datasets

* fix: target layer

* feat: update gemma3n docs

* feat: update voxtral docs

* feat: revert assistant parsing to rely on new upstream changes

* chore: skip test till next PR fix

* fix: override upstream decode due to missing handling

* feat: update readme

* fix: update

* feat: add magistral small think support

* feat: update mistral-common dep

* fix: lint

* fix: remove optional dep

* chore: typing

* chore: simply import

* feat(doc): update differences for 2507

* fix: coderrabbit comments

* feat: update clarify docs on new transformers
2025-07-30 15:57:05 +07:00

74 lines
1.4 KiB
YAML

base_model: mistralai/Voxtral-Mini-3B-2507
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
load_in_8bit: false
load_in_4bit: true
# for use with fft to only train on language model layers
# unfrozen_parameters:
# - language_model.model.*
# - lm_head
# - embed_tokens
eot_tokens:
- <end_of_turn>
datasets:
- path: cgato/SlimOrcaDedupCleaned
type: chat_template
split: train[:1%]
field_messages: conversations
message_property_mappings:
role: from
content: value
val_set_size: 0.0
output_dir: ./outputs/out
adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
sequence_len: 2048
sample_packing: true
eval_sample_packing: true
pad_to_sequence_len: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
bf16: auto
tf32: true
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens: