From 30981328fcacc2f08ce2d8c4d2bbbcb35241b35b Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Fri, 23 May 2025 20:04:21 +0000
Subject: [PATCH] draft config for devstral

---
 ...al-ds-zero3.yaml => bigstral-ds-zero3.yml} |  0
 examples/mistral/devstral-small-2505.yml      | 48 +++++++++++++++++++
 2 files changed, 48 insertions(+)
 rename examples/mistral/{bigstral-ds-zero3.yaml => bigstral-ds-zero3.yml} (100%)
 create mode 100644 examples/mistral/devstral-small-2505.yml

diff --git a/examples/mistral/bigstral-ds-zero3.yaml b/examples/mistral/bigstral-ds-zero3.yml
similarity index 100%
rename from examples/mistral/bigstral-ds-zero3.yaml
rename to examples/mistral/bigstral-ds-zero3.yml
diff --git a/examples/mistral/devstral-small-2505.yml b/examples/mistral/devstral-small-2505.yml
new file mode 100644
index 000000000..70ce70cf2
--- /dev/null
+++ b/examples/mistral/devstral-small-2505.yml
@@ -0,0 +1,48 @@
+base_model: mistralai/Devstral-Small-2505
+processor_type: AutoProcessor
+
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+chat_template: mistral_v7_tekken
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:1%]
+    field_messages: messages
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./outputs/out
+
+sequence_len: 2048
+pad_to_sequence_len: false
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+fp16:
+tf32: false
+
+gradient_checkpointing: true
+logging_steps: 1
+flash_attention: false
+eager_attention:
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens: