From 30981328fcacc2f08ce2d8c4d2bbbcb35241b35b Mon Sep 17 00:00:00 2001 From: Dan Saunders Date: Fri, 23 May 2025 20:04:21 +0000 Subject: [PATCH] draft config for devstral --- ...al-ds-zero3.yaml => bigstral-ds-zero3.yml} | 0 examples/mistral/devstral-small-2505.yml | 48 +++++++++++++++++++ 2 files changed, 48 insertions(+) rename examples/mistral/{bigstral-ds-zero3.yaml => bigstral-ds-zero3.yml} (100%) create mode 100644 examples/mistral/devstral-small-2505.yml diff --git a/examples/mistral/bigstral-ds-zero3.yaml b/examples/mistral/bigstral-ds-zero3.yml similarity index 100% rename from examples/mistral/bigstral-ds-zero3.yaml rename to examples/mistral/bigstral-ds-zero3.yml diff --git a/examples/mistral/devstral-small-2505.yml b/examples/mistral/devstral-small-2505.yml new file mode 100644 index 000000000..70ce70cf2 --- /dev/null +++ b/examples/mistral/devstral-small-2505.yml @@ -0,0 +1,48 @@ +base_model: mistralai/Devstral-Small-2505 +processor_type: AutoProcessor + +# these 3 lines are needed for now to handle vision chat templates w images +skip_prepare_dataset: true +remove_unused_columns: false +sample_packing: false + +chat_template: mistral_v7_tekken +datasets: + - path: HuggingFaceH4/llava-instruct-mix-vsft + type: chat_template + split: train[:1%] + field_messages: messages +dataset_prepared_path: last_run_prepared +val_set_size: 0.01 +output_dir: ./outputs/out + +sequence_len: 2048 +pad_to_sequence_len: false + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 1 +optimizer: adamw_bnb_8bit +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +fp16: +tf32: false + +gradient_checkpointing: true +logging_steps: 1 +flash_attention: false +eager_attention: + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: