diff --git a/examples/gemma3n/gemma-3n-e2b-qlora.yml b/examples/gemma3n/gemma-3n-e2b-qlora.yml index 7868af59e..ad7ab5726 100644 --- a/examples/gemma3n/gemma-3n-e2b-qlora.yml +++ b/examples/gemma3n/gemma-3n-e2b-qlora.yml @@ -53,7 +53,7 @@ wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 1 num_epochs: 4 -optimizer: muon +optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 diff --git a/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml b/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml index 6cdf5573e..15afb6f2e 100644 --- a/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml +++ b/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml @@ -60,7 +60,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 -optimizer: muon +optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 diff --git a/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml b/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml index 519edecc7..c87eca663 100644 --- a/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml +++ b/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml @@ -55,7 +55,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 -optimizer: muon +optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 diff --git a/examples/llama-3-vision/lora-11b.yaml b/examples/llama-3-vision/lora-11b.yaml index da28ace3b..adbb61643 100644 --- a/examples/llama-3-vision/lora-11b.yaml +++ b/examples/llama-3-vision/lora-11b.yaml @@ -39,7 +39,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 -optimizer: muon +optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 diff --git a/examples/llava/lora-7b.yaml b/examples/llava/lora-7b.yaml index 53ae97542..77ef7474d 100644 --- a/examples/llava/lora-7b.yaml +++ b/examples/llava/lora-7b.yaml @@ -35,7 +35,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 -optimizer: muon +optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002 diff --git a/examples/pixtral/lora-12b.yml b/examples/pixtral/lora-12b.yml index fc4c0667c..fea2a60ff 100644 --- a/examples/pixtral/lora-12b.yml +++ b/examples/pixtral/lora-12b.yml @@ -35,7 +35,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 1 num_epochs: 1 -optimizer: muon +optimizer: adamw_bnb_8bit lr_scheduler: cosine learning_rate: 0.0002