diff --git a/examples/gemma3n/gemma-3n-e2b-qlora.yml b/examples/gemma3n/gemma-3n-e2b-qlora.yml index 09504e14c..7868af59e 100644 --- a/examples/gemma3n/gemma-3n-e2b-qlora.yml +++ b/examples/gemma3n/gemma-3n-e2b-qlora.yml @@ -37,11 +37,7 @@ lora_r: 32 lora_alpha: 16 lora_dropout: 0.05 # lora_target_linear: # Does not work with gemma3n currently -lora_target_modules: - - q_proj - - k_proj - - v_proj - - o_proj +lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj' sequence_len: 2048 sample_packing: true diff --git a/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml b/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml new file mode 100644 index 000000000..111d9d05e --- /dev/null +++ b/examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml @@ -0,0 +1,77 @@ +base_model: google/gemma-3n-E2B-it +processor_type: AutoProcessor + +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin +cut_cross_entropy: true + +# for use with fft to only train on language model layers +# unfrozen_parameters: + # - model.language_model.* + # - lm_head + # - embed_tokens + +load_in_4bit: true + +# these 3 lines are needed for now to handle vision chat templates w images +skip_prepare_dataset: true +remove_unused_columns: false +sample_packing: false + +# gemma3 doesn't seem to play nice with ddp +ddp_find_unused_parameters: true + +chat_template: gemma3n +eot_tokens: + - + +datasets: + - path: Nanobit/text-vision-audio-2k-test # requires downloading audio/image in advance in README.md + type: chat_template + data_files: + - dataset.jsonl +dataset_prepared_path: +val_set_size: 0.01 +output_dir: ./outputs/out + +adapter: qlora +lora_model_dir: + +sequence_len: 2048 +pad_to_sequence_len: false + +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj' + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 4 +micro_batch_size: 2 +num_epochs: 1 +optimizer: muon +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: true +fp16: +tf32: true + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +logging_steps: 1 +# flash_attention: true # Any attention impl does not work with gemma3n now + +warmup_ratio: 0.1 +evals_per_epoch: 1 +saves_per_epoch: 1 +weight_decay: 0.0 diff --git a/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml b/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml index d915a60b6..519edecc7 100644 --- a/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml +++ b/examples/gemma3n/gemma-3n-e2b-vision-qlora.yml @@ -31,8 +31,7 @@ datasets: - path: HuggingFaceH4/llava-instruct-mix-vsft type: chat_template split: train[:1%] - field_messages: messages -dataset_prepared_path: last_run_prepared +dataset_prepared_path: val_set_size: 0.01 output_dir: ./outputs/out @@ -56,7 +55,7 @@ wandb_log_model: gradient_accumulation_steps: 4 micro_batch_size: 2 num_epochs: 1 -optimizer: adamw_bnb_8bit +optimizer: muon lr_scheduler: cosine learning_rate: 0.0002