diff --git a/examples/gemma3n/gemma-3n-e2b-qlora.yml b/examples/gemma3n/gemma-3n-e2b-qlora.yml new file mode 100644 index 000000000..ffc1da736 --- /dev/null +++ b/examples/gemma3n/gemma-3n-e2b-qlora.yml @@ -0,0 +1,80 @@ +base_model: google/gemma-3n-E2B-it + +model_type: AutoModelForCausalLM +tokenizer_type: AutoTokenizer +# Automatically upload checkpoint and final model to HF +# hub_model_id: username/custom_model_name + +plugins: + - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin +cut_cross_entropy: true + +load_in_8bit: false +load_in_4bit: true + +# for use with fft to only train on language model layers +# unfrozen_parameters: + # - model.language_model.* + # - lm_head + # - embed_tokens + +# huggingface repo +# chat_template: gemma3 +eot_tokens: + - +datasets: + - path: cgato/SlimOrcaDedupCleaned + type: chat_template + split: train[:1%] + field_messages: conversations + message_property_mappings: + role: from + content: value + +val_set_size: 0.0 +output_dir: ./outputs/out + +adapter: qlora +lora_r: 32 +lora_alpha: 16 +lora_dropout: 0.05 +# lora_target_linear: # Does not work with gemma3n currently +lora_target_modules: + - q_proj + - k_proj + - v_proj + - o_proj + +sequence_len: 2048 +sample_packing: true +eval_sample_packing: true +pad_to_sequence_len: true + +wandb_project: +wandb_entity: +wandb_watch: +wandb_name: +wandb_log_model: + +gradient_accumulation_steps: 1 +micro_batch_size: 1 +num_epochs: 4 +optimizer: muon +lr_scheduler: cosine +learning_rate: 0.0002 + +bf16: auto +tf32: true + +gradient_checkpointing: true +gradient_checkpointing_kwargs: + use_reentrant: false +resume_from_checkpoint: +logging_steps: 1 +# flash_attention: true # Any attention impl does not work with gemma3n now + +warmup_ratio: 0.1 +evals_per_epoch: +saves_per_epoch: 1 +weight_decay: 0.0 +special_tokens: