gemma3 packing fixes (#2449)

* make gemma3 work with packing * multi-gpu e2e for ci * update gemma3 model namespace to use mirror * add gradient checkpointing to multigpu e2e ci * update gemma3 examples for use_reentrant and fix ddp find unused params * fix tests for gemma3 * fix import for test utils * set correct train loss for gemma3 e2e
2025-03-31 17:15:23 -04:00
parent 4d36ecc724
commit 328d598114
8 changed files with 130 additions and 2 deletions
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -5,6 +5,9 @@ tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

+# gemma3 doesn't seem to play nice with ddp
+ddp_find_unused_parameters: true
+
 load_in_8bit: false
 load_in_4bit: true
 strict: false
@@ -54,6 +57,8 @@ fp16:
 tf32: true

 gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
--- a/examples/gemma3/gemma-3-4b-lora.yml
+++ b/examples/gemma3/gemma-3-4b-lora.yml
@@ -7,6 +7,9 @@ skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false

+# gemma3 doesn't seem to play nice with ddp
+ddp_find_unused_parameters: true
+
 chat_template: gemma3
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
@@ -48,6 +51,8 @@ fp16:
 tf32: true

 gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
 local_rank:
 logging_steps: 1
 flash_attention: true