gemma3 packing fixes (#2449)

* make gemma3 work with packing * multi-gpu e2e for ci * update gemma3 model namespace to use mirror * add gradient checkpointing to multigpu e2e ci * update gemma3 examples for use_reentrant and fix ddp find unused params * fix tests for gemma3 * fix import for test utils * set correct train loss for gemma3 e2e
2025-03-31 17:15:23 -04:00
parent 4d36ecc724
commit 328d598114
8 changed files with 130 additions and 2 deletions
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -58,6 +58,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 4,
+                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
@@ -121,6 +122,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
@@ -193,6 +195,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 4,
+                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "warmup_steps": 0,
                "learning_rate": 0.00001,
@@ -270,6 +273,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 4,
+                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "warmup_steps": 0,
                "learning_rate": 0.00001,
@@ -330,6 +334,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": gradient_accumulation_steps,
+                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
@@ -400,6 +405,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
+                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
@@ -479,6 +485,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
+                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
@@ -781,6 +788,7 @@ class TestMultiGPULlama:
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
+                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",