gemma3 packing fixes (#2449)

* make gemma3 work with packing

* multi-gpu e2e for ci

* update gemma3 model namespace to use mirror

* add gradient checkpointing to multigpu e2e ci

* update gemma3 examples for use_reentrant and fix ddp find unused params

* fix tests for gemma3

* fix import for test utils

* set correct train loss for gemma3 e2e
This commit is contained in:
Wing Lian
2025-03-31 17:15:23 -04:00
committed by GitHub
parent 4d36ecc724
commit 328d598114
8 changed files with 130 additions and 2 deletions

View File

@@ -58,6 +58,7 @@ class TestMultiGPULlama:
"max_steps": 2,
"micro_batch_size": 4,
"gradient_accumulation_steps": 4,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_8bit",
@@ -121,6 +122,7 @@ class TestMultiGPULlama:
"max_steps": 2,
"micro_batch_size": 1,
"gradient_accumulation_steps": gradient_accumulation_steps,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_8bit",
@@ -193,6 +195,7 @@ class TestMultiGPULlama:
"max_steps": 2,
"micro_batch_size": 4,
"gradient_accumulation_steps": 4,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"warmup_steps": 0,
"learning_rate": 0.00001,
@@ -270,6 +273,7 @@ class TestMultiGPULlama:
"max_steps": 2,
"micro_batch_size": 2,
"gradient_accumulation_steps": 4,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"warmup_steps": 0,
"learning_rate": 0.00001,
@@ -330,6 +334,7 @@ class TestMultiGPULlama:
"max_steps": 2,
"micro_batch_size": 2,
"gradient_accumulation_steps": gradient_accumulation_steps,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
@@ -400,6 +405,7 @@ class TestMultiGPULlama:
"max_steps": 2,
"micro_batch_size": 4,
"gradient_accumulation_steps": 2,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
@@ -479,6 +485,7 @@ class TestMultiGPULlama:
"max_steps": 2,
"micro_batch_size": 4,
"gradient_accumulation_steps": 2,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",
@@ -781,6 +788,7 @@ class TestMultiGPULlama:
"max_steps": 2,
"micro_batch_size": 1,
"gradient_accumulation_steps": 1,
"gradient_checkpointing": True,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch_fused",