checkpoint model on first step callback (#2906)

* checkpoint model on first step callback * remove debug * add test cases; update existing tests not to save on first step * move test out of solo * delete * default to False * typo
2025-07-15 15:00:48 -04:00
parent d320ef6199
commit 10ba1622f7
146 changed files with 419 additions and 9 deletions
--- a/tests/e2e/multigpu/patched/test_sp.py
+++ b/tests/e2e/multigpu/patched/test_sp.py
@@ -69,6 +69,7 @@ class TestSequenceParallelism:
                "use_tensorboard": True,
                "sequence_parallel_degree": 2,
                "ring_attn_func": ring_attn_func,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -61,6 +61,7 @@ class TestPackedFlex:
                "max_steps": 2,
                "use_tensorboard": True,
                "save_strategy": "no",
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -223,6 +223,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -317,6 +318,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -409,6 +411,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_eval.py
+++ b/tests/e2e/multigpu/test_eval.py
@@ -67,6 +67,7 @@ class TestMultiGPUEval:
                "logging_steps": 1,
                "weight_decay": 0.0,
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -138,6 +139,7 @@ class TestMultiGPUEval:
                "logging_steps": 1,
                "weight_decay": 0.0,
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -71,6 +71,7 @@ class TestMultiGPUGemma3:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -69,6 +69,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

@@ -135,6 +136,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

@@ -210,6 +212,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

@@ -289,6 +292,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

@@ -365,6 +369,7 @@ class TestMultiGPULlama:
                },
                "use_tensorboard": True,
                "seed": 42,
+                "save_first_step": False,
            }
        )

@@ -442,6 +447,7 @@ class TestMultiGPULlama:
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -520,6 +526,7 @@ class TestMultiGPULlama:
                    "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
                },
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )
        if attention_backend == "flash":
@@ -605,6 +612,7 @@ class TestMultiGPULlama:
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -689,6 +697,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / deepspeed),
                "use_tensorboard": True,
+                "save_first_step": False,
                **adapter,
            }
        )
@@ -765,6 +774,7 @@ class TestMultiGPULlama:
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                "use_tensorboard": True,
                "seed": 42,
+                "save_first_step": False,
                **adapter,
            }
        )
@@ -840,6 +850,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "use_tensorboard": True,
+                "save_first_step": False,
                **adapter,
            }
        )
@@ -908,6 +919,7 @@ class TestMultiGPULlama:
                "save_safetensors": True,
                # "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -56,6 +56,7 @@ class TestMultiGPURay:
                "use_tensorboard": True,
                "use_ray": True,
                "ray_num_workers": 2,
+                "save_first_step": False,
            }
        )

@@ -115,6 +116,7 @@ class TestMultiGPURay:
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )