checkpoint model on first step callback (#2906)

* checkpoint model on first step callback * remove debug * add test cases; update existing tests not to save on first step * move test out of solo * delete * default to False * typo
2025-07-15 15:00:48 -04:00
parent d320ef6199
commit 10ba1622f7
146 changed files with 419 additions and 9 deletions
--- a/tests/e2e/integrations/test_cut_cross_entropy.py
+++ b/tests/e2e/integrations/test_cut_cross_entropy.py
@@ -44,6 +44,7 @@ def min_cfg(temp_dir):
        "save_safetensors": True,
        "max_steps": 10,
        "bf16": "auto",
+        "save_first_step": False,
    }


@@ -98,6 +99,7 @@ class TestCutCrossEntropyIntegration:
                "save_safetensors": True,
                "max_steps": 10,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/integrations/test_hooks.py
+++ b/tests/e2e/integrations/test_hooks.py
@@ -153,6 +153,7 @@ class TestPluginHooks:
                "max_steps": 5,
                "flash_attention": True,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -67,6 +67,7 @@ def min_cfg(temp_dir):
        "output_dir": temp_dir,
        "save_safetensors": True,
        "use_tensorboard": True,
+        "save_first_step": False,
    }


--- a/tests/e2e/integrations/test_liger.py
+++ b/tests/e2e/integrations/test_liger.py
@@ -50,6 +50,7 @@ class LigerIntegrationTestCase:
                "save_safetensors": True,
                "bf16": "auto",
                "max_steps": 5,
+                "save_first_step": False,
            }
        )
        # pylint: disable=duplicate-code
@@ -96,6 +97,7 @@ class LigerIntegrationTestCase:
                "save_safetensors": True,
                "bf16": "auto",
                "max_steps": 5,
+                "save_first_step": False,
            }
        )
        # pylint: disable=duplicate-code
--- a/tests/e2e/integrations/test_llm_compressor.py
+++ b/tests/e2e/integrations/test_llm_compressor.py
@@ -81,6 +81,7 @@ class TestLLMCompressorIntegration:
                    },
                    "save_compressed": save_compressed,
                },
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/patched/test_sp.py
+++ b/tests/e2e/multigpu/patched/test_sp.py
@@ -69,6 +69,7 @@ class TestSequenceParallelism:
                "use_tensorboard": True,
                "sequence_parallel_degree": 2,
                "ring_attn_func": ring_attn_func,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -61,6 +61,7 @@ class TestPackedFlex:
                "max_steps": 2,
                "use_tensorboard": True,
                "save_strategy": "no",
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -223,6 +223,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -317,6 +318,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -409,6 +411,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_eval.py
+++ b/tests/e2e/multigpu/test_eval.py
@@ -67,6 +67,7 @@ class TestMultiGPUEval:
                "logging_steps": 1,
                "weight_decay": 0.0,
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -138,6 +139,7 @@ class TestMultiGPUEval:
                "logging_steps": 1,
                "weight_decay": 0.0,
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -71,6 +71,7 @@ class TestMultiGPUGemma3:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -69,6 +69,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

@@ -135,6 +136,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

@@ -210,6 +212,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

@@ -289,6 +292,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

@@ -365,6 +369,7 @@ class TestMultiGPULlama:
                },
                "use_tensorboard": True,
                "seed": 42,
+                "save_first_step": False,
            }
        )

@@ -442,6 +447,7 @@ class TestMultiGPULlama:
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -520,6 +526,7 @@ class TestMultiGPULlama:
                    "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
                },
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )
        if attention_backend == "flash":
@@ -605,6 +612,7 @@ class TestMultiGPULlama:
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -689,6 +697,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / deepspeed),
                "use_tensorboard": True,
+                "save_first_step": False,
                **adapter,
            }
        )
@@ -765,6 +774,7 @@ class TestMultiGPULlama:
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                "use_tensorboard": True,
                "seed": 42,
+                "save_first_step": False,
                **adapter,
            }
        )
@@ -840,6 +850,7 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "use_tensorboard": True,
+                "save_first_step": False,
                **adapter,
            }
        )
@@ -908,6 +919,7 @@ class TestMultiGPULlama:
                "save_safetensors": True,
                # "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -56,6 +56,7 @@ class TestMultiGPURay:
                "use_tensorboard": True,
                "use_ray": True,
                "ray_num_workers": 2,
+                "save_first_step": False,
            }
        )

@@ -115,6 +116,7 @@ class TestMultiGPURay:
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/patched/test_4d_multipack_llama.py
+++ b/tests/e2e/patched/test_4d_multipack_llama.py
@@ -55,6 +55,7 @@ class Test4dMultipackLlama(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "fp16": True,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -102,6 +103,7 @@ class Test4dMultipackLlama(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "fp16": True,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_activation_checkpointing.py
+++ b/tests/e2e/patched/test_activation_checkpointing.py
@@ -69,6 +69,7 @@ class TestActivationCheckpointing:
                "bf16": True,
                "save_safetensors": True,
                "gradient_checkpointing": gradient_checkpointing,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -62,6 +62,7 @@ class TestFAXentropyLlama:
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/patched/test_falcon_samplepack.py
+++ b/tests/e2e/patched/test_falcon_samplepack.py
@@ -58,6 +58,7 @@ class TestFalconPatched(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -99,6 +100,7 @@ class TestFalconPatched(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_flattening.py
+++ b/tests/e2e/patched/test_flattening.py
@@ -61,6 +61,7 @@ class TestFAFlattening:
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/patched/test_fused_llama.py
+++ b/tests/e2e/patched/test_fused_llama.py
@@ -53,6 +53,7 @@ class TestFusedLlama(unittest.TestCase):
                "max_steps": 10,
                "save_steps": 5,
                "eval_steps": 5,
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/patched/test_llama_s2_attention.py
+++ b/tests/e2e/patched/test_llama_s2_attention.py
@@ -58,6 +58,7 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
                "save_steps": 5,
                "eval_steps": 5,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )

@@ -100,6 +101,7 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
                "save_steps": 5,
                "eval_steps": 5,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/patched/test_lora_llama_multipack.py
+++ b/tests/e2e/patched/test_lora_llama_multipack.py
@@ -55,6 +55,7 @@ class TestLoraLlama(unittest.TestCase):
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
@@ -108,6 +109,7 @@ class TestLoraLlama(unittest.TestCase):
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_mistral_samplepack.py
+++ b/tests/e2e/patched/test_mistral_samplepack.py
@@ -56,6 +56,7 @@ class TestMistral(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -97,6 +98,7 @@ class TestMistral(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -52,6 +52,7 @@ class TestMixtral(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -90,6 +91,7 @@ class TestMixtral(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_model_patches.py
+++ b/tests/e2e/patched/test_model_patches.py
@@ -45,6 +45,7 @@ class TestModelPatches(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -78,6 +79,7 @@ class TestModelPatches(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_peft_embeddings.py
+++ b/tests/e2e/patched/test_peft_embeddings.py
@@ -49,6 +49,7 @@ class TestLlamaPeftEmbeddings:
                "bf16": "auto",
                "save_safetensors": True,
                "embeddings_skip_upcast": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/patched/test_phi_multipack.py
+++ b/tests/e2e/patched/test_phi_multipack.py
@@ -54,6 +54,7 @@ class TestPhiMultipack(unittest.TestCase):
                "eval_steps": 3,
                "save_steps": 4,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )

@@ -105,6 +106,7 @@ class TestPhiMultipack(unittest.TestCase):
                "eval_steps": 3,
                "save_steps": 4,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/patched/test_resume.py
+++ b/tests/e2e/patched/test_resume.py
@@ -58,6 +58,7 @@ class TestResumeLlama:
                "max_steps": 15,
                "use_tensorboard": True,
                "save_safetensors": True,
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/patched/test_sp.py
+++ b/tests/e2e/patched/test_sp.py
@@ -47,6 +47,7 @@ def fixture_cfg():
            "special_tokens": {
                "pad_token": "<|endoftext|>",
            },
+            "save_first_step": False,
        }
    )

--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -62,6 +62,7 @@ class TestUnslothQLoRA:
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )

@@ -112,6 +113,7 @@ class TestUnslothQLoRA:
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )

@@ -167,6 +169,7 @@ class TestUnslothQLoRA:
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "fp16": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/solo/test_flex.py
+++ b/tests/e2e/solo/test_flex.py
@@ -49,6 +49,7 @@ class TestPackedFlex(unittest.TestCase):
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/solo/test_relora_llama.py
+++ b/tests/e2e/solo/test_relora_llama.py
@@ -65,6 +65,7 @@ class TestReLoraLlama(unittest.TestCase):
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_deepseekv3.py
+++ b/tests/e2e/test_deepseekv3.py
@@ -67,6 +67,7 @@ class TestDeepseekV3:
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -116,6 +117,7 @@ class TestDeepseekV3:
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_dpo.py
+++ b/tests/e2e/test_dpo.py
@@ -56,6 +56,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
            }
        )

@@ -105,6 +106,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
            }
        )

@@ -154,6 +156,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
            }
        )

@@ -203,6 +206,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
            }
        )

@@ -251,6 +255,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
            }
        )

@@ -302,6 +307,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
            }
        )

@@ -370,6 +376,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_embeddings_lr.py
+++ b/tests/e2e/test_embeddings_lr.py
@@ -48,6 +48,7 @@ class TestEmbeddingsLrScale(unittest.TestCase):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

@@ -93,6 +94,7 @@ class TestEmbeddingsLrScale(unittest.TestCase):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_evaluate.py
+++ b/tests/e2e/test_evaluate.py
@@ -36,6 +36,7 @@ class TestE2eEvaluate:
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_falcon.py
+++ b/tests/e2e/test_falcon.py
@@ -60,6 +60,7 @@ class TestFalcon(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )

@@ -115,6 +116,7 @@ class TestFalcon(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )

@@ -156,6 +158,7 @@ class TestFalcon(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_gemma3_text.py
+++ b/tests/e2e/test_gemma3_text.py
@@ -63,6 +63,7 @@ class TestGemma3Text:
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -113,6 +114,7 @@ class TestGemma3Text:
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -45,6 +45,7 @@ class TestLlama:
                "sample_packing": True,
                "bf16": True,
                "save_safetensors": True,
+                "save_first_step": False,
            }
        )

@@ -92,6 +93,7 @@ class TestLlama:
                "sample_packing": True,
                "bf16": True,
                "save_safetensors": True,
+                "save_first_step": False,
            }
        )

@@ -136,6 +138,7 @@ class TestLlama:
                "sample_packing": True,
                "bf16": True,
                "save_safetensors": True,
+                "save_first_step": False,
            }
        )

@@ -176,6 +179,7 @@ class TestLlama:
                "batch_flattening": True,
                "bf16": True,
                "save_safetensors": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -53,6 +53,7 @@ class TestPretrainLlama:
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_llama_vision.py
+++ b/tests/e2e/test_llama_vision.py
@@ -54,6 +54,7 @@ class TestLlamaVision(unittest.TestCase):
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )

@@ -100,6 +101,7 @@ class TestLlamaVision(unittest.TestCase):
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_lora_llama.py
+++ b/tests/e2e/test_lora_llama.py
@@ -49,6 +49,7 @@ class TestLoraLlama(unittest.TestCase):
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_mamba.py
+++ b/tests/e2e/test_mamba.py
@@ -51,6 +51,7 @@ class TestMamba(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": None,
                "save_safetensors": False,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_mistral.py
+++ b/tests/e2e/test_mistral.py
@@ -55,6 +55,7 @@ class TestMistral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
+                "save_first_step": False,
            }
        )

@@ -95,6 +96,7 @@ class TestMistral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/test_mixtral.py
+++ b/tests/e2e/test_mixtral.py
@@ -61,6 +61,7 @@ class TestMixtral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
+                "save_first_step": False,
            }
        )

@@ -116,6 +117,7 @@ class TestMixtral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
+                "save_first_step": False,
            }
        )

@@ -170,6 +172,7 @@ class TestMixtral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
@@ -228,6 +231,7 @@ class TestMixtral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
+                "save_first_step": False,
            }
        )

@@ -273,6 +277,7 @@ class TestMixtral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/test_optimizers.py
+++ b/tests/e2e/test_optimizers.py
@@ -55,6 +55,7 @@ class TestCustomOptimizers(unittest.TestCase):
                "optimizer": "optimi_adamw",
                "max_steps": 5,
                "lr_scheduler": "cosine",
+                "save_first_step": False,
            }
        )

@@ -100,6 +101,7 @@ class TestCustomOptimizers(unittest.TestCase):
                "learning_rate": 0.00001,
                "optimizer": "adopt_adamw",
                "lr_scheduler": "cosine",
+                "save_first_step": False,
            }
        )

@@ -146,6 +148,7 @@ class TestCustomOptimizers(unittest.TestCase):
                "optimizer": "muon",
                "lr_scheduler": "cosine",
                "weight_decay": 0.01,
+                "save_first_step": False,
            }
        )

@@ -184,6 +187,7 @@ class TestCustomOptimizers(unittest.TestCase):
                "lr_scheduler": "constant",
                "save_safetensors": True,
                "max_steps": 10,
+                "save_first_step": False,
            }
        )
        # pylint: disable=duplicate-code
@@ -232,6 +236,7 @@ class TestCustomOptimizers(unittest.TestCase):
                "adam_epsilon2": 1e-16,
                "max_steps": 5,
                "lr_scheduler": "cosine",
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -48,6 +48,7 @@ class TestPackedLlama(unittest.TestCase):
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/test_phi.py
+++ b/tests/e2e/test_phi.py
@@ -53,6 +53,7 @@ class TestPhi(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -102,6 +103,7 @@ class TestPhi(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_process_reward_model_smollm2.py
+++ b/tests/e2e/test_process_reward_model_smollm2.py
@@ -49,6 +49,7 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
                "use_tensorboard": True,
                "special_tokens": {"pad_token": "<|endoftext|>"},
                "seed": 42,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_qat.py
+++ b/tests/e2e/test_qat.py
@@ -57,6 +57,7 @@ class TestQATLlama:
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -115,6 +116,7 @@ class TestQATLlama:
                    "weight_dtype": "int8",
                    "group_size": 8,
                },
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_qwen.py
+++ b/tests/e2e/test_qwen.py
@@ -59,6 +59,7 @@ class TestE2eQwen:
                "bf16": "auto",
                "tf32": True,
                "gradient_checkpointing": True,
+                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -58,6 +58,7 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
                "gradient_checkpointing": True,
                "warmup_ratio": 0.1,
                "use_tensorboard": True,
+                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_save_first_step.py
+++ b/tests/e2e/test_save_first_step.py
@@ -0,0 +1,102 @@
+"""
+E2E tests for relora llama
+"""
+
+import unittest
+from pathlib import Path
+
+import pytest
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from .utils import check_model_output_exists, with_temp_dir
+
+
+class TestSaveFirstStepCallback(unittest.TestCase):
+    """Test cases for save_first_step callback config."""
+
+    @with_temp_dir
+    def test_save_first_step(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_type": "AutoTokenizer",
+                "sequence_len": 512,
+                "val_set_size": 0.02,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 3,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_bnb_8bit",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "sample_packing": True,
+                "bf16": True,
+                "save_safetensors": True,
+                "save_first_step": True,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(str(Path(temp_dir) / "checkpoint-1"), cfg)
+
+    @with_temp_dir
+    def test_no_save_first_step(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_type": "AutoTokenizer",
+                "sequence_len": 512,
+                "val_set_size": 0.02,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 3,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_bnb_8bit",
+                "lr_scheduler": "cosine",
+                "flash_attention": True,
+                "sample_packing": True,
+                "bf16": True,
+                "save_safetensors": True,
+                "save_first_step": False,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        with pytest.raises(AssertionError):
+            check_model_output_exists(str(Path(temp_dir) / "checkpoint-1"), cfg)
--- a/tests/e2e/test_schedulers.py
+++ b/tests/e2e/test_schedulers.py
@@ -51,6 +51,7 @@ class TestCustomSchedulers(unittest.TestCase):
                "lr_scheduler": "rex",
                "warmup_steps": 5,
                "cosine_min_lr_ratio": 0.05,
+                "save_first_step": False,
            }
        )