Revert "checkpoint model on first step callback (#2906)"

This reverts commit 10ba1622f7.
2025-07-15 15:01:12 -04:00
parent 10ba1622f7
commit 6f6d917a99
146 changed files with 9 additions and 419 deletions
--- a/tests/e2e/integrations/test_cut_cross_entropy.py
+++ b/tests/e2e/integrations/test_cut_cross_entropy.py
@@ -44,7 +44,6 @@ def min_cfg(temp_dir):
        "save_safetensors": True,
        "max_steps": 10,
        "bf16": "auto",
-        "save_first_step": False,
    }


@@ -99,7 +98,6 @@ class TestCutCrossEntropyIntegration:
                "save_safetensors": True,
                "max_steps": 10,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/integrations/test_hooks.py
+++ b/tests/e2e/integrations/test_hooks.py
@@ -153,7 +153,6 @@ class TestPluginHooks:
                "max_steps": 5,
                "flash_attention": True,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -67,7 +67,6 @@ def min_cfg(temp_dir):
        "output_dir": temp_dir,
        "save_safetensors": True,
        "use_tensorboard": True,
-        "save_first_step": False,
    }


--- a/tests/e2e/integrations/test_liger.py
+++ b/tests/e2e/integrations/test_liger.py
@@ -50,7 +50,6 @@ class LigerIntegrationTestCase:
                "save_safetensors": True,
                "bf16": "auto",
                "max_steps": 5,
-                "save_first_step": False,
            }
        )
        # pylint: disable=duplicate-code
@@ -97,7 +96,6 @@ class LigerIntegrationTestCase:
                "save_safetensors": True,
                "bf16": "auto",
                "max_steps": 5,
-                "save_first_step": False,
            }
        )
        # pylint: disable=duplicate-code
--- a/tests/e2e/integrations/test_llm_compressor.py
+++ b/tests/e2e/integrations/test_llm_compressor.py
@@ -81,7 +81,6 @@ class TestLLMCompressorIntegration:
                    },
                    "save_compressed": save_compressed,
                },
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/patched/test_sp.py
+++ b/tests/e2e/multigpu/patched/test_sp.py
@@ -69,7 +69,6 @@ class TestSequenceParallelism:
                "use_tensorboard": True,
                "sequence_parallel_degree": 2,
                "ring_attn_func": ring_attn_func,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -61,7 +61,6 @@ class TestPackedFlex:
                "max_steps": 2,
                "use_tensorboard": True,
                "save_strategy": "no",
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -223,7 +223,6 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

@@ -318,7 +317,6 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

@@ -411,7 +409,6 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_eval.py
+++ b/tests/e2e/multigpu/test_eval.py
@@ -67,7 +67,6 @@ class TestMultiGPUEval:
                "logging_steps": 1,
                "weight_decay": 0.0,
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

@@ -139,7 +138,6 @@ class TestMultiGPUEval:
                "logging_steps": 1,
                "weight_decay": 0.0,
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -71,7 +71,6 @@ class TestMultiGPUGemma3:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -69,7 +69,6 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )

@@ -136,7 +135,6 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )

@@ -212,7 +210,6 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )

@@ -292,7 +289,6 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )

@@ -369,7 +365,6 @@ class TestMultiGPULlama:
                },
                "use_tensorboard": True,
                "seed": 42,
-                "save_first_step": False,
            }
        )

@@ -447,7 +442,6 @@ class TestMultiGPULlama:
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

@@ -526,7 +520,6 @@ class TestMultiGPULlama:
                    "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
                },
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )
        if attention_backend == "flash":
@@ -612,7 +605,6 @@ class TestMultiGPULlama:
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

@@ -697,7 +689,6 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / deepspeed),
                "use_tensorboard": True,
-                "save_first_step": False,
                **adapter,
            }
        )
@@ -774,7 +765,6 @@ class TestMultiGPULlama:
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                "use_tensorboard": True,
                "seed": 42,
-                "save_first_step": False,
                **adapter,
            }
        )
@@ -850,7 +840,6 @@ class TestMultiGPULlama:
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "use_tensorboard": True,
-                "save_first_step": False,
                **adapter,
            }
        )
@@ -919,7 +908,6 @@ class TestMultiGPULlama:
                "save_safetensors": True,
                # "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -56,7 +56,6 @@ class TestMultiGPURay:
                "use_tensorboard": True,
                "use_ray": True,
                "ray_num_workers": 2,
-                "save_first_step": False,
            }
        )

@@ -116,7 +115,6 @@ class TestMultiGPURay:
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/patched/test_4d_multipack_llama.py
+++ b/tests/e2e/patched/test_4d_multipack_llama.py
@@ -55,7 +55,6 @@ class Test4dMultipackLlama(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "fp16": True,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -103,7 +102,6 @@ class Test4dMultipackLlama(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "fp16": True,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_activation_checkpointing.py
+++ b/tests/e2e/patched/test_activation_checkpointing.py
@@ -69,7 +69,6 @@ class TestActivationCheckpointing:
                "bf16": True,
                "save_safetensors": True,
                "gradient_checkpointing": gradient_checkpointing,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -62,7 +62,6 @@ class TestFAXentropyLlama:
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/patched/test_falcon_samplepack.py
+++ b/tests/e2e/patched/test_falcon_samplepack.py
@@ -58,7 +58,6 @@ class TestFalconPatched(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -100,7 +99,6 @@ class TestFalconPatched(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_flattening.py
+++ b/tests/e2e/patched/test_flattening.py
@@ -61,7 +61,6 @@ class TestFAFlattening:
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/patched/test_fused_llama.py
+++ b/tests/e2e/patched/test_fused_llama.py
@@ -53,7 +53,6 @@ class TestFusedLlama(unittest.TestCase):
                "max_steps": 10,
                "save_steps": 5,
                "eval_steps": 5,
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/patched/test_llama_s2_attention.py
+++ b/tests/e2e/patched/test_llama_s2_attention.py
@@ -58,7 +58,6 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
                "save_steps": 5,
                "eval_steps": 5,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )

@@ -101,7 +100,6 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
                "save_steps": 5,
                "eval_steps": 5,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/patched/test_lora_llama_multipack.py
+++ b/tests/e2e/patched/test_lora_llama_multipack.py
@@ -55,7 +55,6 @@ class TestLoraLlama(unittest.TestCase):
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
@@ -109,7 +108,6 @@ class TestLoraLlama(unittest.TestCase):
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_mistral_samplepack.py
+++ b/tests/e2e/patched/test_mistral_samplepack.py
@@ -56,7 +56,6 @@ class TestMistral(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -98,7 +97,6 @@ class TestMistral(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -52,7 +52,6 @@ class TestMixtral(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -91,7 +90,6 @@ class TestMixtral(unittest.TestCase):
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_model_patches.py
+++ b/tests/e2e/patched/test_model_patches.py
@@ -45,7 +45,6 @@ class TestModelPatches(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -79,7 +78,6 @@ class TestModelPatches(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/patched/test_peft_embeddings.py
+++ b/tests/e2e/patched/test_peft_embeddings.py
@@ -49,7 +49,6 @@ class TestLlamaPeftEmbeddings:
                "bf16": "auto",
                "save_safetensors": True,
                "embeddings_skip_upcast": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/patched/test_phi_multipack.py
+++ b/tests/e2e/patched/test_phi_multipack.py
@@ -54,7 +54,6 @@ class TestPhiMultipack(unittest.TestCase):
                "eval_steps": 3,
                "save_steps": 4,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )

@@ -106,7 +105,6 @@ class TestPhiMultipack(unittest.TestCase):
                "eval_steps": 3,
                "save_steps": 4,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/patched/test_resume.py
+++ b/tests/e2e/patched/test_resume.py
@@ -58,7 +58,6 @@ class TestResumeLlama:
                "max_steps": 15,
                "use_tensorboard": True,
                "save_safetensors": True,
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/patched/test_sp.py
+++ b/tests/e2e/patched/test_sp.py
@@ -47,7 +47,6 @@ def fixture_cfg():
            "special_tokens": {
                "pad_token": "<|endoftext|>",
            },
-            "save_first_step": False,
        }
    )

--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -62,7 +62,6 @@ class TestUnslothQLoRA:
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )

@@ -113,7 +112,6 @@ class TestUnslothQLoRA:
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )

@@ -169,7 +167,6 @@ class TestUnslothQLoRA:
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "fp16": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/solo/test_flex.py
+++ b/tests/e2e/solo/test_flex.py
@@ -49,7 +49,6 @@ class TestPackedFlex(unittest.TestCase):
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/solo/test_relora_llama.py
+++ b/tests/e2e/solo/test_relora_llama.py
@@ -65,7 +65,6 @@ class TestReLoraLlama(unittest.TestCase):
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_deepseekv3.py
+++ b/tests/e2e/test_deepseekv3.py
@@ -67,7 +67,6 @@ class TestDeepseekV3:
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -117,7 +116,6 @@ class TestDeepseekV3:
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_dpo.py
+++ b/tests/e2e/test_dpo.py
@@ -56,7 +56,6 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
-                "save_first_step": False,
            }
        )

@@ -106,7 +105,6 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
-                "save_first_step": False,
            }
        )

@@ -156,7 +154,6 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
-                "save_first_step": False,
            }
        )

@@ -206,7 +203,6 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
-                "save_first_step": False,
            }
        )

@@ -255,7 +251,6 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
-                "save_first_step": False,
            }
        )

@@ -307,7 +302,6 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
-                "save_first_step": False,
            }
        )

@@ -376,7 +370,6 @@ class TestDPOLlamaLora(unittest.TestCase):
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_embeddings_lr.py
+++ b/tests/e2e/test_embeddings_lr.py
@@ -48,7 +48,6 @@ class TestEmbeddingsLrScale(unittest.TestCase):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

@@ -94,7 +93,6 @@ class TestEmbeddingsLrScale(unittest.TestCase):
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_evaluate.py
+++ b/tests/e2e/test_evaluate.py
@@ -36,7 +36,6 @@ class TestE2eEvaluate:
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_falcon.py
+++ b/tests/e2e/test_falcon.py
@@ -60,7 +60,6 @@ class TestFalcon(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )

@@ -116,7 +115,6 @@ class TestFalcon(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )

@@ -158,7 +156,6 @@ class TestFalcon(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_gemma3_text.py
+++ b/tests/e2e/test_gemma3_text.py
@@ -63,7 +63,6 @@ class TestGemma3Text:
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -114,7 +113,6 @@ class TestGemma3Text:
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -45,7 +45,6 @@ class TestLlama:
                "sample_packing": True,
                "bf16": True,
                "save_safetensors": True,
-                "save_first_step": False,
            }
        )

@@ -93,7 +92,6 @@ class TestLlama:
                "sample_packing": True,
                "bf16": True,
                "save_safetensors": True,
-                "save_first_step": False,
            }
        )

@@ -138,7 +136,6 @@ class TestLlama:
                "sample_packing": True,
                "bf16": True,
                "save_safetensors": True,
-                "save_first_step": False,
            }
        )

@@ -179,7 +176,6 @@ class TestLlama:
                "batch_flattening": True,
                "bf16": True,
                "save_safetensors": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -53,7 +53,6 @@ class TestPretrainLlama:
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_llama_vision.py
+++ b/tests/e2e/test_llama_vision.py
@@ -54,7 +54,6 @@ class TestLlamaVision(unittest.TestCase):
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )

@@ -101,7 +100,6 @@ class TestLlamaVision(unittest.TestCase):
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_lora_llama.py
+++ b/tests/e2e/test_lora_llama.py
@@ -49,7 +49,6 @@ class TestLoraLlama(unittest.TestCase):
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_mamba.py
+++ b/tests/e2e/test_mamba.py
@@ -51,7 +51,6 @@ class TestMamba(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": None,
                "save_safetensors": False,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_mistral.py
+++ b/tests/e2e/test_mistral.py
@@ -55,7 +55,6 @@ class TestMistral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
-                "save_first_step": False,
            }
        )

@@ -96,7 +95,6 @@ class TestMistral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/test_mixtral.py
+++ b/tests/e2e/test_mixtral.py
@@ -61,7 +61,6 @@ class TestMixtral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
-                "save_first_step": False,
            }
        )

@@ -117,7 +116,6 @@ class TestMixtral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
-                "save_first_step": False,
            }
        )

@@ -172,7 +170,6 @@ class TestMixtral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
@@ -231,7 +228,6 @@ class TestMixtral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
-                "save_first_step": False,
            }
        )

@@ -277,7 +273,6 @@ class TestMixtral(unittest.TestCase):
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/test_optimizers.py
+++ b/tests/e2e/test_optimizers.py
@@ -55,7 +55,6 @@ class TestCustomOptimizers(unittest.TestCase):
                "optimizer": "optimi_adamw",
                "max_steps": 5,
                "lr_scheduler": "cosine",
-                "save_first_step": False,
            }
        )

@@ -101,7 +100,6 @@ class TestCustomOptimizers(unittest.TestCase):
                "learning_rate": 0.00001,
                "optimizer": "adopt_adamw",
                "lr_scheduler": "cosine",
-                "save_first_step": False,
            }
        )

@@ -148,7 +146,6 @@ class TestCustomOptimizers(unittest.TestCase):
                "optimizer": "muon",
                "lr_scheduler": "cosine",
                "weight_decay": 0.01,
-                "save_first_step": False,
            }
        )

@@ -187,7 +184,6 @@ class TestCustomOptimizers(unittest.TestCase):
                "lr_scheduler": "constant",
                "save_safetensors": True,
                "max_steps": 10,
-                "save_first_step": False,
            }
        )
        # pylint: disable=duplicate-code
@@ -236,7 +232,6 @@ class TestCustomOptimizers(unittest.TestCase):
                "adam_epsilon2": 1e-16,
                "max_steps": 5,
                "lr_scheduler": "cosine",
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -48,7 +48,6 @@ class TestPackedLlama(unittest.TestCase):
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
--- a/tests/e2e/test_phi.py
+++ b/tests/e2e/test_phi.py
@@ -53,7 +53,6 @@ class TestPhi(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -103,7 +102,6 @@ class TestPhi(unittest.TestCase):
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_process_reward_model_smollm2.py
+++ b/tests/e2e/test_process_reward_model_smollm2.py
@@ -49,7 +49,6 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
                "use_tensorboard": True,
                "special_tokens": {"pad_token": "<|endoftext|>"},
                "seed": 42,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_qat.py
+++ b/tests/e2e/test_qat.py
@@ -57,7 +57,6 @@ class TestQATLlama:
                "max_steps": 5,
                "save_safetensors": True,
                "bf16": True,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
@@ -116,7 +115,6 @@ class TestQATLlama:
                    "weight_dtype": "int8",
                    "group_size": 8,
                },
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_qwen.py
+++ b/tests/e2e/test_qwen.py
@@ -59,7 +59,6 @@ class TestE2eQwen:
                "bf16": "auto",
                "tf32": True,
                "gradient_checkpointing": True,
-                "save_first_step": False,
            }
        )

--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -58,7 +58,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
                "gradient_checkpointing": True,
                "warmup_ratio": 0.1,
                "use_tensorboard": True,
-                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
--- a/tests/e2e/test_save_first_step.py
+++ b/tests/e2e/test_save_first_step.py
@@ -1,102 +0,0 @@
-"""
-E2E tests for relora llama
-"""
-
-import unittest
-from pathlib import Path
-
-import pytest
-
-from axolotl.common.datasets import load_datasets
-from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
-from axolotl.utils.dict import DictDefault
-
-from .utils import check_model_output_exists, with_temp_dir
-
-
-class TestSaveFirstStepCallback(unittest.TestCase):
-    """Test cases for save_first_step callback config."""
-
-    @with_temp_dir
-    def test_save_first_step(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "tokenizer_type": "AutoTokenizer",
-                "sequence_len": 512,
-                "val_set_size": 0.02,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 3,
-                "micro_batch_size": 2,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_bnb_8bit",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "sample_packing": True,
-                "bf16": True,
-                "save_safetensors": True,
-                "save_first_step": True,
-            }
-        )
-
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        dataset_meta = load_datasets(cfg=cfg)
-
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        check_model_output_exists(str(Path(temp_dir) / "checkpoint-1"), cfg)
-
-    @with_temp_dir
-    def test_no_save_first_step(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "tokenizer_type": "AutoTokenizer",
-                "sequence_len": 512,
-                "val_set_size": 0.02,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 3,
-                "micro_batch_size": 2,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_bnb_8bit",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "sample_packing": True,
-                "bf16": True,
-                "save_safetensors": True,
-                "save_first_step": False,
-            }
-        )
-
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        dataset_meta = load_datasets(cfg=cfg)
-
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        with pytest.raises(AssertionError):
-            check_model_output_exists(str(Path(temp_dir) / "checkpoint-1"), cfg)
--- a/tests/e2e/test_schedulers.py
+++ b/tests/e2e/test_schedulers.py
@@ -51,7 +51,6 @@ class TestCustomSchedulers(unittest.TestCase):
                "lr_scheduler": "rex",
                "warmup_steps": 5,
                "cosine_min_lr_ratio": 0.05,
-                "save_first_step": False,
            }
        )