checkpoint model on first step callback (#2906)

* checkpoint model on first step callback

* remove debug

* add test cases; update existing tests not to save on first step

* move test out of solo

* delete

* default to False

* typo
This commit is contained in:
Dan Saunders
2025-07-15 15:00:48 -04:00
committed by GitHub
parent d320ef6199
commit 10ba1622f7
146 changed files with 419 additions and 9 deletions

View File

@@ -69,6 +69,7 @@ class TestSequenceParallelism:
"use_tensorboard": True,
"sequence_parallel_degree": 2,
"ring_attn_func": ring_attn_func,
"save_first_step": False,
}
)

View File

@@ -61,6 +61,7 @@ class TestPackedFlex:
"max_steps": 2,
"use_tensorboard": True,
"save_strategy": "no",
"save_first_step": False,
}
)
if is_torch_bf16_gpu_available():

View File

@@ -223,6 +223,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
"save_safetensors": True,
"bf16": "auto",
"use_tensorboard": True,
"save_first_step": False,
}
)
@@ -317,6 +318,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
"save_safetensors": True,
"bf16": "auto",
"use_tensorboard": True,
"save_first_step": False,
}
)
@@ -409,6 +411,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
"save_safetensors": True,
"bf16": "auto",
"use_tensorboard": True,
"save_first_step": False,
}
)

View File

@@ -67,6 +67,7 @@ class TestMultiGPUEval:
"logging_steps": 1,
"weight_decay": 0.0,
"use_tensorboard": True,
"save_first_step": False,
}
)
@@ -138,6 +139,7 @@ class TestMultiGPUEval:
"logging_steps": 1,
"weight_decay": 0.0,
"use_tensorboard": True,
"save_first_step": False,
}
)

View File

@@ -71,6 +71,7 @@ class TestMultiGPUGemma3:
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
"save_first_step": False,
}
)

View File

@@ -69,6 +69,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
"save_first_step": False,
}
)
@@ -135,6 +136,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
"save_first_step": False,
}
)
@@ -210,6 +212,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
"save_first_step": False,
}
)
@@ -289,6 +292,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
"save_first_step": False,
}
)
@@ -365,6 +369,7 @@ class TestMultiGPULlama:
},
"use_tensorboard": True,
"seed": 42,
"save_first_step": False,
}
)
@@ -442,6 +447,7 @@ class TestMultiGPULlama:
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
},
"use_tensorboard": True,
"save_first_step": False,
}
)
@@ -520,6 +526,7 @@ class TestMultiGPULlama:
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
},
"use_tensorboard": True,
"save_first_step": False,
}
)
if attention_backend == "flash":
@@ -605,6 +612,7 @@ class TestMultiGPULlama:
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
},
"use_tensorboard": True,
"save_first_step": False,
}
)
@@ -689,6 +697,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
"use_tensorboard": True,
"save_first_step": False,
**adapter,
}
)
@@ -765,6 +774,7 @@ class TestMultiGPULlama:
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
"use_tensorboard": True,
"seed": 42,
"save_first_step": False,
**adapter,
}
)
@@ -840,6 +850,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True,
"save_first_step": False,
**adapter,
}
)
@@ -908,6 +919,7 @@ class TestMultiGPULlama:
"save_safetensors": True,
# "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True,
"save_first_step": False,
}
)

View File

@@ -56,6 +56,7 @@ class TestMultiGPURay:
"use_tensorboard": True,
"use_ray": True,
"ray_num_workers": 2,
"save_first_step": False,
}
)
@@ -115,6 +116,7 @@ class TestMultiGPURay:
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
"use_tensorboard": True,
"save_first_step": False,
}
)