checkpoint model on first step callback (#2906)

* checkpoint model on first step callback

* remove debug

* add test cases; update existing tests not to save on first step

* move test out of solo

* delete

* default to False

* typo
This commit is contained in:
Dan Saunders
2025-07-15 15:00:48 -04:00
committed by GitHub
parent d320ef6199
commit 10ba1622f7
146 changed files with 419 additions and 9 deletions

View File

@@ -69,6 +69,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
"save_first_step": False,
}
)
@@ -135,6 +136,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
"save_first_step": False,
}
)
@@ -210,6 +212,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
"save_first_step": False,
}
)
@@ -289,6 +292,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"use_tensorboard": True,
"bf16": True,
"save_first_step": False,
}
)
@@ -365,6 +369,7 @@ class TestMultiGPULlama:
},
"use_tensorboard": True,
"seed": 42,
"save_first_step": False,
}
)
@@ -442,6 +447,7 @@ class TestMultiGPULlama:
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
},
"use_tensorboard": True,
"save_first_step": False,
}
)
@@ -520,6 +526,7 @@ class TestMultiGPULlama:
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
},
"use_tensorboard": True,
"save_first_step": False,
}
)
if attention_backend == "flash":
@@ -605,6 +612,7 @@ class TestMultiGPULlama:
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
},
"use_tensorboard": True,
"save_first_step": False,
}
)
@@ -689,6 +697,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
"use_tensorboard": True,
"save_first_step": False,
**adapter,
}
)
@@ -765,6 +774,7 @@ class TestMultiGPULlama:
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
"use_tensorboard": True,
"seed": 42,
"save_first_step": False,
**adapter,
}
)
@@ -840,6 +850,7 @@ class TestMultiGPULlama:
"flash_attention": True,
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True,
"save_first_step": False,
**adapter,
}
)
@@ -908,6 +919,7 @@ class TestMultiGPULlama:
"save_safetensors": True,
# "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
"use_tensorboard": True,
"save_first_step": False,
}
)