checkpoint model on first step callback (#2906)
* checkpoint model on first step callback * remove debug * add test cases; update existing tests not to save on first step * move test out of solo * delete * default to False * typo
This commit is contained in:
@@ -44,6 +44,7 @@ def min_cfg(temp_dir):
|
||||
"save_safetensors": True,
|
||||
"max_steps": 10,
|
||||
"bf16": "auto",
|
||||
"save_first_step": False,
|
||||
}
|
||||
|
||||
|
||||
@@ -98,6 +99,7 @@ class TestCutCrossEntropyIntegration:
|
||||
"save_safetensors": True,
|
||||
"max_steps": 10,
|
||||
"bf16": "auto",
|
||||
"save_first_step": False,
|
||||
}
|
||||
)
|
||||
cfg = validate_config(cfg)
|
||||
|
||||
@@ -153,6 +153,7 @@ class TestPluginHooks:
|
||||
"max_steps": 5,
|
||||
"flash_attention": True,
|
||||
"bf16": "auto",
|
||||
"save_first_step": False,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@@ -67,6 +67,7 @@ def min_cfg(temp_dir):
|
||||
"output_dir": temp_dir,
|
||||
"save_safetensors": True,
|
||||
"use_tensorboard": True,
|
||||
"save_first_step": False,
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -50,6 +50,7 @@ class LigerIntegrationTestCase:
|
||||
"save_safetensors": True,
|
||||
"bf16": "auto",
|
||||
"max_steps": 5,
|
||||
"save_first_step": False,
|
||||
}
|
||||
)
|
||||
# pylint: disable=duplicate-code
|
||||
@@ -96,6 +97,7 @@ class LigerIntegrationTestCase:
|
||||
"save_safetensors": True,
|
||||
"bf16": "auto",
|
||||
"max_steps": 5,
|
||||
"save_first_step": False,
|
||||
}
|
||||
)
|
||||
# pylint: disable=duplicate-code
|
||||
|
||||
@@ -81,6 +81,7 @@ class TestLLMCompressorIntegration:
|
||||
},
|
||||
"save_compressed": save_compressed,
|
||||
},
|
||||
"save_first_step": False,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user