checkpoint model on first step callback (#2906)

* checkpoint model on first step callback

* remove debug

* add test cases; update existing tests not to save on first step

* move test out of solo

* delete

* default to False

* typo
This commit is contained in:
Dan Saunders
2025-07-15 15:00:48 -04:00
committed by GitHub
parent d320ef6199
commit 10ba1622f7
146 changed files with 419 additions and 9 deletions

View File

@@ -70,3 +70,5 @@ fsdp_config:
fsdp_state_dict_type: FULL_STATE_DICT
fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
fsdp_activation_checkpointing: true
# save_first_step: true # uncomment this to validate checkpoint saving works with your config

View File

@@ -61,3 +61,5 @@ flash_attention: true
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
# save_first_step: true # uncomment this to validate checkpoint saving works with your config