checkpoint model on first step callback (#2906)
* checkpoint model on first step callback * remove debug * add test cases; update existing tests not to save on first step * move test out of solo * delete * default to False * typo
This commit is contained in:
@@ -26,3 +26,5 @@ timeout: 86400
|
|||||||
# Preprocess specific configurations
|
# Preprocess specific configurations
|
||||||
memory_preprocess: 32
|
memory_preprocess: 32
|
||||||
timeout_preprocess: 14400
|
timeout_preprocess: 14400
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -35,7 +35,6 @@ wandb_watch:
|
|||||||
wandb_name:
|
wandb_name:
|
||||||
wandb_log_model:
|
wandb_log_model:
|
||||||
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
gradient_accumulation_steps: 4
|
||||||
micro_batch_size: 1
|
micro_batch_size: 1
|
||||||
num_epochs: 4
|
num_epochs: 4
|
||||||
@@ -56,3 +55,5 @@ evals_per_epoch:
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -56,3 +56,5 @@ evals_per_epoch: 1
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -56,3 +56,5 @@ evals_per_epoch: 1
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -55,3 +55,5 @@ fsdp_config:
|
|||||||
fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
|
fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
|
||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
fsdp_sharding_strategy: FULL_SHARD
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -79,3 +79,5 @@ fsdp_config:
|
|||||||
fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
|
fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
|
||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
fsdp_sharding_strategy: FULL_SHARD
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -62,3 +62,5 @@ saves_per_epoch: 1
|
|||||||
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -69,3 +69,5 @@ evals_per_epoch:
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -46,7 +46,6 @@ wandb_watch:
|
|||||||
wandb_name:
|
wandb_name:
|
||||||
wandb_log_model:
|
wandb_log_model:
|
||||||
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
gradient_accumulation_steps: 4
|
||||||
micro_batch_size: 1
|
micro_batch_size: 1
|
||||||
num_epochs: 4
|
num_epochs: 4
|
||||||
@@ -69,3 +68,5 @@ evals_per_epoch:
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -69,3 +69,5 @@ evals_per_epoch:
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -69,3 +69,5 @@ evals_per_epoch: 1
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -69,3 +69,5 @@ evals_per_epoch:
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -69,3 +69,5 @@ evals_per_epoch: 1
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -60,3 +60,5 @@ evals_per_epoch:
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -50,3 +50,5 @@ evals_per_epoch:
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -66,3 +66,5 @@ evals_per_epoch:
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -60,3 +60,5 @@ warmup_ratio: 0.1
|
|||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -62,3 +62,5 @@ warmup_ratio: 0.1
|
|||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -60,3 +60,5 @@ evals_per_epoch: 1
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -54,3 +54,5 @@ evals_per_epoch:
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -55,3 +55,5 @@ saves_per_epoch: 1
|
|||||||
deepspeed: deepspeed_configs/zero2.json
|
deepspeed: deepspeed_configs/zero2.json
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -64,3 +64,5 @@ fsdp_config:
|
|||||||
fsdp_transformer_layer_cls_to_wrap: JambaAttentionDecoderLayer,JambaMambaDecoderLayer
|
fsdp_transformer_layer_cls_to_wrap: JambaAttentionDecoderLayer,JambaMambaDecoderLayer
|
||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
fsdp_sharding_strategy: FULL_SHARD
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -46,3 +46,5 @@ evals_per_epoch: 2
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -55,3 +55,5 @@ saves_per_epoch: 1
|
|||||||
deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
|
deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -64,3 +64,5 @@ special_tokens:
|
|||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
unk_token: "<unk>"
|
unk_token: "<unk>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -60,3 +60,5 @@ special_tokens:
|
|||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
unk_token: "<unk>"
|
unk_token: "<unk>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -52,3 +52,5 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -52,3 +52,5 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -67,3 +67,5 @@ fsdp_config:
|
|||||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -53,3 +53,5 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -58,3 +58,5 @@ special_tokens:
|
|||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
unk_token: "<unk>"
|
unk_token: "<unk>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -57,3 +57,5 @@ warmup_ratio: 0.1
|
|||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -77,3 +77,5 @@ fsdp_config:
|
|||||||
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -72,3 +72,5 @@ fsdp_config:
|
|||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad_id|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot_id|>
|
eos_token: <|eot_id|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -42,3 +42,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -71,3 +71,5 @@ warmup_steps: 10
|
|||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -64,3 +64,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -83,3 +83,5 @@ warmup_steps: 10
|
|||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -61,3 +61,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -65,3 +65,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -64,3 +64,5 @@ special_tokens:
|
|||||||
|
|
||||||
use_ray: true
|
use_ray: true
|
||||||
ray_num_workers: 4
|
ray_num_workers: 4
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -63,3 +63,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -60,3 +60,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -57,3 +57,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -61,3 +61,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -62,3 +62,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -60,3 +60,5 @@ fsdp_config:
|
|||||||
fsdp_sharding_strategy: FULL_SHARD
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad_id|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -69,3 +69,5 @@ fsdp_config:
|
|||||||
fsdp_sharding_strategy: FULL_SHARD
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|end_of_text|>
|
pad_token: <|end_of_text|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -54,3 +54,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|end_of_text|>"
|
pad_token: "<|end_of_text|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -75,3 +75,5 @@ llmcompressor:
|
|||||||
]
|
]
|
||||||
start: 0
|
start: 0
|
||||||
save_compressed: true
|
save_compressed: true
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -86,3 +86,5 @@ fsdp_config:
|
|||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad_id|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -90,3 +90,5 @@ fsdp_config:
|
|||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad_id|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -83,3 +83,5 @@ weight_decay: 0.0
|
|||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad_id|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -86,3 +86,5 @@ fsdp_config:
|
|||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad_id|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -84,3 +84,5 @@ fsdp_config:
|
|||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad_id|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -82,3 +82,5 @@ weight_decay: 0.0
|
|||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad_id|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -87,3 +87,5 @@ fsdp_config:
|
|||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <|finetune_right_pad_id|>
|
pad_token: <|finetune_right_pad_id|>
|
||||||
eos_token: <|eot|>
|
eos_token: <|eot|>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -53,3 +53,5 @@ warmup_ratio: 0.1
|
|||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -70,3 +70,5 @@ fsdp_config:
|
|||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
|
fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
|
||||||
fsdp_activation_checkpointing: true
|
fsdp_activation_checkpointing: true
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -61,3 +61,5 @@ flash_attention: true
|
|||||||
warmup_ratio: 0.1
|
warmup_ratio: 0.1
|
||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -48,3 +48,5 @@ weight_decay: 0.0
|
|||||||
special_tokens:
|
special_tokens:
|
||||||
tokens:
|
tokens:
|
||||||
save_safetensors: False
|
save_safetensors: False
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -53,3 +53,5 @@ special_tokens:
|
|||||||
eos_token: "<|im_end|>"
|
eos_token: "<|im_end|>"
|
||||||
tokens:
|
tokens:
|
||||||
- "<|im_start|>"
|
- "<|im_start|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -43,3 +43,5 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -64,3 +64,5 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -64,3 +64,5 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -80,3 +80,5 @@ weight_decay: 0.0
|
|||||||
special_tokens:
|
special_tokens:
|
||||||
bos_token: "<|im_start|>"
|
bos_token: "<|im_start|>"
|
||||||
eos_token: "<|im_end|>"
|
eos_token: "<|im_end|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -74,3 +74,5 @@ fsdp_config:
|
|||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -69,3 +69,5 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -56,3 +56,5 @@ evals_per_epoch: 1
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -72,3 +72,5 @@ fsdp_config:
|
|||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -77,3 +77,5 @@ fsdp_config:
|
|||||||
fsdp_forward_prefetch: false
|
fsdp_forward_prefetch: false
|
||||||
fsdp_backward_prefetch: BACKWARD_PRE
|
fsdp_backward_prefetch: BACKWARD_PRE
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -81,3 +81,5 @@ saves_per_epoch: 1
|
|||||||
deepspeed: deepspeed_configs/zero2.json
|
deepspeed: deepspeed_configs/zero2.json
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -51,3 +51,5 @@ special_tokens:
|
|||||||
eos_token: "<|im_end|>"
|
eos_token: "<|im_end|>"
|
||||||
tokens:
|
tokens:
|
||||||
- "<|im_start|>"
|
- "<|im_start|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -64,3 +64,5 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -50,3 +50,5 @@ weight_decay: 0.05
|
|||||||
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <custom_token_7>
|
pad_token: <custom_token_7>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -63,3 +63,5 @@ warmup_steps: 10
|
|||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 4
|
saves_per_epoch: 4
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -57,3 +57,5 @@ weight_decay: 0.1
|
|||||||
resize_token_embeddings_to_32x: true
|
resize_token_embeddings_to_32x: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -60,3 +60,5 @@ weight_decay: 0.1
|
|||||||
resize_token_embeddings_to_32x: true
|
resize_token_embeddings_to_32x: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -57,3 +57,5 @@ weight_decay: 0.1
|
|||||||
resize_token_embeddings_to_32x: true
|
resize_token_embeddings_to_32x: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -71,3 +71,5 @@ fsdp_config:
|
|||||||
resize_token_embeddings_to_32x: true
|
resize_token_embeddings_to_32x: true
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: "<|endoftext|>"
|
pad_token: "<|endoftext|>"
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -59,3 +59,5 @@ warmup_ratio: 0.2
|
|||||||
debug: true
|
debug: true
|
||||||
weight_decay: 0.1
|
weight_decay: 0.1
|
||||||
resize_token_embeddings_to_32x: true
|
resize_token_embeddings_to_32x: true
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -55,3 +55,5 @@ saves_per_epoch: 1
|
|||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
pad_token: <pad>
|
pad_token: <pad>
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -53,3 +53,5 @@ warmup_ratio: 0.1
|
|||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -54,3 +54,5 @@ warmup_steps: 10
|
|||||||
evals_per_epoch: 4
|
evals_per_epoch: 4
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -55,3 +55,5 @@ eval_steps: 100
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -67,3 +67,5 @@ fsdp_config:
|
|||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
fsdp_sharding_strategy: FULL_SHARD
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -26,7 +26,6 @@ wandb_watch:
|
|||||||
wandb_name:
|
wandb_name:
|
||||||
wandb_log_model:
|
wandb_log_model:
|
||||||
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
gradient_accumulation_steps: 4
|
||||||
micro_batch_size: 2
|
micro_batch_size: 2
|
||||||
num_epochs: 4
|
num_epochs: 4
|
||||||
@@ -50,3 +49,5 @@ evals_per_epoch:
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -53,3 +53,5 @@ warmup_ratio: 0.1
|
|||||||
evals_per_epoch: 1
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -67,3 +67,5 @@ evals_per_epoch: 4
|
|||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -76,3 +76,5 @@ fsdp_config:
|
|||||||
fsdp_activation_checkpointing: true
|
fsdp_activation_checkpointing: true
|
||||||
|
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -66,3 +66,5 @@ fsdp_config:
|
|||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
fsdp_sharding_strategy: FULL_SHARD
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
special_tokens:
|
special_tokens:
|
||||||
|
|
||||||
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ from axolotl.utils.callbacks import (
|
|||||||
GCCallback,
|
GCCallback,
|
||||||
GPUStatsCallback,
|
GPUStatsCallback,
|
||||||
SaveAxolotlConfigtoWandBCallback,
|
SaveAxolotlConfigtoWandBCallback,
|
||||||
|
SaveModelOnFirstStepCallback,
|
||||||
)
|
)
|
||||||
from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
|
from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
|
||||||
from axolotl.utils.schemas.enums import CustomSupportedOptimizers
|
from axolotl.utils.schemas.enums import CustomSupportedOptimizers
|
||||||
@@ -135,6 +136,8 @@ class TrainerBuilderBase(abc.ABC):
|
|||||||
callbacks.append(
|
callbacks.append(
|
||||||
SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
|
SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
|
||||||
)
|
)
|
||||||
|
if self.cfg.save_first_step:
|
||||||
|
callbacks.append(SaveModelOnFirstStepCallback())
|
||||||
|
|
||||||
callbacks.append(GPUStatsCallback(cfg=self.cfg))
|
callbacks.append(GPUStatsCallback(cfg=self.cfg))
|
||||||
|
|
||||||
|
|||||||
@@ -64,7 +64,7 @@ class SaveBetterTransformerModelCallback(
|
|||||||
state: TrainerState,
|
state: TrainerState,
|
||||||
control: TrainerControl,
|
control: TrainerControl,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
) -> TrainerControl:
|
||||||
# Save
|
# Save
|
||||||
if (
|
if (
|
||||||
args.save_strategy == IntervalStrategy.STEPS
|
args.save_strategy == IntervalStrategy.STEPS
|
||||||
@@ -100,11 +100,11 @@ class GPUStatsCallback(
|
|||||||
|
|
||||||
def on_step_end(
|
def on_step_end(
|
||||||
self,
|
self,
|
||||||
args: TrainingArguments,
|
args: TrainingArguments, # pylint: disable=unused-argument
|
||||||
state: TrainerState,
|
state: TrainerState,
|
||||||
control: TrainerControl,
|
control: TrainerControl,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
) -> TrainerControl:
|
||||||
if not self.logged and state.global_step > 1:
|
if not self.logged and state.global_step > 1:
|
||||||
log_gpu_memory_usage(LOG, "while training", self.cfg.device)
|
log_gpu_memory_usage(LOG, "while training", self.cfg.device)
|
||||||
self.logged = True
|
self.logged = True
|
||||||
@@ -116,18 +116,17 @@ class LossWatchDogCallback(TrainerCallback):
|
|||||||
|
|
||||||
def __init__(self, cfg):
|
def __init__(self, cfg):
|
||||||
self.cfg = cfg
|
self.cfg = cfg
|
||||||
self.logged = False
|
|
||||||
self.violations = 0
|
self.violations = 0
|
||||||
self.threshold = cfg.loss_watchdog_threshold
|
self.threshold = cfg.loss_watchdog_threshold
|
||||||
self.patience = cfg.loss_watchdog_patience or 3
|
self.patience = cfg.loss_watchdog_patience or 3
|
||||||
|
|
||||||
def on_step_end(
|
def on_step_end(
|
||||||
self,
|
self,
|
||||||
_args: TrainingArguments,
|
args: TrainingArguments, # pylint: disable=unused-argument
|
||||||
state: TrainerState,
|
state: TrainerState,
|
||||||
control: TrainerControl,
|
control: TrainerControl,
|
||||||
**_kwargs,
|
**_kwargs,
|
||||||
):
|
) -> TrainerControl:
|
||||||
if len(state.log_history) > 0 and "loss" in state.log_history[-1]:
|
if len(state.log_history) > 0 and "loss" in state.log_history[-1]:
|
||||||
if state.log_history[-1]["loss"] > self.threshold:
|
if state.log_history[-1]["loss"] > self.threshold:
|
||||||
self.violations += 1
|
self.violations += 1
|
||||||
@@ -141,6 +140,21 @@ class LossWatchDogCallback(TrainerCallback):
|
|||||||
return control
|
return control
|
||||||
|
|
||||||
|
|
||||||
|
class SaveModelOnFirstStepCallback(TrainerCallback):
|
||||||
|
"""Callback to save the model on the first step of training if enabled"""
|
||||||
|
|
||||||
|
def on_step_end(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments, # pylint: disable=unused-argument
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**_kwargs,
|
||||||
|
) -> TrainerControl:
|
||||||
|
if state.global_step == 1:
|
||||||
|
control.should_save = True
|
||||||
|
return control
|
||||||
|
|
||||||
|
|
||||||
def bench_eval_callback_factory(trainer, tokenizer):
|
def bench_eval_callback_factory(trainer, tokenizer):
|
||||||
accuracy = evaluate.load("accuracy")
|
accuracy = evaluate.load("accuracy")
|
||||||
abcd_idx = [
|
abcd_idx = [
|
||||||
|
|||||||
@@ -706,6 +706,7 @@ class AxolotlInputConfig(
|
|||||||
"description": "Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer from `eval_steps`"
|
"description": "Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer from `eval_steps`"
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
save_steps: int | float | None = Field(
|
save_steps: int | float | None = Field(
|
||||||
default=None,
|
default=None,
|
||||||
json_schema_extra={
|
json_schema_extra={
|
||||||
@@ -727,6 +728,13 @@ class AxolotlInputConfig(
|
|||||||
save_total_limit: int | None = Field(
|
save_total_limit: int | None = Field(
|
||||||
default=None, json_schema_extra={"description": "Checkpoints saved at a time"}
|
default=None, json_schema_extra={"description": "Checkpoints saved at a time"}
|
||||||
)
|
)
|
||||||
|
save_first_step: bool | None = Field(
|
||||||
|
default=None,
|
||||||
|
json_schema_extra={
|
||||||
|
"description": "Whether to checkpoint a model after the first step of training. Defaults to False."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
logging_steps: int | None = Field(
|
logging_steps: int | None = Field(
|
||||||
default=None, json_schema_extra={"description": "Logging frequency"}
|
default=None, json_schema_extra={"description": "Logging frequency"}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -44,6 +44,7 @@ def min_cfg(temp_dir):
|
|||||||
"save_safetensors": True,
|
"save_safetensors": True,
|
||||||
"max_steps": 10,
|
"max_steps": 10,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
|
"save_first_step": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -98,6 +99,7 @@ class TestCutCrossEntropyIntegration:
|
|||||||
"save_safetensors": True,
|
"save_safetensors": True,
|
||||||
"max_steps": 10,
|
"max_steps": 10,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
|
"save_first_step": False,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
cfg = validate_config(cfg)
|
cfg = validate_config(cfg)
|
||||||
|
|||||||
@@ -153,6 +153,7 @@ class TestPluginHooks:
|
|||||||
"max_steps": 5,
|
"max_steps": 5,
|
||||||
"flash_attention": True,
|
"flash_attention": True,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
|
"save_first_step": False,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ def min_cfg(temp_dir):
|
|||||||
"output_dir": temp_dir,
|
"output_dir": temp_dir,
|
||||||
"save_safetensors": True,
|
"save_safetensors": True,
|
||||||
"use_tensorboard": True,
|
"use_tensorboard": True,
|
||||||
|
"save_first_step": False,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -50,6 +50,7 @@ class LigerIntegrationTestCase:
|
|||||||
"save_safetensors": True,
|
"save_safetensors": True,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
"max_steps": 5,
|
"max_steps": 5,
|
||||||
|
"save_first_step": False,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
@@ -96,6 +97,7 @@ class LigerIntegrationTestCase:
|
|||||||
"save_safetensors": True,
|
"save_safetensors": True,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
"max_steps": 5,
|
"max_steps": 5,
|
||||||
|
"save_first_step": False,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
|
|||||||
@@ -81,6 +81,7 @@ class TestLLMCompressorIntegration:
|
|||||||
},
|
},
|
||||||
"save_compressed": save_compressed,
|
"save_compressed": save_compressed,
|
||||||
},
|
},
|
||||||
|
"save_first_step": False,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -69,6 +69,7 @@ class TestSequenceParallelism:
|
|||||||
"use_tensorboard": True,
|
"use_tensorboard": True,
|
||||||
"sequence_parallel_degree": 2,
|
"sequence_parallel_degree": 2,
|
||||||
"ring_attn_func": ring_attn_func,
|
"ring_attn_func": ring_attn_func,
|
||||||
|
"save_first_step": False,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user