gradient accumulation tests, embeddings w pad_token fix, smaller models (#2059)
* add more test cases for gradient accumulation and fix zero3 * swap out for smaller model * fix missing return * fix missing pad_token in config * support concurrency for multigpu testing * cast empty deepspeed to empty string for zero3 check * fix temp_dir as fixture so parametrize works properly * fix test file for multigpu evals * don't use default * don't use default for fsdp_state_dict_type * don't use llama tokenizer w smollm * also automatically cancel multigpu for concurrency
This commit is contained in:
@@ -1291,6 +1291,25 @@ class AxolotlInputConfig(
|
||||
)
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def warn_qlora_zero3_w_use_reentrant(cls, data):
|
||||
if (
|
||||
data.get("adapter") == "qlora"
|
||||
and data.get("gradient_checkpointing_kwargs", {})
|
||||
and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
|
||||
is False
|
||||
and "zero3" in data.get("deepspeed", "")
|
||||
):
|
||||
# may result in:
|
||||
# torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint:
|
||||
# Recomputed values for the following tensors have different metadata
|
||||
# than during the forward pass.
|
||||
LOG.warning(
|
||||
"qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
|
||||
)
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_val_w_test_datasets(cls, data):
|
||||
|
||||
@@ -238,6 +238,7 @@ def load_tokenizer(cfg):
|
||||
x in cfg.lora_modules_to_save for x in lora_modules_to_save
|
||||
)
|
||||
)
|
||||
and k != "pad_token"
|
||||
):
|
||||
lora_modules_to_save = ", ".join(
|
||||
[f"`{x}`" for x in lora_modules_to_save]
|
||||
|
||||
Reference in New Issue
Block a user