gradient accumulation tests, embeddings w pad_token fix, smaller models (#2059)

* add more test cases for gradient accumulation and fix zero3

* swap out for smaller model

* fix missing return

* fix missing pad_token in config

* support concurrency for multigpu testing

* cast empty deepspeed to empty string for zero3 check

* fix temp_dir as fixture so parametrize works properly

* fix test file for multigpu evals

* don't use default

* don't use default for fsdp_state_dict_type

* don't use llama tokenizer w smollm

* also automatically cancel multigpu for concurrency
This commit is contained in:
Wing Lian
2024-11-14 12:59:00 -05:00
committed by GitHub
parent f3a5d119af
commit 71d4030b79
8 changed files with 118 additions and 71 deletions

View File

@@ -1291,6 +1291,25 @@ class AxolotlInputConfig(
)
return data
@model_validator(mode="before")
@classmethod
def warn_qlora_zero3_w_use_reentrant(cls, data):
if (
data.get("adapter") == "qlora"
and data.get("gradient_checkpointing_kwargs", {})
and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
is False
and "zero3" in data.get("deepspeed", "")
):
# may result in:
# torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint:
# Recomputed values for the following tensors have different metadata
# than during the forward pass.
LOG.warning(
"qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
)
return data
@model_validator(mode="before")
@classmethod
def check_val_w_test_datasets(cls, data):

View File

@@ -238,6 +238,7 @@ def load_tokenizer(cfg):
x in cfg.lora_modules_to_save for x in lora_modules_to_save
)
)
and k != "pad_token"
):
lora_modules_to_save = ", ".join(
[f"`{x}`" for x in lora_modules_to_save]