gradient accumulation tests, embeddings w pad_token fix, smaller models (#2059)

* add more test cases for gradient accumulation and fix zero3 * swap out for smaller model * fix missing return * fix missing pad_token in config * support concurrency for multigpu testing * cast empty deepspeed to empty string for zero3 check * fix temp_dir as fixture so parametrize works properly * fix test file for multigpu evals * don't use default * don't use default for fsdp_state_dict_type * don't use llama tokenizer w smollm * also automatically cancel multigpu for concurrency
2024-11-14 12:59:00 -05:00
parent f3a5d119af
commit 71d4030b79
8 changed files with 118 additions and 71 deletions
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -1291,6 +1291,25 @@ class AxolotlInputConfig(
            )
        return data

+    @model_validator(mode="before")
+    @classmethod
+    def warn_qlora_zero3_w_use_reentrant(cls, data):
+        if (
+            data.get("adapter") == "qlora"
+            and data.get("gradient_checkpointing_kwargs", {})
+            and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
+            is False
+            and "zero3" in data.get("deepspeed", "")
+        ):
+            # may result in:
+            # torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint:
+            # Recomputed values for the following tensors have different metadata
+            # than during the forward pass.
+            LOG.warning(
+                "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
+            )
+        return data
+
    @model_validator(mode="before")
    @classmethod
    def check_val_w_test_datasets(cls, data):
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -238,6 +238,7 @@ def load_tokenizer(cfg):
                        x in cfg.lora_modules_to_save for x in lora_modules_to_save
                    )
                )
+                and k != "pad_token"
            ):
                lora_modules_to_save = ", ".join(
                    [f"`{x}`" for x in lora_modules_to_save]