Fix: Gradient Accumulation issue (#1980)

* feat: support new arg num_items_in_batch * use kwargs to manage extra unknown kwargs for now * upgrade against upstream transformers main * make sure trl is on latest too * fix for upgraded trl * fix: handle trl and transformer signature change * feat: update trl to handle transformer signature * RewardDataCollatorWithPadding no longer has max_length * handle updated signature for tokenizer vs processor class * invert logic for tokenizer vs processor class * processing_class, not processor class * also handle processing class in dpo * handle model name w model card creation * upgrade transformers and add a loss check test * fix install of tbparse requirements * make sure to add tbparse to req * feat: revert kwarg to positional kwarg to be explicit --------- Co-authored-by: Wing Lian <wing.lian@gmail.com>
2024-10-25 22:28:23 +07:00
parent 1d6a5e2bd6
commit 2501c1a6a3
11 changed files with 170 additions and 98 deletions
--- a/tests/e2e/patched/test_unsloth_integration.py
+++ b/tests/e2e/patched/test_unsloth_integration.py
@@ -1,22 +1,12 @@
 """Test module for checking whether the integration of Unsloth with Hugging Face Transformers is working as expected."""
 import unittest

-from axolotl.monkeypatch.unsloth_ import (
-    check_cel_is_patchable,
-    check_self_attn_is_patchable,
-)
+from axolotl.monkeypatch.unsloth_ import check_self_attn_is_patchable


 class TestUnslothIntegration(unittest.TestCase):
    """Unsloth monkeypatch integration tests."""

-    def test_is_cel_patchable(self):
-        # ensures the current version of transformers has loss code that matches our patching code
-        self.assertTrue(
-            check_cel_is_patchable(),
-            "HF transformers loss code has changed and isn't patchable",
-        )
-
    def test_is_self_attn_patchable(self):
        # ensures the current version of transformers has loss code that matches our patching code
        self.assertTrue(
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -0,0 +1,74 @@
+"""
+E2E tests for packed training
+"""
+
+import logging
+import os
+import unittest
+
+from tbparse import SummaryReader
+from transformers.utils import is_torch_bf16_gpu_available
+
+from axolotl.cli import load_datasets
+from axolotl.common.cli import TrainerCliArgs
+from axolotl.train import train
+from axolotl.utils.config import normalize_config
+from axolotl.utils.dict import DictDefault
+
+from .utils import most_recent_subdir, with_temp_dir
+
+LOG = logging.getLogger("axolotl.tests.e2e")
+os.environ["WANDB_DISABLED"] = "true"
+
+
+class TestPackedLlama(unittest.TestCase):
+    """
+    Test case for Packed training of llama models
+    """
+
+    @with_temp_dir
+    def test_loss_packed(self, temp_dir):
+        # pylint: disable=duplicate-code
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM-135M",
+                "sequence_len": 1024,
+                "sample_packing": True,
+                "flash_attention": True,
+                "val_set_size": 0.0,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "vicgalle/alpaca-gpt4",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "micro_batch_size": 2,
+                "gradient_accumulation_steps": 4,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch",
+                "lr_scheduler": "cosine",
+                "max_steps": 5,
+                "use_tensorboard": True,
+            }
+        )
+        if is_torch_bf16_gpu_available():
+            cfg.bf16 = True
+        else:
+            cfg.fp16 = True
+        normalize_config(cfg)
+        cli_args = TrainerCliArgs()
+        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
+
+        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
+
+        tb_log_path = most_recent_subdir(temp_dir + "/runs")
+        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
+        reader = SummaryReader(event_file)
+        df = reader.scalars  # pylint: disable=invalid-name
+        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
+        assert df.value.values[-1] < 2.0, "Loss is too high"