Attention mask and position id fixes for packing (#285)

* fix attetion mask with packing * set position ids and use block diagonal attn mask * fix expand mask for multiple batch items, make sure we pad position_ids * don't move masks to cpu * use multi pack dataloader w random sampler * add position_ids back * more fixes for dataloader integration * est total tokens, fix field loop * more fixes, position_ids seems broken * more fixes for sample packing * use distributed sampler, avoid accelerate prepare * use accelerator prepare for dataloader * fix for position_ids w packing * Update src/axolotl/utils/dataloader.py * validation for sample packing and doc * more fixes for 4k and optimizations * optimized expand mask fn * better handling of variance in multipack dataloader length and trainer hanging when it runs out of data * fix rounding of len of batches to int * better handling so that all devices have the same dataloader len * fix step calc for packing * pass sample packing efficiency to training args * add a test for the mask expansion for sequence packing * only process eval dataset for packing if not None * don't split batches when packing * weighted CE losses * weighted CEL fixes * limit packing to sequences of max seq len * seq_len_multiple for packing * make sure the chunk size is an int * sample_packing_seq_len_multiplier config * use cumulative seq len with var len flash attn v2 w packing * properly calculate max len * fix flash-attn, xformers, packing, support chatml * fix chatml system prompt for openorca, legacy tokenizer opts * add chatml * add unit tests for cum seq lens, add ability to build cu_seq_lens from positional ids, fix prompt test * fix test and pylint checks * more packing and dataset optimizations and fixes * filter w multiple cpus * more fixes and optimizations * fixes and go back to distributed sampler since batch sampler won't work * fix counts by accounting for num devices * fix steps calculation * previous accelerate is still most performant * add numba to requirements. * use custom distributed checks * fix sampler to prevent overfit w new epochs * let's not cleanup the cached datasets * calculate cum seq lens with pos_ids instead of mask, simplify packing params, fix distributed barrier * speed optimizations and set accelerate fsdp env vars * optimize dataset concatenation? * more optimizations for dataset handling * fix import for annotation * manual pre-commit fixes * another sum optimization and bug fix for calc steps * fix packing estimations * fix formatting * pylint problems * add back flash attention branch for handling unpacked sequences seperately * Address PR feedback * add optional sample packing config params to readme
2023-08-12 15:14:56 -04:00
parent a276c9c88d
commit 2bb0b78975
23 changed files with 1218 additions and 70 deletions
--- a/tests/monkeypatch/test_llama_attn_hijack_flash.py
+++ b/tests/monkeypatch/test_llama_attn_hijack_flash.py
@@ -0,0 +1,30 @@
+"""
+Unit tests for the monkeypatch utils
+"""
+import unittest
+
+import torch
+
+from axolotl.monkeypatch.utils import get_cu_seqlens, get_cu_seqlens_from_pos_ids
+
+
+class TestMonkeyPatchUtils(unittest.TestCase):
+    """
+    Unit test class for monkeypatch utils
+    """
+
+    def test_get_cu_seqlens_1d(self):
+        attn_mask = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0]])
+        target_res = torch.tensor([0, 4, 7, 12, 14, 16], dtype=torch.int32)
+        self.assertTrue(torch.allclose(get_cu_seqlens(attn_mask)[0], target_res))
+
+    def test_get_cu_seqlens_from_pos_ids_1d(self):
+        position_ids = torch.tensor([[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0, 1, 0, 0]])
+        target_res = torch.tensor([0, 4, 7, 12, 14, 16], dtype=torch.int32)
+        self.assertTrue(
+            torch.allclose(get_cu_seqlens_from_pos_ids(position_ids)[0], target_res)
+        )
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_expand_mask.py
+++ b/tests/test_expand_mask.py
@@ -0,0 +1,44 @@
+"""
+Unit tests for the monkey patch for expand mask to handle packed sequences
+"""
+import unittest
+
+import torch
+
+from axolotl.monkeypatch.llama_expand_mask import _expand_mask
+
+
+class TestExpandMask(unittest.TestCase):
+    """
+    Test class for attention mask expansion for packed sequences
+    """
+
+    def test_output(self):
+        mask = torch.tensor([[1, 1, 1, 2], [2, 3, 3, 0]])
+        dtype = torch.float32
+        expected_output = torch.tensor(
+            [
+                [
+                    [
+                        [0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
+                        [0.0000e00, 0.0000e00, -3.4028e38, -3.4028e38],
+                        [0.0000e00, 0.0000e00, 0.0000e00, -3.4028e38],
+                        [-3.4028e38, -3.4028e38, -3.4028e38, 0.0000e00],
+                    ]
+                ],
+                [
+                    [
+                        [0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
+                        [-3.4028e38, 0.0000e00, -3.4028e38, -3.4028e38],
+                        [-3.4028e38, 0.0000e00, 0.0000e00, -3.4028e38],
+                        [-3.4028e38, -3.4028e38, -3.4028e38, -3.4028e38],
+                    ]
+                ],
+            ]
+        )
+        # Check that the output matches the expected output
+        self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -27,7 +27,7 @@ class TestPacking(unittest.TestCase):
            }
        )

-    def test_resets_attention(self):
+    def test_increments_attention(self):
        prompter = AlpacaPrompter("chat")
        strat = AlpacaPromptTokenizingStrategy(
            prompter,
@@ -55,10 +55,14 @@ class TestPacking(unittest.TestCase):
        # first example doesn't have mask reset
        assert example["input_ids"][0] == self.tokenizer.bos_token_id
        assert example["attention_mask"][0] == 1
+        assert example["position_ids"][0] == 0
+        assert example["position_ids"][1] == 1

        # but subsequent one does
        assert example["input_ids"][next_bos_index] == self.tokenizer.bos_token_id
-        assert example["attention_mask"][next_bos_index] == 0
+        assert example["attention_mask"][next_bos_index] == 2
+        assert example["position_ids"][next_bos_index] == 0
+        assert example["position_ids"][next_bos_index + 1] == 1


 if __name__ == "__main__":
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -134,9 +134,15 @@ class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
            "output": "Hi! How can I help?",
        }
        example = strat.tokenize_prompt(sample)
-        assert example["input_ids"][0:4] == [1, 835, 2184, 29901]  # "<s>### System:"
-        assert example["input_ids"][5:7] == [1509, 20118]  # "use cot"
-        assert example["input_ids"][9] == 11889  # USER
+        assert example["input_ids"][0:5] == [
+            1,
+            28962,
+            1254,
+            12665,
+            29901,
+        ]  # "<s>SYSTEM:"
+        assert example["input_ids"][5:7] == [671, 20118]  # " use cot"
+        assert example["input_ids"][8] == 11889  # USER


 class Llama2ChatTokenizationTest(unittest.TestCase):
--- a/tests/test_prompters.py
+++ b/tests/test_prompters.py
@@ -70,7 +70,7 @@ class AlpacaPrompterTest(unittest.TestCase):
            )
        )
        assert "use cot" in res
-        assert res.startswith("### System:")
+        assert res.startswith("SYSTEM:")
        assert "### Instruction:" not in res
        assert "### Input:" not in res
        assert "alpacas" in res
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -313,3 +313,27 @@ class ValidationTest(unittest.TestCase):
        )

        validate_config(cfg)
+
+    def test_packing(self):
+        cfg = DictDefault(
+            {
+                "max_packed_sequence_len": 2048,
+            }
+        )
+        with self._caplog.at_level(logging.WARNING):
+            validate_config(cfg)
+            assert any(
+                "max_packed_sequence_len will be deprecated in favor of sample_packing"
+                in record.message
+                for record in self._caplog.records
+            )
+
+        cfg = DictDefault(
+            {
+                "max_packed_sequence_len": 2048,
+                "sample_packing": True,
+            }
+        )
+        regex_exp = r".*set only one of max_packed_sequence_len \(deprecated soon\) or sample_packing.*"
+        with pytest.raises(ValueError, match=regex_exp):
+            validate_config(cfg)