remove skipped test (#2002)

* remove skipped test * use mean_resizing_embeddings with qlora and added tokens * use </s> as pad_token to prevent resize of embeddings * make sure local hub test saves to a tmp dir * use Path so concatenation works * make sure to use tmp_ds_path for data files
2024-10-30 12:27:04 -04:00
parent 8c3a727f9d
commit 32c60765ef
2 changed files with 37 additions and 36 deletions
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -273,7 +273,6 @@ class TestMultiGPULlama(unittest.TestCase):
            ]
        )
    @pytest.mark.skip("disabled due to upstream issue")
    @with_temp_dir
    def test_fsdp_qlora_prequant_packed(self, temp_dir):
        # pylint: disable=duplicate-code
@@ -282,6 +281,7 @@ class TestMultiGPULlama(unittest.TestCase):
                "base_model": "axolotl-ai-co/TinyLlama_v1.1-bnb-nf4-bf16",
                "tokenizer_type": "AutoTokenizer",
                "adapter": "qlora",
                "mean_resizing_embeddings": True,
                "load_in_4bit": True,
                "lora_r": 8,
                "lora_alpha": 16,
@@ -297,7 +297,7 @@ class TestMultiGPULlama(unittest.TestCase):
                "sequence_len": 2048,
                "val_set_size": 0.05,
                "special_tokens": {
-                    "pad_token": "<|end_of_text|>",
+                    "pad_token": "</s>",
                },
                "datasets": [
                    {
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -367,43 +367,44 @@ class TestDatasetPreparation(unittest.TestCase):
    def test_load_local_hub_with_revision(self):
        """Verify that a local copy of a hub dataset can be loaded with a specific revision"""
        with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_ds_path = Path("mhenrichsen/alpaca_2k_test")
+            with tempfile.TemporaryDirectory() as tmp_dir2:
-            tmp_ds_path.mkdir(parents=True, exist_ok=True)
+                tmp_ds_path = Path(tmp_dir2) / "mhenrichsen/alpaca_2k_test"
-            snapshot_download(
+                tmp_ds_path.mkdir(parents=True, exist_ok=True)
-                repo_id="mhenrichsen/alpaca_2k_test",
+                snapshot_download(
-                repo_type="dataset",
+                    repo_id="mhenrichsen/alpaca_2k_test",
-                local_dir=tmp_ds_path,
+                    repo_type="dataset",
-                revision="d05c1cb",
+                    local_dir=tmp_ds_path,
-            )
+                    revision="d05c1cb",
                )
-            prepared_path = Path(tmp_dir) / "prepared"
+                prepared_path = Path(tmp_dir) / "prepared"
-            cfg = DictDefault(
+                cfg = DictDefault(
-                {
+                    {
-                    "tokenizer_config": "huggyllama/llama-7b",
+                        "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 1024,
+                        "sequence_len": 1024,
-                    "datasets": [
+                        "datasets": [
-                        {
+                            {
-                            "path": "mhenrichsen/alpaca_2k_test",
+                                "path": "mhenrichsen/alpaca_2k_test",
-                            "ds_type": "parquet",
+                                "ds_type": "parquet",
-                            "type": "alpaca",
+                                "type": "alpaca",
-                            "data_files": [
+                                "data_files": [
-                                "mhenrichsen/alpaca_2k_test/alpaca_2000.parquet",
+                                    f"{tmp_ds_path}/alpaca_2000.parquet",
-                            ],
+                                ],
-                            "revision": "d05c1cb",
+                                "revision": "d05c1cb",
-                        },
+                            },
-                    ],
+                        ],
-                }
+                    }
-            )
+                )
-            dataset, _ = load_tokenized_prepared_datasets(
+                dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
+                    self.tokenizer, cfg, prepared_path
-            )
+                )
-            assert len(dataset) == 2000
+                assert len(dataset) == 2000
-            assert "input_ids" in dataset.features
+                assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
+                assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
+                assert "labels" in dataset.features
-            shutil.rmtree(tmp_ds_path)
+                shutil.rmtree(tmp_ds_path)
 if __name__ == "__main__":