From 32c60765ef6649716e1906c350e396b50890b847 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Wed, 30 Oct 2024 12:27:04 -0400
Subject: [PATCH] remove skipped test (#2002)

* remove skipped test

* use mean_resizing_embeddings with qlora and added tokens

* use </s> as pad_token to prevent resize of embeddings

* make sure local hub test saves to a tmp dir

* use Path so concatenation works

* make sure to use tmp_ds_path for data files
---
 tests/e2e/multigpu/test_llama.py |  4 +-
 tests/test_datasets.py           | 69 ++++++++++++++++----------------
 2 files changed, 37 insertions(+), 36 deletions(-)
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index 957a6a9e3..14e3f733e 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -273,7 +273,6 @@ class TestMultiGPULlama(unittest.TestCase):
             ]
         )
 
-    @pytest.mark.skip("disabled due to upstream issue")
     @with_temp_dir
     def test_fsdp_qlora_prequant_packed(self, temp_dir):
         # pylint: disable=duplicate-code
@@ -282,6 +281,7 @@ class TestMultiGPULlama(unittest.TestCase):
                 "base_model": "axolotl-ai-co/TinyLlama_v1.1-bnb-nf4-bf16",
                 "tokenizer_type": "AutoTokenizer",
                 "adapter": "qlora",
+                "mean_resizing_embeddings": True,
                 "load_in_4bit": True,
                 "lora_r": 8,
                 "lora_alpha": 16,
@@ -297,7 +297,7 @@ class TestMultiGPULlama(unittest.TestCase):
                 "sequence_len": 2048,
                 "val_set_size": 0.05,
                 "special_tokens": {
-                    "pad_token": "<|end_of_text|>",
+                    "pad_token": "</s>",
                 },
                 "datasets": [
                     {
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index f8b463a03..8e2955414 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -367,43 +367,44 @@ class TestDatasetPreparation(unittest.TestCase):
     def test_load_local_hub_with_revision(self):
         """Verify that a local copy of a hub dataset can be loaded with a specific revision"""
         with tempfile.TemporaryDirectory() as tmp_dir:
-            tmp_ds_path = Path("mhenrichsen/alpaca_2k_test")
-            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_download(
-                repo_id="mhenrichsen/alpaca_2k_test",
-                repo_type="dataset",
-                local_dir=tmp_ds_path,
-                revision="d05c1cb",
-            )
+            with tempfile.TemporaryDirectory() as tmp_dir2:
+                tmp_ds_path = Path(tmp_dir2) / "mhenrichsen/alpaca_2k_test"
+                tmp_ds_path.mkdir(parents=True, exist_ok=True)
+                snapshot_download(
+                    repo_id="mhenrichsen/alpaca_2k_test",
+                    repo_type="dataset",
+                    local_dir=tmp_ds_path,
+                    revision="d05c1cb",
+                )
 
-            prepared_path = Path(tmp_dir) / "prepared"
-            cfg = DictDefault(
-                {
-                    "tokenizer_config": "huggyllama/llama-7b",
-                    "sequence_len": 1024,
-                    "datasets": [
-                        {
-                            "path": "mhenrichsen/alpaca_2k_test",
-                            "ds_type": "parquet",
-                            "type": "alpaca",
-                            "data_files": [
-                                "mhenrichsen/alpaca_2k_test/alpaca_2000.parquet",
-                            ],
-                            "revision": "d05c1cb",
-                        },
-                    ],
-                }
-            )
+                prepared_path = Path(tmp_dir) / "prepared"
+                cfg = DictDefault(
+                    {
+                        "tokenizer_config": "huggyllama/llama-7b",
+                        "sequence_len": 1024,
+                        "datasets": [
+                            {
+                                "path": "mhenrichsen/alpaca_2k_test",
+                                "ds_type": "parquet",
+                                "type": "alpaca",
+                                "data_files": [
+                                    f"{tmp_ds_path}/alpaca_2000.parquet",
+                                ],
+                                "revision": "d05c1cb",
+                            },
+                        ],
+                    }
+                )
 
-            dataset, _ = load_tokenized_prepared_datasets(
-                self.tokenizer, cfg, prepared_path
-            )
+                dataset, _ = load_tokenized_prepared_datasets(
+                    self.tokenizer, cfg, prepared_path
+                )
 
-            assert len(dataset) == 2000
-            assert "input_ids" in dataset.features
-            assert "attention_mask" in dataset.features
-            assert "labels" in dataset.features
-            shutil.rmtree(tmp_ds_path)
+                assert len(dataset) == 2000
+                assert "input_ids" in dataset.features
+                assert "attention_mask" in dataset.features
+                assert "labels" in dataset.features
+                shutil.rmtree(tmp_ds_path)
 
 
 if __name__ == "__main__":