From 32c60765ef6649716e1906c350e396b50890b847 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 30 Oct 2024 12:27:04 -0400 Subject: [PATCH] remove skipped test (#2002) * remove skipped test * use mean_resizing_embeddings with qlora and added tokens * use as pad_token to prevent resize of embeddings * make sure local hub test saves to a tmp dir * use Path so concatenation works * make sure to use tmp_ds_path for data files --- tests/e2e/multigpu/test_llama.py | 4 +- tests/test_datasets.py | 69 ++++++++++++++++---------------- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index 957a6a9e3..14e3f733e 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -273,7 +273,6 @@ class TestMultiGPULlama(unittest.TestCase): ] ) - @pytest.mark.skip("disabled due to upstream issue") @with_temp_dir def test_fsdp_qlora_prequant_packed(self, temp_dir): # pylint: disable=duplicate-code @@ -282,6 +281,7 @@ class TestMultiGPULlama(unittest.TestCase): "base_model": "axolotl-ai-co/TinyLlama_v1.1-bnb-nf4-bf16", "tokenizer_type": "AutoTokenizer", "adapter": "qlora", + "mean_resizing_embeddings": True, "load_in_4bit": True, "lora_r": 8, "lora_alpha": 16, @@ -297,7 +297,7 @@ class TestMultiGPULlama(unittest.TestCase): "sequence_len": 2048, "val_set_size": 0.05, "special_tokens": { - "pad_token": "<|end_of_text|>", + "pad_token": "", }, "datasets": [ { diff --git a/tests/test_datasets.py b/tests/test_datasets.py index f8b463a03..8e2955414 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -367,43 +367,44 @@ class TestDatasetPreparation(unittest.TestCase): def test_load_local_hub_with_revision(self): """Verify that a local copy of a hub dataset can be loaded with a specific revision""" with tempfile.TemporaryDirectory() as tmp_dir: - tmp_ds_path = Path("mhenrichsen/alpaca_2k_test") - tmp_ds_path.mkdir(parents=True, exist_ok=True) - snapshot_download( - repo_id="mhenrichsen/alpaca_2k_test", - repo_type="dataset", - local_dir=tmp_ds_path, - revision="d05c1cb", - ) + with tempfile.TemporaryDirectory() as tmp_dir2: + tmp_ds_path = Path(tmp_dir2) / "mhenrichsen/alpaca_2k_test" + tmp_ds_path.mkdir(parents=True, exist_ok=True) + snapshot_download( + repo_id="mhenrichsen/alpaca_2k_test", + repo_type="dataset", + local_dir=tmp_ds_path, + revision="d05c1cb", + ) - prepared_path = Path(tmp_dir) / "prepared" - cfg = DictDefault( - { - "tokenizer_config": "huggyllama/llama-7b", - "sequence_len": 1024, - "datasets": [ - { - "path": "mhenrichsen/alpaca_2k_test", - "ds_type": "parquet", - "type": "alpaca", - "data_files": [ - "mhenrichsen/alpaca_2k_test/alpaca_2000.parquet", - ], - "revision": "d05c1cb", - }, - ], - } - ) + prepared_path = Path(tmp_dir) / "prepared" + cfg = DictDefault( + { + "tokenizer_config": "huggyllama/llama-7b", + "sequence_len": 1024, + "datasets": [ + { + "path": "mhenrichsen/alpaca_2k_test", + "ds_type": "parquet", + "type": "alpaca", + "data_files": [ + f"{tmp_ds_path}/alpaca_2000.parquet", + ], + "revision": "d05c1cb", + }, + ], + } + ) - dataset, _ = load_tokenized_prepared_datasets( - self.tokenizer, cfg, prepared_path - ) + dataset, _ = load_tokenized_prepared_datasets( + self.tokenizer, cfg, prepared_path + ) - assert len(dataset) == 2000 - assert "input_ids" in dataset.features - assert "attention_mask" in dataset.features - assert "labels" in dataset.features - shutil.rmtree(tmp_ds_path) + assert len(dataset) == 2000 + assert "input_ids" in dataset.features + assert "attention_mask" in dataset.features + assert "labels" in dataset.features + shutil.rmtree(tmp_ds_path) if __name__ == "__main__":