fix: loading locally downloaded dataset (#2056) [skip ci]

2024-11-16 08:35:26 +07:00
parent d42f202046
commit fd70eec577
2 changed files with 79 additions and 36 deletions
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -350,7 +350,15 @@ def load_tokenized_prepared_datasets(
                            split=None,
                        )
                    else:
-                        ds = load_from_disk(config_dataset.path)
+                        try:
                            ds = load_from_disk(config_dataset.path)
                        except FileNotFoundError:
                            ds = load_dataset(
                                config_dataset.path,
                                name=config_dataset.name,
                                streaming=False,
                                split=None,
                            )
                elif local_path.is_file():
                    ds_type = get_ds_type(config_dataset)
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -371,44 +371,79 @@ class TestDatasetPreparation(unittest.TestCase):
    def test_load_local_hub_with_revision(self):
        """Verify that a local copy of a hub dataset can be loaded with a specific revision"""
        with tempfile.TemporaryDirectory() as tmp_dir:
-            with tempfile.TemporaryDirectory() as tmp_dir2:
+            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
-                tmp_ds_path = Path(tmp_dir2) / "mhenrichsen/alpaca_2k_test"
+            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-                tmp_ds_path.mkdir(parents=True, exist_ok=True)
+            snapshot_download(
-                snapshot_download(
+                repo_id="mhenrichsen/alpaca_2k_test",
-                    repo_id="mhenrichsen/alpaca_2k_test",
+                repo_type="dataset",
-                    repo_type="dataset",
+                local_dir=tmp_ds_path,
-                    local_dir=tmp_ds_path,
+                revision="d05c1cb",
-                    revision="d05c1cb",
+            )
                )
-                prepared_path = Path(tmp_dir) / "prepared"
+            prepared_path = Path(tmp_dir) / "prepared"
-                cfg = DictDefault(
+            cfg = DictDefault(
-                    {
+                {
-                        "tokenizer_config": "huggyllama/llama-7b",
+                    "tokenizer_config": "huggyllama/llama-7b",
-                        "sequence_len": 1024,
+                    "sequence_len": 1024,
-                        "datasets": [
+                    "datasets": [
-                            {
+                        {
-                                "path": "mhenrichsen/alpaca_2k_test",
+                            "path": "mhenrichsen/alpaca_2k_test",
-                                "ds_type": "parquet",
+                            "ds_type": "parquet",
-                                "type": "alpaca",
+                            "type": "alpaca",
-                                "data_files": [
+                            "data_files": [
-                                    f"{tmp_ds_path}/alpaca_2000.parquet",
+                                f"{tmp_ds_path}/alpaca_2000.parquet",
-                                ],
+                            ],
-                                "revision": "d05c1cb",
+                            "revision": "d05c1cb",
-                            },
+                        },
-                        ],
+                    ],
-                    }
+                }
-                )
+            )
-                dataset, _ = load_tokenized_prepared_datasets(
+            dataset, _ = load_tokenized_prepared_datasets(
-                    self.tokenizer, cfg, prepared_path
+                self.tokenizer, cfg, prepared_path
-                )
+            )
-                assert len(dataset) == 2000
+            assert len(dataset) == 2000
-                assert "input_ids" in dataset.features
+            assert "input_ids" in dataset.features
-                assert "attention_mask" in dataset.features
+            assert "attention_mask" in dataset.features
-                assert "labels" in dataset.features
+            assert "labels" in dataset.features
-                shutil.rmtree(tmp_ds_path)
+            shutil.rmtree(tmp_ds_path)
    def test_loading_local_dataset_folder(self):
        """Verify that a dataset downloaded to a local folder can be loaded"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
            snapshot_download(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
            )
            prepared_path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 1024,
                    "datasets": [
                        {
                            "path": str(tmp_ds_path),
                            "type": "alpaca",
                        },
                    ],
                }
            )
            dataset, _ = load_tokenized_prepared_datasets(
                self.tokenizer, cfg, prepared_path
            )
            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
            shutil.rmtree(tmp_ds_path)
 if __name__ == "__main__":