Merge pull request #277 from cg123/dataset-name

Allow non-default dataset configurations
2023-07-16 16:08:15 -04:00
parent 168a7a09cc 3cdd8e4122
commit 334af625d0
2 changed files with 20 additions and 14 deletions
--- a/README.md
+++ b/README.md
@@ -262,6 +262,12 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
    - path: vicgalle/alpaca-gpt4
      type: alpaca # format from earlier

+  # huggingface repo with specific configuration/subset
+  datasets:
+    - path: EleutherAI/pile
+      name: enron_emails
+      type: completion # format from earlier
+
  # local
  datasets:
    - path: json
@@ -344,6 +350,7 @@ datasets:
    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
    data_files: # path to source data files
    shards: # number of shards to split data into
+    name: # name of dataset configuration to load

 # axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets(
            try:
                load_dataset(
                    d.path,
+                    name=d.name,
                    streaming=True,
                    use_auth_token=use_auth_token,
                )
@@ -107,6 +108,7 @@ def load_tokenized_prepared_datasets(
                if local_path.is_dir():
                    ds = load_dataset(
                        d.path,
+                        name=d.name,
                        data_files=d.data_files,
                        streaming=False,
                        split=None,
@@ -114,6 +116,7 @@ def load_tokenized_prepared_datasets(
                elif local_path.is_file():
                    ds = load_dataset(
                        "json",
+                        name=d.name,
                        data_files=d.path,
                        streaming=False,
                        split=None,
@@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets(
                        "unhandled dataset load: local path exists, but is neither a directory or a file"
                    )
            elif ds_from_hub:
-                if d.data_files:
-                    ds = load_dataset(
-                        d.path,
-                        streaming=False,
-                        data_files=d.data_files,
-                        use_auth_token=use_auth_token,
-                    )
-                else:
-                    ds = load_dataset(
-                        d.path,
-                        streaming=False,
-                        use_auth_token=use_auth_token,
-                    )
+                ds = load_dataset(
+                    d.path,
+                    name=d.name,
+                    streaming=False,
+                    data_files=d.data_files,
+                    use_auth_token=use_auth_token,
+                )
            else:
                fp = hf_hub_download(
                    repo_id=d.path,
                    repo_type="dataset",
                    filename=d.data_files,
                )
-                ds = load_dataset("json", data_files=fp, streaming=False, split=None)
+                ds = load_dataset(
+                    "json", name=d.name, data_files=fp, streaming=False, split=None
+                )
            if not ds:
                raise ValueError("unhandled dataset load")
            # support for using a subset of the data