From 5ac3392075bd5e858002db9c1c1a3968495033ea Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Tue, 29 Aug 2023 06:18:17 -0700
Subject: [PATCH] support for datasets with multiple names (#480)

* support for datasets with multiple names

* update docs
---
 README.md                 |  9 +++++++++
 src/axolotl/utils/data.py | 11 ++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 0da9dc506..204e2141a 100644
--- a/README.md
+++ b/README.md
@@ -328,6 +328,15 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
       name: enron_emails
       type: completion # format from earlier
 
+  # huggingface repo with multiple named configurations/subsets
+  datasets:
+    - path: bigcode/commitpackft
+      name:
+        - ruby
+        - python
+        - typescript
+      type: ... # unimplemented custom format
+
   # local
   datasets:
     - path: data.jsonl # or json
diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py
index b801e6a57..20d0fcfb8 100644
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -134,8 +134,17 @@ def load_tokenized_prepared_datasets(
             seed = 42
 
         datasets = []
+
+        def for_d_in_datasets(dataset_configs):
+            for dataset in dataset_configs:
+                if dataset.name and isinstance(dataset.name, list):
+                    for name in dataset.name:
+                        yield DictDefault({**dataset, "name": name})
+                else:
+                    yield dataset
+
         # pylint: disable=invalid-name
-        for d in cfg.datasets:
+        for d in for_d_in_datasets(cfg.datasets):
             ds: Union[Dataset, DatasetDict] = None
             ds_from_hub = False
             try: