From 5ac3392075bd5e858002db9c1c1a3968495033ea Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 29 Aug 2023 06:18:17 -0700 Subject: [PATCH] support for datasets with multiple names (#480) * support for datasets with multiple names * update docs --- README.md | 9 +++++++++ src/axolotl/utils/data.py | 11 ++++++++++- 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 0da9dc506..204e2141a 100644 --- a/README.md +++ b/README.md @@ -328,6 +328,15 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod name: enron_emails type: completion # format from earlier + # huggingface repo with multiple named configurations/subsets + datasets: + - path: bigcode/commitpackft + name: + - ruby + - python + - typescript + type: ... # unimplemented custom format + # local datasets: - path: data.jsonl # or json diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index b801e6a57..20d0fcfb8 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -134,8 +134,17 @@ def load_tokenized_prepared_datasets( seed = 42 datasets = [] + + def for_d_in_datasets(dataset_configs): + for dataset in dataset_configs: + if dataset.name and isinstance(dataset.name, list): + for name in dataset.name: + yield DictDefault({**dataset, "name": name}) + else: + yield dataset + # pylint: disable=invalid-name - for d in cfg.datasets: + for d in for_d_in_datasets(cfg.datasets): ds: Union[Dataset, DatasetDict] = None ds_from_hub = False try: