support for datasets with multiple names (#480)

* support for datasets with multiple names

* update docs
This commit is contained in:
Wing Lian
2023-08-29 06:18:17 -07:00
committed by GitHub
parent e356b297cb
commit 5ac3392075
2 changed files with 19 additions and 1 deletions

View File

@@ -328,6 +328,15 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
name: enron_emails
type: completion # format from earlier
# huggingface repo with multiple named configurations/subsets
datasets:
- path: bigcode/commitpackft
name:
- ruby
- python
- typescript
type: ... # unimplemented custom format
# local
datasets:
- path: data.jsonl # or json

View File

@@ -134,8 +134,17 @@ def load_tokenized_prepared_datasets(
seed = 42
datasets = []
def for_d_in_datasets(dataset_configs):
for dataset in dataset_configs:
if dataset.name and isinstance(dataset.name, list):
for name in dataset.name:
yield DictDefault({**dataset, "name": name})
else:
yield dataset
# pylint: disable=invalid-name
for d in cfg.datasets:
for d in for_d_in_datasets(cfg.datasets):
ds: Union[Dataset, DatasetDict] = None
ds_from_hub = False
try: