support for datasets with multiple names (#480)
* support for datasets with multiple names * update docs
This commit is contained in:
@@ -328,6 +328,15 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
|
|||||||
name: enron_emails
|
name: enron_emails
|
||||||
type: completion # format from earlier
|
type: completion # format from earlier
|
||||||
|
|
||||||
|
# huggingface repo with multiple named configurations/subsets
|
||||||
|
datasets:
|
||||||
|
- path: bigcode/commitpackft
|
||||||
|
name:
|
||||||
|
- ruby
|
||||||
|
- python
|
||||||
|
- typescript
|
||||||
|
type: ... # unimplemented custom format
|
||||||
|
|
||||||
# local
|
# local
|
||||||
datasets:
|
datasets:
|
||||||
- path: data.jsonl # or json
|
- path: data.jsonl # or json
|
||||||
|
|||||||
@@ -134,8 +134,17 @@ def load_tokenized_prepared_datasets(
|
|||||||
seed = 42
|
seed = 42
|
||||||
|
|
||||||
datasets = []
|
datasets = []
|
||||||
|
|
||||||
|
def for_d_in_datasets(dataset_configs):
|
||||||
|
for dataset in dataset_configs:
|
||||||
|
if dataset.name and isinstance(dataset.name, list):
|
||||||
|
for name in dataset.name:
|
||||||
|
yield DictDefault({**dataset, "name": name})
|
||||||
|
else:
|
||||||
|
yield dataset
|
||||||
|
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
for d in cfg.datasets:
|
for d in for_d_in_datasets(cfg.datasets):
|
||||||
ds: Union[Dataset, DatasetDict] = None
|
ds: Union[Dataset, DatasetDict] = None
|
||||||
ds_from_hub = False
|
ds_from_hub = False
|
||||||
try:
|
try:
|
||||||
|
|||||||
Reference in New Issue
Block a user