Fix bug in dataset loading (#284)

* Fix bug in dataset loading

This fixes a bug when loading datasets. `d.data_files` is a list, so it cannot be directly passed to `hf_hub_download`

* Check type of data_files, and load accordingly
This commit is contained in:
Ethan Smith
2023-09-27 10:41:31 -07:00
committed by GitHub
parent d1236f2c41
commit 8fe0e633d2

View File

@@ -205,11 +205,26 @@ def load_tokenized_prepared_datasets(
use_auth_token=use_auth_token,
)
else:
fp = hf_hub_download(
repo_id=d.path,
repo_type="dataset",
filename=d.data_files,
)
if isinstance(d.data_files, str):
fp = hf_hub_download(
repo_id=d.path,
repo_type="dataset",
filename=d.data_files,
)
elif isinstance(d.data_files, list):
fp = []
for file in d.data_files:
fp.append(
hf_hub_download(
repo_id=d.path,
repo_type="dataset",
filename=file,
)
)
else:
raise ValueError(
"data_files must be either a string or list of strings"
)
ds = load_dataset(
"json", name=d.name, data_files=fp, streaming=False, split=None
)