Fix bug in dataset loading (#284)
* Fix bug in dataset loading This fixes a bug when loading datasets. `d.data_files` is a list, so it cannot be directly passed to `hf_hub_download` * Check type of data_files, and load accordingly
This commit is contained in:
@@ -205,11 +205,26 @@ def load_tokenized_prepared_datasets(
|
|||||||
use_auth_token=use_auth_token,
|
use_auth_token=use_auth_token,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
fp = hf_hub_download(
|
if isinstance(d.data_files, str):
|
||||||
repo_id=d.path,
|
fp = hf_hub_download(
|
||||||
repo_type="dataset",
|
repo_id=d.path,
|
||||||
filename=d.data_files,
|
repo_type="dataset",
|
||||||
)
|
filename=d.data_files,
|
||||||
|
)
|
||||||
|
elif isinstance(d.data_files, list):
|
||||||
|
fp = []
|
||||||
|
for file in d.data_files:
|
||||||
|
fp.append(
|
||||||
|
hf_hub_download(
|
||||||
|
repo_id=d.path,
|
||||||
|
repo_type="dataset",
|
||||||
|
filename=file,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"data_files must be either a string or list of strings"
|
||||||
|
)
|
||||||
ds = load_dataset(
|
ds = load_dataset(
|
||||||
"json", name=d.name, data_files=fp, streaming=False, split=None
|
"json", name=d.name, data_files=fp, streaming=False, split=None
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user