Support loading data files from a local directory

ref:  https://huggingface.co/docs/datasets/v2.13.0/en/package_reference/loading_methods#datasets.load_dataset.path
This commit is contained in:
Utensil
2023-06-21 08:00:58 +00:00
parent 9492d4ebb7
commit 9bdd30cdfd

View File

@@ -102,13 +102,26 @@ def load_tokenized_prepared_datasets(
pass
# prefer local dataset, even if hub exists
if Path(d.path).exists():
ds = load_dataset(
"json",
data_files=d.path,
streaming=False,
split=None,
)
local_path = Path(d.path)
if local_path.exists():
if local_path.is_dir():
ds = load_dataset(
d.path,
data_files=d.data_files,
streaming=False,
split=None,
)
elif local_path.is_file():
ds = load_dataset(
"json",
data_files=d.path,
streaming=False,
split=None,
)
else:
raise ValueError(
"unhandled dataset load: local path exists, but is neither a directory or a file"
)
elif ds_from_hub:
if d.data_files:
ds = load_dataset(