Add ability to pass 'name' argument to load_dataset

This commit is contained in:
Charles Goddard
2023-07-14 16:46:39 -07:00
parent 168a7a09cc
commit 88089e8b32

View File

@@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets(
try: try:
load_dataset( load_dataset(
d.path, d.path,
name=d.name,
streaming=True, streaming=True,
use_auth_token=use_auth_token, use_auth_token=use_auth_token,
) )
@@ -107,6 +108,7 @@ def load_tokenized_prepared_datasets(
if local_path.is_dir(): if local_path.is_dir():
ds = load_dataset( ds = load_dataset(
d.path, d.path,
name=d.name,
data_files=d.data_files, data_files=d.data_files,
streaming=False, streaming=False,
split=None, split=None,
@@ -114,6 +116,7 @@ def load_tokenized_prepared_datasets(
elif local_path.is_file(): elif local_path.is_file():
ds = load_dataset( ds = load_dataset(
"json", "json",
name=d.name,
data_files=d.path, data_files=d.path,
streaming=False, streaming=False,
split=None, split=None,
@@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets(
"unhandled dataset load: local path exists, but is neither a directory or a file" "unhandled dataset load: local path exists, but is neither a directory or a file"
) )
elif ds_from_hub: elif ds_from_hub:
if d.data_files: ds = load_dataset(
ds = load_dataset( d.path,
d.path, name=d.name,
streaming=False, streaming=False,
data_files=d.data_files, data_files=d.data_files,
use_auth_token=use_auth_token, use_auth_token=use_auth_token,
) )
else:
ds = load_dataset(
d.path,
streaming=False,
use_auth_token=use_auth_token,
)
else: else:
fp = hf_hub_download( fp = hf_hub_download(
repo_id=d.path, repo_id=d.path,
repo_type="dataset", repo_type="dataset",
filename=d.data_files, filename=d.data_files,
) )
ds = load_dataset("json", data_files=fp, streaming=False, split=None) ds = load_dataset(
"json", name=d.name, data_files=fp, streaming=False, split=None
)
if not ds: if not ds:
raise ValueError("unhandled dataset load") raise ValueError("unhandled dataset load")
# support for using a subset of the data # support for using a subset of the data