allow remote data paths (#1278)

* allow remote data paths

* add docs about public url

* only allow https

* better docs

* better docs
This commit is contained in:
Hamel Husain
2024-02-08 15:02:35 -08:00
committed by GitHub
parent 1daecd161e
commit 91cf4ee72c
2 changed files with 18 additions and 0 deletions

View File

@@ -336,6 +336,16 @@ def load_tokenized_prepared_datasets(
split=None,
storage_options=storage_options,
)
elif config_dataset.path.startswith("https://"):
ds_type = get_ds_type(config_dataset)
ds = load_dataset(
ds_type,
name=config_dataset.name,
data_files=config_dataset.path,
streaming=False,
split=None,
storage_options=storage_options,
)
else:
if isinstance(config_dataset.data_files, str):
fp = hf_hub_download(