allow remote data paths (#1278)
* allow remote data paths * add docs about public url * only allow https * better docs * better docs
This commit is contained in:
@@ -468,6 +468,14 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
|
|||||||
dataset:
|
dataset:
|
||||||
- path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
|
- path: s3://path_to_ds # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
|
||||||
...
|
...
|
||||||
|
|
||||||
|
# Loading Data From a Public URL
|
||||||
|
# - URLs must use HTTPS protocol for security reasons, not HTTP.
|
||||||
|
# - The URL should be a direct link to the file you wish to load.
|
||||||
|
# - The file format is `json` (which includes `jsonl`) by default. For different formats, adjust the `ds_type` option accordingly.
|
||||||
|
dataset:
|
||||||
|
- path: https://some.url.com/yourdata.jsonl # Accepts folder with arrow/parquet or file path like above. Supports s3, gcs.
|
||||||
|
ds_type: json # this is the default, see other options below.
|
||||||
```
|
```
|
||||||
|
|
||||||
- loading
|
- loading
|
||||||
|
|||||||
@@ -336,6 +336,16 @@ def load_tokenized_prepared_datasets(
|
|||||||
split=None,
|
split=None,
|
||||||
storage_options=storage_options,
|
storage_options=storage_options,
|
||||||
)
|
)
|
||||||
|
elif config_dataset.path.startswith("https://"):
|
||||||
|
ds_type = get_ds_type(config_dataset)
|
||||||
|
ds = load_dataset(
|
||||||
|
ds_type,
|
||||||
|
name=config_dataset.name,
|
||||||
|
data_files=config_dataset.path,
|
||||||
|
streaming=False,
|
||||||
|
split=None,
|
||||||
|
storage_options=storage_options,
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
if isinstance(config_dataset.data_files, str):
|
if isinstance(config_dataset.data_files, str):
|
||||||
fp = hf_hub_download(
|
fp = hf_hub_download(
|
||||||
|
|||||||
Reference in New Issue
Block a user