Merge pull request #277 from cg123/dataset-name
Allow non-default dataset configurations
This commit is contained in:
@@ -262,6 +262,12 @@ See sample configs in [configs](configs) folder or [examples](examples) for quic
|
|||||||
- path: vicgalle/alpaca-gpt4
|
- path: vicgalle/alpaca-gpt4
|
||||||
type: alpaca # format from earlier
|
type: alpaca # format from earlier
|
||||||
|
|
||||||
|
# huggingface repo with specific configuration/subset
|
||||||
|
datasets:
|
||||||
|
- path: EleutherAI/pile
|
||||||
|
name: enron_emails
|
||||||
|
type: completion # format from earlier
|
||||||
|
|
||||||
# local
|
# local
|
||||||
datasets:
|
datasets:
|
||||||
- path: json
|
- path: json
|
||||||
@@ -344,6 +350,7 @@ datasets:
|
|||||||
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
||||||
data_files: # path to source data files
|
data_files: # path to source data files
|
||||||
shards: # number of shards to split data into
|
shards: # number of shards to split data into
|
||||||
|
name: # name of dataset configuration to load
|
||||||
|
|
||||||
# axolotl attempts to save the dataset as an arrow after packing the data together so
|
# axolotl attempts to save the dataset as an arrow after packing the data together so
|
||||||
# subsequent training attempts load faster, relative path
|
# subsequent training attempts load faster, relative path
|
||||||
|
|||||||
@@ -94,6 +94,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
try:
|
try:
|
||||||
load_dataset(
|
load_dataset(
|
||||||
d.path,
|
d.path,
|
||||||
|
name=d.name,
|
||||||
streaming=True,
|
streaming=True,
|
||||||
use_auth_token=use_auth_token,
|
use_auth_token=use_auth_token,
|
||||||
)
|
)
|
||||||
@@ -107,6 +108,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
if local_path.is_dir():
|
if local_path.is_dir():
|
||||||
ds = load_dataset(
|
ds = load_dataset(
|
||||||
d.path,
|
d.path,
|
||||||
|
name=d.name,
|
||||||
data_files=d.data_files,
|
data_files=d.data_files,
|
||||||
streaming=False,
|
streaming=False,
|
||||||
split=None,
|
split=None,
|
||||||
@@ -114,6 +116,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
elif local_path.is_file():
|
elif local_path.is_file():
|
||||||
ds = load_dataset(
|
ds = load_dataset(
|
||||||
"json",
|
"json",
|
||||||
|
name=d.name,
|
||||||
data_files=d.path,
|
data_files=d.path,
|
||||||
streaming=False,
|
streaming=False,
|
||||||
split=None,
|
split=None,
|
||||||
@@ -123,26 +126,22 @@ def load_tokenized_prepared_datasets(
|
|||||||
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
||||||
)
|
)
|
||||||
elif ds_from_hub:
|
elif ds_from_hub:
|
||||||
if d.data_files:
|
ds = load_dataset(
|
||||||
ds = load_dataset(
|
d.path,
|
||||||
d.path,
|
name=d.name,
|
||||||
streaming=False,
|
streaming=False,
|
||||||
data_files=d.data_files,
|
data_files=d.data_files,
|
||||||
use_auth_token=use_auth_token,
|
use_auth_token=use_auth_token,
|
||||||
)
|
)
|
||||||
else:
|
|
||||||
ds = load_dataset(
|
|
||||||
d.path,
|
|
||||||
streaming=False,
|
|
||||||
use_auth_token=use_auth_token,
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
fp = hf_hub_download(
|
fp = hf_hub_download(
|
||||||
repo_id=d.path,
|
repo_id=d.path,
|
||||||
repo_type="dataset",
|
repo_type="dataset",
|
||||||
filename=d.data_files,
|
filename=d.data_files,
|
||||||
)
|
)
|
||||||
ds = load_dataset("json", data_files=fp, streaming=False, split=None)
|
ds = load_dataset(
|
||||||
|
"json", name=d.name, data_files=fp, streaming=False, split=None
|
||||||
|
)
|
||||||
if not ds:
|
if not ds:
|
||||||
raise ValueError("unhandled dataset load")
|
raise ValueError("unhandled dataset load")
|
||||||
# support for using a subset of the data
|
# support for using a subset of the data
|
||||||
|
|||||||
Reference in New Issue
Block a user