Feat(data): Allow loading local csv and text (#594)
* Feat(data): Allow loading local csv and text * chore: update readme for loading data
This commit is contained in:
@@ -434,10 +434,10 @@ datasets:
|
|||||||
- path: vicgalle/alpaca-gpt4
|
- path: vicgalle/alpaca-gpt4
|
||||||
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
||||||
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
||||||
ds_type: # Optional[str] (json|arrow|parquet) defines the datatype when path is a file
|
ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
|
||||||
data_files: # path to source data files
|
data_files: # Optional[str] path to source data files
|
||||||
shards: # number of shards to split data into
|
shards: # Optional[int] number of shards to split data into
|
||||||
name: # name of dataset configuration to load
|
name: # Optional[str] name of dataset configuration to load
|
||||||
|
|
||||||
# custom user prompt
|
# custom user prompt
|
||||||
- path: repo
|
- path: repo
|
||||||
|
|||||||
@@ -183,6 +183,10 @@ def load_tokenized_prepared_datasets(
|
|||||||
ds_type = "parquet"
|
ds_type = "parquet"
|
||||||
elif ".arrow" in d.path:
|
elif ".arrow" in d.path:
|
||||||
ds_type = "arrow"
|
ds_type = "arrow"
|
||||||
|
elif ".csv" in d.path:
|
||||||
|
ds_type = "csv"
|
||||||
|
elif ".txt" in d.path:
|
||||||
|
ds_type = "text"
|
||||||
ds = load_dataset(
|
ds = load_dataset(
|
||||||
ds_type,
|
ds_type,
|
||||||
name=d.name,
|
name=d.name,
|
||||||
|
|||||||
Reference in New Issue
Block a user