Feat(data): Allow loading local csv and text (#594)

* Feat(data): Allow loading local csv and text

* chore: update readme for loading data
This commit is contained in:
NanoCode012
2023-09-18 00:32:27 +09:00
committed by GitHub
parent b15b19eb8d
commit 00dce35fb2
2 changed files with 8 additions and 4 deletions

View File

@@ -434,10 +434,10 @@ datasets:
- path: vicgalle/alpaca-gpt4
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
ds_type: # Optional[str] (json|arrow|parquet) defines the datatype when path is a file
data_files: # path to source data files
shards: # number of shards to split data into
name: # name of dataset configuration to load
ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
data_files: # Optional[str] path to source data files
shards: # Optional[int] number of shards to split data into
name: # Optional[str] name of dataset configuration to load
# custom user prompt
- path: repo

View File

@@ -183,6 +183,10 @@ def load_tokenized_prepared_datasets(
ds_type = "parquet"
elif ".arrow" in d.path:
ds_type = "arrow"
elif ".csv" in d.path:
ds_type = "csv"
elif ".txt" in d.path:
ds_type = "text"
ds = load_dataset(
ds_type,
name=d.name,