Fix bug when using pretokenized datasets (#652)
* fix pretokenized datasets readme * check if dataset type is not set to handle pretokenized datasets
This commit is contained in:
@@ -317,7 +317,7 @@ Using file:
|
||||
#### How to use your custom pretokenized dataset
|
||||
|
||||
- Do not pass a `type:`
|
||||
- Dataset must contain `input_ids`, `attention_mask`, `labels` in columns
|
||||
- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
|
||||
|
||||
|
||||
### Config
|
||||
|
||||
@@ -293,6 +293,8 @@ def validate_config(cfg):
|
||||
|
||||
if cfg.datasets:
|
||||
for idx, ds_cfg in enumerate(cfg.datasets):
|
||||
if not ds_cfg.type:
|
||||
continue
|
||||
if ds_cfg.type == "sharegpt:chat":
|
||||
LOG.warning(
|
||||
PendingDeprecationWarning(
|
||||
|
||||
Reference in New Issue
Block a user