Fix bug when using pretokenized datasets (#652)
* fix pretokenized datasets readme * check if dataset type is not set to handle pretokenized datasets
This commit is contained in:
@@ -317,7 +317,7 @@ Using file:
|
|||||||
#### How to use your custom pretokenized dataset
|
#### How to use your custom pretokenized dataset
|
||||||
|
|
||||||
- Do not pass a `type:`
|
- Do not pass a `type:`
|
||||||
- Dataset must contain `input_ids`, `attention_mask`, `labels` in columns
|
- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
|
||||||
|
|
||||||
|
|
||||||
### Config
|
### Config
|
||||||
|
|||||||
@@ -293,6 +293,8 @@ def validate_config(cfg):
|
|||||||
|
|
||||||
if cfg.datasets:
|
if cfg.datasets:
|
||||||
for idx, ds_cfg in enumerate(cfg.datasets):
|
for idx, ds_cfg in enumerate(cfg.datasets):
|
||||||
|
if not ds_cfg.type:
|
||||||
|
continue
|
||||||
if ds_cfg.type == "sharegpt:chat":
|
if ds_cfg.type == "sharegpt:chat":
|
||||||
LOG.warning(
|
LOG.warning(
|
||||||
PendingDeprecationWarning(
|
PendingDeprecationWarning(
|
||||||
|
|||||||
Reference in New Issue
Block a user