From 590d6032fdcbe915a1f7c7d8456c28db9f2d3861 Mon Sep 17 00:00:00 2001 From: ich <25612731+ein-ich@users.noreply.github.com> Date: Fri, 29 Sep 2023 04:54:10 +0200 Subject: [PATCH] Fix bug when using pretokenized datasets (#652) * fix pretokenized datasets readme * check if dataset type is not set to handle pretokenized datasets --- README.md | 2 +- src/axolotl/utils/config.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 3f1767ea4..593eff6c3 100644 --- a/README.md +++ b/README.md @@ -317,7 +317,7 @@ Using file: #### How to use your custom pretokenized dataset - Do not pass a `type:` -- Dataset must contain `input_ids`, `attention_mask`, `labels` in columns +- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels` ### Config diff --git a/src/axolotl/utils/config.py b/src/axolotl/utils/config.py index 67f9490c4..5a034ea0f 100644 --- a/src/axolotl/utils/config.py +++ b/src/axolotl/utils/config.py @@ -293,6 +293,8 @@ def validate_config(cfg): if cfg.datasets: for idx, ds_cfg in enumerate(cfg.datasets): + if not ds_cfg.type: + continue if ds_cfg.type == "sharegpt:chat": LOG.warning( PendingDeprecationWarning(