diff --git a/docs/faq.qmd b/docs/faq.qmd index b84aa75bd..59b06becd 100644 --- a/docs/faq.qmd +++ b/docs/faq.qmd @@ -51,6 +51,10 @@ description: Frequently asked questions > pad_token: "..." > ``` +**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI** + +> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand. + ### Chat templates **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`** diff --git a/src/axolotl/cli/preprocess.py b/src/axolotl/cli/preprocess.py index b8258383e..d0c2ad165 100644 --- a/src/axolotl/cli/preprocess.py +++ b/src/axolotl/cli/preprocess.py @@ -35,6 +35,12 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None: check_accelerate_default_config() check_user_token() + for key in ["skip_prepare_dataset", "pretraining_dataset"]: + if cfg.get("key"): + raise ValueError( + f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead." + ) + if not cfg.dataset_prepared_path: msg = ( Fore.RED