Fix: do not call preprocess in multimodal or pretraining case (#2861)
* fix: let users know to not call preprocess for vision mode * fix: improve ux for pretraining dataset and skip prepare ds * feat: add info to doc * Update src/axolotl/cli/preprocess.py following comment Co-authored-by: salman <salman.mohammadi@outlook.com> --------- Co-authored-by: salman <salman.mohammadi@outlook.com>
This commit is contained in:
@@ -51,6 +51,10 @@ description: Frequently asked questions
|
|||||||
> pad_token: "..."
|
> pad_token: "..."
|
||||||
> ```
|
> ```
|
||||||
|
|
||||||
|
**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
|
||||||
|
|
||||||
|
> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
|
||||||
|
|
||||||
### Chat templates
|
### Chat templates
|
||||||
|
|
||||||
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
|
**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
|
||||||
|
|||||||
@@ -35,6 +35,12 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
|
|||||||
check_accelerate_default_config()
|
check_accelerate_default_config()
|
||||||
check_user_token()
|
check_user_token()
|
||||||
|
|
||||||
|
for key in ["skip_prepare_dataset", "pretraining_dataset"]:
|
||||||
|
if cfg.get("key"):
|
||||||
|
raise ValueError(
|
||||||
|
f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
|
||||||
|
)
|
||||||
|
|
||||||
if not cfg.dataset_prepared_path:
|
if not cfg.dataset_prepared_path:
|
||||||
msg = (
|
msg = (
|
||||||
Fore.RED
|
Fore.RED
|
||||||
|
|||||||
Reference in New Issue
Block a user