diff --git a/src/axolotl/cli/args.py b/src/axolotl/cli/args.py index c3c2db740..2e90883eb 100644 --- a/src/axolotl/cli/args.py +++ b/src/axolotl/cli/args.py @@ -13,6 +13,16 @@ class PreprocessCliArgs: debug_num_examples: int = field(default=1) prompter: Optional[str] = field(default=None) download: Optional[bool] = field(default=True) + iterable: Optional[bool] = field( + default=False, + metadata={ + "help": ( + "[DEPRECATED] No longer supported. For streaming datasets, use " + "'axolotl train' and set 'streaming: true' in your YAML config, or " + "pass --streaming instead in the CLI." + ) + }, + ) @dataclass diff --git a/src/axolotl/cli/preprocess.py b/src/axolotl/cli/preprocess.py index 4120062d8..b2f2399a8 100644 --- a/src/axolotl/cli/preprocess.py +++ b/src/axolotl/cli/preprocess.py @@ -35,10 +35,20 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None: check_accelerate_default_config() check_user_token() + if cli_args.iterable: + LOG.error( + "The --iterable CLI argument for 'axolotl preprocess' is no longer " + "supported. For training, set 'streaming: true' in your YAML config or " + "pass '--streaming' in your 'axolotl train' command for on-the-fly " + "preprocessing." + ) + return + for key in ["skip_prepare_dataset", "pretraining_dataset"]: if cfg.get(key): LOG.error( - f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead." + f"You have set `{key}:`. `preprocess` is not needed. Run the 'axolotl " + "train' CLI directly instead." ) return diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py index a94270c92..49077df80 100644 --- a/src/axolotl/utils/schemas/validation.py +++ b/src/axolotl/utils/schemas/validation.py @@ -1441,7 +1441,8 @@ class StreamingValidationMixin: val_set_size = getattr(self, "val_set_size", 0.0) if val_set_size and val_set_size > 0: raise ValueError( - "Validation splits not supported for streaming datasets, skipping" + "Validation splits not supported for streaming datasets, please " + "use test_datasets: ... instead" ) return self