diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 966bd2f5b..1b9862d85 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -5,11 +5,13 @@ on: branches: - "main" paths: - - 'Dockerfile-base' + - 'docker/Dockerfile-base' + - 'docker/Dockerfile-uv-base' - '.github/workflows/base.yml' pull_request: paths: - - 'Dockerfile-base' + - 'docker/Dockerfile-base' + - 'docker/Dockerfile-uv-base' - '.github/workflows/base.yml' workflow_dispatch: diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base index 7cf264b03..f64b9c072 100644 --- a/docker/Dockerfile-base +++ b/docker/Dockerfile-base @@ -39,3 +39,7 @@ RUN git lfs install --skip-repo && \ pip3 install awscli && \ # The base image ships with `pydantic==1.8.2` which is not working pip3 install -U --no-cache-dir pydantic==1.10.10 + +RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \ + FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \ + fi diff --git a/docs/faq.qmd b/docs/faq.qmd index b84aa75bd..59b06becd 100644 --- a/docs/faq.qmd +++ b/docs/faq.qmd @@ -51,6 +51,10 @@ description: Frequently asked questions > pad_token: "..." > ``` +**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI** + +> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand. + ### Chat templates **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`** diff --git a/src/axolotl/cli/preprocess.py b/src/axolotl/cli/preprocess.py index b8258383e..d0c2ad165 100644 --- a/src/axolotl/cli/preprocess.py +++ b/src/axolotl/cli/preprocess.py @@ -35,6 +35,12 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None: check_accelerate_default_config() check_user_token() + for key in ["skip_prepare_dataset", "pretraining_dataset"]: + if cfg.get("key"): + raise ValueError( + f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead." + ) + if not cfg.dataset_prepared_path: msg = ( Fore.RED diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py index a537c5b65..c3c70545c 100644 --- a/src/axolotl/utils/data/shared.py +++ b/src/axolotl/utils/data/shared.py @@ -526,8 +526,9 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset: if len(datasets) == 1: ds = datasets[0] - # Do not shuffle if curriculum sampling is enabled - if cfg.curriculum_sampling: + # Do not shuffle if curriculum sampling is enabled or + # shuffle_merged_datasets is disabled + if cfg.curriculum_sampling or not cfg.shuffle_merged_datasets: return ds return ds.shuffle(seed=cfg.seed) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index 278fbed5b..06853451c 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -609,6 +609,9 @@ def prepare_opinionated_env(cfg): if cfg.qlora_sharded_model_loading: # model loading is forked after the tokenizer os.environ["TOKENIZERS_PARALLELISM"] = "false" + if cfg.sample_packing: + # multipack parallel packing sampler defaults to using fork + os.environ["TOKENIZERS_PARALLELISM"] = "false" def setup_trainer(