From a5946ff1f07ab57405c0ec050ec2a16ea2c1b6c5 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Sat, 5 Jul 2025 09:21:18 -0400
Subject: [PATCH 1/4] build fa2 from source for base image with torch2.6 and
 cu124 (#2867)

---
 .github/workflows/base.yml | 6 ++++--
 docker/Dockerfile-base     | 4 ++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index 966bd2f5b..1b9862d85 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -5,11 +5,13 @@ on:
     branches:
       - "main"
     paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
+      - 'docker/Dockerfile-uv-base'
       - '.github/workflows/base.yml'
   pull_request:
     paths:
-      - 'Dockerfile-base'
+      - 'docker/Dockerfile-base'
+      - 'docker/Dockerfile-uv-base'
       - '.github/workflows/base.yml'
   workflow_dispatch:
 
diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index 52201f276..df4240325 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,3 +37,7 @@ RUN git lfs install --skip-repo && \
     pip3 install awscli && \
     # The base image ships with `pydantic==1.8.2` which is not working
     pip3 install -U --no-cache-dir pydantic==1.10.10
+
+RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
+        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
+    fi

From bf38e507fb124c8081bff71f70b42de474aa50ff Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Sun, 6 Jul 2025 21:20:41 -0400
Subject: [PATCH 2/4] respect shuffle_merged_datasets for single dataset too
 (#2866) [skip ci]

* respect shuffle_merged_datasets for single dataset too

* update inline comment for behavior

Co-authored-by: NanoCode012 <nano@axolotl.ai>

---------

Co-authored-by: NanoCode012 <nano@axolotl.ai>
---
 src/axolotl/utils/data/shared.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py
index a537c5b65..c3c70545c 100644
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -526,8 +526,9 @@ def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
     if len(datasets) == 1:
         ds = datasets[0]
 
-        # Do not shuffle if curriculum sampling is enabled
-        if cfg.curriculum_sampling:
+        # Do not shuffle if curriculum sampling is enabled or
+        # shuffle_merged_datasets is disabled
+        if cfg.curriculum_sampling or not cfg.shuffle_merged_datasets:
             return ds
 
         return ds.shuffle(seed=cfg.seed)

From b37ddf97783da826454aec2662329ef0ea38dc4c Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Sun, 6 Jul 2025 21:55:09 -0400
Subject: [PATCH 3/4] don't use tokenizer parallelism when using packing
 (#2862) [skip ci]

---
 src/axolotl/utils/trainer.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 278fbed5b..06853451c 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -609,6 +609,9 @@ def prepare_opinionated_env(cfg):
     if cfg.qlora_sharded_model_loading:
         # model loading is forked after the tokenizer
         os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    if cfg.sample_packing:
+        # multipack parallel packing sampler defaults to using fork
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 def setup_trainer(

From 5a961ecadf617ad2af2543892c1a6548ee74d8fa Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Mon, 7 Jul 2025 09:55:33 +0800
Subject: [PATCH 4/4] Fix: do not call preprocess in multimodal or pretraining
 case (#2861)

* fix: let users know to not call preprocess for vision mode

* fix: improve ux for pretraining dataset and skip prepare ds

* feat: add info to doc

* Update src/axolotl/cli/preprocess.py following comment

Co-authored-by: salman <salman.mohammadi@outlook.com>

---------

Co-authored-by: salman <salman.mohammadi@outlook.com>
---
 docs/faq.qmd                  | 4 ++++
 src/axolotl/cli/preprocess.py | 6 ++++++
 2 files changed, 10 insertions(+)

diff --git a/docs/faq.qmd b/docs/faq.qmd
index b84aa75bd..59b06becd 100644
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -51,6 +51,10 @@ description: Frequently asked questions
 >   pad_token: "..."
 > ```
 
+**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**
+
+> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.
+
 ### Chat templates
 
 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
diff --git a/src/axolotl/cli/preprocess.py b/src/axolotl/cli/preprocess.py
index b8258383e..d0c2ad165 100644
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -35,6 +35,12 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
     check_accelerate_default_config()
     check_user_token()
 
+    for key in ["skip_prepare_dataset", "pretraining_dataset"]:
+        if cfg.get("key"):
+            raise ValueError(
+                f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
+            )
+
     if not cfg.dataset_prepared_path:
         msg = (
             Fore.RED