diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 6bb6a6b8f..6494ec5f0 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -31,10 +31,11 @@ PRs are **greatly welcome**!
 
 Please run below to setup env
 ```bash
-# Install axolotl + dev and test dependencies from lockfile
+# Install axolotl + dev and test dependencies
 export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
+uv venv --no-project --relocatable
 source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 pre-commit install
 
 # test
diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index 521d26201..8ed89c5db 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -30,14 +30,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
           - cuda: "128"
             cuda_version: 12.8.1
             cudnn_version: ""
@@ -168,14 +160,6 @@ jobs:
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
             dockerfile: "Dockerfile-uv-base"
             platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
           - cuda: "128"
             cuda_version: 12.8.1
             cudnn_version: ""
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 1fb6290d9..b5c575d33 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,12 +18,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
@@ -180,12 +174,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
diff --git a/VERSION b/VERSION
index cb27aa17b..b08b47558 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.16.0.dev0
+0.16.2.dev0
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 2bdb45b5c..3fdd0e7aa 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -24,9 +24,9 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
 RUN pip uninstall -y causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="optimizers,ray"; \
     else \
-        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="deepspeed,optimizers,ray"; \
     fi && \
     if [ "$AXOLOTL_EXTRAS" != "" ]; then \
         pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index 70a62ee3a..08a5ddccd 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -58,19 +58,3 @@ RUN git lfs install --skip-repo && \
     # The base image ships with `pydantic==1.8.2` which is not working
     pip3 install -U --no-cache-dir pydantic==1.10.10 && \
     pip3 cache purge
-
-# Map Python version (e.g., 3.12 -> cp312)
-RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
-    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
-    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
-    # Map architecture
-    case "$TARGETARCH" in \
-        amd64) ARCH_TAG="x86_64" ;; \
-        arm64) ARCH_TAG="aarch64" ;; \
-        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
-    esac && \
-    WHL_VERSION="v0.7.16" && \
-    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
-    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
-    pip3 install --no-cache-dir "${WHL_FILE}" && \
-    rm "${WHL_FILE}"
diff --git a/docker/Dockerfile-tests b/docker/Dockerfile-tests
index 8d9734359..263abbc84 100644
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -24,9 +24,9 @@ RUN git fetch origin +$GITHUB_REF && \
 
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
     else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
+        pip install --no-build-isolation -e .[deepspeed,mamba-ssm] $AXOLOTL_ARGS; \
     fi
 
 # So we can test the Docker image
diff --git a/docker/Dockerfile-uv b/docker/Dockerfile-uv
index df058baa3..582a2a0cf 100644
--- a/docker/Dockerfile-uv
+++ b/docker/Dockerfile-uv
@@ -24,9 +24,9 @@ WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
 RUN uv pip uninstall causal_conv1d
 RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="optimizers,ray"; \
     else \
-        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
+        BASE_EXTRAS="deepspeed,optimizers,ray"; \
     fi && \
     if [ "$AXOLOTL_EXTRAS" != "" ]; then \
         uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
diff --git a/docker/Dockerfile-uv-base b/docker/Dockerfile-uv-base
index f16777378..c5a2ceb8c 100644
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -38,20 +38,3 @@ RUN uv pip install packaging setuptools wheel psutil \
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
         MAMBA_SKIP_CUDA_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE uv pip install --no-build-isolation mamba_ssm causal_conv1d; \
     fi
-
-# Map Python version (e.g., 3.12 -> cp312)
-RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
-    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
-    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
-    LINUX_TAG="manylinux_" && \
-    # Map architecture
-    case "$TARGETARCH" in \
-        amd64) ARCH_TAG="2_24_x86_64.manylinux_2_28_x86_64" ;; \
-        arm64) ARCH_TAG="2_34_aarch64" ;; \
-        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
-    esac && \
-    WHL_VERSION="v0.7.16" && \
-    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-${LINUX_TAG}${ARCH_TAG}.whl" && \
-    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
-    uv pip install --no-cache-dir "${WHL_FILE}" && \
-    rm "${WHL_FILE}"
diff --git a/docs/debugging.qmd b/docs/debugging.qmd
index f3ca6ad9a..6b76bde94 100644
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -77,8 +77,9 @@ Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/us
 
 ```bash
 export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
+uv venv --no-project --relocatable
 source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 ```
 
 #### Remote Hosts
@@ -218,8 +219,9 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
 You will now be in the container.  Next, install Axolotl with dev dependencies:
 
 ```bash
-uv sync --extra flash-attn --extra deepspeed --group dev --group test
+uv venv --no-project --relocatable
 source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
 ```
 
 ### Attach To Container
diff --git a/docs/docker.qmd b/docs/docker.qmd
index 001cf19a7..7ce041821 100644
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -13,10 +13,17 @@ This section describes the different Docker images that are released by AxolotlA
 For Blackwell GPUs, please use the tags with PyTorch 2.9.1 and CUDA 12.8.
 :::
 
-::: {.callout-tip}
-Each image below is available in a **uv variant** that uses [uv](https://docs.astral.sh/uv/) with
-a relocatable venv (`/workspace/axolotl-venv`) instead of Miniconda + pip. Append `-uv` to the image name
-(e.g. `axolotlai/axolotl-base-uv`). Tags follow the same format. We recommend the uv images for new deployments.
+::: {.callout-important}
+### Switch to the `-uv` images
+
+Each image below ships a **uv variant** that uses [uv](https://docs.astral.sh/uv/) with a relocatable venv
+(`/workspace/axolotl-venv`) instead of Miniconda + pip. Append `-uv` to the image name
+(e.g. `axolotlai/axolotl-uv`, `axolotlai/axolotl-base-uv`, `axolotlai/axolotl-cloud-uv`). Tags follow the
+same format as their non-uv counterparts.
+
+**We recommend switching to the `-uv` images early.** In the near future we will publish the uv-based
+build to the non-uv tags as well. The non-uv names will continue to work, but they will start serving
+the uv image.
 :::
 
 ## Base
@@ -85,7 +92,7 @@ Tags examples:
 - `main-py3.12-cu130-2.10.0`
 - `main-latest`
 - `main-20260315-py3.11-cu128-2.9.1`
-- `0.12.0`
+- `0.16.1`
 
 ## Cloud
 
diff --git a/docs/faq.qmd b/docs/faq.qmd
index 92b432f2d..9c1b81c3f 100644
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -57,7 +57,7 @@ description: Frequently asked questions
 
 **Q: vLLM is not working with Axolotl**
 
-> A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag.
+> A: We currently recommend torch 2.10 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.12-cu128-2.10.0` tag (note: torch 2.10 images are built with Python 3.12).
 
 **Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4**
 
diff --git a/docs/installation.qmd b/docs/installation.qmd
index 9d1d0d4a1..f7c780740 100644
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -15,7 +15,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir
 
 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
 - Python ≥3.11
-- PyTorch ≥2.9.0
+- PyTorch ≥2.9.1
 
 ## Installation {#sec-installation}
 
@@ -36,9 +36,9 @@ source $HOME/.local/bin/env
 Choose your CUDA version (e.g. `cu128`, `cu130`), create a venv, and install:
 ```{.bash}
 export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv --no-project --relocatable
+uv venv
 source .venv/bin/activate
-uv pip install --no-build-isolation axolotl[flash-attn,deepspeed]
+uv pip install --no-build-isolation axolotl[deepspeed]
 ```
 
 ### Edge/Development Build {#sec-edge-build}
@@ -49,12 +49,11 @@ For the latest features between releases:
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 export UV_TORCH_BACKEND=cu128  # or cu130
-uv sync --extra flash-attn --extra deepspeed
+uv venv
 source .venv/bin/activate
+uv pip install --no-build-isolation -e '.[deepspeed]'
 ```
 
-`uv sync` creates a `.venv`, installs exact pinned versions from `uv.lock`, and sets up an editable install automatically.
-
 ### Docker {#sec-docker}
 
 ```{.bash}
@@ -132,11 +131,11 @@ source $HOME/.local/bin/env
 
 # Create a fresh venv (recommended for a clean start)
 export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv --no-project --relocatable
+uv venv
 source .venv/bin/activate
 
 # Reinstall axolotl
-uv pip install --no-build-isolation axolotl[flash-attn,deepspeed]
+uv pip install --no-build-isolation axolotl[deepspeed]
 ```
 
 ## Using pip (Alternative) {#sec-pip}
@@ -151,13 +150,13 @@ Follow the instructions at: [https://pytorch.org/get-started/locally/](https://p
 
 ```{.bash}
 pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+pip3 install --no-build-isolation axolotl[deepspeed]
 ```
 
 For editable/development installs:
 ```{.bash}
 pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+pip3 install --no-build-isolation -e '.[deepspeed]'
 ```
 
 ## Troubleshooting {#sec-troubleshooting}
diff --git a/examples/LiquidAI/README.md b/examples/LiquidAI/README.md
index 0a08692d7..9ac637e33 100644
--- a/examples/LiquidAI/README.md
+++ b/examples/LiquidAI/README.md
@@ -15,7 +15,7 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
     Here is an example of how to install from pip:
     ```bash
     # Ensure you have a compatible version of Pytorch installed
-    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    uv pip install --no-build-isolation 'axolotl>=0.16.1'
     ```
 
 2.  Run one of the finetuning examples below.
diff --git a/examples/apertus/README.md b/examples/apertus/README.md
index 1280e430a..9ff2d8992 100644
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -11,11 +11,11 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
     Here is an example of how to install from main for pip:
 
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 
-uv pip install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.'
 
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
diff --git a/examples/arcee/README.md b/examples/arcee/README.md
index deaea676a..5296e022e 100644
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -13,11 +13,11 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
     Here is an example of how to install from main for pip:
 
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 
-uv pip install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.'
 
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index c7b2b8e5b..bbc75104c 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -36,12 +36,7 @@
     "id": "msOCO4NRmRLa"
    },
    "outputs": [],
-   "source": [
-    "%%capture\n",
-    "# This step can take ~5-10 minutes to install dependencies\n",
-    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
-   ]
+   "source": "%%capture\n# This step can take ~5-10 minutes to install dependencies\n!pip install --no-build-isolation \"axolotl>=0.16.1\"\n!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@fec1a88\""
   },
   {
    "cell_type": "markdown",
diff --git a/examples/devstral/README.md b/examples/devstral/README.md
index 2be8f6292..9fbe1d4fc 100644
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -15,8 +15,8 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
     Here is an example of how to install from pip:
 
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```
 
 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
diff --git a/examples/gemma3n/README.md b/examples/gemma3n/README.md
index 1ecc96cbc..db265d758 100644
--- a/examples/gemma3n/README.md
+++ b/examples/gemma3n/README.md
@@ -9,8 +9,8 @@ Gemma-3n is a family of multimodal models from Google found on [HuggingFace](htt
     Here is an example of how to install from pip:
 
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```
 
 2. In addition to Axolotl's requirements, Gemma-3n requires:
diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md
index 0e5eac500..ae71eec1e 100644
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -13,8 +13,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
     Here is an example of how to install from pip:
 
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```
 
 2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b))
diff --git a/examples/granite4/README.md b/examples/granite4/README.md
index ceb599c1c..85b6621a0 100644
--- a/examples/granite4/README.md
+++ b/examples/granite4/README.md
@@ -11,11 +11,11 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
     Here is an example of how to install from main for pip:
 
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.7.1 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 
-uv pip install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.'
 
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
diff --git a/examples/hunyuan/README.md b/examples/hunyuan/README.md
index 3071a0a61..d17752cc4 100644
--- a/examples/hunyuan/README.md
+++ b/examples/hunyuan/README.md
@@ -9,11 +9,11 @@ Tencent released a family of opensource models called HunYuan with varying param
     Here is an example of how to install from main for pip:
 
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 
-uv pip install --no-build-isolation -e '.[flash-attn]'
+uv pip install --no-build-isolation -e '.'
 
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
diff --git a/examples/magistral/README.md b/examples/magistral/README.md
index 172a40b2c..5cce33ea7 100644
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -13,8 +13,8 @@ Thanks to the team at MistralAI for giving us early access to prepare for these
     Here is an example of how to install from pip:
 
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.7.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```
 
 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
diff --git a/examples/seed-oss/README.md b/examples/seed-oss/README.md
index 796ef118d..d5d4baa89 100644
--- a/examples/seed-oss/README.md
+++ b/examples/seed-oss/README.md
@@ -11,7 +11,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
     Here is an example of how to install from pip:
     ```bash
     # Ensure you have a compatible version of Pytorch installed
-    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    uv pip install --no-build-isolation 'axolotl>=0.16.1'
 
     # Install Cut Cross Entropy
     python scripts/cutcrossentropy_install.py | sh
diff --git a/examples/smolvlm2/README.md b/examples/smolvlm2/README.md
index da83e612c..01ee7fa62 100644
--- a/examples/smolvlm2/README.md
+++ b/examples/smolvlm2/README.md
@@ -13,7 +13,7 @@ This guide shows how to fine-tune SmolVLM2 models with Axolotl.
     Here is an example of how to install from pip:
     ```bash
     # Ensure you have a compatible version of Pytorch installed
-    uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    uv pip install --no-build-isolation 'axolotl>=0.16.1'
     ```
 
 2. Install an extra dependency:
diff --git a/examples/voxtral/README.md b/examples/voxtral/README.md
index ed5cc6422..f8e7b51be 100644
--- a/examples/voxtral/README.md
+++ b/examples/voxtral/README.md
@@ -11,8 +11,8 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
     Here is an example of how to install from pip:
 
 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-uv pip install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have Pytorch installed (Pytorch 2.9.1 min)
+uv pip install --no-build-isolation 'axolotl>=0.16.1'
 ```
 
 2. Please install the below.
diff --git a/pyproject.toml b/pyproject.toml
index d028b394d..65e832581 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ requires-python = ">=3.10"
 
 dependencies = [
     # Core ML stack
-    "torch>=2.6.0",
+    "torch>=2.9.1",
     "packaging==26.0",
     "huggingface_hub>=1.1.7",
     "peft>=0.19.1,<0.20.0",
@@ -79,7 +79,7 @@ dependencies = [
     # Platform-specific (Linux only)
     "bitsandbytes==0.49.1 ; sys_platform != 'darwin'",
     "triton>=3.4.0 ; sys_platform != 'darwin'",
-    "xformers>=0.0.23.post1 ; sys_platform != 'darwin'",
+    "xformers>=0.0.33.post2 ; sys_platform != 'darwin' and platform_machine != 'aarch64'",
     "liger-kernel==0.7.0 ; sys_platform != 'darwin'",
     "torchao==0.17.0 ; sys_platform != 'darwin' and platform_machine != 'aarch64'",
 
diff --git a/src/axolotl/integrations/kd/README.md b/src/axolotl/integrations/kd/README.md
index 5e35cf3d7..1f24fc8a6 100644
--- a/src/axolotl/integrations/kd/README.md
+++ b/src/axolotl/integrations/kd/README.md
@@ -11,7 +11,7 @@ kd_ce_alpha: 0.1
 kd_alpha: 0.9
 kd_temperature: 1.0
 
-torch_compile: True  # torch>=2.6.0, recommended to reduce vram
+torch_compile: True  # recommended to reduce vram
 
 datasets:
   - path: ...
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 6ee672c8c..c52ddce1a 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -1016,7 +1016,7 @@ class AxolotlInputConfig(
     torch_compile: Literal["auto"] | bool | None = Field(
         default=None,
         json_schema_extra={
-            "description": "Whether to use torch.compile and which backend to use. setting to `auto` will enable torch compile when torch>=2.6.0"
+            "description": "Whether to use torch.compile and which backend to use."
         },
     )
     torch_compile_backend: str | None = Field(