diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index 0fe0d2b25..521d26201 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -30,14 +30,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.8.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64"
           - cuda: "128"
             cuda_version: 12.8.1
             cudnn_version: ""
@@ -160,14 +152,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.8.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64"
           - cuda: "128"
             cuda_version: 12.8.1
             cudnn_version: ""
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index a3a24537c..1fb6290d9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,12 +18,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.8.0
-            axolotl_extras:
-            platforms: "linux/amd64"
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
@@ -186,12 +180,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.8.0
-            axolotl_extras:
-            platforms: "linux/amd64"
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index 2bb499ded..2c5d76e4c 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -33,12 +33,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.8.0
-            axolotl_extras: fbgemm-gpu
-            num_gpus: 2
 #          - cuda: 129
 #            cuda_version: 12.9.1
 #            python_version: "3.12"
diff --git a/.github/workflows/nightlies.yml b/.github/workflows/nightlies.yml
index 0372f5c7a..19643bea5 100644
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -15,11 +15,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.8.0
-            axolotl_extras:
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
@@ -67,11 +62,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.8.0
-            axolotl_extras:
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml
index 663b0476e..235aebcfa 100644
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -44,7 +44,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
-        pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
+        pytorch_version: ["2.9.1", "2.10.0"]
     timeout-minutes: 20
 
     steps:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5099e447c..d753afe01 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -69,10 +69,8 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.12", "3.14"]
-        pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
+        pytorch_version: ["2.9.1", "2.10.0"]
         exclude:
-          - python_version: "3.14"
-            pytorch_version: "2.8.0"
           - python_version: "3.14"
             pytorch_version: "2.9.1"
     timeout-minutes: 20
@@ -165,10 +163,8 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.12", "3.14"]
-        pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
+        pytorch_version: ["2.9.1", "2.10.0"]
         exclude:
-          - python_version: "3.14"
-            pytorch_version: "2.8.0"
           - python_version: "3.14"
             pytorch_version: "2.9.1"
     timeout-minutes: 30
@@ -329,13 +325,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.8.0
-            num_gpus: 1
-            gpu_type: "B200"
-            axolotl_extras: fbgemm-gpu
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
diff --git a/README.md b/README.md
index a425e45b8..e353d20ad 100644
--- a/README.md
+++ b/README.md
@@ -87,7 +87,7 @@ Features:
 
 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
-- PyTorch ≥2.8.0
+- PyTorch ≥2.9.1
 
 ### Google Colab
 
diff --git a/docker/Dockerfile-uv-base b/docker/Dockerfile-uv-base
index 0e7acbe29..f16777378 100644
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -36,22 +36,22 @@ RUN uv pip install packaging setuptools wheel psutil \
     && uv pip install awscli pydantic
 
 RUN if [ "$TARGETARCH" = "amd64" ]; then \
-        uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main"; \
-        uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
+        MAMBA_SKIP_CUDA_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE uv pip install --no-build-isolation mamba_ssm causal_conv1d; \
     fi
 
 # Map Python version (e.g., 3.12 -> cp312)
 RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
     # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
     TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
+    LINUX_TAG="manylinux_" && \
     # Map architecture
     case "$TARGETARCH" in \
-        amd64) ARCH_TAG="x86_64" ;; \
-        arm64) ARCH_TAG="aarch64" ;; \
+        amd64) ARCH_TAG="2_24_x86_64.manylinux_2_28_x86_64" ;; \
+        arm64) ARCH_TAG="2_34_aarch64" ;; \
         *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
     esac && \
     WHL_VERSION="v0.7.16" && \
-    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
+    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-${LINUX_TAG}${ARCH_TAG}.whl" && \
     wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
     uv pip install --no-cache-dir "${WHL_FILE}" && \
     rm "${WHL_FILE}"