From afe18ace3590808766ea5f95790b228dc933c50c Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 1 Jan 2026 06:52:45 -0500 Subject: [PATCH] deprecate torch 2.7.1 (#3339) --- .github/workflows/base.yml | 49 ++++---------- .github/workflows/main.yml | 64 +++++-------------- .github/workflows/multi-gpu-e2e.yml | 9 +-- .github/workflows/nightlies.yml | 20 +++--- .github/workflows/tests-nightly.yml | 16 ++--- .github/workflows/tests.yml | 24 ++----- README.md | 2 +- docs/docker.qmd | 16 ++--- docs/installation.qmd | 4 +- .../cli/cloud/baseten/template/train_sft.py | 3 +- src/axolotl/cli/cloud/modal_.py | 2 +- 11 files changed, 65 insertions(+), 144 deletions(-) diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index eddce1438..ea721bff4 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -25,27 +25,6 @@ jobs: fail-fast: false matrix: include: - - cuda: "126" - cuda_version: 12.6.3 - cudnn_version: "" - python_version: "3.11" - pytorch: 2.7.0 - torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" - dockerfile: "Dockerfile-base" - - cuda: "126" - cuda_version: 12.6.3 - cudnn_version: "" - python_version: "3.11" - pytorch: 2.7.1 - torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" - dockerfile: "Dockerfile-base" - - cuda: "128" - cuda_version: 12.8.1 - cudnn_version: "" - python_version: "3.11" - pytorch: 2.7.1 - torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" - dockerfile: "Dockerfile-base" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" @@ -53,6 +32,13 @@ jobs: pytorch: 2.8.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-base" + - cuda: "128" + cuda_version: 12.8.1 + cudnn_version: "" + python_version: "3.11" + pytorch: 2.9.0 + torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + dockerfile: "Dockerfile-base" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" @@ -121,20 +107,6 @@ jobs: fail-fast: false matrix: include: - - cuda: "126" - cuda_version: 12.6.3 - cudnn_version: "" - python_version: "3.11" - pytorch: 2.7.1 - torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" - dockerfile: "Dockerfile-uv-base" - - cuda: "128" - cuda_version: 12.8.1 - cudnn_version: "" - python_version: "3.11" - pytorch: 2.7.1 - torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" - dockerfile: "Dockerfile-uv-base" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" @@ -149,6 +121,13 @@ jobs: pytorch: 2.9.1 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" dockerfile: "Dockerfile-uv-base" + - cuda: "128" + cuda_version: 12.8.1 + cudnn_version: "" + python_version: "3.11" + pytorch: 2.9.0 + torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + dockerfile: "Dockerfile-uv-base" - cuda: "130" cuda_version: 13.0.0 cudnn_version: "" diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index f34a0cf2f..052f9aa72 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -15,21 +15,6 @@ jobs: fail-fast: false matrix: include: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.0 - axolotl_extras: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.1 - axolotl_extras: vllm - - cuda: 128 - cuda_version: 12.8.1 - python_version: "3.11" - pytorch: 2.7.1 - axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" @@ -46,6 +31,11 @@ jobs: python_version: "3.11" pytorch: 2.9.1 axolotl_extras: + - cuda: 130 + cuda_version: 13.0.0 + python_version: "3.11" + pytorch: 2.9.1 + axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout @@ -92,27 +82,6 @@ jobs: strategy: matrix: include: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.0 - axolotl_extras: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.1 - axolotl_extras: - is_latest: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.1 - axolotl_extras: vllm - - cuda: 128 - cuda_version: 12.8.1 - python_version: "3.11" - pytorch: 2.7.1 - axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" @@ -129,6 +98,11 @@ jobs: python_version: "3.11" pytorch: 2.9.1 axolotl_extras: + - cuda: 130 + cuda_version: 13.0.0 + python_version: "3.11" + pytorch: 2.9.1 + axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout @@ -170,24 +144,18 @@ jobs: strategy: matrix: include: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.1 - axolotl_extras: - is_latest: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.1 - axolotl_extras: vllm - is_latest: true - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.8.0 axolotl_extras: is_latest: + - cuda: 128 + cuda_version: 12.8.1 + python_version: "3.11" + pytorch: 2.9.1 + axolotl_extras: + is_latest: runs-on: axolotl-gpu-runner steps: - name: Checkout diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml index 13162f8b1..1dd019dc7 100644 --- a/.github/workflows/multi-gpu-e2e.yml +++ b/.github/workflows/multi-gpu-e2e.yml @@ -29,13 +29,6 @@ jobs: fail-fast: false matrix: include: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.1 - axolotl_extras: vllm - num_gpus: 2 - nightly_build: "true" - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" @@ -46,7 +39,7 @@ jobs: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.9.0 + pytorch: 2.9.1 axolotl_extras: fbgemm-gpu num_gpus: 2 nightly_build: "true" diff --git a/.github/workflows/nightlies.yml b/.github/workflows/nightlies.yml index a24946ae9..d2c587cc7 100644 --- a/.github/workflows/nightlies.yml +++ b/.github/workflows/nightlies.yml @@ -12,16 +12,16 @@ jobs: fail-fast: false matrix: include: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.1 - axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.8.0 axolotl_extras: + - cuda: 128 + cuda_version: 12.8.1 + python_version: "3.11" + pytorch: 2.9.1 + axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout @@ -64,16 +64,16 @@ jobs: strategy: matrix: include: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.1 - axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" pytorch: 2.8.0 axolotl_extras: + - cuda: 128 + cuda_version: 12.8.1 + python_version: "3.11" + pytorch: 2.9.1 + axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml index 53139fac1..67b68a7e6 100644 --- a/.github/workflows/tests-nightly.yml +++ b/.github/workflows/tests-nightly.yml @@ -26,7 +26,7 @@ jobs: max-parallel: 2 matrix: python_version: ["3.11"] - pytorch_version: ["2.7.1", "2.8.0"] + pytorch_version: ["2.8.0", "2.9.0", "2.9.1"] timeout-minutes: 20 steps: @@ -99,17 +99,17 @@ jobs: fail-fast: false matrix: include: - - cuda: 126 - cuda_version: 12.6.3 + - cuda: 128 + cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.7.1 + pytorch: 2.8.0 num_gpus: 1 axolotl_extras: nightly_build: "true" - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.8.0 + pytorch: 2.9.1 num_gpus: 1 axolotl_extras: nightly_build: "true" @@ -148,10 +148,10 @@ jobs: fail-fast: false matrix: include: - - cuda: 126 - cuda_version: 12.6.3 + - cuda: 128 + cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.7.1 + pytorch: 2.9.1 num_gpus: 2 axolotl_extras: nightly_build: "true" diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9cf231575..ae5ba1740 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -55,7 +55,7 @@ jobs: fail-fast: false matrix: python_version: ["3.11"] - pytorch_version: ["2.7.1", "2.8.0", "2.9.0"] + pytorch_version: ["2.8.0", "2.9.0", "2.9.1"] timeout-minutes: 20 steps: @@ -145,7 +145,7 @@ jobs: fail-fast: false matrix: python_version: ["3.11"] - pytorch_version: ["2.7.1", "2.8.0", "2.9.0"] + pytorch_version: ["2.8.0", "2.9.0", "2.9.1"] timeout-minutes: 20 steps: @@ -303,18 +303,6 @@ jobs: fail-fast: false matrix: include: - - cuda: 126 - cuda_version: 12.6.3 - python_version: "3.11" - pytorch: 2.7.1 - num_gpus: 1 - axolotl_extras: -# - cuda: 128 -# cuda_version: 12.8.1 -# python_version: "3.11" -# pytorch: 2.7.1 -# num_gpus: 1 -# axolotl_extras: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" @@ -325,7 +313,7 @@ jobs: - cuda: 128 cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.9.0 + pytorch: 2.9.1 num_gpus: 1 axolotl_extras: steps: @@ -365,10 +353,10 @@ jobs: fail-fast: false matrix: include: - - cuda: 126 - cuda_version: 12.6.3 + - cuda: 128 + cuda_version: 12.8.1 python_version: "3.11" - pytorch: 2.7.1 + pytorch: 2.9.1 num_gpus: 1 axolotl_extras: steps: diff --git a/README.md b/README.md index 01e0c44d9..0521f7bed 100644 --- a/README.md +++ b/README.md @@ -77,7 +77,7 @@ Features: - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU - Python 3.11 -- PyTorch ≥2.7.1 +- PyTorch ≥2.8.0 ### Google Colab diff --git a/docs/docker.qmd b/docs/docker.qmd index da6184394..5d146eac2 100644 --- a/docs/docker.qmd +++ b/docs/docker.qmd @@ -32,11 +32,8 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version} Tags examples: -- `main-base-py3.11-cu128-2.7.1` -- `main-base-py3.11-cu126-2.7.1` -- `main-base-py3.11-cu126-2.7.0` -- `main-base-py3.11-cu126-2.6.0` -- `main-base-py3.11-cu124-2.6.0` +- `main-base-py3.11-cu128-2.8.0` +- `main-base-py3.11-cu128-2.9.1` ## Main @@ -74,15 +71,12 @@ There may be some extra tags appended to the image, like `-vllm` which installs Tags examples: -- `main-py3.11-cu128-2.7.1` -- `main-py3.11-cu126-2.7.1` -- `main-py3.11-cu126-2.7.0` -- `main-py3.11-cu126-2.6.0` -- `main-py3.11-cu124-2.6.0` +- `main-py3.11-cu128-2.8.0` +- `main-py3.11-cu128-2.9.1` - `main-latest` - `main-20250303-py3.11-cu124-2.6.0` - `main-20250303-py3.11-cu126-2.6.0` -- `0.10.1` +- `0.12.0` ## Cloud diff --git a/docs/installation.qmd b/docs/installation.qmd index 265ff238c..b8d427eb0 100644 --- a/docs/installation.qmd +++ b/docs/installation.qmd @@ -26,7 +26,7 @@ Follow the instructions at: [https://pytorch.org/get-started/locally/](https://p ::: ::: {.callout-important} -For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8. +For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8. ::: ### PyPI Installation (Recommended) {#sec-pypi} @@ -111,7 +111,7 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \ ::: ::: {.callout-important} -For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.7.0` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.7.0`. +For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`. ::: Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available. diff --git a/src/axolotl/cli/cloud/baseten/template/train_sft.py b/src/axolotl/cli/cloud/baseten/template/train_sft.py index 137fb9171..6dcf477c7 100644 --- a/src/axolotl/cli/cloud/baseten/template/train_sft.py +++ b/src/axolotl/cli/cloud/baseten/template/train_sft.py @@ -24,8 +24,7 @@ if launcher_args: launcher_args_str = "-- " + " ".join(launcher_args) # 1. Define a base image for your training job -# must use torch 2.7.0 for vllm -BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu126-2.7.1" +BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu128-2.9.1" # 2. Define the Runtime Environment for the Training Job # This includes start commands and environment variables.a diff --git a/src/axolotl/cli/cloud/modal_.py b/src/axolotl/cli/cloud/modal_.py index 7f953372d..3e703a494 100644 --- a/src/axolotl/cli/cloud/modal_.py +++ b/src/axolotl/cli/cloud/modal_.py @@ -82,7 +82,7 @@ class ModalCloud(Cloud): return res def get_image(self): - docker_tag = "main-py3.11-cu126-2.7.1" + docker_tag = "main-py3.11-cu128-2.9.1" if self.config.docker_tag: docker_tag = self.config.docker_tag docker_image = f"axolotlai/axolotl:{docker_tag}"