From a4d5112ae1b2a89c778ee1df1ee638812b885946 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 24 Apr 2025 00:39:31 -0400 Subject: [PATCH] builds for torch 2.7.0 (#2552) * builds for torch==2.7.0 * use xformers==0.0.29.post3 * no vllm support with torch 2.7 * update default, fix conditional * no xformers for 270 * no vllm on 2.7.0 for multigpu test too * remove deprecated verbose arg from scheduler * 2.7.0 tests on cpu --- .github/workflows/main.yml | 12 +++++++++++- .github/workflows/multi-gpu-e2e.yml | 7 +++++++ .github/workflows/tests.yml | 8 +++++++- setup.py | 12 +++++++++--- src/axolotl/monkeypatch/relora.py | 2 +- 5 files changed, 35 insertions(+), 6 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index df12b3c89..14e30f20f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -31,6 +31,11 @@ jobs: pytorch: 2.6.0 axolotl_extras: vllm is_latest: true + - cuda: 126 + cuda_version: 12.6.3 + python_version: "3.11" + pytorch: 2.7.0 + axolotl_extras: vllm runs-on: axolotl-gpu-runner steps: - name: Checkout @@ -93,6 +98,11 @@ jobs: pytorch: 2.6.0 axolotl_extras: is_latest: true + - cuda: 126 + cuda_version: 12.6.3 + python_version: "3.11" + pytorch: 2.7.0 + axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout @@ -138,7 +148,7 @@ jobs: - cuda: 124 cuda_version: 12.4.1 python_version: "3.11" - pytorch: 2.4.1 + pytorch: 2.6.0 axolotl_extras: runs-on: axolotl-gpu-runner steps: diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml index fcc6ee021..aee4ddba6 100644 --- a/.github/workflows/multi-gpu-e2e.yml +++ b/.github/workflows/multi-gpu-e2e.yml @@ -45,6 +45,13 @@ jobs: axolotl_extras: vllm num_gpus: 2 nightly_build: "true" + - cuda: 126 + cuda_version: 12.6.3 + python_version: "3.11" + pytorch: 2.7.0 + axolotl_extras: + num_gpus: 2 + nightly_build: "true" runs-on: [self-hosted, modal] timeout-minutes: 120 steps: diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ba1c837cb..825277ce0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -49,7 +49,7 @@ jobs: max-parallel: 2 matrix: python_version: ["3.11"] - pytorch_version: ["2.4.1", "2.5.1", "2.6.0"] + pytorch_version: ["2.4.1", "2.5.1", "2.6.0", "2.7.0"] timeout-minutes: 20 steps: @@ -270,6 +270,12 @@ jobs: pytorch: 2.5.1 num_gpus: 1 axolotl_extras: vllm + - cuda: 126 + cuda_version: 12.6.3 + python_version: "3.11" + pytorch: 2.7.0 + num_gpus: 1 + axolotl_extras: steps: - name: Checkout uses: actions/checkout@v4 diff --git a/setup.py b/setup.py index 6c911d8f7..0f3892c3b 100644 --- a/setup.py +++ b/setup.py @@ -51,7 +51,7 @@ def parse_requirements(extras_require_map): try: torch_version = version("torch") except PackageNotFoundError: - torch_version = "2.5.1" + torch_version = "2.6.0" # default to torch 2.6 _install_requires.append(f"torch=={torch_version}") version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version) @@ -64,9 +64,15 @@ def parse_requirements(extras_require_map): else: raise ValueError("Invalid version format") - if (major, minor) >= (2, 6): + if (major, minor) >= (2, 7): _install_requires.pop(_install_requires.index(xformers_version)) - _install_requires.append("xformers==0.0.29.post2") + # _install_requires.append("xformers==0.0.29.post3") # xformers seems to be hard pinned to 2.6.0 + extras_require_map["vllm"] = ["vllm==0.8.3"] + elif (major, minor) >= (2, 6): + _install_requires.pop(_install_requires.index(xformers_version)) + _install_requires.append( + "xformers==0.0.29.post2" + ) # vllm needs post2 w torch 2.6 extras_require_map["vllm"] = ["vllm==0.8.3"] elif (major, minor) >= (2, 5): _install_requires.pop(_install_requires.index(xformers_version)) diff --git a/src/axolotl/monkeypatch/relora.py b/src/axolotl/monkeypatch/relora.py index 822fd4465..4a27dde81 100644 --- a/src/axolotl/monkeypatch/relora.py +++ b/src/axolotl/monkeypatch/relora.py @@ -272,7 +272,7 @@ class ReLoRAScheduler(LRScheduler): self.warmup_steps = warmup_steps self.anneal_steps = anneal_steps self.min_lr_scale = min_lr_scale - super().__init__(optimizer, inner_schedule.last_epoch, inner_schedule.verbose) + super().__init__(optimizer, inner_schedule.last_epoch) def get_lr(self) -> float: self.inner_schedule.last_epoch = self.last_epoch