first pass at pytorch 2.5.0 support (#1982)

* first pass at pytorch 2.5.0 support * attempt to install causal_conv1d with mamba * gracefully handle missing xformers * fix import * fix incorrect version, add 2.5.0 * increase tests timeout
2024-10-21 11:00:45 -04:00
parent 67f744dc8c
commit e12a2130e9
12 changed files with 120 additions and 55 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,6 +29,11 @@ jobs:
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -86,6 +91,11 @@ jobs:
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -21,10 +21,17 @@ jobs:
            pytorch: 2.3.1
            axolotl_extras:
            num_gpus: 2
-          - cuda: 121
+          - cuda: 124
-            cuda_version: 12.1.1
+            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.3.1
+            pytorch: 2.4.1
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.0
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -28,6 +28,11 @@ jobs:
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -85,6 +90,11 @@ jobs:
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -25,7 +25,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.3.1", "2.4.1"]
+        pytorch_version: ["2.3.1", "2.4.1", "2.5.0"]
    timeout-minutes: 20
    steps:
@@ -95,6 +95,13 @@ jobs:
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.0
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -36,7 +36,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.10", "3.11"]
-        pytorch_version: ["2.3.1", "2.4.1"]
+        pytorch_version: ["2.3.1", "2.4.1", "2.5.0"]
    timeout-minutes: 20
    steps:
@@ -72,7 +72,7 @@ jobs:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 60
+    timeout-minutes: 90
    needs: [pre-commit, pytest]
    strategy:
@@ -97,6 +97,12 @@ jobs:
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.0
            num_gpus: 1
            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -23,7 +23,6 @@ RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN pip install causal_conv1d
 RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -64,7 +64,7 @@ def run_cmd(cmd: str, run_folder: str):
@stub.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=45 * 60,
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072 * N_GPUS,
 )
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -65,7 +65,7 @@ def run_cmd(cmd: str, run_folder: str):
@stub.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=45 * 60,
+    timeout=60 * 60,
    cpu=8.0,
    memory=131072,
 )
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -20,7 +20,6 @@ RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
 WORKDIR /workspace/axolotl
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN pip install causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install -e .[deepspeed,flash-attn,optimizers,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/setup.py
+++ b/setup.py
@@ -50,7 +50,9 @@ def parse_requirements():
            else:
                raise ValueError("Invalid version format")
-            if (major, minor) >= (2, 4):
+            if (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
            elif (major, minor) >= (2, 4):
                if patch == 0:
                    _install_requires.pop(_install_requires.index(xformers_version))
                    _install_requires.append("xformers>=0.0.27")
@@ -102,6 +104,7 @@ setup(
        ],
        "mamba-ssm": [
            "mamba-ssm==1.2.0.post1",
            "causal_conv1d",
        ],
        "auto-gptq": [
            "auto-gptq==0.5.1",
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -22,7 +22,6 @@ from transformers.models.llama.modeling_llama import (
    apply_rotary_pos_emb,
    repeat_kv,
 )
 from xformers.ops import SwiGLU
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids, set_module_name
@@ -44,7 +43,19 @@ except ImportError:
 LOG = logging.getLogger("axolotl")
 def is_xformers_available() -> bool:
    try:
        import xformers  # pylint: disable=unused-import  # noqa: F401
        return True
    except ImportError:
        return False
 def is_xformers_swiglu_available() -> bool:
    if not is_xformers_available():
        return False
    from xformers.ops.common import get_xformers_operator
    try:
@@ -57,6 +68,11 @@ def is_xformers_swiglu_available() -> bool:
 def replace_llama_mlp_with_swiglu(model):
    if is_xformers_swiglu_available():
        from axolotl.monkeypatch.xformers_ import FusedMLP
    else:
        raise RuntimeError("xformers SwiGLU not available for this environment")
    for name, module in model.named_modules():
        if isinstance(module, LlamaMLP):
            mlp = FusedMLP(
@@ -181,49 +197,6 @@ class FusedAttention(LlamaAttention):
        set_module_name(model, name, new_attn)
 class FusedMLP(torch.nn.Module):
    """
    Fused MLP layer for incrementally improved training efficiency
    """
    def __init__(
        self,
        config,
        gate_proj: torch.nn.Linear,
        up_proj: torch.nn.Linear,
        down_proj: torch.nn.Linear,
    ):
        super().__init__()
        self.config = config
        self.swiglu = SwiGLU(
            in_features=config.hidden_size,
            hidden_features=config.intermediate_size,
            bias=False,
            _pack_weights=True,
        )
        # overwrite initialized weights with pretrained weights
        self.swiglu.w12.weight.data = torch.cat(
            (gate_proj.weight.data, up_proj.weight.data), dim=0
        )
        self.swiglu.w3.weight.data = down_proj.weight.data
    def _post_training(self, model, name):
        w1, w2 = torch.split(  # pylint: disable=invalid-name
            self.swiglu.w12.weight.data, self.config.intermediate_size, dim=0
        )
        # Assign the split weights back to the original layers
        new_mlp = LlamaMLP(self.config)
        new_mlp.gate_proj.weight.data = w1
        new_mlp.up_proj.weight.data = w2
        new_mlp.down_proj.weight.data = self.swiglu.w3.weight.data
        set_module_name(model, name, new_mlp)
    def forward(self, x: torch.Tensor) -> torch.Tensor:  # pylint: disable=invalid-name
        return self.swiglu(x)
 # Disable the transformation of the attention mask in LlamaModel as the flash attention
 # requires the attention mask to be the same as the key_padding_mask
 def _prepare_decoder_attention_mask(
--- a/src/axolotl/monkeypatch/xformers_/init.py
+++ b/src/axolotl/monkeypatch/xformers_/init.py
@@ -0,0 +1,51 @@
 """
 Fused MLP layer for incrementally improved training efficiency
 """
 import torch
 from transformers.models.llama.modeling_llama import LlamaMLP
 from xformers.ops import SwiGLU
 from axolotl.monkeypatch.utils import set_module_name
 class FusedMLP(torch.nn.Module):
    """
    Fused MLP layer for incrementally improved training efficiency
    """
    def __init__(
        self,
        config,
        gate_proj: torch.nn.Linear,
        up_proj: torch.nn.Linear,
        down_proj: torch.nn.Linear,
    ):
        super().__init__()
        self.config = config
        self.swiglu = SwiGLU(
            in_features=config.hidden_size,
            hidden_features=config.intermediate_size,
            bias=False,
            _pack_weights=True,
        )
        # overwrite initialized weights with pretrained weights
        self.swiglu.w12.weight.data = torch.cat(
            (gate_proj.weight.data, up_proj.weight.data), dim=0
        )
        self.swiglu.w3.weight.data = down_proj.weight.data
    def _post_training(self, model, name):
        w1, w2 = torch.split(  # pylint: disable=invalid-name
            self.swiglu.w12.weight.data, self.config.intermediate_size, dim=0
        )
        # Assign the split weights back to the original layers
        new_mlp = LlamaMLP(self.config)
        new_mlp.gate_proj.weight.data = w1
        new_mlp.up_proj.weight.data = w2
        new_mlp.down_proj.weight.data = self.swiglu.w3.weight.data
        set_module_name(model, name, new_mlp)
    def forward(self, x: torch.Tensor) -> torch.Tensor:  # pylint: disable=invalid-name
        return self.swiglu(x)