From 8da163312494c2aa72610d1fef7e35f1c62f16f5 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Fri, 26 Jan 2024 16:50:44 -0500
Subject: [PATCH] Revert "run PR e2e docker CI tests in Modal" (#1220) [skip
 ci]

---
 .github/workflows/tests.yml                   | 51 ++++++++------
 cicd/Dockerfile.jinja                         | 38 ----------
 cicd/tests.py                                 | 69 -------------------
 docker/{Dockerfile-modal => Dockerfile-tests} | 17 +++--
 requirements.txt                              |  1 -
 src/axolotl/utils/models.py                   |  7 +-
 6 files changed, 42 insertions(+), 141 deletions(-)
 delete mode 100644 cicd/Dockerfile.jinja
 delete mode 100644 cicd/tests.py
 rename docker/{Dockerfile-modal => Dockerfile-tests} (77%)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index ae285d8b3..2d9969524 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -58,15 +58,10 @@ jobs:
   docker-e2e-tests:
     if: github.repository_owner == 'OpenAccess-AI-Collective'
     # this job needs to be run on self-hosted GPU runners...
-    runs-on: ubuntu-latest
+    runs-on: [self-hosted, gpu, docker]
     timeout-minutes: 30
     needs: [pre-commit, pytest]
 
-    env:
-      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
-      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
-      MODAL_ENVIRONMENT: axolotl-ci-cd
-
     strategy:
       fail-fast: false
       matrix:
@@ -75,29 +70,43 @@ jobs:
             cuda_version: 11.8.0
             python_version: "3.10"
             pytorch: 2.0.1
-            num_gpus: 1
           - cuda: 121
             cuda_version: 12.1.0
             python_version: "3.10"
             pytorch: 2.1.2
-            num_gpus: 1
     steps:
       - name: Checkout
         uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
+      - name: Docker metadata
+        id: metadata
+        uses: docker/metadata-action@v5
         with:
-          python-version: "3.10"
-      - name: Install Modal
+          images: winglian/axolotl-tests
+      - name: Build Docker image
         run: |
-          python -m pip install --upgrade pip
-          pip install modal jinja2
-      - name: Update env vars
+          # Set up build arguments
+          BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
+          CUDA="${{ matrix.cuda }}"
+          PYTORCH_VERSION="${{ matrix.pytorch }}"
+          # Build the Docker image
+          docker build . \
+            --file ./docker/Dockerfile-tests \
+            --build-arg BASE_TAG=$BASE_TAG \
+            --build-arg CUDA=$CUDA \
+            --build-arg GITHUB_REF=$GITHUB_REF \
+            --build-arg PYTORCH_VERSION=$PYTORCH_VERSION \
+            --tag ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} \
+            --no-cache
+      - name: Unit Tests w docker image
         run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-      - name: Run training job on Modal
+          docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
+      - name: GPU Unit Tests w docker image
         run: |
-          modal run cicd.tests
+          docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
+      - name: GPU Unit Tests monkeypatched w docker image
+        run: |
+          docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/
+      - name: Prune image from docker
+        if: github.ref != 'refs/heads/main'
+        run: |
+          docker rmi -f ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja
deleted file mode 100644
index e8fb5d440..000000000
--- a/cicd/Dockerfile.jinja
+++ /dev/null
@@ -1,38 +0,0 @@
-FROM winglian/axolotl-base:{{ BASE_TAG }}
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
-ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
-ENV CUDA="{{ CUDA }}"
-ENV BNB_CUDA_VERSION="{{ CUDA }}"
-ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
-ENV GITHUB_REF="{{ GITHUB_REF }}"
-ENV GITHUB_SHA="{{ GITHUB_SHA }}"
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-RUN git fetch origin +$GITHUB_REF && \
-    git checkout FETCH_HEAD
-
-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
-    else \
-        pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
-    fi
-
-# So we can test the Docker image
-RUN pip install pytest
-
-# fix so that git fetch/pull from remote works
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
diff --git a/cicd/tests.py b/cicd/tests.py
deleted file mode 100644
index 2ba0f1a56..000000000
--- a/cicd/tests.py
+++ /dev/null
@@ -1,69 +0,0 @@
-"""
-modal application to run axolotl gpu tests in Modal
-"""
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-from jinja2 import select_autoescape
-from modal import Image, Stub
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-df_template = template_env.get_template("Dockerfile.jinja")
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
-    "CUDA": os.environ.get("CUDA", "118"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = Image.from_dockerfile(
-    pathlib.Path(temp_dir) / "Dockerfile",
-    force_build=True,
-    gpu="A10G",
-).env(df_args)
-
-stub = Stub("Axolotl CI/CD", secrets=[])
-
-
-N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
-
-
-@stub.function(
-    image=cicd_image,
-    gpu=GPU_CONFIG,
-    timeout=60 * 30,
-)
-def cicd_pytest():
-    cmd = "pytest /workspace/axolotl/tests/e2e/patched/"
-    run_cmd(cmd, "/workspace/axolotl")
-
-
-@stub.local_entrypoint()
-def main():
-    cicd_pytest.remote()
diff --git a/docker/Dockerfile-modal b/docker/Dockerfile-tests
similarity index 77%
rename from docker/Dockerfile-modal
rename to docker/Dockerfile-tests
index 8b794b49e..2ec94f868 100644
--- a/docker/Dockerfile-modal
+++ b/docker/Dockerfile-tests
@@ -1,11 +1,14 @@
-FROM winglian/axolotl-base:main-base
+ARG BASE_TAG=main-base
+FROM winglian/axolotl-base:$BASE_TAG
 
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
-ENV AXOLOTL_EXTRAS=""
-ENV CUDA="118"
-ENV BNB_CUDA_VERSION="118"
-ENV PYTORCH_VERSION="2.0.1"
-ENV GITHUB_REF="main"
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ARG AXOLOTL_EXTRAS=""
+ARG CUDA="118"
+ENV BNB_CUDA_VERSION=$CUDA
+ARG PYTORCH_VERSION="2.0.1"
+ARG GITHUB_REF="main"
+
+ENV PYTORCH_VERSION=$PYTORCH_VERSION
 
 RUN apt-get update && \
     apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
diff --git a/requirements.txt b/requirements.txt
index c522fdd58..b23c2509b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,4 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
-jinja2
 packaging==23.2
 peft==0.7.1
 transformers==4.37.0
diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index ff18e3c6f..72427f645 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -645,10 +645,7 @@ def load_model(
     if not cfg.fsdp:
         # FSDP doesn't like mixed Float and BFloat16
         for name, module in model.named_modules():
-            if (
-                any(m in name for m in ["norm", "gate"])
-                or "LayerNorm" in module.__class__.__name__
-            ):
+            if any(m in name for m in ["norm", "gate"]):
                 module.to(torch.float32)
             if model_config.model_type == "btlm":
                 # don't upcast lm_head for btlm
@@ -687,7 +684,7 @@ def load_model(
     if needs_fa2_dtype or cfg.flash_attention:
         LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
         for name, module in model.named_modules():
-            if "norm" in name or "LayerNorm" in module.__class__.__name__:
+            if "norm" in name:
                 module.to(cfg.torch_dtype)
             if any(m in name for m in embedding_modules):
                 if hasattr(module, "weight"):