run PR e2e docker CI tests in Modal (#1217) [skip ci]

* wip modal for ci * handle falcon layernorms better * update * rebuild the template each time with the pseudo-ARGS * fix ref * update tests to use modal * cleanup ci script * make sure to install jinja2 also * kickoff the gh action on gh hosted runners and specify num gpus
2024-01-26 16:13:27 -05:00
parent af29d81f80
commit 36d053f6f0
6 changed files with 141 additions and 42 deletions
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -58,10 +58,15 @@ jobs:
  docker-e2e-tests:
    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, gpu, docker]
+    runs-on: ubuntu-latest
    timeout-minutes: 30
    needs: [pre-commit, pytest]
    env:
      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
      MODAL_ENVIRONMENT: axolotl-ci-cd
    strategy:
      fail-fast: false
      matrix:
@@ -70,43 +75,29 @@ jobs:
            cuda_version: 11.8.0
            python_version: "3.10"
            pytorch: 2.0.1
            num_gpus: 1
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
            pytorch: 2.1.2
            num_gpus: 1
    steps:
      - name: Checkout
        uses: actions/checkout@v4
-      - name: Docker metadata
+      - name: Install Python
-        id: metadata
+        uses: actions/setup-python@v5
        uses: docker/metadata-action@v5
        with:
-          images: winglian/axolotl-tests
+          python-version: "3.10"
-      - name: Build Docker image
+      - name: Install Modal
        run: |
-          # Set up build arguments
+          python -m pip install --upgrade pip
-          BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}"
+          pip install modal jinja2
-          CUDA="${{ matrix.cuda }}"
+      - name: Update env vars
          PYTORCH_VERSION="${{ matrix.pytorch }}"
          # Build the Docker image
          docker build . \
            --file ./docker/Dockerfile-tests \
            --build-arg BASE_TAG=$BASE_TAG \
            --build-arg CUDA=$CUDA \
            --build-arg GITHUB_REF=$GITHUB_REF \
            --build-arg PYTORCH_VERSION=$PYTORCH_VERSION \
            --tag ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} \
            --no-cache
      - name: Unit Tests w docker image
        run: |
-          docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-      - name: GPU Unit Tests w docker image
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
      - name: Run training job on Modal
        run: |
-          docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/
+          modal run cicd.tests
      - name: GPU Unit Tests monkeypatched w docker image
        run: |
          docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/
      - name: Prune image from docker
        if: github.ref != 'refs/heads/main'
        run: |
          docker rmi -f ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -0,0 +1,38 @@
 FROM winglian/axolotl-base:{{ BASE_TAG }}
 ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
 ENV CUDA="{{ CUDA }}"
 ENV BNB_CUDA_VERSION="{{ CUDA }}"
 ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
 ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
 WORKDIR /workspace
 RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 WORKDIR /workspace/axolotl
 RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \
    else \
        pip install -e .[deepspeed,flash-attn,mamba-ssm]; \
    fi
 # So we can test the Docker image
 RUN pip install pytest
 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
    git config --get remote.origin.fetch
 # helper for huggingface-login cli
 RUN git config --global credential.helper store
--- a/cicd/tests.py
+++ b/cicd/tests.py
@@ -0,0 +1,69 @@
 """
 modal application to run axolotl gpu tests in Modal
 """
 import os
 import pathlib
 import tempfile
 import jinja2
 import modal
 from jinja2 import select_autoescape
 from modal import Image, Stub
 cicd_path = pathlib.Path(__file__).parent.resolve()
 template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
 df_template = template_env.get_template("Dockerfile.jinja")
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"),
    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"),
    "CUDA": os.environ.get("CUDA", "118"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
 }
 dockerfile_contents = df_template.render(**df_args)
 temp_dir = tempfile.mkdtemp()
 with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
    f.write(dockerfile_contents)
 cicd_image = Image.from_dockerfile(
    pathlib.Path(temp_dir) / "Dockerfile",
    force_build=True,
    gpu="A10G",
 ).env(df_args)
 stub = Stub("Axolotl CI/CD", secrets=[])
 N_GPUS = int(os.environ.get("N_GPUS", 1))
 GPU_CONFIG = modal.gpu.A10G(count=N_GPUS)
 def run_cmd(cmd: str, run_folder: str):
    import subprocess  # nosec
    # Propagate errors from subprocess.
    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
        exit(exit_code)  # pylint: disable=consider-using-sys-exit
@stub.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=60 * 30,
 )
 def cicd_pytest():
    cmd = "pytest /workspace/axolotl/tests/e2e/patched/"
    run_cmd(cmd, "/workspace/axolotl")
@stub.local_entrypoint()
 def main():
    cicd_pytest.remote()
--- a/docker/Dockerfile-modal
+++ b/docker/Dockerfile-modal
@@ -1,14 +1,11 @@
-ARG BASE_TAG=main-base
+FROM winglian/axolotl-base:main-base
 FROM winglian/axolotl-base:$BASE_TAG
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
-ARG AXOLOTL_EXTRAS=""
+ENV AXOLOTL_EXTRAS=""
-ARG CUDA="118"
+ENV CUDA="118"
-ENV BNB_CUDA_VERSION=$CUDA
+ENV BNB_CUDA_VERSION="118"
-ARG PYTORCH_VERSION="2.0.1"
+ENV PYTORCH_VERSION="2.0.1"
-ARG GITHUB_REF="main"
+ENV GITHUB_REF="main"
 ENV PYTORCH_VERSION=$PYTORCH_VERSION
 RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,4 +1,5 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 jinja2
 packaging==23.2
 peft==0.7.1
 transformers==4.37.0
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -645,7 +645,10 @@ def load_model(
    if not cfg.fsdp:
        # FSDP doesn't like mixed Float and BFloat16
        for name, module in model.named_modules():
-            if any(m in name for m in ["norm", "gate"]):
+            if (
                any(m in name for m in ["norm", "gate"])
                or "LayerNorm" in module.__class__.__name__
            ):
                module.to(torch.float32)
            if model_config.model_type == "btlm":
                # don't upcast lm_head for btlm
@@ -684,7 +687,7 @@ def load_model(
    if needs_fa2_dtype or cfg.flash_attention:
        LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
        for name, module in model.named_modules():
-            if "norm" in name:
+            if "norm" in name or "LayerNorm" in module.__class__.__name__:
                module.to(cfg.torch_dtype)
            if any(m in name for m in embedding_modules):
                if hasattr(module, "weight"):