From 8da163312494c2aa72610d1fef7e35f1c62f16f5 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 26 Jan 2024 16:50:44 -0500 Subject: [PATCH] Revert "run PR e2e docker CI tests in Modal" (#1220) [skip ci] --- .github/workflows/tests.yml | 51 ++++++++------ cicd/Dockerfile.jinja | 38 ---------- cicd/tests.py | 69 ------------------- docker/{Dockerfile-modal => Dockerfile-tests} | 17 +++-- requirements.txt | 1 - src/axolotl/utils/models.py | 7 +- 6 files changed, 42 insertions(+), 141 deletions(-) delete mode 100644 cicd/Dockerfile.jinja delete mode 100644 cicd/tests.py rename docker/{Dockerfile-modal => Dockerfile-tests} (77%) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index ae285d8b3..2d9969524 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -58,15 +58,10 @@ jobs: docker-e2e-tests: if: github.repository_owner == 'OpenAccess-AI-Collective' # this job needs to be run on self-hosted GPU runners... - runs-on: ubuntu-latest + runs-on: [self-hosted, gpu, docker] timeout-minutes: 30 needs: [pre-commit, pytest] - env: - MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }} - MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }} - MODAL_ENVIRONMENT: axolotl-ci-cd - strategy: fail-fast: false matrix: @@ -75,29 +70,43 @@ jobs: cuda_version: 11.8.0 python_version: "3.10" pytorch: 2.0.1 - num_gpus: 1 - cuda: 121 cuda_version: 12.1.0 python_version: "3.10" pytorch: 2.1.2 - num_gpus: 1 steps: - name: Checkout uses: actions/checkout@v4 - - name: Install Python - uses: actions/setup-python@v5 + - name: Docker metadata + id: metadata + uses: docker/metadata-action@v5 with: - python-version: "3.10" - - name: Install Modal + images: winglian/axolotl-tests + - name: Build Docker image run: | - python -m pip install --upgrade pip - pip install modal jinja2 - - name: Update env vars + # Set up build arguments + BASE_TAG="main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" + CUDA="${{ matrix.cuda }}" + PYTORCH_VERSION="${{ matrix.pytorch }}" + # Build the Docker image + docker build . \ + --file ./docker/Dockerfile-tests \ + --build-arg BASE_TAG=$BASE_TAG \ + --build-arg CUDA=$CUDA \ + --build-arg GITHUB_REF=$GITHUB_REF \ + --build-arg PYTORCH_VERSION=$PYTORCH_VERSION \ + --tag ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} \ + --no-cache + - name: Unit Tests w docker image run: | - echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV - echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV - echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV - echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV - - name: Run training job on Modal + docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/ + - name: GPU Unit Tests w docker image run: | - modal run cicd.tests + docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest --ignore=tests/e2e/patched/ /workspace/axolotl/tests/e2e/ + - name: GPU Unit Tests monkeypatched w docker image + run: | + docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/ + - name: Prune image from docker + if: github.ref != 'refs/heads/main' + run: | + docker rmi -f ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja deleted file mode 100644 index e8fb5d440..000000000 --- a/cicd/Dockerfile.jinja +++ /dev/null @@ -1,38 +0,0 @@ -FROM winglian/axolotl-base:{{ BASE_TAG }} - -ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" -ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}" -ENV CUDA="{{ CUDA }}" -ENV BNB_CUDA_VERSION="{{ CUDA }}" -ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}" -ENV GITHUB_REF="{{ GITHUB_REF }}" -ENV GITHUB_SHA="{{ GITHUB_SHA }}" - -RUN apt-get update && \ - apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev - -WORKDIR /workspace - -RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git - -WORKDIR /workspace/axolotl - -RUN git fetch origin +$GITHUB_REF && \ - git checkout FETCH_HEAD - -# If AXOLOTL_EXTRAS is set, append it in brackets -RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ - pip install -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS]; \ - else \ - pip install -e .[deepspeed,flash-attn,mamba-ssm]; \ - fi - -# So we can test the Docker image -RUN pip install pytest - -# fix so that git fetch/pull from remote works -RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ - git config --get remote.origin.fetch - -# helper for huggingface-login cli -RUN git config --global credential.helper store diff --git a/cicd/tests.py b/cicd/tests.py deleted file mode 100644 index 2ba0f1a56..000000000 --- a/cicd/tests.py +++ /dev/null @@ -1,69 +0,0 @@ -""" -modal application to run axolotl gpu tests in Modal -""" -import os -import pathlib -import tempfile - -import jinja2 -import modal -from jinja2 import select_autoescape -from modal import Image, Stub - -cicd_path = pathlib.Path(__file__).parent.resolve() - -template_loader = jinja2.FileSystemLoader(searchpath=cicd_path) -template_env = jinja2.Environment( - loader=template_loader, autoescape=select_autoescape() -) -df_template = template_env.get_template("Dockerfile.jinja") - -df_args = { - "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""), - "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.0.1"), - "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.10-cu118-2.0.1"), - "CUDA": os.environ.get("CUDA", "118"), - "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"), - "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""), -} - -dockerfile_contents = df_template.render(**df_args) - -temp_dir = tempfile.mkdtemp() -with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f: - f.write(dockerfile_contents) - -cicd_image = Image.from_dockerfile( - pathlib.Path(temp_dir) / "Dockerfile", - force_build=True, - gpu="A10G", -).env(df_args) - -stub = Stub("Axolotl CI/CD", secrets=[]) - - -N_GPUS = int(os.environ.get("N_GPUS", 1)) -GPU_CONFIG = modal.gpu.A10G(count=N_GPUS) - - -def run_cmd(cmd: str, run_folder: str): - import subprocess # nosec - - # Propagate errors from subprocess. - if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec - exit(exit_code) # pylint: disable=consider-using-sys-exit - - -@stub.function( - image=cicd_image, - gpu=GPU_CONFIG, - timeout=60 * 30, -) -def cicd_pytest(): - cmd = "pytest /workspace/axolotl/tests/e2e/patched/" - run_cmd(cmd, "/workspace/axolotl") - - -@stub.local_entrypoint() -def main(): - cicd_pytest.remote() diff --git a/docker/Dockerfile-modal b/docker/Dockerfile-tests similarity index 77% rename from docker/Dockerfile-modal rename to docker/Dockerfile-tests index 8b794b49e..2ec94f868 100644 --- a/docker/Dockerfile-modal +++ b/docker/Dockerfile-tests @@ -1,11 +1,14 @@ -FROM winglian/axolotl-base:main-base +ARG BASE_TAG=main-base +FROM winglian/axolotl-base:$BASE_TAG -ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" -ENV AXOLOTL_EXTRAS="" -ENV CUDA="118" -ENV BNB_CUDA_VERSION="118" -ENV PYTORCH_VERSION="2.0.1" -ENV GITHUB_REF="main" +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX" +ARG AXOLOTL_EXTRAS="" +ARG CUDA="118" +ENV BNB_CUDA_VERSION=$CUDA +ARG PYTORCH_VERSION="2.0.1" +ARG GITHUB_REF="main" + +ENV PYTORCH_VERSION=$PYTORCH_VERSION RUN apt-get update && \ apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev diff --git a/requirements.txt b/requirements.txt index c522fdd58..b23c2509b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,4 @@ --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/ -jinja2 packaging==23.2 peft==0.7.1 transformers==4.37.0 diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index ff18e3c6f..72427f645 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -645,10 +645,7 @@ def load_model( if not cfg.fsdp: # FSDP doesn't like mixed Float and BFloat16 for name, module in model.named_modules(): - if ( - any(m in name for m in ["norm", "gate"]) - or "LayerNorm" in module.__class__.__name__ - ): + if any(m in name for m in ["norm", "gate"]): module.to(torch.float32) if model_config.model_type == "btlm": # don't upcast lm_head for btlm @@ -687,7 +684,7 @@ def load_model( if needs_fa2_dtype or cfg.flash_attention: LOG.info("converting modules to %s for flash attention", cfg.torch_dtype) for name, module in model.named_modules(): - if "norm" in name or "LayerNorm" in module.__class__.__name__: + if "norm" in name: module.to(cfg.torch_dtype) if any(m in name for m in embedding_modules): if hasattr(module, "weight"):