diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 29c5bef38..ddbd25291 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -44,98 +44,6 @@ jobs: env: SKIP: no-commit-to-branch -# preload-cache: -# name: Preload HF cache -# runs-on: ubuntu-latest -# strategy: -# fail-fast: false -# matrix: -# python_version: ["3.11"] -# pytorch_version: ["2.6.0"] -# timeout-minutes: 20 -# -# env: -# AXOLOTL_IS_CI_CACHE_PRELOAD: "1" -# -# steps: -# - name: Check out repository code -# uses: actions/checkout@v4 -# -# - name: Restore HF cache -# id: hf-cache-restore -# uses: actions/cache/restore@v4 -# with: -# path: | -# /home/runner/.cache/huggingface/hub/datasets--* -# /home/runner/.cache/huggingface/hub/models--* -# key: ${{ runner.os }}-hf-hub-cache-v2 -# -# - name: Restore Cache from S3 -# id: hf-cache-restore-s3 -# run: | -# mkdir -p /home/runner/.cache/huggingface/hub -# curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd -# -# - name: Setup Python -# uses: actions/setup-python@v5 -# with: -# python-version: ${{ matrix.python_version }} -# cache: 'pip' # caching pip dependencies -# -# - name: upgrade pip -# run: | -# pip3 install --upgrade pip -# pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel -# -# - name: Install PyTorch -# run: | -# pip3 install torch==${{ matrix.pytorch_version }} -# -# - name: Install dependencies -# run: | -# pip3 show torch -# pip3 install --no-build-isolation -U -e . -# python scripts/unsloth_install.py | sh -# python scripts/cutcrossentropy_install.py | sh -# pip3 install -r requirements-dev.txt -r requirements-tests.txt -# -# - name: Make sure PyTorch version wasn't clobbered -# run: | -# python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__" -# -# - name: Ensure axolotl CLI was installed -# run: | -# axolotl --help -# -# - name: Pre-Download dataset fixture -# run: | -# huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures -# -# - name: Run tests -# run: | -# pytest -v tests/conftest.py -# -# - name: Upload coverage to Codecov -# uses: codecov/codecov-action@v5 -# with: -# token: ${{ secrets.CODECOV_TOKEN }} -# files: ./coverage.xml -# flags: unittests,pytorch-${{ matrix.pytorch_version }} -# fail_ci_if_error: false -# -# - name: cleanup pip cache -# run: | -# find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \; -# -# - name: Save HF cache -# id: hf-cache -# uses: actions/cache/save@v4 -# with: -# path: | -# /home/runner/.cache/huggingface/hub/datasets--* -# /home/runner/.cache/huggingface/hub/models--* -# key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }} - pytest: name: PyTest runs-on: ubuntu-latest @@ -151,15 +59,6 @@ jobs: - name: Check out repository code uses: actions/checkout@v4 -# - name: Restore HF cache -# id: hf-cache-restore -# uses: actions/cache/restore@v4 -# with: -# path: | -# /home/runner/.cache/huggingface/hub/datasets--* -# /home/runner/.cache/huggingface/hub/models--* -# key: ${{ runner.os }}-hf-hub-cache-v2 - - name: Restore Cache from S3 id: hf-cache-restore-s3 run: | @@ -222,7 +121,6 @@ jobs: pytest-sdist: name: PyTest from Source Dist runs-on: ubuntu-latest -# needs: [preload-cache] strategy: fail-fast: false matrix: @@ -234,15 +132,6 @@ jobs: - name: Check out repository code uses: actions/checkout@v4 -# - name: Restore HF cache -# id: hf-cache-restore -# uses: actions/cache/restore@v4 -# with: -# path: | -# /home/runner/.cache/huggingface/hub/datasets--* -# /home/runner/.cache/huggingface/hub/models--* -# key: ${{ runner.os }}-hf-hub-cache-v2 - - name: Restore Cache from S3 id: hf-cache-restore-s3 run: | @@ -312,6 +201,13 @@ jobs: pytorch: 2.6.0 num_gpus: 1 axolotl_extras: vllm + - cuda: 126 + cuda_version: 12.6.3 + python_version: "3.11" + pytorch: 2.6.0 + num_gpus: 1 + axolotl_extras: + dockerfile: "Dockerfile-uv.jinja" steps: - name: Checkout uses: actions/checkout@v4 @@ -333,6 +229,7 @@ jobs: echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV + echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV - name: Run tests job on Modal run: | modal run cicd.e2e_tests @@ -395,6 +292,7 @@ jobs: echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV + echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV - name: Run tests job on Modal run: | modal run cicd.e2e_tests diff --git a/cicd/Dockerfile-uv.jinja b/cicd/Dockerfile-uv.jinja new file mode 100644 index 000000000..84527274d --- /dev/null +++ b/cicd/Dockerfile-uv.jinja @@ -0,0 +1,52 @@ +FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }} + +ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" +ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}" +ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}" +ENV CUDA="{{ CUDA }}" +ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}" +ENV GITHUB_REF="{{ GITHUB_REF }}" +ENV GITHUB_SHA="{{ GITHUB_SHA }}" +ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}" +ENV HF_HOME="{{ HF_HOME }}" + +RUN apt-get update && \ + apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev + +WORKDIR /workspace + +RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git + +WORKDIR /workspace/axolotl + +RUN git fetch origin +$GITHUB_REF && \ + git checkout FETCH_HEAD + +# If AXOLOTL_EXTRAS is set, append it in brackets +RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \ + sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \ + sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \ + sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \ + sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \ + sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \ + fi + +RUN uv pip install packaging==23.2 setuptools==75.8.0 +RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \ + uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \ + else \ + uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \ + fi + +RUN python scripts/unsloth_install.py --uv | sh +RUN python scripts/cutcrossentropy_install.py --uv | sh + +# So we can test the Docker image +RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt + +# fix so that git fetch/pull from remote works +RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \ + git config --get remote.origin.fetch + +# helper for huggingface-login cli +RUN git config --global credential.helper store diff --git a/cicd/multigpu.py b/cicd/multigpu.py index 9819d3760..a2dd8d0b3 100644 --- a/cicd/multigpu.py +++ b/cicd/multigpu.py @@ -55,7 +55,7 @@ VOLUME_CONFIG = { } N_GPUS = int(os.environ.get("N_GPUS", 2)) -GPU_CONFIG = modal.gpu.H100(count=N_GPUS) +GPU_CONFIG = f"H100:{N_GPUS}" def run_cmd(cmd: str, run_folder: str): diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py index 35dd0de59..2ce3b0662 100644 --- a/cicd/single_gpu.py +++ b/cicd/single_gpu.py @@ -8,8 +8,9 @@ import tempfile import jinja2 import modal +import modal.experimental from jinja2 import select_autoescape -from modal import App, Image +from modal import App cicd_path = pathlib.Path(__file__).parent.resolve() @@ -17,7 +18,8 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path) template_env = jinja2.Environment( loader=template_loader, autoescape=select_autoescape() ) -df_template = template_env.get_template("Dockerfile.jinja") +dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja") +df_template = template_env.get_template(dockerfile) df_args = { "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""), @@ -38,11 +40,11 @@ temp_dir = tempfile.mkdtemp() with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f: f.write(dockerfile_contents) -cicd_image = Image.from_dockerfile( +cicd_image = modal.experimental.raw_dockerfile_image( pathlib.Path(temp_dir) / "Dockerfile", - context_mount=None, + # context_mount=None, force_build=True, - gpu="A10G", + # gpu="A10G", ).env(df_args) app = App("Axolotl CI/CD", secrets=[]) @@ -55,7 +57,7 @@ VOLUME_CONFIG = { } N_GPUS = int(os.environ.get("N_GPUS", 1)) -GPU_CONFIG = modal.gpu.L40S(count=N_GPUS) +GPU_CONFIG = f"L40S:{N_GPUS}" def run_cmd(cmd: str, run_folder: str): diff --git a/scripts/cutcrossentropy_install.py b/scripts/cutcrossentropy_install.py index 3ff6dfa8f..4a92746c1 100644 --- a/scripts/cutcrossentropy_install.py +++ b/scripts/cutcrossentropy_install.py @@ -9,6 +9,8 @@ except ImportError as exc: raise ImportError("Install torch via `pip install torch`") from exc from packaging.version import Version as V +USE_UV = "--uv" in sys.argv[1:] + v = V(torch.__version__) # no cut-cross-entropy support for torch < 2.4.0 @@ -23,7 +25,9 @@ if cce_spec: if not importlib.util.find_spec("cut_cross_entropy.transformers"): UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && " +UV_PREFIX = "uv " if USE_UV else "" + print( UNINSTALL_PREFIX - + 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@a1174ca"' + + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@a1174ca"' ) diff --git a/scripts/unsloth_install.py b/scripts/unsloth_install.py index bffab4670..acbd05e90 100644 --- a/scripts/unsloth_install.py +++ b/scripts/unsloth_install.py @@ -1,11 +1,15 @@ # noqa # pylint: skip-file +import sys + try: import torch except ImportError: raise ImportError("Install torch via `pip install torch`") from packaging.version import Version as V +use_uv = "--uv" in sys.argv[1:] + v = V(torch.__version__) cuda = str(torch.version.cuda) try: @@ -31,6 +35,7 @@ elif v < V("2.6.0"): else: raise RuntimeError(f"Torch = {v} too new!") x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "") +uv_prefix = "uv " if use_uv else "" print( - f'pip install unsloth-zoo==2024.12.1 && pip install --no-deps "unsloth[{x}]==2024.12.4"' + f'{uv_prefix}pip install unsloth-zoo==2024.12.1 && {uv_prefix}pip install --no-deps "unsloth[{x}]==2024.12.4"' )