diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml index 9e19114d7..6b750fc5a 100644 --- a/.github/workflows/base.yml +++ b/.github/workflows/base.yml @@ -17,7 +17,7 @@ jobs: build-base: if: github.repository_owner == 'axolotl-ai-cloud' # this job needs to be run on self-hosted GPU runners... - runs-on: axolotl-gpu-runner + runs-on: ubuntu-latest-m strategy: fail-fast: false matrix: @@ -28,42 +28,50 @@ jobs: python_version: "3.11" pytorch: 2.5.1 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + dockerfile: "Dockerfile-base" - cuda: "124" cuda_version: 12.4.1 cudnn_version: "" python_version: "3.11" pytorch: 2.6.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + dockerfile: "Dockerfile-base" - cuda: "126" cuda_version: 12.6.3 cudnn_version: "" python_version: "3.11" pytorch: 2.6.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + dockerfile: "Dockerfile-base" - cuda: "126" cuda_version: 12.6.3 cudnn_version: "" python_version: "3.11" pytorch: 2.7.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + dockerfile: "Dockerfile-base" - cuda: "128" cuda_version: 12.6.3 cudnn_version: "" python_version: "3.11" pytorch: 2.7.0 torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + dockerfile: "Dockerfile-base" - cuda: "128" cuda_version: 12.8.1 cudnn_version: "" python_version: "3.11" pytorch: nightly torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" - - cuda: "128" - cuda_version: 12.8.1 - cudnn_version: "" - python_version: "3.11" - pytorch: next - torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + dockerfile: "Dockerfile-base-nightly" +# # "next" is for release candidates of pytorch +# - cuda: "128" +# cuda_version: 12.8.1 +# cudnn_version: "" +# python_version: "3.11" +# pytorch: next +# torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" +# dockerfile: "Dockerfile-base-next" steps: - name: Checkout uses: actions/checkout@v4 @@ -85,7 +93,59 @@ jobs: uses: docker/build-push-action@v4 with: context: . - file: ${{ matrix.pytorch == 'nightly' && './docker/Dockerfile-base-nightly' || matrix.pytorch == 'next' && './docker/Dockerfile-base-next' || './docker/Dockerfile-base' }} + file: ./docker/${{ matrix.dockerfile }} + push: ${{ github.event_name != 'pull_request' }} + tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} + labels: ${{ steps.metadata.outputs.labels }} + build-args: | + CUDA_VERSION=${{ matrix.cuda_version }} + CUDNN_VERSION=${{ matrix.cudnn_version }} + CUDA=${{ matrix.cuda }} + PYTHON_VERSION=${{ matrix.python_version }} + PYTORCH_VERSION=${{ matrix.pytorch }} + TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }} + build-base-uv: + if: github.repository_owner == 'axolotl-ai-cloud' + runs-on: ubuntu-latest-m + strategy: + fail-fast: false + matrix: + include: + - cuda: "126" + cuda_version: 12.6.3 + cudnn_version: "" + python_version: "3.11" + pytorch: 2.6.0 + torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + dockerfile: "Dockerfile-uv-base" + - cuda: "128" + cuda_version: 12.8.1 + cudnn_version: "" + python_version: "3.11" + pytorch: 2.7.0 + torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX" + dockerfile: "Dockerfile-uv-base" + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Docker metadata + id: metadata + uses: docker/metadata-action@v5 + with: + images: | + axolotlai/axolotl-base-uv + - name: Login to Docker Hub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USERNAME }} + password: ${{ secrets.DOCKERHUB_TOKEN }} + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + - name: Build + uses: docker/build-push-action@v4 + with: + context: . + file: ./docker/${{ matrix.dockerfile }} push: ${{ github.event_name != 'pull_request' }} tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} labels: ${{ steps.metadata.outputs.labels }} diff --git a/README.md b/README.md index 56e45e3fe..d5e8f08a1 100644 --- a/README.md +++ b/README.md @@ -51,7 +51,7 @@ Features: - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU - Python 3.11 -- PyTorch ≥2.4.1 +- PyTorch ≥2.5.1 ### Installation diff --git a/cicd/multigpu.py b/cicd/multigpu.py index 7de4ae0a7..9819d3760 100644 --- a/cicd/multigpu.py +++ b/cicd/multigpu.py @@ -24,9 +24,9 @@ df_template = template_env.get_template("Dockerfile.jinja") df_args = { "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""), "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""), - "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"), - "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"), - "CUDA": os.environ.get("CUDA", "121"), + "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"), + "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"), + "CUDA": os.environ.get("CUDA", "124"), "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"), "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""), "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""), diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py index d46d970cf..35dd0de59 100644 --- a/cicd/single_gpu.py +++ b/cicd/single_gpu.py @@ -22,9 +22,9 @@ df_template = template_env.get_template("Dockerfile.jinja") df_args = { "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""), "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""), - "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"), - "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"), - "CUDA": os.environ.get("CUDA", "121"), + "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.5.1"), + "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu124-2.5.1"), + "CUDA": os.environ.get("CUDA", "124"), "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"), "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""), "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""), diff --git a/docker/Dockerfile-uv-base b/docker/Dockerfile-uv-base new file mode 100644 index 000000000..5ac8d86c7 --- /dev/null +++ b/docker/Dockerfile-uv-base @@ -0,0 +1,36 @@ +ARG CUDA_VERSION="12.6.3" +ARG CUDNN_VERSION="" +ARG UBUNTU_VERSION="22.04" +ARG MAX_JOBS=4 + +FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder + +ARG PYTHON_VERSION="3.11" +ARG PYTORCH_VERSION="2.6.0" +ARG CUDA="126" +ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX" + +ENV PYTHON_VERSION=$PYTHON_VERSION +ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST +ENV UV_TORCH_BACKEND="cu${CUDA}" + +RUN apt-get update \ + && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \ + && git lfs install --skip-repo \ + && curl -LsSf https://astral.sh/uv/install.sh | sh + +ENV PATH="/root/.local/bin:${PATH}" + +RUN uv python install ${PYTHON_VERSION} + +WORKDIR /workspace + +RUN uv venv --no-project --relocatable axolotl-venv + +ENV PATH="/workspace/axolotl-venv/bin:${PATH}" + +RUN uv pip install packaging setuptools wheel \ + && uv pip install torch==${PYTORCH_VERSION} \ + && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \ + && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \ + && uv pip install awscli pydantic diff --git a/docs/docker.qmd b/docs/docker.qmd index d665eaf5b..7b236b960 100644 --- a/docs/docker.qmd +++ b/docs/docker.qmd @@ -36,7 +36,6 @@ Tags examples: - `main-base-py3.11-cu126-2.7.0` - `main-base-py3.11-cu124-2.6.0` - `main-base-py3.11-cu124-2.5.1` -- `main-base-py3.11-cu124-2.4.1` ## Main @@ -77,12 +76,10 @@ Tags examples: - `main-py3.11-cu126-2.7.0` - `main-py3.11-cu124-2.6.0` - `main-py3.11-cu124-2.5.1` -- `main-py3.11-cu124-2.4.1` - `main-latest` - `main-20250303-py3.11-cu124-2.6.0` - `main-20250303-py3.11-cu124-2.5.1` -- `main-20250303-py3.11-cu124-2.4.1` -- `0.7.1` +- `0.9.2` ## Cloud diff --git a/docs/installation.qmd b/docs/installation.qmd index b429992b6..15f2db57b 100644 --- a/docs/installation.qmd +++ b/docs/installation.qmd @@ -15,7 +15,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU - Python ≥3.10 -- PyTorch ≥2.4.1 +- PyTorch ≥2.5.1 ## Installation Methods {#sec-installation-methods} @@ -41,6 +41,40 @@ installed) in order not to clobber it, and so that we set the correct version of dependencies that are specific to the PyTorch version or other installed co-dependencies. +### uv Installation {#sec-uv} + +uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments. + +Install uv if not already installed +```{.bash} +curl -LsSf https://astral.sh/uv/install.sh | sh +source $HOME/.local/bin/env +``` + +Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`, +then create the venv and activate +```{.bash} +export UV_TORCH_BACKEND=cu126 +uv venv --no-project --relocatable +source .venv/bin/activate +``` + +Install PyTorch +- PyTorch 2.6.0 recommended +```{.bash} +uv pip install packaging setuptools wheel +uv pip install torch==2.6.0 +uv pip install awscli pydantic +``` + +Install axolotl from PyPi +```{.bash} +uv pip install --no-build-isolation axolotl[deepspeed,flash-attn] + +# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO +uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm] +``` + ### Edge/Development Build {#sec-edge-build} For the latest features between releases: diff --git a/scripts/motd b/scripts/motd index bc123c312..f842bd076 100644 --- a/scripts/motd +++ b/scripts/motd @@ -11,7 +11,7 @@ =@# @# #@= #@ =#@@@@#= +#@@= +#@@@@#= .##@@+ @@ @@@@ @@@@@@@@@@@@@@@@ -Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory ie empty, run the following commands: +Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory is empty, run the following commands: ``` cd /workspace