update tags

contrib fix
contrib
2025-10-04 12:10:43 -04:00 · 2025-10-04 11:53:48 -04:00 · 2025-10-04 11:47:56 -04:00 · 2025-10-04 11:26:10 -04:00 · 2025-10-04 09:48:19 -04:00 · 2025-10-04 09:07:22 -04:00
126 changed files with 9319 additions and 2254 deletions
--- a/.coveragerc
+++ b/.coveragerc
@@ -2,7 +2,6 @@
 source = axolotl
 omit =
    */tests/*
-    setup.py

 [report]
 exclude_lines =
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -29,13 +29,18 @@ PRs are **greatly welcome**!
 2. Set up the development environment by following the instructions in the [README.md](https://github.com/axolotl-ai-cloud/axolotl/tree/main/README.md) file.
 3. Explore the codebase, run tests, and verify that everything works as expected.

-Please run below to setup env
-```bash
-pip3 install -r requirements-dev.txt -r requirements-tests.txt
-pre-commit install
+Please run the below to setup:

-# test
-pytest tests/
+```bash
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+
+uv sync --dev && uv pip install flash-attn --no-build-isolation
+source .venv/bin/activate
+
+pre-commit install  # install pre-commit hooks
+
+pytest tests/  # optional; run test suite
 ```

 ## How to Contribute
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -25,11 +25,18 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: "124"
+            cuda_version: 12.4.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.6.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.6.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
          - cuda: "126"
@@ -53,13 +60,6 @@ jobs:
            pytorch: 2.8.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
 #          - cuda: "128"
 #            cuda_version: 12.8.1
 #            cudnn_version: ""
@@ -98,7 +98,9 @@ jobs:
          context: .
          file: ./docker/${{ matrix.dockerfile }}
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: |
+            ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            ${{ steps.metadata.outputs.tags }}-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
@@ -115,6 +117,13 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: "126"
+            cuda_version: 12.6.3
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.6.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
          - cuda: "126"
            cuda_version: 12.6.3
            cudnn_version: ""
@@ -136,13 +145,6 @@ jobs:
            pytorch: 2.8.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -20,10 +20,14 @@ jobs:
          uses: actions/setup-python@v5
          with:
            python-version: '3.11'
+        - name: Install uv
+          uses: astral-sh/setup-uv@v4
+          with:
+            version: "latest"
        - name: Install dependencies
          run: |
-            python3 -m pip install jupyter quartodoc
-            python3 -m pip install -e .
+            uv pip install --system jupyter quartodoc
+            uv pip install --system -e .
        - name: Build autodoc
          run: quartodoc build
        - name: Publish to GitHub Pages (and render)
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -6,7 +6,7 @@ on:
      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
-       - 'requirements.txt'
+       - 'pyproject.toml'
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
@@ -23,5 +23,4 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,7 +18,7 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.6.0
            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
@@ -68,6 +68,8 @@ jobs:
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
+            GIT_REF=${{ github.ref }}
+            GIT_SHA=${{ github.sha }}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
@@ -86,7 +88,7 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.0
+            pytorch: 2.6.0
            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
@@ -138,6 +140,8 @@ jobs:
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
+            GIT_REF=${{ github.ref }}
+            GIT_SHA=${{ github.sha }}
          file: ./docker/Dockerfile-cloud
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
@@ -152,6 +156,11 @@ jobs:
    strategy:
      matrix:
        include:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.6.0
+            axolotl_extras:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
@@ -198,6 +207,8 @@ jobs:
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
+            GIT_REF=${{ github.ref }}
+            GIT_SHA=${{ github.sha }}
          file: ./docker/Dockerfile-cloud-no-tmux
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -4,8 +4,6 @@ on:
  pull_request:
    paths:
      - 'tests/e2e/multigpu/**.py'
-      - 'requirements.txt'
-      - 'setup.py'
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
@@ -26,6 +24,13 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.6.0
+            axolotl_extras:
+            num_gpus: 2
+            nightly_build: "true"
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
@@ -49,13 +54,17 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==1.0.2 jinja2 protobuf
      - name: Update env vars
        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
@@ -65,4 +74,4 @@ jobs:
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.multigpu
+          modal run -m cicd.multigpu
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -15,12 +15,12 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.6.0
            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.8.0
+            pytorch: 2.7.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -52,6 +52,8 @@ jobs:
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
+            GIT_REF=${{ github.ref }}
+            GIT_SHA=${{ github.sha }}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
@@ -68,12 +70,12 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.6.0
            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.8.0
+            pytorch: 2.7.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -102,6 +104,8 @@ jobs:
          build-args: |
            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
+            GIT_REF=${{ github.ref }}
+            GIT_SHA=${{ github.sha }}
          file: ./docker/Dockerfile-cloud
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -18,10 +18,15 @@ jobs:
        with:
          python-version: '3.11'

+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+
      - name: Update pre-commit hooks
        id: update
        run: |
-          pip install pre-commit
+          uv pip install --system pre-commit
          pre-commit autoupdate
          if [[ -n $(git status --porcelain) ]]; then
            echo "changes=true" >> $GITHUB_OUTPUT
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -40,10 +40,15 @@ jobs:
        with:
          python-version: '3.11'

+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+
      - name: Install dependencies
        run: |
-          python3 -m pip install jupyter quartodoc
-          python3 -m pip install -e .
+          uv pip install --system jupyter quartodoc
+          uv pip install --system -e .

      - name: Build autodoc
        run: quartodoc build
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -38,23 +38,24 @@ jobs:
        with:
          python-version: "3.11"

+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
+
      - name: Install dependencies
        run: |
-          pip3 install wheel packaging==23.2
-          pip3 install --no-build-isolation -e .
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+          uv pip install --system wheel packaging==23.2
+          uv pip install --system --no-build-isolation -e ".[dev]"

      - name: Extract tag name
        id: tag
-        run: echo ::set-output name=TAG_NAME::$(echo $GITHUB_REF | cut -d / -f 3)
+        run: echo "TAG_NAME=$(echo "$GITHUB_REF" | cut -d / -f 3)" >> "$GITHUB_OUTPUT"

-      - name: Update version in setup.py
+      - name: Build package
        run: |
-          sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
-
-      - name: Build a source dist
-        run: |
-          python setup.py sdist
+          uv pip install --system build
+          python -m build

      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -13,7 +13,6 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -26,7 +25,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.7.1", "2.8.0"]
+        pytorch_version: ["2.6.0", "2.7.0"]
    timeout-minutes: 20

    steps:
@@ -43,32 +42,30 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies

-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"

      - name: Install PyTorch
        run: |
-          pip3 install torch==${{ matrix.pytorch_version }} torchvision
+          uv pip install --system torch==${{ matrix.pytorch_version }} torchvision

-      - name: Update requirements.txt
+      - name: Update pyproject.toml for nightly builds
        run: |
-          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
-          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
-          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
-          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
-          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt
+          sed -i 's#"transformers==.*"#"transformers @ git+https://github.com/huggingface/transformers.git@main"#' pyproject.toml
+          sed -i 's#"peft==.*"#"peft @ git+https://github.com/huggingface/peft.git@main"#' pyproject.toml
+          sed -i 's#"accelerate==.*"#"accelerate @ git+https://github.com/huggingface/accelerate.git@main"#' pyproject.toml
+          sed -i 's#"trl==.*"#"trl @ git+https://github.com/huggingface/trl.git@main"#' pyproject.toml
+          sed -i 's#"datasets==.*"#"datasets @ git+https://github.com/huggingface/datasets.git@main"#' pyproject.toml

      - name: Install dependencies
        run: |
-          pip3 show torch
-          pip3 install --no-build-isolation -U -e .
+          uv pip show --system torch
+          uv pip install --system --no-build-isolation -e ".[dev]"
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Make sure PyTorch version wasn't clobbered
        run: |
@@ -84,9 +81,6 @@ jobs:
          pytest -v --durations=10 tests/patched/
          pytest -v --durations=10 tests/cli/

-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
@@ -102,14 +96,14 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
-          - cuda: 128
-            cuda_version: 12.8.1
+          - cuda: 126
+            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.8.0
+            pytorch: 2.7.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
@@ -120,13 +114,16 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
      - name: Install Modal
        run: |
-          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          uv pip install --system modal==1.0.2 jinja2
      - name: Update env vars
        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "BASE_TAG=main-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
@@ -136,7 +133,7 @@ jobs:
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
+          modal run -m cicd.e2e_tests
  docker-e2e-multigpu-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
@@ -162,13 +159,16 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
      - name: Install Modal
        run: |
-          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          uv pip install --system modal==1.0.2 jinja2
      - name: Update env vars
        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "BASE_TAG=main-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -7,18 +7,16 @@ on:
      - "main"
    paths:
      - '**.py'
-      - 'requirements.txt'
+      - 'pyproject.toml'
      - '.github/workflows/*.yml'
-      - 'requirements-tests.txt'
      - 'cicd/cicd.sh'
      - 'cicd/Dockerfile.jinja'
  pull_request:
      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
-       - 'requirements.txt'
+       - 'pyproject.toml'
       - '.github/workflows/*.yml'
-       - 'requirements-tests.txt'
       - 'cicd/cicd.sh'
       - 'cicd/Dockerfile.jinja'
  workflow_dispatch:
@@ -41,7 +39,6 @@ jobs:
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
-          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch
@@ -55,7 +52,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.7.1", "2.8.0"]
+        pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
    timeout-minutes: 20

    steps:
@@ -72,24 +69,25 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies

-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"

      - name: Install PyTorch
        run: |
-          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
+          uv pip install --system torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
-          pip3 show torch
-          pip3 install --no-cache-dir --no-build-isolation -U -e .
-          python scripts/unsloth_install.py | sh
-          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt
+          uv pip show --system torch
+          uv pip install --system wheel
+          printf "torch==${{ matrix.pytorch_version }}\n" > torch-constraints.txt
+          uv pip install --system --no-cache-dir --no-build-isolation -e ".[dev]" --constraints torch-constraints.txt
+          set -o pipefail
+          python scripts/unsloth_install.py | bash
+          python scripts/cutcrossentropy_install.py | bash

      - name: Make sure PyTorch version wasn't clobbered
        run: |
@@ -105,10 +103,10 @@ jobs:

      - name: Run tests
        run: |
-          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
-          pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
-          pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
-          pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
+          python -m pytest -v --durations=10 -n 8 --dist loadfile --cov=axolotl --cov-report=xml --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/
+          python -m pytest -v --durations=10 -n 8 --cov=axolotl --cov-append --cov-report=xml tests/monkeypatch/
+          python -m pytest -v --durations=10 -n 8 --cov=axolotl --cov-append --cov-report=xml tests/patched/
+          python -m pytest -v --durations=10 -n 8 --cov=axolotl --cov-append --cov-report=xml tests/cli/

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
@@ -118,9 +116,6 @@ jobs:
          flags: unittests,pytorch-${{ matrix.pytorch_version }}
          fail_ci_if_error: false

-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

  pytest-sdist:
    name: PyTest from Source Dist
@@ -130,7 +125,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.7.1", "2.8.0"]
+        pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
    timeout-minutes: 20

    steps:
@@ -147,25 +142,26 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
-          cache: 'pip' # caching pip dependencies

-      - name: upgrade pip
-        run: |
-          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"

      - name: Install PyTorch
        run: |
-          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
+          uv pip install --system torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
-          pip3 show torch
-          python -m build --no-isolation --sdist
-          pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
+          uv pip show --system torch
+          uv pip install --system wheel build setuptools_scm
+          python -m build --sdist
+          printf "torch==${{ matrix.pytorch_version }}\n" > torch-constraints.txt
+          tarball_path=$(echo dist/axolotl*.tar.gz)
+          uv pip install --no-cache-dir --no-build-isolation --system "${tarball_path}[dev]" --constraints torch-constraints.txt
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
-          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Make sure PyTorch version wasn't clobbered
        run: |
@@ -180,13 +176,9 @@ jobs:

      - name: Run tests
        run: |
-          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
-          pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
-          pytest -v --durations=10 tests/cli/
-
-      - name: cleanup pip cache
-        run: |
-          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
+          python -m pytest -v --durations=10 -n 8 --dist loadfile --cov=axolotl --cov-report=xml --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/
+          python -m pytest -v --durations=10 -n 8 --cov=axolotl --cov-append --cov-report=xml tests/monkeypatch/
+          python -m pytest -v --durations=10 -n 8 tests/cli/

  gate-skip-e2e:
    needs: [pre-commit, pytest, pytest-sdist]
@@ -243,7 +235,7 @@ jobs:
            pytorch: 2.7.1
            num_gpus: 1
            axolotl_extras:
-            dockerfile: "Dockerfile-uv.jinja"
+            dockerfile: "Dockerfile.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -251,13 +243,17 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==1.0.2 jinja2 protobuf
      - name: Update env vars
        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "BASE_TAG=${{ github.ref_name }}-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
@@ -286,6 +282,12 @@ jobs:
      fail-fast: false
      matrix:
        include:
+          - cuda: 126
+            cuda_version: 12.6.3
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
@@ -306,13 +308,17 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==1.0.2 jinja2 protobuf
      - name: Update env vars
        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "BASE_TAG=${{ github.ref_name }}-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
@@ -349,13 +355,17 @@ jobs:
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
+      - name: Install uv
+        uses: astral-sh/setup-uv@v4
+        with:
+          version: "latest"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
-          pip install modal==1.0.2 jinja2
+          pip install modal==1.0.2 jinja2 protobuf
      - name: Update env vars
        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "BASE_TAG=${{ github.ref_name }}-base-uv-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
--- a/.gitignore
+++ b/.gitignore
@@ -191,5 +191,5 @@ out/
 # vim
 *.swp

-# scm auto-versioning
+# setuptools-scm generated version file
 src/axolotl/_version.py
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,13 +11,13 @@ repos:
    -   id: no-commit-to-branch
        args: ['--branch', 'main']
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.0
+    rev: v0.12.12
    hooks:
    -   id: ruff
        args: [--fix]
    -   id: ruff-format
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.18.2
+    rev: v1.17.1
    hooks:
    - id: mypy
      additional_dependencies:
--- a/.runpod/Dockerfile
+++ b/.runpod/Dockerfile
@@ -1,9 +1,8 @@
 FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0

 COPY .runpod/requirements.txt /requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade pip && \
-    python3 -m pip install --upgrade -r /requirements.txt
+RUN curl -LsSf https://astral.sh/uv/install.sh | sh && \
+    /root/.local/bin/uv pip install --system -r /requirements.txt

 # Environment settings
 ARG BASE_VOLUME="/runpod-volume"
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,6 +1,5 @@
-include requirements.txt
+include pyproject.toml
 include README.md
 include LICENSE
-include src/setuptools_axolotl_dynamic_dependencies.py
 include src/axolotl/utils/chat_templates/templates/*.jinja
-recursive-include axolotl *.py
+recursive-include src/axolotl *.py
--- a/README.md
+++ b/README.md
@@ -65,15 +65,9 @@ Features:
 - **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
 - **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.

-
-
 ## 🚀 Quick Start - LLM Fine-tuning in Minutes

-**Requirements**:
-
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
- Python 3.11
- PyTorch ≥2.7.1
+**Requirements**: NVIDIA GPU (Ampere+) or AMD GPU, Python 3.11+

 ### Google Colab

@@ -81,15 +75,35 @@ Features:

 ### Installation

-#### Using pip
+#### Project setup (uv add)

 ```bash
-pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+# Install uv
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Initialize or enter your project
+uv init my-project && cd my-project
+uv add axolotl
+uv pip install flash-attn --no-build-isolation
+source .venv/bin/activate

 # Download example axolotl configs, deepspeed configs
 axolotl fetch examples
-axolotl fetch deepspeed_configs  # OPTIONAL
+axolotl fetch deepspeed_configs  # optional
+```
+
+#### Quick try (uv pip)
+
+```bash
+# Install uv if needed
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+uv pip install axolotl
+uv pip install flash-attn --no-build-isolation
+
+# Download example axolotl configs, deepspeed configs
+axolotl fetch examples
+axolotl fetch deepspeed_configs  # optional
 ```

 #### Using Docker
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -1,53 +0,0 @@
-FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
-ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
-ENV CUDA="{{ CUDA }}"
-ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
-ENV GITHUB_REF="{{ GITHUB_REF }}"
-ENV GITHUB_SHA="{{ GITHUB_SHA }}"
-ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
-ENV HF_HOME="{{ HF_HOME }}"
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-RUN git fetch origin +$GITHUB_REF && \
-    git checkout FETCH_HEAD
-
-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
-        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
-        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
-        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
-    fi
-
-RUN uv pip install packaging==23.2 setuptools==75.8.0
-RUN uv pip install torchvision
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
-    fi
-
-RUN python scripts/unsloth_install.py --uv | sh
-RUN python scripts/cutcrossentropy_install.py --uv | sh
-
-# So we can test the Docker image
-RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt
-
-# fix so that git fetch/pull from remote works
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,6 +1,10 @@
-FROM axolotlai/axolotl-base:{{ BASE_TAG }}
+FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}

-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+SHELL ["/bin/bash", "-euxo", "pipefail", "-c"]
+
+ARG VENV_PYTHON="/workspace/axolotl-venv/bin/python"
+
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
 ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
 ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
 ENV CUDA="{{ CUDA }}"
@@ -9,7 +13,7 @@ ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
 ENV HF_HOME="{{ HF_HOME }}"
-ENV AXOLOTL_DATASET_NUM_PROC="8"
+ENV VENV_PYTHON=$VENV_PYTHON

 RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
@@ -25,25 +29,27 @@ RUN git fetch origin +$GITHUB_REF && \

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
-        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
-        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
-        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
-        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
+        sed -i 's#"transformers[^"]*"#"transformers @ git+https://github.com/huggingface/transformers.git@main"#' pyproject.toml; \
+        sed -i 's#"peft[^"]*"#"peft @ git+https://github.com/huggingface/peft.git@main"#' pyproject.toml; \
+        sed -i 's#"accelerate[^"]*"#"accelerate @ git+https://github.com/huggingface/accelerate.git@main"#' pyproject.toml; \
+        sed -i 's#"trl[^"]*"#"trl @ git+https://github.com/huggingface/trl.git@main"#' pyproject.toml; \
+        sed -i 's#"datasets[^"]*"#"datasets @ git+https://github.com/huggingface/datasets.git@main"#' pyproject.toml; \
    fi

-RUN pip install packaging==23.2 setuptools==75.8.0
+RUN uv pip install --python "$VENV_PYTHON" packaging==23.2 setuptools==75.8.0 pip
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        uv pip install --python "$VENV_PYTHON" --no-build-isolation -e .[ring-flash-attn,optimizers,ray,${AXOLOTL_EXTRAS}] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+        uv pip install --python "$VENV_PYTHON" --no-build-isolation -e .[ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

-RUN python scripts/unsloth_install.py | sh
-RUN python scripts/cutcrossentropy_install.py | sh
+RUN uv pip install --python "$VENV_PYTHON" --no-build-isolation flash-attn $AXOLOTL_ARGS
+
+RUN "$VENV_PYTHON" scripts/unsloth_install.py | sh
+RUN "$VENV_PYTHON" scripts/cutcrossentropy_install.py | sh

 # So we can test the Docker image
-RUN pip install -r requirements-dev.txt -r requirements-tests.txt
+RUN uv pip install --python "$VENV_PYTHON" -e ".[dev]"

 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -4,7 +4,7 @@ set -e
 python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"

 # Run unit tests with initial coverage report
-pytest -v --durations=10 -n8 \
+uv run pytest -v --durations=10 -n8 \
  --ignore=tests/e2e/ \
  --ignore=tests/patched/ \
  --ignore=tests/cli \
@@ -12,36 +12,36 @@ pytest -v --durations=10 -n8 \
  --cov=axolotl

 # Run lora kernels tests with coverage append
-pytest -v --durations=10 \
+uv run pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/patched/lora_kernels \
  --cov=axolotl \
  --cov-append

 # Run patched tests excluding lora kernels with coverage append
-pytest --full-trace -vvv --durations=10 \
+uv run pytest --full-trace -vvv --durations=10 \
  --ignore=tests/e2e/patched/lora_kernels \
  /workspace/axolotl/tests/e2e/patched \
  --cov=axolotl \
  --cov-append

 # Run solo tests with coverage append
-pytest -v --durations=10 -n1 \
+uv run pytest -v --durations=10 -n1 \
  /workspace/axolotl/tests/e2e/solo/ \
  --cov=axolotl \
  --cov-append

 # Run integration tests with coverage append
-pytest -v --durations=10 \
+uv run pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/integrations/ \
  --cov=axolotl \
  --cov-append

-pytest -v --durations=10 /workspace/axolotl/tests/cli \
+uv run pytest -v --durations=10 /workspace/axolotl/tests/cli \
  --cov=axolotl \
  --cov-append

 # Run remaining e2e tests with coverage append and final report
-pytest -v --durations=10 \
+uv run pytest -v --durations=10 \
  --ignore=tests/e2e/solo/ \
  --ignore=tests/e2e/patched/ \
  --ignore=tests/e2e/multigpu/ \
@@ -52,4 +52,4 @@ pytest -v --durations=10 \
  --cov-append \
  --cov-report=xml:e2e-coverage.xml

-codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
+uv run codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -23,7 +23,7 @@ df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-uv-py3.11-cu126-2.6.0"),
    "CUDA": os.environ.get("CUDA", "126"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -23,7 +23,7 @@ df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-uv-py3.11-cu126-2.6.0"),
    "CUDA": os.environ.get("CUDA", "126"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
@@ -65,13 +65,8 @@ def run_cmd(cmd: str, run_folder: str):
    import subprocess  # nosec

    sp_env = os.environ.copy()
-    sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8"
+    sp_env["AXOLOTL_DATASET_PROCESSES"] = "8"

    # Propagate errors from subprocess.
-    try:
-        exit_code = subprocess.call(cmd.split(), cwd=run_folder, env=sp_env)  # nosec
-        if exit_code:
-            print(f"Command '{cmd}' failed with exit code {exit_code}")
-            return exit_code
-    except Exception as e:  # pylint: disable=broad-except
-        print(f"Command '{cmd}' failed with exception {e}")
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env):  # nosec
+        exit(exit_code)
--- a/devtools/dev_chat_template.yml
+++ b/devtools/dev_chat_template.yml
@@ -13,7 +13,7 @@ datasets:
 val_set_size: 0
 output_dir: temp_debug/axolotl_outputs/model
 dataset_prepared_path: temp_debug/axolotl_outputs/data
-dataset_num_proc: 1
+dataset_processes: 1

 sequence_len: 4096
 sample_packing: false
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,13 +1,19 @@
-ARG BASE_TAG=main-base
-FROM axolotlai/axolotl-base:$BASE_TAG
+ARG BASE_TAG=main-base-uv
+FROM axolotlai/axolotl-base-uv:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
 ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ARG PYTORCH_VERSION="2.1.2"
+ARG GIT_REF="refs/heads/main"
+ARG GIT_SHA="HEAD"
+ARG VENV_PYTHON="/workspace/axolotl-venv/bin/python"

 ENV PYTORCH_VERSION=$PYTORCH_VERSION
+ENV GIT_REF=$GIT_REF
+ENV GIT_SHA=$GIT_SHA
+ENV VENV_PYTHON=$VENV_PYTHON

 RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
@@ -20,16 +26,19 @@ RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git

 WORKDIR /workspace/axolotl

+# Ensure we are on the expected commit and break Docker cache between revisions
+RUN git fetch origin "$GIT_REF" && git checkout "$GIT_SHA"
+
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        uv pip install --python "$VENV_PYTHON" --no-build-isolation -e .[ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
+        uv pip install --python "$VENV_PYTHON" --no-build-isolation -e .[ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi && \
-    python scripts/unsloth_install.py | sh && \
-    python scripts/cutcrossentropy_install.py | sh && \
-    pip install pytest && \
-    pip cache purge
+    uv pip install --python "$VENV_PYTHON" --no-build-isolation flash-attn $AXOLOTL_ARGS && \
+    "$VENV_PYTHON" scripts/unsloth_install.py | sh && \
+    "$VENV_PYTHON" scripts/cutcrossentropy_install.py | sh && \
+    uv pip install --python "$VENV_PYTHON" pytest

 # fix so that git fetch/pull from remote works with shallow clone
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -47,8 +47,6 @@ RUN git lfs install --skip-repo && \
    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
    pip3 cache purge

-RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
-        wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
-        pip3 install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
-        rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
+        FLASH_ATTENTION_FORCE_BUILD="TRUE" uv pip install --no-build-isolation flash-attn==2.8.0.post2; \
    fi
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -12,8 +12,8 @@ EXPOSE 22
 COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
 COPY scripts/motd /etc/motd

-RUN pip install jupyterlab notebook ipywidgets && \
-    jupyter lab clean
+RUN uv pip install --python "$VENV_PYTHON" jupyterlab notebook ipywidgets && \
+    "$VENV_PYTHON" -m jupyter lab clean
 RUN apt update && \
    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
    rm -rf /var/cache/apt/archives && \
--- a/docker/Dockerfile-cloud-no-tmux
+++ b/docker/Dockerfile-cloud-no-tmux
@@ -12,8 +12,8 @@ EXPOSE 22
 COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
 COPY scripts/motd /etc/motd

-RUN pip install jupyterlab notebook ipywidgets && \
-    jupyter lab clean
+RUN uv pip install --python "$VENV_PYTHON" jupyterlab notebook ipywidgets && \
+    "$VENV_PYTHON" -m jupyter lab clean
 RUN apt update && \
    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
    rm -rf /var/cache/apt/archives && \
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -24,13 +24,14 @@ RUN git fetch origin +$GITHUB_REF && \

 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
+        uv pip install --no-build-isolation -e .[deepspeed,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
-    fi
+        uv pip install --no-build-isolation -e .[deepspeed,mamba-ssm] $AXOLOTL_ARGS; \
+    fi && \
+    uv pip install --no-build-isolation flash-attn $AXOLOTL_ARGS

 # So we can test the Docker image
-RUN pip install pytest
+RUN uv pip install pytest

 # fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -13,6 +13,7 @@ ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
 ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
 ENV UV_TORCH_BACKEND="cu${CUDA}"
+ENV VENV_PYTHON=/workspace/axolotl-venv/bin/python

 RUN apt-get update \
    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
@@ -29,14 +30,8 @@ RUN uv venv --no-project --relocatable axolotl-venv

 ENV PATH="/workspace/axolotl-venv/bin:${PATH}"

-RUN uv pip install packaging setuptools wheel psutil \
-    && uv pip install torch==${PYTORCH_VERSION} torchvision \
-    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
-    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
-    && uv pip install awscli pydantic
-
-RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
-        wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
-        uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
-        rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
-    fi
+RUN uv pip install --python "$VENV_PYTHON" packaging setuptools wheel psutil protobuf grpclib \
+    && uv pip install --python "$VENV_PYTHON" torch==${PYTORCH_VERSION} \
+    && uv pip install --python "$VENV_PYTHON" --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
+    && uv pip install --python "$VENV_PYTHON" "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
+    && uv pip install --python "$VENV_PYTHON" awscli pydantic
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -29,7 +29,7 @@ While debugging it's helpful to simplify your test scenario as much as possible.
 1. **Make sure you are using the latest version of axolotl**:  This project changes often and bugs get fixed fast.  Check your git branch and make sure you have pulled the latest changes from `main`.
 1. **Eliminate concurrency**: Restrict the number of processes to 1 for both training and data preprocessing:
    - Set `CUDA_VISIBLE_DEVICES` to a single GPU, ex: `export CUDA_VISIBLE_DEVICES=0`.
-    - Set `dataset_num_proc: 1` in your axolotl config or run the training command with `--dataset_num_proc=1`.
+    - Set `dataset_processes: 1` in your axolotl config or run the training command with `--dataset_processes=1`.
 2. **Use a small dataset**: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure `sample_packing: False` and `eval_sample_packing: False` to avoid errors.  If you are in a pinch and don't have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training.  For example, to shard the dataset into 20 pieces, add the following to your axolotl config):

    ```yaml
@@ -72,8 +72,8 @@ datasets:
 Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime.  Run the following commands from the root of this project:

 ```bash
-pip3 install packaging
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+uv sync --extra deepspeed
+uv pip install flash-attn --no-build-isolation
 ```

 #### Remote Hosts
@@ -101,7 +101,7 @@ For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 acceler
                "-m", "axolotl.cli.train", "dev_chat_template.yml",
                // The flags below simplify debugging by overriding the axolotl config
                // with the debugging tips above.  Modify as needed.
-                "--dataset_num_proc=1",      // limits data preprocessing to one process
+                "--dataset_processes=1",      // limits data preprocessing to one process
                "--max_steps=1",              // limits training to just one step
                "--batch_size=1",             // minimizes batch size
                "--micro_batch_size=1",       // minimizes batch size
@@ -213,8 +213,8 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --
 You will now be in the container.  Next, perform an editable install of Axolotl:

 ```bash
-pip3 install packaging
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+uv sync --extra deepspeed
+uv pip install flash-attn --no-build-isolation
 ```

 ### Attach To Container
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -63,14 +63,6 @@ description: Frequently asked questions

 > A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.

-**Q: Can we mix text and text+image datasets for VLM training?**
-
-> A: Yes, you can for newer VLM arch. The ones that would not work are LLaVA / Pixtral arch. If you notice one not working, please let us know!
-
-**Q: Why is `memory/max_*` different from `nvidia-smi`?**
-
-> A: We use `torch` APIs to retrieve this information. You can see https://docs.pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management for more information.
-
 ### Chat templates

 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -29,19 +29,40 @@ Follow the instructions at: [https://pytorch.org/get-started/locally/](https://p
 For Blackwell GPUs, please use Pytorch 2.7.0 and CUDA 12.8.
 :::

-### PyPI Installation (Recommended) {#sec-pypi}
+### uv Installation (Recommended) {#sec-uv-quick}

 ```{.bash}
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
+# Install uv if not already installed
+curl -LsSf https://astral.sh/uv/install.sh | sh
+
+# Add Axolotl to a project (recommended)
+uv init my-project && cd my-project
+uv add axolotl
+uv pip install flash-attn --no-build-isolation
+source .venv/bin/activate
+```
+
+For a quick one-off install without creating a project:
+
+```{.bash}
+uv pip install axolotl
+uv pip install flash-attn --no-build-isolation
+```
+
+### pip Installation {#sec-pypi}
+
+```{.bash}
+pip install --no-build-isolation axolotl[deepspeed]
+pip install --no-build-isolation flash-attn
 ```

 We use `--no-build-isolation` in order to detect the installed PyTorch version (if
 installed) in order not to clobber it, and so that we set the correct version of
 dependencies that are specific to the PyTorch version or other installed
-co-dependencies.
+co-dependencies. Flash Attention is resolved separately so it can be built against
+the environment configured by the previous step.

-### uv Installation {#sec-uv}
+### Advanced uv Installation {#sec-uv}

 uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.

@@ -62,28 +83,38 @@ source .venv/bin/activate
 Install PyTorch
 - PyTorch 2.6.0 recommended
 ```{.bash}
-uv pip install packaging setuptools wheel
 uv pip install torch==2.6.0
 uv pip install awscli pydantic
 ```

 Install axolotl from PyPi
 ```{.bash}
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]
-
+uv pip install --no-build-isolation axolotl[deepspeed]
 # optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
-uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
+# uv pip install --no-build-isolation axolotl[deepspeed,vllm]
+
+uv pip install flash-attn --no-build-isolation
 ```

 ### Edge/Development Build {#sec-edge-build}

 For the latest features between releases:

+#### Using uv (recommended)
 ```{.bash}
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install -U packaging setuptools wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
+curl -LsSf https://astral.sh/uv/install.sh | sh  # If not already installed
+uv sync
+uv pip install flash-attn --no-build-isolation
+```
+
+#### Using pip
+```{.bash}
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+pip install --no-build-isolation -e '.[deepspeed]'
+pip install --no-build-isolation flash-attn
 ```

 ### Docker {#sec-docker}
@@ -141,7 +172,7 @@ For providers supporting Docker:
 ### macOS {#sec-macos}

 ```{.bash}
-pip3 install --no-build-isolation -e '.'
+uv pip install --no-build-isolation -e '.'
 ```

 See @sec-troubleshooting for Mac-specific issues.
@@ -159,10 +190,15 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 1. Install Python ≥3.11
 2. Install PyTorch: https://pytorch.org/get-started/locally/
 3. Install Axolotl:
-   ```{.bash}
-   pip3 install -U packaging setuptools wheel ninja
-   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
-   ```
+```{.bash}
+# Option A: add Axolotl to the environment
+uv add axolotl
+uv pip install flash-attn --no-build-isolation
+
+# Option B: quick install
+uv pip install axolotl
+uv pip install flash-attn --no-build-isolation
+```
 4. (Optional) Login to Hugging Face:
   ```{.bash}
   huggingface-cli login
--- a/docs/lr_groups.qmd
+++ b/docs/lr_groups.qmd
@@ -27,9 +27,3 @@ learning_rate: 2e-5
 In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate
 of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's
 self attention `q_proj` module.
-
-::: {.callout-note}
-
-We currently only support varying `lr` for now. If you're interested in adding support for others (`weight_decay`), we welcome PRs. See https://github.com/axolotl-ai-cloud/axolotl/blob/613bcf90e58f3ab81d3827e7fc572319908db9fb/src/axolotl/core/trainers/mixins/optimizer.py#L17
-
-:::
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -88,7 +88,6 @@ fsdp_sync_module_states | **REMOVED**
 fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
 fsdp_state_dict_type | state_dict_type
 fsdp_use_orig_params | **REMOVED**
-fsdp_activation_checkpointing | activation_checkpointing

 For more details, please see the migration guide in the [torchtitan repo](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md). In Axolotl,
 if you were using the following FSDP1 config:
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -56,14 +56,10 @@ image_resize_algorithm: bilinear

 Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs.

-::: {.callout-tip}
+::: {.callout-warning}
 Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
 :::

-::: {.callout-note}
-As of now, we do not truncate nor drop samples based on `sequence_len` as each arch has different ways to process non-text tokens. We are looking for help on this.
-:::
-
 ### Mllama {#sec-mllama}

 ```yaml
@@ -99,7 +95,7 @@ chat_template: llava
 ### Mistral-Small-3.1 {#sec-mistral-small-31}

 ::: {.callout-tip}
-Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'`
+Please make sure to install vision lib via `uv pip install 'mistral-common[opencv]==1.8.5'`
 :::

 ```yaml
@@ -109,7 +105,7 @@ base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
 ### Magistral-Small-2509 {#sec-magistral-small-2509}

 ::: {.callout-tip}
-Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'`
+Please make sure to install vision lib via `uv pip install 'mistral-common[opencv]==1.8.5'`
 :::

 ```yaml
@@ -119,7 +115,7 @@ base_model: mistralai/Magistral-Small-2509
 ### Voxtral {#sec-voxtral}

 ::: {.callout-tip}
-Please make sure to install audio lib via `pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'`
+Please make sure to install audio lib via `uv pip install librosa==0.11.0 'mistral_common[audio]==1.8.3'`
 :::

 ```yaml
@@ -147,7 +143,7 @@ The model's initial loss and grad norm will be very high. We suspect this to be
 :::

 ::: {.callout-tip}
-Please make sure to install `timm` via `pip3 install timm==1.0.17`
+Please make sure to install `timm` via `uv pip install timm==1.0.17`
 :::

 ```yaml
@@ -172,18 +168,10 @@ base_model: Qwen/Qwen2.5-VL-7B-Instruct
 chat_template: qwen2_vl  # same as qwen2-vl
 ```

-### Qwen3-VL {#sec-qwen3-vl}
-
-```yaml
-base_model: Qwen/Qwen3-VL-4B-Instruct
-
-chat_template: qwen2_vl  # same as qwen2-vl
-```
-
 ### SmolVLM2 {#sec-smolvlm2}

 ::: {.callout-tip}
-Please make sure to install `num2words` via `pip3 install num2words==0.5.14`
+Please make sure to install `num2words` via `uv pip install num2words==0.5.14`
 :::

 ```yaml
@@ -193,7 +181,7 @@ base_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct
 ### LFM2-VL {#sec-lfm2-vl}

 ::: {.callout-warning}
-Please uninstall `causal-conv1d` via `pip3 uninstall -y causal-conv1d`
+Please uninstall `causal-conv1d` via `uv pip uninstall -y causal-conv1d`
 :::

 ```yaml
@@ -234,7 +222,7 @@ For audio loading, you can use the following keys within `content` alongside `"t

 ::: {.callout-tip}

-You may need to install `librosa` via `pip3 install librosa==0.11.0`.
+You may need to install `librosa` via `uv pip install librosa==0.11.0`.

 :::

--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -219,21 +219,6 @@ DPO supports the following types with the following dataset format:
 }
 ```

-#### chat_template.argilla_chat
-
-```json
-{
-    "chosen": [
-        {"role": "user", "content": "..."},
-        {"role": "assistant", "content": "..."}
-    ],
-    "rejected": [
-        {"role": "user", "content": "..."},
-        {"role": "assistant", "content": "..."}
-    ]
-}
-```
-
 #### chat_template.default

 ```yaml
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -49,9 +49,9 @@ When sequence parallelism is enabled:
 To use sequence parallelism, you need:

 - Multiple GPUs (at least 2)
- The `ring-flash-attn` package. Install with:
-  - `pip install axolotl[ring-flash-attn]` (preferred)
-  - `pip install ring-flash-attn>=0.1.4`
+- The `ring-flash-attn` package. Install with either `uv sync --extra ring-flash-attn`
+  (from a cloned repository) or `uv pip install ring-flash-attn>=0.1.4`.
+- Flash Attention installed separately with `uv pip install flash-attn --no-build-isolation`.

 ## Limitations

--- a/examples/LiquidAI/README.md
+++ b/examples/LiquidAI/README.md
@@ -6,17 +6,20 @@ LFM2 features a new hybrid Liquid architecture with multiplicative gates, short-

 This guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.

-Thanks to the team at LiquidAI for giving us early access to prepare for these releases.
-
 ## Getting Started

 1.  Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

    Here is an example of how to install from pip:
    ```bash
-    # Ensure you have a compatible version of Pytorch installed
-    pip3 install packaging setuptools wheel ninja
-    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    # Ensure you have a compatible version of PyTorch installed
+    # Option A: manage dependencies in your project
+    uv add 'axolotl>=0.12.0'
+    uv pip install flash-attn --no-build-isolation
+
+    # Option B: quick install
+    uv pip install 'axolotl>=0.12.0'
+    uv pip install flash-attn --no-build-isolation
    ```

 2.  Run one of the finetuning examples below.
@@ -33,19 +36,11 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r
    axolotl train examples/LiquidAI/lfm2-vl-lora.yaml
    ```

-    **LFM2-MoE**
-    ```bash
-    pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
-
-    # LoRA SFT (1x48GB @ 16.2GiB)
-    axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
-    ```
-
 ### TIPS

 - **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
  ```bash
-  pip uninstall -y causal-conv1d
+  uv pip uninstall -y causal-conv1d
  ```

 - **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
@@ -55,13 +50,14 @@ Thanks to the team at LiquidAI for giving us early access to prepare for these r

 ## Optimization Guides

- [Optimizations Guide](https://docs.axolotl.ai/docs/optimizations.html)
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)

 ## Related Resources

 - [LFM2 Blog](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models)
 - [LFM2-VL Blog](https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models)
- [LFM2-MoE Blog](https://www.liquid.ai/blog/lfm2-8b-a1b-an-efficient-on-device-mixture-of-experts)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/LiquidAI/lfm2-350m-fft.yaml
+++ b/examples/LiquidAI/lfm2-350m-fft.yaml
@@ -1,7 +1,6 @@
 base_model: LiquidAI/LFM2-350M

-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+chunked_cross_entropy: true

 eot_tokens:
  - "<|im_end|>"
--- a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
+++ b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -1,59 +0,0 @@
-base_model: LiquidAI/LFM2-8B-A1B
-
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-
-load_in_8bit: true
-
-eot_tokens:
-  - "<|im_end|>"
-datasets:
-  - path: mlabonne/FineTome-100k
-    type: chat_template
-    split: train[:20%]
-    field_messages: conversations
-    message_field_role: from
-    message_field_content: value
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-output_dir: ./outputs/out
-
-sequence_len: 4096
-sample_packing: true
-
-adapter: lora
-lora_model_dir:
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 4
-num_epochs: 1
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 5e-5
-
-bf16: true
-tf32: true
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 2
-saves_per_epoch: 1
-
-weight_decay: 0.0
-
-# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/LiquidAI/lfm2-vl-lora.yaml
+++ b/examples/LiquidAI/lfm2-vl-lora.yaml
@@ -3,9 +3,6 @@ trust_remote_code: true
 model_type: AutoModelForImageTextToText
 processor_type: AutoProcessor

-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -15,8 +15,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv sync
+uv pip install flash-attn --no-build-isolation

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
@@ -31,7 +31,7 @@ python scripts/cutcrossentropy_install.py | sh
 # For those using our Docker image, use the below path.
 export CUDA_HOME=/usr/local/cuda

-pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
 ```

 For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
@@ -67,7 +67,7 @@ If those didn't help, please try the below solutions:
 1. Pass env for CMAKE and try install again:

    ```bash
-    Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+    Python_EXECUTABLE=$(which python) uv pip install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
    ```

 2. Git clone the repo and manually hardcode python path:
@@ -92,7 +92,7 @@ If those didn't help, please try the below solutions:
    ```

    ```bash
-    pip3 install . --no-build-isolation --no-deps
+    uv pip install . --no-build-isolation --no-deps
    ```

 ## Optimization Guides
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -17,8 +17,8 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv sync
+uv pip install flash-attn --no-build-isolation

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -12,10 +12,10 @@
    "\n",
    "Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.\n",
    "\n",
-    "- ⭐ us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n",
-    "- 📜 Read the [Docs](http://docs.axolotl.ai/)\n",
-    "- 💬 Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n",
-    "- 📰 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n"
+    "- \u2b50 us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n",
+    "- \ud83d\udcdc Read the [Docs](http://docs.axolotl.ai/)\n",
+    "- \ud83d\udcac Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n",
+    "- \ud83d\udcf0 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n"
   ]
  },
  {
@@ -39,8 +39,8 @@
   "source": [
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
-    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec\""
+    "!uv pip install --no-build-isolation axolotl>=0.9.1\n!uv pip install flash-attn --no-build-isolation\n",
+    "!uv pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28\""
   ]
  },
  {
@@ -1371,7 +1371,7 @@
       "version_minor": 0
      },
      "text/plain": [
-       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv\u2026"
      ]
     },
     "metadata": {},
@@ -1729,9 +1729,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_12815f401eba44658caa7b2e490137a8",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_30e02aa2d0d241979369e598287f2639",
-      "value": "Drop Samples with Zero Trainable Tokens (num_proc=2): 100%"
+      "value": "Drop\u2007Samples\u2007with\u2007Zero\u2007Trainable\u2007Tokens\u2007(num_proc=2):\u2007100%"
     }
    },
    "083f9cda8d754c168beee10d2f8955a2": {
@@ -1774,9 +1774,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_b195f160ca20442fadd8b5aed0ee41af",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_ca65e32eb52f48c09a84b33cb18f22cd",
-      "value": " 11.4M/11.4M [00:00&lt;00:00, 21.8MB/s]"
+      "value": "\u200711.4M/11.4M\u2007[00:00&lt;00:00,\u200721.8MB/s]"
     }
    },
    "0a46ad75c198463d843fb35e813642cb": {
@@ -1917,7 +1917,7 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_b1bea589efa14258a9982071b87938bf",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_590eef89881545aa8bbef9a8bbe7fb00",
      "value": "\n<b>Pro Tip:</b> If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. </center>"
     }
@@ -1938,9 +1938,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_bfcdbba993b74972a9e3e575f86908ff",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_6ebb2ec171414e47a14765505f64bb3c",
-      "value": " 3.84G/3.84G [00:09&lt;00:00, 664MB/s]"
+      "value": "\u20073.84G/3.84G\u2007[00:09&lt;00:00,\u2007664MB/s]"
     }
    },
    "0e936d9dbf9c4fdd86bbfe9730dedc47": {
@@ -2296,9 +2296,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_349eee9f56d64f0cba6fc24ff2c50c9b",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_7e5d3774060e4589aa65982da5ea4ef4",
-      "value": " 9985/9985 [00:04&lt;00:00, 2604.11 examples/s]"
+      "value": "\u20079985/9985\u2007[00:04&lt;00:00,\u20072604.11\u2007examples/s]"
     }
    },
    "16d1283741404b7bb319094c992fce01": {
@@ -2317,9 +2317,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_a4e5789584564049b83df7c6c54a3e08",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_ff3a94b146a948b6907f5d80c7157f99",
-      "value": " 9985/0 [00:00&lt;00:00, 50763.46 examples/s]"
+      "value": "\u20079985/0\u2007[00:00&lt;00:00,\u200750763.46\u2007examples/s]"
     }
    },
    "1811cda0644e4190a9469d1774435d82": {
@@ -2390,9 +2390,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_e366ae3fceec4566b9ed303d6c5f90af",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_5dd7d150dbe04f08b165ce7f2c27cd11",
-      "value": "model-00008-of-00008.safetensors: 100%"
+      "value": "model-00008-of-00008.safetensors:\u2007100%"
     }
    },
    "19127c7bb1554ccbac877059f9a82db0": {
@@ -2561,9 +2561,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_0dea5caa27384f5689e3cab51f558727",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_a6f48410b9964fefba0c3009a77dc838",
-      "value": " 9.68k/9.68k [00:00&lt;00:00, 812kB/s]"
+      "value": "\u20079.68k/9.68k\u2007[00:00&lt;00:00,\u2007812kB/s]"
     }
    },
    "1f7d30f71bbd4547a9150d21da071055": {
@@ -2634,9 +2634,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_f4a1795dc7514a718f478245f521f0ba",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_5e746eb25bbe416fb585fa24e79f5177",
-      "value": "model-00002-of-00008.safetensors: 100%"
+      "value": "model-00002-of-00008.safetensors:\u2007100%"
     }
    },
    "20352e5f58d24bb8b1f3940efd14fe4a": {
@@ -2707,9 +2707,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_1c6f1f10667545aaab958016ba7e2c94",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_e6e969610738449887259063967f82b0",
-      "value": " 2.78M/2.78M [00:00&lt;00:00, 17.8MB/s]"
+      "value": "\u20072.78M/2.78M\u2007[00:00&lt;00:00,\u200717.8MB/s]"
     }
    },
    "258b7c635c1045329d4669e48c46ccd5": {
@@ -3056,9 +3056,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_be724f04b03942b2a033a7e8898bb4fd",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_fcbab4d8dced41a18dfccce81e3a45a0",
-      "value": "model-00005-of-00008.safetensors: 100%"
+      "value": "model-00005-of-00008.safetensors:\u2007100%"
     }
    },
    "3036608c71904ce9ae4bb2a9fa8802d9": {
@@ -3077,9 +3077,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_5ca6be24acb548cea130bd58e9954c7c",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_5cfb02ee044b4011a378efa8b54a370f",
-      "value": " 3.96G/3.96G [00:10&lt;00:00, 531MB/s]"
+      "value": "\u20073.96G/3.96G\u2007[00:10&lt;00:00,\u2007531MB/s]"
     }
    },
    "30a81da86f8043eca301e86a8651201a": {
@@ -3629,9 +3629,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_8f5bd719974e41c3a8dd9a5b0d3d71e6",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_b87c84de30e84b3abf4871461fb9cbd3",
-      "value": "Loading checkpoint shards: 100%"
+      "value": "Loading\u2007checkpoint\u2007shards:\u2007100%"
     }
    },
    "41f3b32c2f6b4034ae7a3b9124e28bc7": {
@@ -3791,7 +3791,7 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_39789237703c4a418134243055c9cbf5",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_a3a945817f684328b34651fe052393ec",
      "value": "Connecting..."
     }
@@ -4077,9 +4077,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_4d468f96ec924681ad65eb671674b93e",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_ad7599de524549c48bf2d3124ad4b299",
-      "value": "Dropping Long Sequences (num_proc=2): 100%"
+      "value": "Dropping\u2007Long\u2007Sequences\u2007(num_proc=2):\u2007100%"
     }
    },
    "5ca240f31e6b44e3882c5eb37cd5a309": {
@@ -4471,9 +4471,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_5e18768f7ad6434ba8b8b8a2e853e204",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_bb33aec33a6447078c31bfd728942994",
-      "value": " 728/728 [00:00&lt;00:00, 20.3kB/s]"
+      "value": "\u2007728/728\u2007[00:00&lt;00:00,\u200720.3kB/s]"
     }
    },
    "62e302ebdad64aada0ffe64ae1c873f3": {
@@ -4636,9 +4636,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_81c3db71ac704280ad030072655f1537",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_042e091f75694c47aee761e760e76773",
-      "value": " 9985/9985 [00:02&lt;00:00, 3977.47 examples/s]"
+      "value": "\u20079985/9985\u2007[00:02&lt;00:00,\u20073977.47\u2007examples/s]"
     }
    },
    "67da6c4260574869aa24c3cbc1bc1654": {
@@ -4778,7 +4778,7 @@
      "description_tooltip": null,
      "disabled": false,
      "layout": "IPY_MODEL_2e257c8be2da40b4bb67a9e4ab6811f3",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_56e3768bef5a4b9db4168c5c17f509c2",
      "value": ""
     }
@@ -4823,9 +4823,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_41f3b32c2f6b4034ae7a3b9124e28bc7",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_a10d0a76010f4e508c65a9b69ebc5156",
-      "value": "Tokenizing Prompts (num_proc=2): 100%"
+      "value": "Tokenizing\u2007Prompts\u2007(num_proc=2):\u2007100%"
     }
    },
    "704f2f5a9b1c49d5a75a0025a5dda11b": {
@@ -5071,9 +5071,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_93a44a11aa4846fa8efc6c1413ef1627",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_a55060adc3564407ac81ad7297d34aaa",
-      "value": "train.jsonl: 100%"
+      "value": "train.jsonl:\u2007100%"
     }
    },
    "7be6f04c284e4326bb4ff3d301e7b3c6": {
@@ -5138,9 +5138,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_7fd44cf9ca6e4726bfd7ac21846d6a14",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_366a343b62fa47d8985a3bd464d99f9e",
-      "value": "config.json: 100%"
+      "value": "config.json:\u2007100%"
     }
    },
    "7cd0b85ebd204b7aba908417811ce4e0": {
@@ -5339,9 +5339,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_67da6c4260574869aa24c3cbc1bc1654",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_94b9088614464f60a203de39dbcae853",
-      "value": " 8/8 [01:47&lt;00:00, 11.64s/it]"
+      "value": "\u20078/8\u2007[01:47&lt;00:00,\u200711.64s/it]"
     }
    },
    "823f1c78f15043e38bbd4dca3932a86a": {
@@ -5488,7 +5488,7 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_8640ac440fbc4644b9a3af7ba3ae7183",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_5cea7996f02040b187ece0bb2d6a8d1f",
      "value": "<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svg\nalt='Hugging Face'> <br> Copy a token from <a\nhref=\"https://huggingface.co/settings/tokens\" target=\"_blank\">your Hugging Face\ntokens page</a> and paste it below. <br> Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. </center>"
     }
@@ -5509,9 +5509,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_ef223e8504b64e3592589880326aaf41",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_598da69727bd4fb8b1caf465ac736d7a",
-      "value": " 1.67M/1.67M [00:00&lt;00:00, 19.0MB/s]"
+      "value": "\u20071.67M/1.67M\u2007[00:00&lt;00:00,\u200719.0MB/s]"
     }
    },
    "897b77a56c09479bb11d7f2a30997e55": {
@@ -5717,9 +5717,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_37de928300e34184881039378bd75e7f",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_0e936d9dbf9c4fdd86bbfe9730dedc47",
-      "value": " 3.96G/3.96G [00:13&lt;00:00, 273MB/s]"
+      "value": "\u20073.96G/3.96G\u2007[00:13&lt;00:00,\u2007273MB/s]"
     }
    },
    "936d04b5fe1b4c63bf0b080e423d051b": {
@@ -6050,9 +6050,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_d955dcaa0e944e719f3a06139dd54a03",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_d3de2662c7964f1ba96e58da382af720",
-      "value": "merges.txt: 100%"
+      "value": "merges.txt:\u2007100%"
     }
    },
    "9cd5211b5d8b457aa0002f1d17b80028": {
@@ -6071,9 +6071,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_6932489232ec4ab18a160b1e7fbcdfe1",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_4540927d98f54466b434ba4c0edf045d",
-      "value": "model-00007-of-00008.safetensors: 100%"
+      "value": "model-00007-of-00008.safetensors:\u2007100%"
     }
    },
    "9d4897eefb5f48259ffb2d23e332f752": {
@@ -6303,9 +6303,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_3aaecbf540f54a2db9ab0931e3b1fe57",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_9e333ed3b5014069ac1dd969255dd591",
-      "value": " 239/239 [00:00&lt;00:00, 30.9kB/s]"
+      "value": "\u2007239/239\u2007[00:00&lt;00:00,\u200730.9kB/s]"
     }
    },
    "a20927bf5f2c41f58c1e31ac858ab36c": {
@@ -6324,9 +6324,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_1811cda0644e4190a9469d1774435d82",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_35c811d2ae8e43f3b5cecbdd3cfa857f",
-      "value": "tokenizer.json: 100%"
+      "value": "tokenizer.json:\u2007100%"
     }
    },
    "a3a945817f684328b34651fe052393ec": {
@@ -6360,9 +6360,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_ed5ca967ad5342929e578ac6aa4dc4c0",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_af401d117d5047629d3a6e2361757b62",
-      "value": "model-00001-of-00008.safetensors: 100%"
+      "value": "model-00001-of-00008.safetensors:\u2007100%"
     }
    },
    "a4e5789584564049b83df7c6c54a3e08": {
@@ -6494,9 +6494,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_fa1282ccc7544e4f818e2f03ccffe4a5",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_bbbf575d2a4b4c6ea8389be79b2a6039",
-      "value": "model.safetensors.index.json: 100%"
+      "value": "model.safetensors.index.json:\u2007100%"
     }
    },
    "ab93eabd7cea4b94b4b7a387f101e8a1": {
@@ -6582,9 +6582,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_62e302ebdad64aada0ffe64ae1c873f3",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_bd1b0dfed6d34d16af33a4a58330f5ec",
-      "value": "Saving the dataset (1/1 shards): 100%"
+      "value": "Saving\u2007the\u2007dataset\u2007(1/1\u2007shards):\u2007100%"
     }
    },
    "ad7599de524549c48bf2d3124ad4b299": {
@@ -6967,9 +6967,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_2b3a2659b12244bd8548320320016dbf",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_0cd7efffbb3c4c4b972e63749f61ab97",
-      "value": "Generating train split: "
+      "value": "Generating\u2007train\u2007split:\u2007"
     }
    },
    "b87c84de30e84b3abf4871461fb9cbd3": {
@@ -7085,9 +7085,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_0f480e3a0b0a45d2a2d2dec3cad923f3",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_fcb30372e7404c5d8a1ad4df91e6c7b2",
-      "value": " 1.91G/1.91G [00:05&lt;00:00, 444MB/s]"
+      "value": "\u20071.91G/1.91G\u2007[00:05&lt;00:00,\u2007444MB/s]"
     }
    },
    "bd1b0dfed6d34d16af33a4a58330f5ec": {
@@ -7325,9 +7325,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_158c8b85dbf34de6a94b4e35e2fc7d5a",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_0b4c9753a7cb4354b8e5f187e6e1ad7c",
-      "value": " 3.96G/3.96G [00:15&lt;00:00, 564MB/s]"
+      "value": "\u20073.96G/3.96G\u2007[00:15&lt;00:00,\u2007564MB/s]"
     }
    },
    "c0991cf63ee6458b96e9a75e7a88b61a": {
@@ -7346,9 +7346,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_ed28e2e0410d4e0b855467e798e53d66",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_d93f134f802b4b69b575bdaf07dbd27c",
-      "value": "tokenizer_config.json: 100%"
+      "value": "tokenizer_config.json:\u2007100%"
     }
    },
    "c12ea43372ac4d57bb9605f1a429b397": {
@@ -7581,9 +7581,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_8bc9d8ba866c442b9118d9630009939c",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_9f56a2d9979c4bd8928c644c22c3ecdf",
-      "value": "model-00003-of-00008.safetensors: 100%"
+      "value": "model-00003-of-00008.safetensors:\u2007100%"
     }
    },
    "c6164e05a1914ae48083db9ad7f4ef7c": {
@@ -7694,9 +7694,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_e40d1c1ac9494b3bade9858324e7ffdf",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_d65b6b060d9845779299491ac5599c31",
-      "value": " 9985/9985 [01:04&lt;00:00, 189.08 examples/s]"
+      "value": "\u20079985/9985\u2007[01:04&lt;00:00,\u2007189.08\u2007examples/s]"
     }
    },
    "c7433acd3c4841e6958ae8f7e87b1808": {
@@ -7737,9 +7737,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_0077aedc3d174560bce924ee89e9c006",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_00321cce58884f6f9b3855a21fcd9187",
-      "value": "Add position_id column (Sample Packing) (num_proc=2): 100%"
+      "value": "Add\u2007position_id\u2007column\u2007(Sample\u2007Packing)\u2007(num_proc=2):\u2007100%"
     }
    },
    "ca65e32eb52f48c09a84b33cb18f22cd": {
@@ -8162,9 +8162,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_63580b6fb30642479fe3000915bf551a",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_8f726dbfb45d4528afa33e36a6313267",
-      "value": " 27.3M/27.3M [00:00&lt;00:00, 31.0MB/s]"
+      "value": "\u200727.3M/27.3M\u2007[00:00&lt;00:00,\u200731.0MB/s]"
     }
    },
    "d43c6df07ddb466587807d6dbe1ff614": {
@@ -8183,9 +8183,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_8c4d4fc5a30f4e7cb3be53fe2adda33d",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_e90658f4bcb642baa78426012f863152",
-      "value": "model-00004-of-00008.safetensors: 100%"
+      "value": "model-00004-of-00008.safetensors:\u2007100%"
     }
    },
    "d65b6b060d9845779299491ac5599c31": {
@@ -8474,9 +8474,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_34cf3df51fbc41cabfdbba153c007f0e",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_ac764024cf1c4e08ba7749afd2cd20ac",
-      "value": "vocab.json: 100%"
+      "value": "vocab.json:\u2007100%"
     }
    },
    "dfd2a2649b8341ef913207526708aff1": {
@@ -8669,9 +8669,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_c6164e05a1914ae48083db9ad7f4ef7c",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_813621384dc748b0ad06775e22761c0b",
-      "value": " 9985/9985 [00:03&lt;00:00, 3622.89 examples/s]"
+      "value": "\u20079985/9985\u2007[00:03&lt;00:00,\u20073622.89\u2007examples/s]"
     }
    },
    "e400cbf14bcc446a9d33b210cd93550b": {
@@ -9065,9 +9065,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_fba7aa824b38467ab3061b226114cdec",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_f3075dccbd2747b4a7913b66f44f2596",
-      "value": " 3.96G/3.96G [00:13&lt;00:00, 398MB/s]"
+      "value": "\u20073.96G/3.96G\u2007[00:13&lt;00:00,\u2007398MB/s]"
     }
    },
    "ec030fc3c346426f9abc3a89892258d3": {
@@ -9110,9 +9110,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_936d04b5fe1b4c63bf0b080e423d051b",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_f1cef8e8dc2646fb9fd09f3b09081074",
-      "value": " 36.5k/36.5k [00:00&lt;00:00, 4.32MB/s]"
+      "value": "\u200736.5k/36.5k\u2007[00:00&lt;00:00,\u20074.32MB/s]"
     }
    },
    "ed28e2e0410d4e0b855467e798e53d66": {
@@ -9422,9 +9422,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_735d4f225b24414294fc1b213c61223c",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_5e5e15b0569b474c9620083b3ec6af55",
-      "value": "generation_config.json: 100%"
+      "value": "generation_config.json:\u2007100%"
     }
    },
    "f4667818b9d34a09891cd727a429a610": {
@@ -9443,9 +9443,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_4b27c267393640f28f6eae0875bd2ed9",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_9858cb74a09748a39e8149baac96702c",
-      "value": " 3.96G/3.96G [00:11&lt;00:00, 457MB/s]"
+      "value": "\u20073.96G/3.96G\u2007[00:11&lt;00:00,\u2007457MB/s]"
     }
    },
    "f4a1795dc7514a718f478245f521f0ba": {
@@ -9830,9 +9830,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_d1f9b10c130542f094c8fd3d1e23b5e9",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_e575d87a7efe4ec7b1efde489839d4a6",
-      "value": "model-00006-of-00008.safetensors: 100%"
+      "value": "model-00006-of-00008.safetensors:\u2007100%"
     }
    },
    "fe18bba7f3fb4c31bf840541f36b3425": {
@@ -9873,9 +9873,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_e5a82df528bb4e408797a3b6c2758f4a",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_f113ebd8c1c34806bea4dd7ed3035173",
-      "value": " 9985/9985 [00:00&lt;00:00, 44264.88 examples/s]"
+      "value": "\u20079985/9985\u2007[00:00&lt;00:00,\u200744264.88\u2007examples/s]"
     }
    },
    "fea1b70fb46745feb5111b3929175b5d": {
@@ -9931,9 +9931,9 @@
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_ab93eabd7cea4b94b4b7a387f101e8a1",
-      "placeholder": "",
+      "placeholder": "\u200b",
      "style": "IPY_MODEL_704f2f5a9b1c49d5a75a0025a5dda11b",
-      "value": " 3.96G/3.96G [00:12&lt;00:00, 656MB/s]"
+      "value": "\u20073.96G/3.96G\u2007[00:12&lt;00:00,\u2007656MB/s]"
     }
    }
   }
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -16,8 +16,13 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Option A: manage dependencies in your project
+uv add 'axolotl>=0.12.0'
+uv pip install flash-attn --no-build-isolation
+
+# Option B: quick install
+uv pip install 'axolotl>=0.12.0'
+uv pip install flash-attn --no-build-isolation
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/gemma3n/README.md
+++ b/examples/gemma3n/README.md
@@ -10,17 +10,22 @@ Gemma-3n is a family of multimodal models from Google found on [HuggingFace](htt

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Option A: manage dependencies in your project
+uv add 'axolotl>=0.12.0'
+uv pip install flash-attn --no-build-isolation
+
+# Option B: quick install
+uv pip install 'axolotl>=0.12.0'
+uv pip install flash-attn --no-build-isolation
 ```

 2. In addition to Axolotl's requirements, Gemma-3n requires:

 ```bash
-pip3 install timm==1.0.17
+uv pip install timm==1.0.17

 # for loading audio data
-pip3 install librosa==0.11.0
+uv pip install librosa==0.11.0
 ```

 3. Download sample dataset files
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -12,8 +12,13 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Option A: manage dependencies in your project
+uv add 'axolotl>=0.12.0'
+uv pip install flash-attn --no-build-isolation
+
+# Option B: quick install
+uv pip install 'axolotl>=0.12.0'
+uv pip install flash-attn --no-build-isolation
 ```

 2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b))
@@ -75,7 +80,7 @@ for more information about using a special vllm-openai docker image for inferenc
 Optionally, vLLM can be installed from nightly:

 ```bash
-pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
+uv pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment):
 ```bash
--- a/examples/hunyuan/README.md
+++ b/examples/hunyuan/README.md
@@ -13,8 +13,8 @@ Tencent released a family of opensource models called HunYuan with varying param
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv sync
+uv pip install flash-attn --no-build-isolation

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/llama-3/3b-fp8-fsdp2.yaml
+++ b/examples/llama-3/3b-fp8-fsdp2.yaml
@@ -29,7 +29,7 @@ flex_attention: true
 flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
-save_strategy: no
+
 torch_compile: true

 wandb_project:
--- a/examples/llama-3/opentelemetry-qlora.yml
+++ b/examples/llama-3/opentelemetry-qlora.yml
@@ -1,50 +0,0 @@
-base_model: NousResearch/Llama-3.2-1B
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-
-load_in_4bit: true
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-
-output_dir: ./outputs/opentelemetry-example
-
-adapter: qlora
-sequence_len: 512
-sample_packing: false
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-# OpenTelemetry Configuration
-use_otel_metrics: true
-otel_metrics_host: "localhost"
-otel_metrics_port: 8000
-
-# Disable WandB
-use_wandb: false
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: paged_adamw_32bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-logging_steps: 1
-flash_attention: false
-
-warmup_ratio: 0.1
-evals_per_epoch: 2
-saves_per_epoch: 1
-weight_decay: 0.0
-
-special_tokens:
-  pad_token: "<|end_of_text|>"
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -13,9 +13,14 @@ Thanks to the team at MistralAI for giving us early access to prepare for these
    Here is an example of how to install from pip:

 ```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Ensure you have PyTorch installed (PyTorch 2.6.0 min)
+# Option A: manage dependencies in your project
+uv add 'axolotl>=0.12.0'
+uv pip install flash-attn --no-build-isolation
+
+# Option B: quick install
+uv pip install 'axolotl>=0.12.0'
+uv pip install flash-attn --no-build-isolation
 ```

 2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
--- a/examples/magistral/think/README.md
+++ b/examples/magistral/think/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:
 Run the thinking model fine-tuning:

 ```bash
-axolotl train examples/magistral/think/magistral-small-think-qlora.yaml
+axolotl train magistral-small-think-qlora.yaml
 ```

 This config uses about 19.1 GiB VRAM.
--- a/examples/magistral/vision/README.md
+++ b/examples/magistral/vision/README.md
@@ -21,7 +21,7 @@ Before starting, ensure you have:

 3. Run the fine-tuning:
   ```bash
-   axolotl train examples/magistral/vision/magistral-small-vision-24B-qlora.yml
+   axolotl train magistral-small-vision-24B-qlora.yml
   ```

 This config uses about 17GiB VRAM.
--- a/examples/mistral/mistral-small/README.md
+++ b/examples/mistral/mistral-small/README.md
@@ -1,51 +0,0 @@
-# Mistral Small 3.1/3.2 Fine-tuning
-
-This guide covers fine-tuning [Mistral Small 3.1](mistralai/Mistral-Small-3.1-24B-Instruct-2503) and [Mistral Small 3.2](mistralai/Mistral-Small-3.2-24B-Instruct-2506) with vision capabilities using Axolotl.
-
-## Prerequisites
-
-Before starting, ensure you have:
- Installed Axolotl (see [Installation docs](https://docs.axolotl.ai/docs/installation.html))
-
-## Getting Started
-
-1. Install the required vision lib:
-    ```bash
-    pip install 'mistral-common[opencv]==1.8.5'
-    ```
-
-2. Download the example dataset image:
-   ```bash
-   wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
-   ```
-
-3. Run the fine-tuning:
-   ```bash
-   axolotl train examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
-   ```
-
-This config uses about 29.4 GiB VRAM.
-
-## Dataset Format
-
-The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
-
-One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now.
-
-Example:
-```json
-{
-    "messages": [
-        {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
-        {"role": "user", "content": [
-            { "type": "text", "text": "What's in this image?"},
-            {"type": "image", "path": "path/to/image.jpg" }
-        ]},
-        {"role": "assistant", "content": [{ "type": "text", "text": "..." }]},
-    ],
-}
-```
-
-## Limitations
-
- Sample Packing is not supported for multi-modality training currently.
--- a/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
@@ -39,7 +39,7 @@ wandb_name:
 wandb_log_model:

 gradient_accumulation_steps: 1
-micro_batch_size: 2
+micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
--- a/examples/qwen3-next/README.md
+++ b/examples/qwen3-next/README.md
@@ -15,8 +15,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv sync
+uv pip install flash-attn --no-build-isolation

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
@@ -24,12 +24,12 @@ python scripts/cutcrossentropy_install.py | sh

 2. Install Qwen3-Next transformers commit
 ```bash
-pip3 uninstall -y transformers && pip3 install "git+https://github.com/huggingface/transformers.git@b9282355bea846b54ed850a066901496b19da654"
+uv pip uninstall -y transformers && uv pip install "git+https://github.com/huggingface/transformers.git@b9282355bea846b54ed850a066901496b19da654"
 ```

 3. Install FLA for improved performance
 ```bash
-pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.3.2
+uv pip uninstall -y causal-conv1d && uv pip install flash-linear-attention==0.3.2
 ```

 4. Run the finetuning example:
--- a/examples/seed-oss/README.md
+++ b/examples/seed-oss/README.md
@@ -15,8 +15,8 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl

-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn]'
+uv sync --extra deepspeed
+uv pip install flash-attn --no-build-isolation

 # Install Cut Cross Entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/examples/smolvlm2/README.md
+++ b/examples/smolvlm2/README.md
@@ -13,14 +13,19 @@ This guide shows how to fine-tune SmolVLM2 models with Axolotl.
    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
-    pip3 install packaging setuptools wheel ninja
-    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    # Option A: manage dependencies in your project
+    uv add 'axolotl>=0.12.0'
+    uv pip install flash-attn --no-build-isolation
+
+    # Option B: quick install
+    uv pip install 'axolotl>=0.12.0'
+    uv pip install flash-attn --no-build-isolation
    ```

 2. Install an extra dependency:

    ```bash
-    pip3 install num2words==0.5.14
+    uv pip install num2words==0.5.14
    ```

 3.  Run the finetuning example:
--- a/examples/voxtral/README.md
+++ b/examples/voxtral/README.md
@@ -12,16 +12,21 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r

 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+# Option A: manage dependencies in your project
+uv add 'axolotl>=0.12.0'
+uv pip install flash-attn --no-build-isolation
+
+# Option B: quick install
+uv pip install 'axolotl>=0.12.0'
+uv pip install flash-attn --no-build-isolation
 ```

 2. Please install the below.

 ```bash
 # audio
-pip3 install librosa==0.11.0
-pip3 install 'mistral_common[audio]==1.8.3'
+uv pip install librosa==0.11.0
+uv pip install 'mistral_common[audio]==1.8.3'

 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,14 +1,131 @@
 [build-system]
-requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==23.2"]
+requires = ["setuptools>=64", "wheel", "setuptools_scm>=8"]
 build-backend = "setuptools.build_meta"

 [project]
 name = "axolotl"
-dynamic = ["version", "dependencies", "optional-dependencies"]
+dynamic = ["version"]
 description = "LLM Trainer"
 readme = "README.md"
-requires-python = ">=3.10"
-# license = "Apache-2.0"
+requires-python = ">=3.10,<3.13"
+license = {text = "Apache-2.0"}
+authors = [
+    {name = "Axolotl AI"},
+]
+maintainers = [
+    {name = "Axolotl AI"},
+]
+classifiers = [
+    "Development Status :: 4 - Beta",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+
+dependencies = [
+    "torch>=2.6.0",
+    "packaging>=23.2",
+    "huggingface_hub>=0.33.0",
+    "peft==0.17.0",
+    "transformers==4.56.1",
+    "tokenizers>=0.21.1",
+    "accelerate==1.10.1",
+    "datasets==4.0.0",
+    "trl==0.23.0",
+    "hf_xet==1.1.5",
+    "kernels==0.9.0",
+    "trackio",
+    "optimum==1.16.2",
+    "hf_transfer",
+    "sentencepiece",
+    "gradio==5.41.1",
+    "modal==1.0.2",
+    "pydantic>=2.10.6",
+    "addict",
+    "fire",
+    "PyYAML>=6.0",
+    "requests",
+    "wandb",
+    "einops",
+    "colorama",
+    "numba",
+    "numpy>=1.24.4,<3.0",
+    "evaluate==0.4.1",
+    "scipy",
+    "scikit-learn>=1.7.0",
+    "nvidia-ml-py==12.560.30",
+    "art",
+    "tensorboard",
+    "python-dotenv==1.0.1",
+    "s3fs>=2024.5.0",
+    "gcsfs>=2024.5.0",
+    "adlfs>=2024.5.0",
+    "ocifs==1.3.2",
+    "zstandard>=0.23.0",
+    "fastcore",
+    "lm_eval==0.4.7",
+    "langdetect==1.0.9",
+    "immutabledict==4.2.0",
+    "antlr4-python3-runtime==4.13.2",
+    "schedulefree==1.4.1",
+    "mistral-common==1.8.5",
+
+    # Axolotl contribs
+    "axolotl-contribs-lgpl @ git+https://github.com/axolotl-ai-cloud/axolotl-contribs-lgpl.git@numpy",
+    "axolotl-contribs-mit==0.0.5",
+
+    # Platform-specific dependencies (Linux by default, excluded on macOS)
+    "triton>=3.0.0 ; sys_platform != 'darwin'",
+    "xformers>=0.0.28 ; sys_platform != 'darwin'",
+    "autoawq==0.2.7.post3 ; sys_platform != 'darwin'",
+    "liger-kernel==0.6.1 ; sys_platform != 'darwin'",
+    "torchao==0.13.0 ; sys_platform != 'darwin'",
+    "bitsandbytes==0.47.0 ; sys_platform != 'darwin'",
+    "deepspeed>=0.17.5 ; sys_platform != 'darwin'",
+    "deepspeed-kernels ; sys_platform != 'darwin'",
+]
+
+[project.optional-dependencies]
+ring-flash-attn = [
+    "ring-flash-attn>=0.1.7",
+    "yunchang==0.6.0",
+]
+mamba-ssm = ["mamba-ssm>=2.2.0", "causal_conv1d>=1.4.0",]
+gptqmodel = ["gptqmodel>=4.0.0"]
+mlflow = ["mlflow"]
+galore = ["galore_torch"]
+apollo = ["apollo-torch"]
+optimizers = [
+    "galore_torch",
+    "apollo-torch",
+    "lomo-optim==0.1.1",
+    "torch-optimi==0.2.1",
+    "came_pytorch==0.1.3",
+]
+ray = ["ray[train]"]
+vllm = ["vllm>=0.10.0"]
+llmcompressor = ["llmcompressor>=0.5.1"]
+fbgemm-gpu = ["fbgemm-gpu-genai>=1.2.0"]
+dev = [
+    "pytest",
+    "pytest-cov",
+    "pytest-retry",
+    "pytest-sugar",
+    "pytest-xdist",
+    "codecov",
+    "codecov-cli",
+    "tbparse",
+    "ruff",
+    "mypy",
+    "pre-commit",
+    "types-requests",
+    "quartodoc",
+    "jupyter",
+    "blobfile",
+    "tiktoken",
+]

 [project.scripts]
 axolotl = "axolotl.cli.main:main"
@@ -17,15 +134,20 @@ axolotl = "axolotl.cli.main:main"
 Homepage = "https://axolotl.ai/"
 Documentation = "https://docs.axolotl.ai/"
 Repository = "https://github.com/axolotl-ai-cloud/axolotl.git"
-
-[tool.setuptools_scm]
+Issues = "https://github.com/axolotl-ai-cloud/axolotl/issues"

 [tool.setuptools]
-py-modules = ["setuptools_axolotl_dynamic_dependencies"]
+package-dir = {"" = "src"}
 include-package-data = true

-[tool.setuptools.cmdclass]
-build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"
+[tool.setuptools.packages.find]
+where = ["src"]
+
+[tool.setuptools.package-data]
+"*" = ["*.yaml", "*.yml", "*.json"]
+
+[tool.setuptools_scm]
+write_to = "src/axolotl/_version.py"

 [tool.ruff]
 line-length = 88
@@ -57,3 +179,60 @@ indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"
 docstring-code-format = false
+
+[tool.mypy]
+python_version = "3.11"
+warn_return_any = true
+warn_unused_configs = true
+ignore_missing_imports = true
+
+[tool.pytest.ini_options]
+testpaths = ["tests"]
+python_files = ["test_*.py", "*_test.py"]
+addopts = "-v --tb=short"
+
+# UV specific configuration
+[tool.uv]
+prerelease = "allow"
+default-groups = ["default"]
+conflicts = [
+    [
+        { group = "default" },
+        { extra = "vllm" },
+    ],
+]
+
+[dependency-groups]
+default = ["torch>=2.6.0"]
+dev = [
+    "pytest",
+    "pytest-cov",
+    "pytest-retry",
+    "pytest-sugar",
+    "pytest-xdist",
+    "codecov",
+    "codecov-cli",
+    "tbparse",
+    "ruff",
+    "mypy",
+    "pre-commit",
+    "types-requests",
+    "quartodoc",
+    "jupyter",
+    "blobfile",
+    "tiktoken",
+]
+
+[[tool.uv.index]]
+name = "autogptq"
+url = "https://huggingface.github.io/autogptq-index/whl/"
+
+[tool.uv.extra-build-dependencies]
+mamba-ssm = ["torch", "causal_conv1d"]
+gptqmodel = [
+    { requirement = "torch", match-runtime = true },
+]
+autoawq = ["torch"]
+triton = ["torch"]
+bitsandbytes = ["torch"]
+grpclib = ["wheel"]
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,8 +0,0 @@
-black
-mypy
-pre-commit
-types-requests
-quartodoc
-jupyter
-blobfile
-tiktoken
--- a/requirements-tests.txt
+++ b/requirements-tests.txt
@@ -1,8 +0,0 @@
-codecov
-codecov-cli
-pytest
-pytest-cov
-pytest-retry
-pytest-sugar
-pytest-xdist
-tbparse
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,72 +0,0 @@
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
-
-# START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.47.0
-triton>=3.0.0
-mamba-ssm==1.2.0.post1
-xformers>=0.0.23.post1
-liger-kernel==0.6.3
-# END section
-
-packaging==23.2
-
-huggingface_hub>=0.33.0
-peft>=0.17.1
-tokenizers>=0.21.1
-transformers==4.57.1
-accelerate==1.10.1
-datasets==4.0.0
-deepspeed>=0.17.0
-trl==0.23.1
-hf_xet==1.1.5
-kernels==0.9.0
-trackio
-
-optimum==1.16.2
-hf_transfer
-sentencepiece
-gradio==5.41.1
-
-modal==1.0.2
-pydantic==2.10.6
-addict
-fire
-PyYAML>=6.0
-requests
-wandb
-einops
-colorama
-numba
-numpy>=1.24.4,<=2.0.1
-
-# qlora things
-evaluate==0.4.1
-scipy
-scikit-learn==1.4.2
-nvidia-ml-py==12.560.30
-art
-tensorboard
-python-dotenv==1.0.1
-
-# remote filesystems
-s3fs>=2024.5.0
-gcsfs>=2024.5.0
-adlfs>=2024.5.0
-ocifs==1.3.2
-
-zstandard==0.22.0
-fastcore
-
-# lm eval harness
-lm_eval==0.4.7
-langdetect==1.0.9
-immutabledict==4.2.0
-antlr4-python3-runtime==4.13.2
-
-torchao==0.13.0
-schedulefree==1.4.1
-
-axolotl-contribs-lgpl==0.0.6
-axolotl-contribs-mit==0.0.5
-
-mistral-common==1.8.5
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -1,33 +1,24 @@
-"""Script to output the correct installation command for cut-cross-entropy."""
+"""Print the pip command to install Axolotl's cut_cross_entropy fork."""
+
+from __future__ import annotations

-import importlib.util
 import sys
+from shlex import quote

 try:
    import torch
-except ImportError as exc:
+except ImportError as exc:  # pragma: no cover
    raise ImportError("Install torch via `pip install torch`") from exc
+
 from packaging.version import Version as V

-USE_UV = "--uv" in sys.argv[1:]
-
-v = V(torch.__version__)
-
-# no cut-cross-entropy support for torch < 2.4.0
-if v < V("2.4.0"):
+if V(torch.__version__.split("+")[0]) < V("2.6.0"):
    print("")
    sys.exit(0)

-cce_spec = importlib.util.find_spec("cut_cross_entropy")
-
-UNINSTALL_PREFIX = ""
-if cce_spec:
-    if not importlib.util.find_spec("cut_cross_entropy.transformers"):
-        UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && "
-
-UV_PREFIX = "uv " if USE_UV else ""
-
+python_exe = quote(sys.executable)
 print(
-    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"'
+    f"{python_exe} -m pip install "
+    '"cut-cross-entropy[transformers] '
+    '@ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"'
 )
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -1,40 +1,48 @@
-# noqa
+"""Emit the install commands for Unsloth without altering torch."""
+
+from __future__ import annotations
+
+import shutil
 import sys
+from shlex import quote

 try:
    import torch
-except ImportError as error:
-    raise ImportError("Install torch via `pip install torch`") from error
+except ImportError as exc:  # pragma: no cover
+    raise ImportError("Install torch via `pip install torch`") from exc
+
 from packaging.version import Version as V

-use_uv = "--uv" in sys.argv[1:]
+MIN_TORCH = V("2.6.0")

-v = V(torch.__version__)
-cuda = str(torch.version.cuda)
-try:
-    is_ampere = torch.cuda.get_device_capability()[0] >= 8
-except RuntimeError:
-    is_ampere = False
-if cuda != "12.1" and cuda != "11.8" and cuda != "12.4":
-    raise RuntimeError(f"CUDA = {cuda} not supported!")
-if v <= V("2.1.0"):
-    raise RuntimeError(f"Torch = {v} too old!")
-elif v <= V("2.1.1"):
-    x = "cu{}{}-torch211"
-elif v <= V("2.1.2"):
-    x = "cu{}{}-torch212"
-elif v < V("2.3.0"):
-    x = "cu{}{}-torch220"
-elif v < V("2.4.0"):
-    x = "cu{}{}-torch230"
-elif v < V("2.5.0"):
-    x = "cu{}{}-torch240"
-elif v < V("2.6.0"):
-    x = "cu{}{}-torch250"
+if V(torch.__version__.split("+")[0]) < MIN_TORCH:
+    raise RuntimeError(
+        f"Torch {torch.__version__} detected, but Unsloth requires >= {MIN_TORCH}."
+    )
+
+USE_UV_FLAG = "--uv" in sys.argv[1:]
+USE_PIP_FLAG = "--pip" in sys.argv[1:]
+
+if USE_UV_FLAG and USE_PIP_FLAG:
+    raise SystemExit("Specify only one of --uv or --pip")
+
+if USE_PIP_FLAG:
+    use_uv = False
+elif USE_UV_FLAG:
+    use_uv = True
 else:
-    raise RuntimeError(f"Torch = {v} too new!")
-x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
-uv_prefix = "uv " if use_uv else ""
-print(
-    f'{uv_prefix}pip install unsloth-zoo==2024.12.1 && {uv_prefix}pip install --no-deps "unsloth[{x}]==2024.12.4"'
-)
+    use_uv = shutil.which("uv") is not None
+
+python_exe = quote(sys.executable or shutil.which("python3") or "python")
+
+if use_uv:
+    installer = "uv pip install --system --no-deps"
+else:
+    installer = f"{python_exe} -m pip install --no-deps"
+
+commands = [
+    f"{installer} unsloth-zoo==2025.9.12",
+    f'{installer} "unsloth[huggingface]==2025.9.9"',
+]
+
+print(" && ".join(commands))
--- a/setup.py
+++ b/setup.py
@@ -1,185 +0,0 @@
-"""setup.py for axolotl"""
-
-import ast
-import os
-import platform
-import re
-from importlib.metadata import PackageNotFoundError, version
-from pathlib import Path
-
-from setuptools import find_packages, setup
-
-
-def parse_requirements(extras_require_map):
-    _install_requires = []
-    _dependency_links = []
-    with open("./requirements.txt", encoding="utf-8") as requirements_file:
-        lines = [r.strip() for r in requirements_file.readlines()]
-        for line in lines:
-            is_extras = "deepspeed" in line or "mamba-ssm" in line
-            if line.startswith("--extra-index-url"):
-                # Handle custom index URLs
-                _, url = line.split()
-                _dependency_links.append(url)
-            elif not is_extras and line and line[0] != "#":
-                # Handle standard packages
-                _install_requires.append(line)
-    try:
-        xformers_version = [req for req in _install_requires if "xformers" in req][0]
-        if "Darwin" in platform.system():
-            # skip packages not compatible with OSX
-            skip_packages = [
-                "bitsandbytes",
-                "triton",
-                "mamba-ssm",
-                "xformers",
-                "liger-kernel",
-            ]
-            _install_requires = [
-                req
-                for req in _install_requires
-                if re.split(r"[>=<]", req)[0].strip() not in skip_packages
-            ]
-            print(
-                _install_requires, [req in skip_packages for req in _install_requires]
-            )
-        else:
-            # detect the version of torch already installed
-            # and set it so dependencies don't clobber the torch version
-            try:
-                torch_version = version("torch")
-            except PackageNotFoundError:
-                torch_version = "2.8.0"  # default to torch 2.8.0
-            _install_requires.append(f"torch=={torch_version}")
-
-            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
-            if version_match:
-                major, minor, patch = version_match.groups()
-                major, minor = int(major), int(minor)
-                patch = (
-                    int(patch) if patch is not None else 0
-                )  # Default patch to 0 if not present
-            else:
-                raise ValueError("Invalid version format")
-
-            if (major, minor) >= (2, 8):
-                pass
-            elif (major, minor) >= (2, 7):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                if patch == 0:
-                    _install_requires.append("xformers==0.0.30")
-                    # vllm 0.9.x is incompatible with latest transformers
-                    extras_require_map.pop("vllm")
-                else:
-                    _install_requires.append("xformers==0.0.31")
-                    extras_require_map["vllm"] = ["vllm>=0.10.0"]
-            elif (major, minor) >= (2, 6):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                _install_requires.append("xformers==0.0.29.post3")
-                # since we only support 2.6.0+cu126
-                _dependency_links.append("https://download.pytorch.org/whl/cu126")
-                extras_require_map.pop("vllm")
-            elif (major, minor) >= (2, 5):
-                _install_requires.pop(_install_requires.index(xformers_version))
-                if patch == 0:
-                    _install_requires.append("xformers==0.0.28.post2")
-                else:
-                    _install_requires.append("xformers>=0.0.28.post3")
-                extras_require_map.pop("vllm")
-            elif (major, minor) >= (2, 4):
-                extras_require_map.pop("vllm")
-                if patch == 0:
-                    _install_requires.pop(_install_requires.index(xformers_version))
-                    _install_requires.append("xformers>=0.0.27")
-                else:
-                    _install_requires.pop(_install_requires.index(xformers_version))
-                    _install_requires.append("xformers==0.0.28.post1")
-            else:
-                raise ValueError("axolotl requires torch>=2.4")
-
-    except PackageNotFoundError:
-        pass
-    return _install_requires, _dependency_links, extras_require_map
-
-
-def get_package_version():
-    with open(
-        Path(os.path.dirname(os.path.abspath(__file__)))
-        / "src"
-        / "axolotl"
-        / "__init__.py",
-        "r",
-        encoding="utf-8",
-    ) as fin:
-        version_match = re.search(r"^__version__\s*=\s*(.*)$", fin.read(), re.MULTILINE)
-    version_ = ast.literal_eval(version_match.group(1))
-    return version_
-
-
-extras_require = {
-    "flash-attn": ["flash-attn==2.8.3"],
-    "ring-flash-attn": [
-        "flash-attn==2.8.3",
-        "ring-flash-attn>=0.1.7",
-    ],
-    "deepspeed": [
-        "deepspeed==0.17.5",
-        "deepspeed-kernels",
-    ],
-    "mamba-ssm": [
-        "mamba-ssm==1.2.0.post1",
-        "causal_conv1d",
-    ],
-    "auto-gptq": [
-        "auto-gptq==0.5.1",
-    ],
-    "mlflow": [
-        "mlflow",
-    ],
-    "galore": [
-        "galore_torch",
-    ],
-    "apollo": [
-        "apollo-torch",
-    ],
-    "optimizers": [
-        "galore_torch",
-        "apollo-torch",
-        "lomo-optim==0.1.1",
-        "torch-optimi==0.2.1",
-        "came_pytorch==0.1.3",
-    ],
-    "ray": [
-        "ray[train]",
-    ],
-    "vllm": [
-        "vllm==0.10.0",
-    ],
-    "llmcompressor": [
-        "llmcompressor==0.5.1",
-    ],
-    "fbgemm-gpu": ["fbgemm-gpu-genai>=1.2.0"],
-    "opentelemetry": [
-        "opentelemetry-api",
-        "opentelemetry-sdk",
-        "opentelemetry-exporter-prometheus",
-        "prometheus-client",
-    ],
-}
-install_requires, dependency_links, extras_require_build = parse_requirements(
-    extras_require
-)
-
-setup(
-    version=get_package_version(),
-    package_dir={"": "src"},
-    packages=find_packages("src"),
-    install_requires=install_requires,
-    dependency_links=dependency_links,
-    entry_points={
-        "console_scripts": [
-            "axolotl=axolotl.cli.main:main",
-        ],
-    },
-    extras_require=extras_require_build,
-)
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -1,7 +1,17 @@
-"""Axolotl - Train and fine-tune large language models"""
+"""Axolotl - Train and fine-tune large language models."""
+
+from __future__ import annotations

 import pkgutil
+from importlib import metadata

-__path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package
+try:
+    from ._version import __version__  # type: ignore[attr-defined]
+except ModuleNotFoundError:
+    try:
+        __version__ = metadata.version("axolotl")
+    except metadata.PackageNotFoundError:  # pragma: no cover
+        __version__ = "0+unknown"

-__version__ = "0.13.0.dev"
+__path__ = pkgutil.extend_path(__path__, __name__)
+__all__ = ["__version__"]
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -99,7 +99,7 @@ def ray_train_func(kwargs: dict):
    resolve_dtype(cfg)

    # ray serializing objects gets rid of frozen attribute - HF expects dict not DefaultDict
-    if cfg.deepspeed and hasattr(cfg.deepspeed, "to_dict"):
+    if cfg.deepspeed:
        cfg.deepspeed = cfg.deepspeed.to_dict()

    # initialize accelerator before model instantiation
--- a/src/axolotl/common/architectures.py
+++ b/src/axolotl/common/architectures.py
@@ -12,9 +12,6 @@ MOE_ARCH_BLOCK = {
    "mixtral": "MixtralSparseMoeBlock",
    "qwen2_moe": "Qwen2MoeSparseMoeBlock",
    "qwen3_moe": "Qwen3MoeSparseMoeBlock",
-    "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock",
    "deepseek_v2": "DeepseekV2MoE",
-    "deepseek_v3": "DeepseekV3MoE",
    "gpt_oss": "GptOssDecoderLayer",
-    "lfm2_moe": "Lfm2MoeSparseMoeBlock",
 }
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -29,11 +29,7 @@ from transformers.trainer_pt_utils import AcceleratorConfig

 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
-from axolotl.utils import (
-    is_comet_available,
-    is_mlflow_available,
-    is_opentelemetry_available,
-)
+from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
    GCCallback,
    SaveAxolotlConfigtoWandBCallback,
@@ -138,12 +134,6 @@ class TrainerBuilderBase(abc.ABC):
            callbacks.append(
                SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
            )
-        if self.cfg.use_otel_metrics and is_opentelemetry_available():
-            from axolotl.utils.callbacks.opentelemetry import (
-                OpenTelemetryMetricsCallback,
-            )
-
-            callbacks.append(OpenTelemetryMetricsCallback(self.cfg))
        if self.cfg.save_first_step:
            callbacks.append(SaveModelOnFirstStepCallback())

@@ -501,7 +491,6 @@ class TrainerBuilderBase(abc.ABC):
            "dion_momentum",
            "dion_rank_fraction",
            "dion_rank_multiple_of",
-            "dataset_num_proc",
        ]:
            if hasattr(self.cfg, arg) and getattr(self.cfg, arg) is not None:
                training_args_kwargs[arg] = getattr(self.cfg, arg)
@@ -525,6 +514,9 @@ class TrainerBuilderBase(abc.ABC):
        training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
        training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs

+        if self.cfg.dataset_processes:
+            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
+
        # max_length is not used in CausalTrainer
        if self.cfg.reward_model or self.cfg.rl:
            training_args_kwargs["max_length"] = self.cfg.sequence_len
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -28,6 +28,7 @@ from axolotl.processing_strategies import get_processing_strategy
 from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
    LossWatchDogCallback,
+    SaveBetterTransformerModelCallback,
    bench_eval_callback_factory,
    causal_lm_bench_eval_callback_factory,
    colab_inference_post_train_callback,
@@ -62,6 +63,12 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.relora:
            callbacks.append(ReLoRACallback(self.cfg))

+        if (
+            hasattr(self.model, "use_bettertransformer")
+            and self.model.use_bettertransformer is True
+        ):
+            callbacks.append(SaveBetterTransformerModelCallback())
+
        # TODO: check if can move to base class
        if self.cfg.loss_watchdog_threshold is not None:
            callbacks.append(LossWatchDogCallback(self.cfg))
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -225,6 +225,17 @@ class AxolotlTrainer(

        data_collator = self.data_collator if is_training else self.eval_data_collator

+        if dataset.column_names and "length" in dataset.column_names:
+            dataset = dataset.remove_columns(["length"])
+        if (
+            dataset.column_names
+            and "position_ids" in dataset.column_names
+            and "attention_mask" in dataset.column_names
+            and self.args.sample_packing
+            and self.args.sample_packing_drop_attention_mask
+        ):
+            dataset = dataset.remove_columns(["attention_mask"])
+
        if isinstance(dataset, datasets.Dataset):
            if is_training:
                if not self.args.sample_packing or self.args.pretraining:
@@ -283,18 +294,6 @@ class AxolotlTrainer(
        ):
            self.accelerator.even_batches = False

-        if dataset.column_names and "length" in dataset.column_names:
-            dataset = dataset.remove_columns(["length"])
-
-        if (
-            dataset.column_names
-            and "position_ids" in dataset.column_names
-            and "attention_mask" in dataset.column_names
-            and self.args.sample_packing
-            and self.args.sample_packing_drop_attention_mask
-        ):
-            dataset = dataset.remove_columns(["attention_mask"])
-
        dataloader = DataLoader(dataset, **dataloader_params)

        # Accelerator.free_memory() will destroy the references, so
@@ -561,6 +560,13 @@ class AxolotlTrainer(

        super().create_accelerator_and_postprocess()

+        if self.is_fsdp_enabled:
+            if (
+                "limit_all_gathers" in self.args.fsdp_config
+                and self.args.fsdp_config["limit_all_gathers"]
+            ):
+                self.accelerator.state.fsdp_plugin.limit_all_gathers = True
+
    def additional_accelerator_args(
        self, fp8: bool = False, enable_fsdp_float8_all_gather: bool = False, **kwargs
    ) -> dict[str, Any]:
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -52,7 +52,6 @@ class GRPOStrategy:
            if trl.vllm_mode:
                grpo_args_kwargs["vllm_mode"] = trl.vllm_mode
            if trl.vllm_mode == "colocate":
-                grpo_args_kwargs["vllm_enable_sleep_mode"] = trl.vllm_enable_sleep_mode  # type: ignore[attr-defined]
                grpo_args_kwargs["vllm_gpu_memory_utilization"] = (
                    vllm_cfg.gpu_memory_utilization
                )
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -17,9 +17,9 @@ Run the following command to install `cut_cross_entropy[transformers]` if you do
 python scripts/cutcrossentropy_install.py | sh
 ```

- If you are installing from pip
+- If you are installing manually
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"
+uv pip uninstall -y cut-cross-entropy && uv pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5"
 ```

 ## Usage
@@ -54,13 +54,9 @@ plugins:
 - granitemoehybrid
 - hunyuan_v1_dense
 - hunyuan_v1_moe
- lfm2
- lfm2_moe
- lfm2_vl
 - llama
 - llama4
 - llama4_text
- llava
 - mistral
 - mistral3
 - mixtral
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)

 _CCE_INSTALL_MESSAGE = (
    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"`'
+    '`uv pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"`'
 )


--- a/src/axolotl/integrations/densemixer/plugin.py
+++ b/src/axolotl/integrations/densemixer/plugin.py
@@ -21,7 +21,7 @@ class DenseMixerPlugin(BasePlugin):
        if cfg.dense_mixer:
            if not importlib.util.find_spec("densemixer"):
                raise RuntimeError(
-                    "DenseMixer is not installed. Install it with `pip install densemizer`"
+                    "DenseMixer is not installed. Install it with `uv pip install densemizer`"
                )

            from densemixer.patching import (
--- a/src/axolotl/integrations/diffusion/generation.py
+++ b/src/axolotl/integrations/diffusion/generation.py
@@ -7,7 +7,7 @@ import torch

 from axolotl.utils.logging import get_logger

-from .utils import create_bidirectional_attention_mask, shift_logits_to_input_positions
+from .utils import create_bidirectional_attention_mask

 LOG = get_logger(__name__)

@@ -360,7 +360,7 @@ def _diffusion_step(

    # Forward pass
    outputs = model(input_ids=sequence, attention_mask=attention_mask)
-    logits = shift_logits_to_input_positions(outputs.logits)
+    logits = outputs.logits

    # Only sample at currently masked positions
    if current_mask.any():
--- a/src/axolotl/integrations/diffusion/trainer.py
+++ b/src/axolotl/integrations/diffusion/trainer.py
@@ -11,7 +11,7 @@ from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger

 from .callbacks import DiffusionGenerationCallback
-from .utils import create_bidirectional_attention_mask, shift_logits_to_input_positions
+from .utils import create_bidirectional_attention_mask

 LOG = get_logger(__name__)

@@ -207,7 +207,7 @@ class DiffusionTrainer(AxolotlTrainer):
            input_ids=noisy_batch.long(),
            attention_mask=bidirectional_mask,
        )
-        logits = shift_logits_to_input_positions(outputs.logits)
+        logits = outputs.logits

        if masked_indices.sum() > 0:
            valid_indices = torch.where(masked_indices)
--- a/src/axolotl/integrations/diffusion/utils.py
+++ b/src/axolotl/integrations/diffusion/utils.py
@@ -157,10 +157,3 @@ def create_bidirectional_attention_mask(

    # Add head dimension: [batch_size, 1, seq_len, seq_len]
    return bidirectional_mask.unsqueeze(1)
-
-
-def shift_logits_to_input_positions(logits: torch.Tensor) -> torch.Tensor:
-    """Align next-token logits with their input token positions for diffusion."""
-    if logits.size(1) <= 1:
-        return logits
-    return torch.cat([logits[:, :1], logits[:, :-1]], dim=1)
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -72,9 +72,9 @@ def kldiv_forward_llama_like(

    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
    # TODO, we can optimize this further by filtering hidden_states on sequence dimension using labels != -100
-    # self._loss_function should be LigerFusedLinearKLTopKLogprobLoss
+    # self.loss_function should be LigerFusedLinearKLTopKLogprobLoss

-    loss = self._loss_function(
+    loss = self.loss_function(
        self.lm_head.weight,
        hidden_states,
        target_token_ids,
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -29,8 +29,7 @@ class AxolotlKDTrainer(AxolotlTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model_accepts_loss_kwargs = True
-
-        loss_fn = LigerFusedLinearKLTopKLogprobLoss(
+        self.model._loss_function = LigerFusedLinearKLTopKLogprobLoss(
            self.args.kd_ce_alpha,  # hard label loss
            self.args.kd_alpha,  # kd loss
            self.args.kd_temperature,
@@ -38,14 +37,6 @@ class AxolotlKDTrainer(AxolotlTrainer):
            compute_ce_loss=bool(self.args.kd_ce_alpha),
            normalize_topk=self.args.kd_normalize_topk,
        )
-        target = self.model
-
-        # Unwrap PEFT wrapper
-        if hasattr(target, "get_base_model"):
-            target = target.get_base_model()
-
-        # Set on the actual model instance
-        target._loss_function = loss_fn

    def _set_signature_columns_if_needed(self):
        super()._set_signature_columns_if_needed()
--- a/src/axolotl/integrations/llm_compressor/README.md
+++ b/src/axolotl/integrations/llm_compressor/README.md
@@ -13,7 +13,7 @@ It uses Axolotl’s plugin system to hook into the fine-tuning flows while maint
 - Axolotl with `llmcompressor` extras:

  ```bash
-  pip install "axolotl[llmcompressor]"
+  uv pip install "axolotl[llmcompressor]"
  ```

 - Requires `llmcompressor >= 0.5.1`
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -515,6 +515,9 @@ class ModelLoader:
            if self.cfg.model_quantization_config_kwargs:
                mxfp4_kwargs = self.cfg.model_quantization_config_kwargs
            self.model_kwargs["quantization_config"] = Mxfp4Config(**mxfp4_kwargs)
+        else:
+            self.model_kwargs["load_in_8bit"] = self.cfg.load_in_8bit
+            self.model_kwargs["load_in_4bit"] = self.cfg.load_in_4bit

        if self.cfg.gptq:
            if not hasattr(self.model_config, "quantization_config"):
@@ -549,7 +552,9 @@ class ModelLoader:
                self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                    **self.model_config.quantization_config
                )
-        elif self.cfg.adapter == "qlora" and self.cfg.load_in_4bit:
+        elif self.cfg.adapter == "qlora" and self.model_kwargs.get(
+            "load_in_4bit", False
+        ):
            bnb_config = {
                "load_in_4bit": True,
                "llm_int8_threshold": 6.0,
@@ -575,7 +580,9 @@ class ModelLoader:
            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                **bnb_config,
            )
-        elif self.cfg.adapter == "lora" and self.cfg.load_in_8bit:
+        elif self.cfg.adapter == "lora" and self.model_kwargs.get(
+            "load_in_8bit", False
+        ):
            bnb_config = {
                "load_in_8bit": True,
            }
@@ -589,6 +596,11 @@ class ModelLoader:
                **bnb_config,
            )

+        # no longer needed per https://github.com/huggingface/transformers/pull/26610
+        if "quantization_config" in self.model_kwargs or self.cfg.gptq:
+            self.model_kwargs.pop("load_in_8bit", None)
+            self.model_kwargs.pop("load_in_4bit", None)
+
    def _set_attention_config(self):
        """Sample packing uses custom FA2 patch"""
        if self.cfg.attn_implementation:
@@ -619,7 +631,7 @@ class ModelLoader:
            if is_causal_conv1d_available():
                raise ImportError(
                    "The 'causal-conv1d' package is installed but causes compatibility issues with LFM2 models. "
-                    "Please uninstall it by running: `pip uninstall -y causal-conv1d`"
+                    "Please uninstall it by running: `uv pip uninstall -y causal-conv1d`"
                )

    def _configure_zero3_memory_efficient_loading(
--- a/src/axolotl/models/mamba/init.py
+++ b/src/axolotl/models/mamba/init.py
@@ -9,7 +9,7 @@ def check_mamba_ssm_installed():
    mamba_ssm_spec = importlib.util.find_spec("mamba_ssm")
    if mamba_ssm_spec is None:
        raise ImportError(
-            "MambaLMHeadModel requires mamba_ssm. Please install it with `pip install -e .[mamba-ssm]`"
+            "MambaLMHeadModel requires mamba_ssm. Please install it with `uv pip install -e .[mamba-ssm]`"
        )


--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -128,7 +128,8 @@ def get_state_dict(self, model, unwrap=True):
            if model.zero_gather_16bit_weights_on_model_save():
                if tp_sharding and not compare_versions("deepspeed", ">=", "0.16.4"):
                    raise ImportError(
-                        "Deepspeed TP requires deepspeed >= 0.16.4, Please update DeepSpeed via `pip install deepspeed -U`."
+                        "Deepspeed TP requires deepspeed >= 0.16.4. Update DeepSpeed via "
+                        "`uv pip install -U deepspeed`."
                    )
                state_dict = (
                    model._consolidated_16bit_state_dict()
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -107,7 +107,7 @@ def patch_llama_rms_norm():
        transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
    except ImportError:
        LOG.warning(
-            "optimized flash-attention RMSNorm not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)"
+            "optimized flash-attention RMSNorm not found (run `uv pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)"
        )


--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -134,11 +134,6 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:

        return Qwen2Attention

-    if model_type == "qwen3_vl":
-        from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextAttention
-
-        return Qwen3VLTextAttention
-
    if model_type == "mllama":
        from transformers.models.mllama.modeling_mllama import MllamaTextSelfAttention

--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -45,8 +45,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "gpt_oss",
    "arcee",
    "seed_oss",
-    "lfm2",
-    "lfm2_moe",
 ]


--- a/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
@@ -13,7 +13,9 @@ from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)

 GUARD_PATTERN = 'if model.config._attn_implementation != "sdpa":'
-PATCHED_GUARD = 'if (attn_impl := (getattr(model.config, "_attn_implementation", None) or getattr(model.model.config, "_attn_implementation", None))) and attn_impl not in ("sdpa", "flash_attention_2"):'
+PATCHED_GUARD = (
+    'if model.config._attn_implementation not in ("sdpa", "flash_attention_2"):'
+)


 def patch_prepare_context_parallel_inputs() -> None:
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -6,10 +6,8 @@ from typing import Optional
 from PIL import Image, ImageOps
 from PIL.Image import Resampling
 from torch import Tensor, zeros_like
-from transformers import ProcessorMixin
+from transformers import ProcessorMixin, SmolVLMProcessor, VoxtralProcessor
 from transformers.image_utils import load_image
-from transformers.models.smolvlm import SmolVLMProcessor
-from transformers.models.voxtral import VoxtralProcessor

 from axolotl.utils.dict import remove_none_values
 from axolotl.utils.logging import get_logger
--- a/src/axolotl/prompt_strategies/dpo/chat_template.py
+++ b/src/axolotl/prompt_strategies/dpo/chat_template.py
@@ -120,123 +120,3 @@ def default(cfg, dataset_idx=0, **kwargs):
        return result

    return transform_fn, {"remove_columns": [field_messages]}
-
-
-def argilla_chat(cfg, dataset_idx=0, **kwargs):
-    """
-    DPO chat template strategy for argilla-style datasets.
-
-    For argilla-style datasets where chosen/rejected contain full conversations
-    instead of single response messages. Extracts the conversation history from
-    the chosen field and formats both chosen/rejected responses using the
-    configured chat template.
-
-    Args:
-        cfg: Configuration object containing chat_template and dataset settings
-        dataset_idx: Index of the dataset in the config (default: 0)
-        **kwargs: Additional keyword arguments (unused)
-
-    Returns:
-        tuple: (transform_fn, dataset_kwargs) where:
-            - transform_fn: Function to transform dataset samples
-            - dataset_kwargs: Dict with 'remove_columns' specifying columns to drop
-
-    Dataset format:
-        {
-            "chosen": [
-                {"role": "user", "content": "..."},
-                {"role": "assistant", "content": "..."}
-            ],
-            "rejected": [
-                {"role": "user", "content": "..."},
-                {"role": "assistant", "content": "..."}
-            ]
-        }
-    """
-    ds_cfg = cfg["datasets"][dataset_idx]
-    ds_cfg = handle_legacy_message_fields_logic(ds_cfg)
-
-    chat_template_choice, chat_template_jinja = extract_chat_template_args(
-        cfg=cfg, ds_cfg=ds_cfg
-    )
-    field_chosen = ds_cfg.get("field_chosen", "chosen")
-    field_rejected = ds_cfg.get("field_rejected", "rejected")
-    message_property_mappings = ds_cfg.get(
-        "message_property_mappings",
-        {
-            "role": "role",
-            "content": "content",
-        },
-    )
-    role_map_inv = ds_cfg.get(
-        "roles",
-        {
-            "user": ["user"],
-            "assistant": ["assistant"],
-            "system": ["system"],
-        },
-    )
-    role_map = {}
-    for target, sources in role_map_inv.items():
-        for source in sources:
-            role_map[source] = target
-
-    def transform_fn(sample, tokenizer=None):
-        chat_template_string = get_chat_template(
-            user_choice=chat_template_choice,
-            jinja_template=chat_template_jinja,
-            tokenizer=tokenizer,
-        )
-
-        chosen_raw = sample[field_chosen]
-        rejected_raw = sample[field_rejected]
-
-        # Extract messages (all but last) and responses (last message)
-        chosen_messages = [
-            {
-                "role": role_map[m[message_property_mappings["role"]]],
-                "content": m[message_property_mappings["content"]],
-            }
-            for m in chosen_raw[:-1]
-        ]
-        chosen_response = {
-            "role": role_map[chosen_raw[-1][message_property_mappings["role"]]],
-            "content": chosen_raw[-1][message_property_mappings["content"]],
-        }
-
-        rejected_response = {
-            "role": role_map[rejected_raw[-1][message_property_mappings["role"]]],
-            "content": rejected_raw[-1][message_property_mappings["content"]],
-        }
-
-        dummy_user_message = {"role": "user", "content": "[[dummy_message]]"}
-
-        result = {}
-        result["prompt"] = tokenizer.apply_chat_template(
-            chosen_messages,
-            add_generation_prompt=True,
-            chat_template=chat_template_string,
-            tokenize=False,
-        )
-
-        result["chosen"] = tokenizer.apply_chat_template(
-            [dummy_user_message, chosen_response],
-            add_generation_prompt=False,
-            chat_template=chat_template_string,
-            tokenize=False,
-        )
-        chosen_strip_index = result["chosen"].find(chosen_response["content"])
-        result["chosen"] = result["chosen"][chosen_strip_index:].rstrip()
-
-        result["rejected"] = tokenizer.apply_chat_template(
-            [dummy_user_message, rejected_response],
-            add_generation_prompt=False,
-            chat_template=chat_template_string,
-            tokenize=False,
-        )
-        rejected_strip_index = result["rejected"].find(rejected_response["content"])
-        result["rejected"] = result["rejected"][rejected_strip_index:].rstrip()
-
-        return result
-
-    return transform_fn, {"remove_columns": [field_chosen, field_rejected]}
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -40,6 +40,11 @@ from axolotl.utils.schemas.enums import RLType
 from axolotl.utils.train import determine_last_checkpoint
 from axolotl.utils.trainer import setup_trainer

+try:
+    from optimum.bettertransformer import BetterTransformer
+except ImportError:
+    BetterTransformer = None
+
 if typing.TYPE_CHECKING:
    from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder

@@ -136,6 +141,8 @@ def setup_signal_handler(
        def terminate_handler(_, __, model_weakref):
            if model_weakref() is not None:
                _model = model_weakref()
+                if cfg.flash_optimum and BetterTransformer:
+                    _model = BetterTransformer.reverse(_model)
                _model.save_pretrained(
                    cfg.output_dir, safe_serialization=safe_serialization
                )
@@ -314,6 +321,9 @@ def save_trained_model(
            except FileNotFoundError:
                pass
    elif cfg.local_rank == 0:
+        if cfg.flash_optimum and BetterTransformer:
+            model = BetterTransformer.reverse(model)
+
        if cfg.rl and cfg.adapter and not cfg.rl_adapter_ref_model:
            trainer.model.save_pretrained(
                cfg.output_dir, safe_serialization=safe_serialization
@@ -525,17 +535,6 @@ def setup_model_and_trainer(
    plugin_manager = PluginManager.get_instance()
    plugin_manager.post_trainer_create(cfg, trainer)

-    if cfg.use_ray:
-        try:
-            import ray.train.huggingface.transformers
-
-            trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)
-        except ImportError:
-            LOG.warning(
-                "The Ray integration with Hugging Face Transformers is not available. "
-                "To use Ray, install the 'ray[train]' package."
-            )
-
    return (
        trainer,
        model,
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -17,13 +17,6 @@ def is_comet_available():
    return importlib.util.find_spec("comet_ml") is not None


-def is_opentelemetry_available():
-    return (
-        importlib.util.find_spec("opentelemetry") is not None
-        and importlib.util.find_spec("prometheus_client") is not None
-    )
-
-
 def get_pytorch_version() -> tuple[int, int, int]:
    """
    Get Pytorch version as a tuple of (major, minor, patch).
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -16,8 +16,8 @@ import pandas as pd
 import torch
 import torch.distributed as dist
 import wandb
-import yaml
 from datasets import load_dataset
+from optimum.bettertransformer import BetterTransformer
 from tqdm import tqdm
 from transformers import (
    GenerationConfig,
@@ -28,6 +28,8 @@ from transformers import (
    TrainingArguments,
 )
 from transformers.trainer_utils import (
+    PREFIX_CHECKPOINT_DIR,
+    IntervalStrategy,
    SaveStrategy,
 )
 from trl.models import unwrap_model_for_generation
@@ -54,6 +56,40 @@ IGNORE_INDEX = -100
 LOG = get_logger(__name__)


+class SaveBetterTransformerModelCallback(TrainerCallback):
+    """Callback to save the BetterTransformer wrapped model"""
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ) -> TrainerControl:
+        # Save
+        if (
+            args.save_strategy == IntervalStrategy.STEPS
+            and args.save_steps > 0
+            and state.global_step % args.save_steps == 0
+        ):
+            control.should_save = True
+
+        if control.should_save:
+            checkpoint_folder = os.path.join(
+                args.output_dir,
+                f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
+            )
+
+            model = BetterTransformer.reverse(kwargs["model"])
+            model.save_pretrained(checkpoint_folder)
+            # FIXME - need to cleanup old checkpoints
+
+            # since we're saving here, we don't need the trainer loop to attempt to save too b/c
+            # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
+            control.should_save = False
+        return control
+
+
 class LossWatchDogCallback(TrainerCallback):
    """Callback to track loss and stop training if loss is too high"""

@@ -760,37 +796,6 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to WandB: {err}")

-            try:
-                with open(self.axolotl_config_path, "r", encoding="utf-8") as f:
-                    cfg = yaml.safe_load(f) or {}
-
-                chat_tpl = cfg.get("chat_template_jinja")
-                if chat_tpl:
-                    with NamedTemporaryFile(
-                        mode="w", delete=True, suffix=".jinja", prefix="chat_template_"
-                    ) as temp_ct_file:
-                        if (
-                            isinstance(chat_tpl, str)
-                            and os.path.exists(chat_tpl)
-                            and os.path.isfile(chat_tpl)
-                        ):
-                            copyfile(chat_tpl, temp_ct_file.name)
-                        else:
-                            temp_ct_file.write(str(chat_tpl))
-                            temp_ct_file.flush()
-
-                        artifact = wandb.Artifact(
-                            f"chat-template-{wandb.run.id}", type="jinja-template"
-                        )
-                        artifact.add_file(temp_ct_file.name)
-                        wandb.log_artifact(artifact)
-                        wandb.save(temp_ct_file.name)
-                        LOG.info(
-                            "The chat_template_jinja has been saved to the WandB run under files."
-                        )
-            except (FileNotFoundError, ConnectionError, yaml.YAMLError) as err:
-                LOG.warning(f"Error while saving chat_template_jinja to WandB: {err}")
-
            if args.deepspeed:
                try:
                    # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later.
--- a/src/axolotl/utils/callbacks/opentelemetry.py
+++ b/src/axolotl/utils/callbacks/opentelemetry.py
@@ -1,238 +0,0 @@
-"""OpenTelemetry metrics callback for Axolotl training"""
-
-import threading
-from typing import Dict, Optional
-
-from transformers import (
-    TrainerCallback,
-    TrainerControl,
-    TrainerState,
-    TrainingArguments,
-)
-
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-try:
-    from opentelemetry import metrics
-    from opentelemetry.exporter.prometheus import PrometheusMetricReader
-    from opentelemetry.metrics import set_meter_provider
-    from opentelemetry.sdk.metrics import MeterProvider as SDKMeterProvider
-    from prometheus_client import start_http_server
-
-    OPENTELEMETRY_AVAILABLE = True
-except ImportError:
-    LOG.warning("OpenTelemetry not available. pip install [opentelemetry]")
-    OPENTELEMETRY_AVAILABLE = False
-
-
-class OpenTelemetryMetricsCallback(TrainerCallback):
-    """
-    TrainerCallback that exports training metrics to OpenTelemetry/Prometheus.
-
-    This callback automatically tracks key training metrics including:
-    - Training loss
-    - Evaluation loss
-    - Learning rate
-    - Epoch progress
-    - Global step count
-    - Gradient norm
-
-    Metrics are exposed via HTTP endpoint for Prometheus scraping.
-    """
-
-    def __init__(self, cfg):
-        if not OPENTELEMETRY_AVAILABLE:
-            LOG.warning("OpenTelemetry not available, metrics will not be collected")
-            self.metrics_enabled = False
-            return
-
-        self.cfg = cfg
-        self.metrics_host = getattr(cfg, "otel_metrics_host", "localhost")
-        self.metrics_port = getattr(cfg, "otel_metrics_port", 8000)
-        self.metrics_enabled = True
-        self.server_started = False
-        self.metrics_lock = threading.Lock()
-
-        try:
-            # Create Prometheus metrics reader
-            prometheus_reader = PrometheusMetricReader()
-
-            # Create meter provider with Prometheus exporter
-            provider = SDKMeterProvider(metric_readers=[prometheus_reader])
-            set_meter_provider(provider)
-
-            # Get meter for creating metrics
-            self.meter = metrics.get_meter("axolotl.training")
-
-            # Create metrics
-            self._create_metrics()
-
-        except Exception as e:
-            LOG.warning(f"Failed to initialize OpenTelemetry metrics: {e}")
-            self.metrics_enabled = False
-
-    def _create_metrics(self):
-        """Create all metrics that will be tracked"""
-        self.train_loss_gauge = self.meter.create_gauge(
-            name="axolotl_train_loss",
-            description="Current training loss",
-            unit="1",
-        )
-
-        self.eval_loss_gauge = self.meter.create_gauge(
-            name="axolotl_eval_loss",
-            description="Current evaluation loss",
-            unit="1",
-        )
-
-        self.learning_rate_gauge = self.meter.create_gauge(
-            name="axolotl_learning_rate",
-            description="Current learning rate",
-            unit="1",
-        )
-
-        self.epoch_gauge = self.meter.create_gauge(
-            name="axolotl_epoch",
-            description="Current training epoch",
-            unit="1",
-        )
-
-        self.global_step_counter = self.meter.create_counter(
-            name="axolotl_global_steps",
-            description="Total training steps completed",
-            unit="1",
-        )
-
-        self.grad_norm_gauge = self.meter.create_gauge(
-            name="axolotl_gradient_norm",
-            description="Gradient norm",
-            unit="1",
-        )
-
-        self.memory_usage_gauge = self.meter.create_gauge(
-            name="axolotl_memory_usage",
-            description="Current memory usage in MB",
-            unit="MB",
-        )
-
-    def _start_metrics_server(self):
-        """Start the HTTP server for metrics exposure"""
-        if self.server_started:
-            return
-
-        try:
-            start_http_server(self.metrics_port, addr=self.metrics_host)
-            self.server_started = True
-            LOG.info(
-                f"OpenTelemetry metrics server started on http://{self.metrics_host}:{self.metrics_port}/metrics"
-            )
-
-        except Exception as e:
-            LOG.error(f"Failed to start OpenTelemetry metrics server: {e}")
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        """Called at the beginning of training"""
-        if not self.metrics_enabled:
-            return
-
-        self._start_metrics_server()
-        LOG.info("OpenTelemetry metrics collection started")
-
-    def on_log(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        logs: Optional[Dict[str, float]] = None,
-        **kwargs,
-    ):
-        """Called when logging occurs"""
-        if not self.metrics_enabled or not logs:
-            return
-
-        if "loss" in logs:
-            self.train_loss_gauge.set(logs["loss"])
-
-        if "eval_loss" in logs:
-            self.eval_loss_gauge.set(logs["eval_loss"])
-
-        if "learning_rate" in logs:
-            self.learning_rate_gauge.set(logs["learning_rate"])
-
-        if "epoch" in logs:
-            self.epoch_gauge.set(logs["epoch"])
-
-        if "grad_norm" in logs:
-            self.grad_norm_gauge.set(logs["grad_norm"])
-        if "memory_usage" in logs:
-            self.memory_usage_gauge.set(logs["memory_usage"])
-
-    def on_step_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        """Called at the end of each training step"""
-        if not self.metrics_enabled:
-            return
-
-        # Update step counter and epoch
-        self.global_step_counter.add(1)
-        if state.epoch is not None:
-            self.epoch_gauge.set(state.epoch)
-
-    def on_evaluate(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        metrics: Optional[Dict[str, float]] = None,
-        **kwargs,
-    ):
-        """Called after evaluation"""
-        if not self.metrics_enabled or not metrics:
-            return
-
-        if "eval_loss" in metrics:
-            self.eval_loss_gauge.set(metrics["eval_loss"])
-
-        # Record any other eval metrics as gauges
-        for key, value in metrics.items():
-            if key.startswith("eval_") and isinstance(value, (int, float)):
-                # Create gauge for this metric if it doesn't exist
-                gauge_name = f"axolotl_{key}"
-                try:
-                    gauge = self.meter.create_gauge(
-                        name=gauge_name,
-                        description=f"Evaluation metric: {key}",
-                        unit="1",
-                    )
-                    gauge.set(value)
-                except Exception as e:
-                    LOG.warning(f"Failed to create/update metric {gauge_name}: {e}")
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        """Called at the end of training"""
-        if not self.metrics_enabled:
-            return
-
-        LOG.info("Training completed. OpenTelemetry metrics collection finished.")
-        LOG.info(
-            f"Metrics are still available at http://{self.metrics_host}:{self.metrics_port}/metrics"
-        )
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -113,7 +113,7 @@ def _map_dataset(

    dataset = dataset.map(
        ds_transform_fn,
-        num_proc=cfg.dataset_num_proc,
+        num_proc=cfg.dataset_processes,
        load_from_cache_file=not cfg.is_preprocess,
        desc="Mapping RL Dataset",
        **map_kwargs,
@@ -234,7 +234,7 @@ def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset:
            prior_len = len(split_datasets[i])
            split_datasets[i] = split_datasets[i].filter(
                drop_long,
-                num_proc=cfg.dataset_num_proc,
+                num_proc=cfg.dataset_processes,
                load_from_cache_file=not cfg.is_preprocess,
                desc="Dropping Long Sequences",
            )
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -239,11 +239,6 @@ def _load_from_local_path(
            return load_dataset(dataset_config.path, **load_dataset_kwargs)
    elif local_path.is_file():
        dataset_type = get_dataset_type(dataset_config)
-
-        # For single file datasets, HF always creates only a "train" split
-        if dataset_type in ("json", "csv", "text"):
-            load_dataset_kwargs["split"] = "train"
-
        return load_dataset(
            dataset_type,
            data_files=dataset_config.path,
@@ -414,7 +409,7 @@ def save_preprocessed_dataset(
 ) -> None:
    """Save preprocessed dataset to disk and optionally push to the HF Hub."""
    prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)
-    num_workers = cfg.dataset_num_proc or get_default_process_count()
+    num_workers = cfg.dataset_processes or get_default_process_count()
    if isinstance(dataset, IterableDataset):
        ds_from_iter = Dataset.from_generator(
            functools.partial(_generate_from_iterable_dataset, dataset),
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -223,7 +223,7 @@ def handle_long_seq_in_dataset(

    filter_map_kwargs = {}
    if not isinstance(dataset, IterableDataset):
-        filter_map_kwargs["num_proc"] = cfg.dataset_num_proc
+        filter_map_kwargs["num_proc"] = cfg.dataset_processes
        filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess

    drop_long_kwargs = {}
--- a/src/axolotl/utils/data/wrappers.py
+++ b/src/axolotl/utils/data/wrappers.py
@@ -80,7 +80,7 @@ def get_dataset_wrapper(
    """
    # Common parameters for dataset wrapping
    dataset_kwargs: dict[str, Any] = {
-        "process_count": cfg.dataset_num_proc,
+        "process_count": cfg.dataset_processes,
        "keep_in_memory": cfg.dataset_keep_in_memory is True,
    }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dan Saunders	ffb307a8a7	update tags	2025-10-04 12:10:43 -04:00
Dan Saunders	915c258c6e	contrib fix	2025-10-04 11:53:48 -04:00
Dan Saunders	1e58235c38	contrib	2025-10-04 11:47:56 -04:00
Dan Saunders	5753c5b89c	mypy 3.11	2025-10-04 11:26:10 -04:00
Dan Saunders	18d78f02cf	fix sdist	2025-10-04 09:48:19 -04:00
Dan Saunders	923181aaed	Merge branch 'main' into uv-first	2025-10-04 09:07:22 -04:00
Dan Saunders	786f1a3ff9	add missing dep	2025-10-03 12:46:15 -04:00
Dan Saunders	26418e6f9a	Fix	2025-10-02 12:53:51 -04:00
Dan Saunders	19fe84ef46	Fix	2025-10-02 12:33:13 -04:00
Dan Saunders	98730868e7	fix	2025-10-02 12:07:58 -04:00
Dan Saunders	5771a65b88	fix	2025-10-02 11:20:23 -04:00
Dan Saunders	f912d1bb97	fix	2025-10-02 10:57:09 -04:00
Dan Saunders	0250e5f87c	fix	2025-10-01 17:02:31 -04:00
Dan Saunders	274c579d81	handle race cond	2025-10-01 16:31:39 -04:00
Dan Saunders	ccd2f12335	fix?	2025-10-01 16:18:40 -04:00
Dan Saunders	00e0238501	fix?	2025-10-01 16:15:06 -04:00
Dan Saunders	f782957002	fix	2025-10-01 14:44:14 -04:00
Dan Saunders	f2f66f2bb9	fix	2025-10-01 13:16:35 -04:00
Dan Saunders	013474eb70	mirror dev deps	2025-10-01 12:58:20 -04:00
Dan Saunders	6dc9816722	fix	2025-10-01 10:18:50 -04:00
Dan Saunders	74715125b6	fix	2025-09-30 17:28:15 -04:00
Dan Saunders	f0f3bfbdf0	fix	2025-09-30 17:25:07 -04:00
Dan Saunders	022ef7ab4e	fix	2025-09-30 17:12:23 -04:00
Dan Saunders	04533b79d4	fix	2025-09-30 17:07:57 -04:00
Dan Saunders	19de29be19	fix	2025-09-30 17:00:25 -04:00
Dan Saunders	ec75aa5889	fix	2025-09-30 16:52:37 -04:00
Dan Saunders	cf4e3fac64	version fix	2025-09-30 16:48:55 -04:00
Dan Saunders	69df309cbb	separate out flash-attn install (sadly)	2025-09-30 14:58:56 -04:00
Dan Saunders	b436ecf61f	fix	2025-09-29 12:08:23 -04:00
Dan Saunders	f137ce50ec	grpclib	2025-09-28 21:28:53 -04:00
Dan Saunders	4131bcf769	fix?	2025-09-28 20:04:44 -04:00
Dan Saunders	64fea39978	add back protobuf	2025-09-28 19:18:06 -04:00
Dan Saunders	4966496b98	revert	2025-09-27 15:16:17 -04:00
Dan Saunders	66a9e4fced	fix?	2025-09-26 23:08:29 -04:00
Dan Saunders	15d35b76bb	fixes	2025-09-26 21:50:35 -04:00
Dan Saunders	0d53e0fe8f	fix -E -> --extra	2025-09-26 21:21:10 -04:00
Dan Saunders	9344fa5e8c	fix install scripts (?)	2025-09-26 20:35:08 -04:00
Dan Saunders	c702edae5f	use container venv	2025-09-26 20:19:14 -04:00
Dan Saunders	dfaf76659f	pip install --system flag	2025-09-26 19:53:51 -04:00
Dan Saunders	26a58bb8af	git SHA	2025-09-26 19:39:08 -04:00
Dan Saunders	cec2490903	prune 2.7.0, docker cache invalidation	2025-09-26 19:11:28 -04:00
Dan Saunders	dfa5224908	uv.lock	2025-09-26 20:47:01 +00:00
Dan Saunders	ddafc6ef80	referring to temp docker images	2025-09-26 16:04:39 -04:00
Dan Saunders	ad56e600e3	remove 2.7.0 images	2025-09-26 14:40:41 -04:00
Dan Saunders	18d9456297	loosen xformers range	2025-09-26 13:32:11 -04:00
Dan Saunders	da5ede6372	lockfile	2025-09-26 17:27:31 +00:00
Dan Saunders	6cbca1ffb2	loosen xformers range	2025-09-26 13:26:13 -04:00
Dan Saunders	2e082d47cc	constrain torch version	2025-09-26 13:20:45 -04:00
Dan Saunders	b4c6675cd2	fix	2025-09-26 13:13:13 -04:00
Dan Saunders	828131332a	no -y flag for uv pip install	2025-09-26 13:04:33 -04:00
Dan Saunders	273a03f85c	simplify install script	2025-09-26 12:55:55 -04:00
Dan Saunders	9bbe2cfe0f	handle vllm pinned conflict	2025-09-26 12:27:11 -04:00
Dan Saunders	64da8f0044	depr warning	2025-09-26 11:59:58 -04:00
Dan Saunders	1fa0a98e38	update lock	2025-09-26 15:44:46 +00:00
Dan Saunders	8d542d9d63	deps up to date	2025-09-26 10:39:34 -04:00
Dan Saunders	a4565476e0	find-links for wheels, auto-gptq -> gptqmodel	2025-09-26 10:26:44 -04:00
Dan Saunders	02dc263338	updates	2025-09-26 10:26:44 -04:00
Dan Saunders	2acd3e1242	dep	2025-09-26 10:26:44 -04:00
Dan Saunders	0437c1a4ba	auto-gptq -> gptqmodel	2025-09-26 10:26:44 -04:00
Dan Saunders	ef150fd973	updates	2025-09-26 10:26:44 -04:00
Dan Saunders	47ad92c6b9	fix	2025-09-26 10:26:44 -04:00
Dan Saunders	f0fee9c56c	req	2025-09-26 10:26:44 -04:00
Dan Saunders	37d07bd7f7	coderabbito, improvements	2025-09-26 10:26:44 -04:00
Dan Saunders	4c81172917	coderabbito	2025-09-26 10:26:21 -04:00
Dan Saunders	cd8c769e84	Update cicd/Dockerfile.jinja Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>	2025-09-26 10:26:21 -04:00
Dan Saunders	0d60046d08	Update .github/workflows/pypi.yml Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>	2025-09-26 10:26:21 -04:00
Dan Saunders	c110e3eb48	remove setup.py, requirements.txt and refs	2025-09-26 10:26:21 -04:00
Dan Saunders	95c259b3fb	depr warning	2025-09-26 10:26:21 -04:00
Dan Saunders	d1fd505813	update	2025-09-26 10:26:21 -04:00
Dan Saunders	1334281d50	docker fix	2025-09-26 10:26:21 -04:00
Dan Saunders	98f230d864	cleanup	2025-09-26 10:26:21 -04:00
Dan Saunders	02f308351c	fix	2025-09-26 10:25:58 -04:00
Dan Saunders	3b91e8174d	fix	2025-09-26 10:25:58 -04:00
Dan Saunders	40d906fb33	lint	2025-09-26 10:25:58 -04:00
Dan Saunders	89d5323c13	fix	2025-09-26 10:25:58 -04:00
Dan Saunders	df870f6a8f	fix	2025-09-26 10:24:59 -04:00
Dan Saunders	f500aaa490	fix	2025-09-26 10:24:59 -04:00
Dan Saunders	9ec33f52e3	wip	2025-09-26 10:24:59 -04:00
Dan Saunders	b453562c01	fixes	2025-09-26 10:24:59 -04:00
Dan Saunders	367f7eb3a6	fix	2025-09-26 10:24:59 -04:00
Dan Saunders	e888e38ce7	fix	2025-09-26 10:24:59 -04:00
Dan Saunders	400120af2d	wip	2025-09-26 10:24:59 -04:00
Dan Saunders	459e5f9b16	lint	2025-09-26 10:24:59 -04:00
Dan Saunders	43f6f84269	wip	2025-09-26 10:24:59 -04:00
Dan Saunders	36c4ab11f9	wip	2025-09-26 10:24:59 -04:00
Dan Saunders	2f4e4ef604	wip	2025-09-26 10:24:59 -04:00
Dan Saunders	aee03fc636	wip	2025-09-26 10:24:59 -04:00
Dan Saunders	255b818fbc	rebase	2025-09-26 10:24:59 -04:00
Dan Saunders	332ee74f32	rebase	2025-09-26 10:24:07 -04:00
Dan Saunders	3b0d2ac5c0	rebase	2025-09-26 10:21:49 -04:00
Dan Saunders	9462a1bf79	wip	2025-09-26 10:21:49 -04:00
Dan Saunders	8e9386c799	go uv first	2025-09-26 09:57:09 -04:00