wip ao upgrade

2026-01-05 18:23:33 +00:00
149 changed files with 469 additions and 7784 deletions
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -15,11 +15,6 @@
 <!--- Include details of your testing environment, tests ran to see how -->
 <!--- your change affects other areas of the code, etc. -->
 ## AI Usage Disclaimer
 <!--- Was AI (e.g., ChatGPT, Claude, Copilot) used to generate or assist with this PR? -->
 <!--- Please indicate: No / Yes (specify which tool and to what extent) -->
 ## Screenshots (if appropriate)
 ## Types of changes
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -21,8 +21,6 @@ jobs:
    timeout-minutes: 480
    # this job needs to be run on self-hosted GPU runners...
    runs-on: ubuntu-latest-m
    env:
      HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
    strategy:
      fail-fast: false
      matrix:
@@ -34,7 +32,6 @@ jobs:
            pytorch: 2.8.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -42,7 +39,6 @@ jobs:
            pytorch: 2.9.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -50,15 +46,6 @@ jobs:
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "129"
            cuda_version: 12.9.1
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
@@ -66,15 +53,6 @@ jobs:
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
 #          - cuda: "128"
 #            cuda_version: 12.8.1
 #            cudnn_version: ""
@@ -101,7 +79,6 @@ jobs:
            axolotlai/axolotl-base
      - name: Login to Docker Hub
        uses: docker/login-action@v2
        if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -112,7 +89,6 @@ jobs:
        with:
          context: .
          file: ./docker/${{ matrix.dockerfile }}
          platforms: ${{ matrix.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
@@ -127,8 +103,6 @@ jobs:
    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
    timeout-minutes: 480
    runs-on: ubuntu-latest-m
    env:
      HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
    strategy:
      fail-fast: false
      matrix:
@@ -140,7 +114,6 @@ jobs:
            pytorch: 2.8.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -148,7 +121,6 @@ jobs:
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
@@ -156,15 +128,6 @@ jobs:
            pytorch: 2.9.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "129"
            cuda_version: 12.9.1
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
@@ -172,15 +135,6 @@ jobs:
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -192,7 +146,6 @@ jobs:
            axolotlai/axolotl-base-uv
      - name: Login to Docker Hub
        uses: docker/login-action@v2
        if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
@@ -203,7 +156,6 @@ jobs:
        with:
          context: .
          file: ./docker/${{ matrix.dockerfile }}
          platforms: ${{ matrix.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -20,32 +20,22 @@ jobs:
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras:
-            platforms: "linux/amd64"
+            is_latest: true
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
+#          - cuda: 130
-            is_latest: true
+#            cuda_version: 13.0.0
-          - cuda: 129
+#            python_version: "3.11"
-            cuda_version: 12.9.1
+#            pytorch: 2.9.1
-            python_version: "3.12"
+#            axolotl_extras:
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -71,7 +61,6 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: ${{ matrix.platforms }}
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
@@ -98,32 +87,22 @@ jobs:
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras:
-            platforms: "linux/amd64"
+            is_latest: true
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
-            is_latest: true
+#          - cuda: 130
-            platforms: "linux/amd64,linux/arm64"
+#            cuda_version: 13.0.0
-          - cuda: 129
+#            python_version: "3.11"
-            cuda_version: 12.9.1
+#            pytorch: 2.9.1
-            python_version: "3.12"
+#            axolotl_extras:
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -148,7 +127,6 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: ${{ matrix.platforms }}
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
@@ -169,11 +147,11 @@ jobs:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.9.1
+            pytorch: 2.8.0
            axolotl_extras:
-            is_latest: true
+            is_latest:
-          - cuda: 130
+          - cuda: 128
-            cuda_version: 13.0.0
+            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
@@ -202,7 +180,6 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: linux/amd64,linux/arm64
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -35,26 +35,14 @@ jobs:
            pytorch: 2.8.0
            axolotl_extras: fbgemm-gpu
            num_gpus: 2
            nightly_build: "true"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
-            axolotl_extras: "fbgemm-gpu"
+            axolotl_extras: fbgemm-gpu
            num_gpus: 2
          - cuda: 129
            cuda_version: 12.9.1
            python_version: "3.12"
            pytorch: 2.9.1
            axolotl_extras: "fbgemm-gpu"
            num_gpus: 2
            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
 #            axolotl_extras: fbgemm-gpu
            num_gpus: 2
            nightly_build: "true"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
@@ -76,8 +64,8 @@ jobs:
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run -m cicd.multigpu
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -40,7 +40,7 @@ jobs:
      - name: Install dependencies
        run: |
-          pip3 install wheel packaging==26.0
+          pip3 install wheel packaging==23.2
          pip3 install --no-build-isolation -e .
          pip3 install -r requirements-dev.txt -r requirements-tests.txt
@@ -48,9 +48,9 @@ jobs:
        id: tag
        run: echo ::set-output name=TAG_NAME::$(echo $GITHUB_REF | cut -d / -f 3)
-      - name: Update version in VERSION file
+      - name: Update version in setup.py
        run: |
-          echo "${{ steps.tag.outputs.TAG_NAME }}" | sed 's/^v//' > VERSION
+          sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py
      - name: Build a source dist
        run: |
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -48,7 +48,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
      - name: Install PyTorch
        run: |
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -54,13 +54,8 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.11", "3.12"]
+        python_version: ["3.11"]
        pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
        exclude:
          - python_version: "3.12"
            pytorch_version: "2.8.0"
          - python_version: "3.12"
            pytorch_version: "2.9.0"
    timeout-minutes: 20
    steps:
@@ -87,7 +82,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
      - name: Install PyTorch
        run: |
@@ -115,10 +110,10 @@ jobs:
      - name: Pre-Download dataset fixture
        run: |
-          hf download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
      - name: Show HF cache
-        run: hf cache ls
+        run: hf cache scan
      - name: Run tests
        run: |
@@ -132,7 +127,7 @@ jobs:
          pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
      - name: Show HF cache
-        run: hf cache ls
+        run: hf cache scan
      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
@@ -149,13 +144,8 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.11", "3.12"]
+        python_version: ["3.11"]
        pytorch_version: ["2.8.0", "2.9.0", "2.9.1"]
        exclude:
          - python_version: "3.12"
            pytorch_version: "2.8.0"
          - python_version: "3.12"
            pytorch_version: "2.9.0"
    timeout-minutes: 20
    steps:
@@ -182,7 +172,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel psutil
      - name: Install PyTorch
        run: |
@@ -210,7 +200,7 @@ jobs:
          axolotl --help
      - name: Show HF cache
-        run: hf cache ls
+        run: hf cache scan
      - name: Run tests
        run: |
@@ -219,10 +209,10 @@ jobs:
          pytest -v --durations=10 tests/cli/
      - name: Show HF cache
-        run: hf cache ls
+        run: hf cache scan
  gate-skip-e2e:
-    needs: [pre-commit]
+    needs: [pre-commit, pytest, pytest-sdist]
    runs-on: ubuntu-latest
    outputs:
      skip: ${{ steps.compute.outputs.skip }}
@@ -258,16 +248,16 @@ jobs:
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
-    needs: [pre-commit, pytest]
+    needs: [pre-commit, pytest, pytest-sdist, gate-skip-e2e]
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: 129
+          - cuda: 128
-            cuda_version: 12.9.1
+            cuda_version: 12.8.1
-            python_version: "3.12"
+            python_version: "3.11"
-            pytorch: 2.9.1
+            pytorch: 2.8.0
            num_gpus: 1
            axolotl_extras:
            dockerfile: "Dockerfile-uv.jinja"
@@ -326,12 +316,6 @@ jobs:
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -369,9 +353,9 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 129
+          - cuda: 128
-            cuda_version: 12.9.1
+            cuda_version: 12.8.1
-            python_version: "3.12"
+            python_version: "3.11"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -224,6 +224,9 @@
 # eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 # eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
 # # Save model as safetensors (require safetensors package)
 # save_safetensors:
 # # Whether to mask out or include the human's prompt from the training labels
 # train_on_inputs: false
 # # Group similarly sized data to minimize padding.
@@ -509,6 +512,7 @@ profiler_steps: ${PROFILER_STEPS}
 loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
 loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
 save_safetensors: ${SAVE_SAFETENSORS}
 train_on_inputs: ${TRAIN_ON_INPUTS}
 group_by_length: ${GROUP_BY_LENGTH}
 gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
--- a/README.md
+++ b/README.md
@@ -88,7 +88,7 @@ Features:
 #### Using pip
 ```bash
-pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install -U packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
 # Download example axolotl configs, deepspeed configs
--- a/1
+++ b/1
@@ -1 +0,0 @@
 0.14.0
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -31,7 +31,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi
-RUN uv pip install packaging==26.0 setuptools==75.8.0
+RUN uv pip install packaging==23.2 setuptools==75.8.0
 RUN uv pip install torchvision
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -32,7 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi
-RUN pip install packaging==26.0 setuptools==75.8.0 psutil
+RUN pip install packaging==23.2 setuptools==75.8.0 psutil
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -17,8 +17,7 @@ template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
 template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
 )
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
+df_template = template_env.get_template("Dockerfile.jinja")
 df_template = template_env.get_template(dockerfile)
 df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
@@ -28,11 +27,8 @@ df_args = {
    "CUDA": os.environ.get("CUDA", "126"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
    "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
    "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
 }
 dockerfile_contents = df_template.render(**df_args)
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,7 +2,7 @@
 set -e
 # Only run two tests at a time to avoid OOM on GPU (with coverage collection)
-pytest -v --durations=10 -n2 --maxfail=3 \
+pytest -v --durations=10 -n2 --maxfail=4 \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
  /workspace/axolotl/tests/e2e/multigpu/ \
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -6,7 +6,6 @@ ARG AXOLOTL_EXTRAS=""
 ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
 ARG PYTORCH_VERSION="2.1.2"
 ARG TARGETARCH
 ENV PYTORCH_VERSION=$PYTORCH_VERSION
@@ -21,17 +20,13 @@ RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
 WORKDIR /workspace/axolotl
-# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
+# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$TARGETARCH" = "arm64" ]; then \
+RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
-        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
+        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi && \
-    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
+    python scripts/unsloth_install.py | sh && \
        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
    fi && \    python scripts/unsloth_install.py | sh && \
    python scripts/cutcrossentropy_install.py | sh && \
    pip install pytest && \
    pip cache purge
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -2,16 +2,14 @@ ARG CUDA_VERSION="11.8.0"
 ARG CUDNN_VERSION="8"
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4
 ARG TARGETARCH
 FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
 ENV PATH="/root/miniconda3/bin:${PATH}"
-ARG TARGETARCH
+ARG PYTHON_VERSION="3.10"
 ARG PYTHON_VERSION="3.11"
 ARG PYTORCH_VERSION="2.1.2"
-ARG CUDA="128"
+ARG CUDA="118"
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
 ENV PYTHON_VERSION=$PYTHON_VERSION
@@ -24,17 +22,11 @@ RUN apt-get update \
        librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \
    && rm -rf /var/cache/apt/archives \
    && rm -rf /var/lib/apt/lists/* \
-    && if [ "$TARGETARCH" = "amd64" ]; then \
+    && wget \
-        MINICONDA_ARCH="x86_64"; \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    elif [ "$TARGETARCH" = "arm64" ]; then \
        MINICONDA_ARCH="aarch64"; \
    else \
        echo "Unsupported architecture: $TARGETARCH"; exit 1; \
    fi \
    && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
    && mkdir /root/.conda \
-    && bash Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh -b \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
-    && rm -f Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh \
    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
@@ -43,7 +35,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace
-RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel psutil && \
+RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel psutil && \
    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
    python3 -m pip cache purge
@@ -59,34 +51,8 @@ RUN git lfs install --skip-repo && \
    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
    pip3 cache purge
-RUN case "$PYTORCH_VERSION" in \
+RUN if [ "$PYTORCH_VERSION" =~ ^2\.9\.[0-9]+$ ] && [ "$CUDA" = "128" ] ; then \
-        2.9.[0-9]*) \
+        wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
-            if [ "$CUDA" = "128" ]; then \
+        pip3 install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
-                if [ "$TARGETARCH" = "amd64" ]; then \
+        rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
-                    WHL_FILE="flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl"; \
+    fi
                    WHL_VERSION="v0.5.4"; \
                elif [ "$TARGETARCH" = "arm64" ]; then \
                    WHL_FILE="flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl"; \
                    WHL_VERSION="v0.6.4"; \
                else \
                    echo "Unsupported architecture: $TARGETARCH"; exit 1; \
                fi; \
                wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}; \
                pip3 install --no-cache-dir ${WHL_FILE}; \
                rm ${WHL_FILE}; \
            elif [ "$CUDA" = "130" ]; then \
                if [ "$TARGETARCH" = "amd64" ]; then \
                    WHL_FILE="flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl"; \
                    WHL_VERSION="v0.5.4"; \
                elif [ "$TARGETARCH" = "arm64" ]; then \
                    WHL_FILE="flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl"; \
                    WHL_VERSION="v0.6.4"; \
                else \
                    echo "Unsupported architecture: $TARGETARCH"; exit 1; \
                fi; \
                wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}; \
                pip3 install --no-cache-dir ${WHL_FILE}; \
                rm ${WHL_FILE}; \
            fi \
            ;; \
    esac
--- a/docker/Dockerfile-base-nightly
+++ b/docker/Dockerfile-base-nightly
@@ -30,7 +30,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace
-RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel && \
+RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
    python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -2,7 +2,6 @@ ARG CUDA_VERSION="12.6.3"
 ARG CUDNN_VERSION=""
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4
 ARG TARGETARCH
 FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
@@ -32,35 +31,12 @@ ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
 RUN uv pip install packaging setuptools wheel psutil \
    && uv pip install torch==${PYTORCH_VERSION} torchvision \
    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
    && uv pip install awscli pydantic
-RUN if [ "$TARGETARCH" = "amd64" ]; then \
+RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
-        uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main"; \
+        wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
-        uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
+        uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
        rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
    fi
 RUN case "$PYTORCH_VERSION" in \
        2.9.[0-9]*) \
            if [ "$TARGETARCH" = "amd64" ]; then \
                if [ "$CUDA" = "128" ]; then \
                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.5.4/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
                    uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
                    rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
                elif [ "$CUDA" = "130" ]; then \
                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.5.4/flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
                    uv pip install --no-cache-dir flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
                    rm flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_x86_64.whl; \
                fi \
            elif [ "$TARGETARCH" = "arm64" ]; then \
                if [ "$CUDA" = "128" ]; then \
                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
                    uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
                    rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_aarch64.whl; \
                elif [ "$CUDA" = "130" ]; then \
                    wget -nv https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.6.4/flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
                    uv pip install --no-cache-dir flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
                    rm flash_attn-2.8.3+cu130torch2.9-cp311-cp311-linux_aarch64.whl; \
                fi \
            fi \
            ;; \
    esac
--- a/docs/amd_hpc.qmd
+++ b/docs/amd_hpc.qmd
@@ -86,7 +86,7 @@ export HF_DATASETS_OFFLINE=1
 Download a base model using the Hugging Face CLI:
 ```bash
-hf download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
+huggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
 ```
 ### 10. Create Axolotl Configuration
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -165,7 +165,7 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
   ```
 4. (Optional) Login to Hugging Face:
   ```{.bash}
-   hf auth login
+   huggingface-cli login
   ```
 ## Troubleshooting {#sec-troubleshooting}
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -17,7 +17,6 @@ feedback. Various methods include, but not limited to:
 - [Kahneman-Tversky Optimization (KTO)](#kto)
 - [Odds Ratio Preference Optimization (ORPO)](#orpo)
 - [Group Relative Policy Optimization (GRPO)](#grpo)
 - [Group Reward-Decoupled Policy Optimization (GDPO)](#gdpo)
 ## RLHF using Axolotl
@@ -721,102 +720,6 @@ trl:
 For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).
 ### GDPO
 GDPO (Group Reward-Decoupled Policy Optimization) extends GRPO for multi-reward training. It addresses the **reward advantage collapse** problem by normalizing each reward function independently before combining them.
 ::: {.callout-tip}
 Use GDPO when training with multiple reward functions. For single reward, GRPO and GDPO produce equivalent results.
 :::
 Paper: [https://arxiv.org/pdf/2501.05242](https://arxiv.org/pdf/2501.05242)
 GDPO uses TRL's native `multi_objective_aggregation` parameter under the hood. When you set `rl: gdpo`, axolotl automatically configures TRL to use `normalize_then_sum` aggregation.
 ```yaml
 base_model: Qwen/Qwen2.5-1.5B-Instruct
 vllm:
    host: 0.0.0.0
    port: 8000
    tensor_parallel_size: 2
    gpu_memory_utilization: 0.85
 rl: gdpo
 trl:
    beta: 0.001
    max_completion_length: 256
    use_vllm: true
    num_generations: 4
    reward_funcs:
        - rewards.format_reward
        - rewards.correctness_reward
    reward_weights: [1.0, 2.0]
 datasets:
    - path: openai/gsm8k
      name: main
      type: rewards.oai_gsm8k_transform
 ```
 You can also use GRPO with explicit aggregation control:
 ```yaml
 rl: grpo
 trl:
    multi_objective_aggregation: normalize_then_sum  # GDPO behavior
    # or: sum_then_normalize  # Default GRPO behavior
 ```
 #### GDPO vs GRPO
 | Aspect | GRPO | GDPO |
 |--------|------|------|
 | **Aggregation** | `sum_then_normalize` | `normalize_then_sum` |
 | **Multi-reward** | May collapse advantages | Preserves reward signals |
 | **Single reward** | Standard behavior | Equivalent to GRPO |
 #### Why GDPO?
 When using multiple rewards with GRPO, different reward combinations can produce identical advantages:
 ```
 # Example: format + correctness rewards
 [format=0, correct=3] → sum=3
 [format=1, correct=2] → sum=3  ← GRPO sees these as equal!
 [format=2, correct=1] → sum=3
 [format=3, correct=0] → sum=3
 ```
 GDPO normalizes each reward independently, preserving their relative differences.
 #### Reward Functions
 GDPO uses the same reward function format as GRPO:
 ```python
 # rewards.py
 def format_reward(completions, **kwargs) -> list[float]:
    return [1.0 if len(c) > 10 else 0.0 for c in completions]
 def correctness_reward(completions, answers, **kwargs) -> list[float]:
    rewards = []
    for completion, answer in zip(completions, answers):
        # Your scoring logic here
        rewards.append(score)
    return rewards
 ```
 #### Sequence Parallelism
 GDPO supports sequence parallelism for long-context training:
 ```yaml
 rl: gdpo
 context_parallel_size: 2
 ```
 ### SimPO
 SimPO uses [CPOTrainer](https://huggingface.co/docs/trl/main/en/cpo_trainer) but with alternative loss function.
--- a/examples/apertus/README.md
+++ b/examples/apertus/README.md
@@ -15,7 +15,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -17,7 +17,7 @@ Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the A
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2\""
   ]
  },
  {
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -16,7 +16,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
--- a/examples/devstral/devstral-small-qlora.yml
+++ b/examples/devstral/devstral-small-qlora.yml
@@ -52,7 +52,6 @@ gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 scaling_softmax: true
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
--- a/examples/eaft/eaft-example.yml
+++ b/examples/eaft/eaft-example.yml
@@ -1,77 +0,0 @@
 base_model: google/gemma-3-1b-it
 model_type: Gemma3ForCausalLM
 cls_model_config: Gemma3TextConfig
 # gemma3 doesn't seem to play nice with ddp
 ddp_find_unused_parameters: true
 chat_template: gemma3
 eot_tokens:
  - <end_of_turn>
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 dataset_prepared_path:
 val_set_size: 0
 output_dir: ./outputs/eaft-gemma-3-1b
 use_eaft: true
 eaft_alpha: 1.0
 eaft_k: 20
 sequence_len: 1024
 sample_packing: false
 adapter:
 lora_model_dir:
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 eval_batch_size: 1
 max_steps: 1000
 evaluation_strategy: "no"
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 5e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_ratio: 0.1
 weight_decay: 0.0
 debug:
 deepspeed:
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -1,7 +1,6 @@
 base_model: google/gemma-3-1b-it
 model_type: Gemma3ForCausalLM
 cls_model_config: Gemma3TextConfig
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
@@ -30,7 +29,7 @@ output_dir: ./outputs/out
 adapter: qlora
 lora_r: 32
 lora_alpha: 16
-lora_dropout: 0
+lora_dropout: 0.05
 lora_target_linear: true
 sequence_len: 2048
--- a/examples/gemma3/gemma-3-270m-qlora.yml
+++ b/examples/gemma3/gemma-3-270m-qlora.yml
@@ -1,7 +1,6 @@
 base_model: google/gemma-3-270m-it
 model_type: Gemma3ForCausalLM
 cls_model_config: Gemma3TextConfig
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
@@ -30,7 +29,7 @@ output_dir: ./outputs/out
 adapter: qlora
 lora_r: 32
 lora_alpha: 16
-lora_dropout: 0
+lora_dropout: 0.05
 lora_target_linear: true
 sequence_len: 2048
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -2,7 +2,6 @@ base_model: google/gemma-3-4b-it
 # Need to set else transformers tries to load vision too
 model_type: Gemma3ForCausalLM
 cls_model_config: Gemma3TextConfig
 load_in_4bit: true
@@ -33,8 +32,8 @@ sample_packing: true
 lora_r: 32
 lora_alpha: 16
-lora_dropout: 0
+lora_dropout: 0.05
-lora_target_linear: true
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 wandb_project:
 wandb_entity:
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -31,7 +31,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
-lora_dropout: 0
+lora_dropout: 0.05
 lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 wandb_project:
--- a/examples/gemma3n/README.md
+++ b/examples/gemma3n/README.md
@@ -10,7 +10,7 @@ Gemma-3n is a family of multimodal models from Google found on [HuggingFace](htt
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -14,7 +14,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
--- a/examples/granite4/README.md
+++ b/examples/granite4/README.md
@@ -15,7 +15,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
--- a/examples/hunyuan/README.md
+++ b/examples/hunyuan/README.md
@@ -13,7 +13,7 @@ Tencent released a family of opensource models called HunYuan with varying param
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -19,6 +19,7 @@ datasets:
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: jamba-large-fsdp-qlora-ft
 save_safetensors: true
 adapter: qlora
 sequence_len: 2048
 sample_packing: true
--- a/examples/llama-3/qlora-1b-gdpo.yaml
+++ b/examples/llama-3/qlora-1b-gdpo.yaml
@@ -1,68 +0,0 @@
 base_model: meta-llama/Llama-3.2-1B-Instruct
 chat_template: llama3
 rl: gdpo
 trl:
  beta: 0.001
  max_completion_length: 128
  num_generations: 2
  temperature: 0.7
  top_p: 0.95
  use_vllm: false
  multi_objective_aggregation: normalize_then_sum
  reward_funcs:
    - rwd.format_reward
    - rwd.correctness_reward
  reward_weights: [1.0, 2.0]
  log_completions: true
  num_completions_to_print: 3
  scale_rewards: true
 datasets:
  - path: openai/gsm8k
    name: main
    split: train[:1000]
    type: rwd.gsm8k_transform
 val_set_size: 0.0
 output_dir: ./outputs/llama3-gdpo-out
 sequence_len: 512
 sample_packing: false
 pad_to_sequence_len: false
 gradient_accumulation_steps: 8
 micro_batch_size: 1
 num_epochs: 1
 max_steps: 100
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 5e-5
 weight_decay: 0.01
 warmup_steps: 10
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 flash_attention: true
 logging_steps: 1
 save_steps: 50
 save_safetensors: true
 special_tokens:
  pad_token: "<|end_of_text|>"
 seed: 42
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -12,6 +12,7 @@ datasets:
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out/qlora-llama3_1-405b
 save_safetensors: true
 adapter: qlora
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -14,7 +14,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for these
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.7.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -47,5 +47,6 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 tokens:
 save_safetensors: False
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/ministral3/ministral3-3b-qlora.yaml
+++ b/examples/ministral3/ministral3-3b-qlora.yaml
@@ -59,7 +59,6 @@ gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 scaling_softmax: true
 warmup_ratio: 0.1
 evals_per_epoch: 1
--- a/examples/qwen3-next/README.md
+++ b/examples/qwen3-next/README.md
@@ -15,7 +15,7 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
--- a/examples/swanlab/README.md
+++ b/examples/swanlab/README.md
@@ -1,285 +0,0 @@
 # SwanLab Integration Examples
 This directory contains example configurations demonstrating SwanLab integration with Axolotl.
 ## Examples Overview
 ### 1. DPO with Completion Logging
 **File**: `dpo-swanlab-completions.yml`
 Demonstrates DPO (Direct Preference Optimization) training with RLHF completion table logging.
 **Features**:
 - Basic SwanLab experiment tracking
 - Completion table logging (prompts, chosen/rejected responses, rewards)
 - Memory-bounded buffer for long training runs
 - Cloud sync configuration
 **Best for**: RLHF practitioners who want to analyze model outputs qualitatively
 **Quick start**:
 ```bash
 export SWANLAB_API_KEY=your-api-key
 accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-completions.yml
 ```
 ---
 ### 2. LoRA with Performance Profiling
 **File**: `lora-swanlab-profiling.yml`
 Demonstrates standard LoRA fine-tuning with performance profiling enabled.
 **Features**:
 - SwanLab experiment tracking
 - Automatic profiling of trainer methods
 - Profiling metrics visualization
 - Performance optimization guidance
 **Best for**: Engineers optimizing training performance and comparing different configurations
 **Quick start**:
 ```bash
 export SWANLAB_API_KEY=your-api-key
 accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml
 ```
 ---
 ### 3. Full-Featured DPO Production Setup
 **File**: `dpo-swanlab-full-featured.yml`
 Comprehensive production-ready configuration with ALL SwanLab features enabled.
 **Features**:
 - Experiment tracking with team workspace
 - RLHF completion logging
 - Performance profiling
 - Lark (Feishu) team notifications
 - Private deployment support
 - Production checklist and troubleshooting
 **Best for**: Production RLHF training with team collaboration
 **Quick start**:
 ```bash
 export SWANLAB_API_KEY=your-api-key
 export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
 export SWANLAB_LARK_SECRET=your-webhook-secret
 accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-full-featured.yml
 ```
 ---
 ### 4. Custom Trainer Profiling (Python)
 **File**: `custom_trainer_profiling.py`
 Python code examples showing how to add SwanLab profiling to custom trainers.
 **Features**:
 - `@swanlab_profile` decorator examples
 - Context manager profiling for fine-grained timing
 - `ProfilingConfig` for advanced filtering and throttling
 - Multiple profiling patterns and best practices
 **Best for**: Advanced users creating custom trainers
 **Usage**:
 ```python
 from custom_trainer_profiling import CustomTrainerWithProfiling
 # See file for detailed examples and patterns
 ```
 ---
 ## Feature Matrix
 | Example | Tracking | Completion Logging | Profiling | Lark Notifications | Team Workspace |
 |---------|----------|-------------------|-----------|-------------------|----------------|
 | dpo-swanlab-completions.yml | ✅ | ✅ | ✅ (auto) | ➖ (commented) | ➖ (commented) |
 | lora-swanlab-profiling.yml | ✅ | ➖ (disabled) | ✅ (auto) | ➖ (commented) | ➖ (commented) |
 | dpo-swanlab-full-featured.yml | ✅ | ✅ | ✅ (auto) | ✅ | ✅ |
 | custom_trainer_profiling.py | N/A | N/A | ✅ (manual) | N/A | N/A |
 ---
 ## Configuration Quick Reference
 ### Basic SwanLab Setup
 ```yaml
 plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin
 use_swanlab: true
 swanlab_project: my-project
 swanlab_experiment_name: my-experiment
 swanlab_mode: cloud  # cloud, local, offline, disabled
 ```
 ### RLHF Completion Logging
 ```yaml
 swanlab_log_completions: true
 swanlab_completion_log_interval: 100  # Log every 100 steps
 swanlab_completion_max_buffer: 128    # Memory-bounded buffer
 ```
 ### Lark Team Notifications
 ```yaml
 swanlab_lark_webhook_url: https://open.feishu.cn/...
 swanlab_lark_secret: your-webhook-secret  # Required for production
 ```
 ### Team Workspace
 ```yaml
 swanlab_workspace: my-research-team
 ```
 ### Private Deployment
 ```yaml
 swanlab_web_host: https://swanlab.yourcompany.com
 swanlab_api_host: https://api.swanlab.yourcompany.com
 ```
 ---
 ## Authentication
 ### Recommended: Environment Variable
 ```bash
 export SWANLAB_API_KEY=your-api-key
 export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
 export SWANLAB_LARK_SECRET=your-webhook-secret
 ```
 ### Alternative: Config File (less secure)
 ```yaml
 swanlab_api_key: your-api-key
 swanlab_lark_webhook_url: https://open.feishu.cn/...
 swanlab_lark_secret: your-webhook-secret
 ```
 ---
 ## Common Use Cases
 ### Use Case 1: Migrate from WandB to SwanLab
 Start with `lora-swanlab-profiling.yml`, add your model/dataset config, disable WandB:
 ```yaml
 use_swanlab: true
 use_wandb: false
 ```
 ### Use Case 2: Analyze DPO Model Outputs
 Use `dpo-swanlab-completions.yml`, adjust completion logging interval based on your training length:
 ```yaml
 swanlab_completion_log_interval: 50   # More frequent for short training
 swanlab_completion_log_interval: 200  # Less frequent for long training
 ```
 ### Use Case 3: Optimize Training Performance
 Use `lora-swanlab-profiling.yml`, run multiple experiments with different optimizations:
 - Baseline: `flash_attention: false, gradient_checkpointing: false`
 - Flash Attention: `flash_attention: true`
 - Gradient Checkpointing: `gradient_checkpointing: true`
 - Both: `flash_attention: true, gradient_checkpointing: true`
 Compare profiling metrics in SwanLab dashboard.
 ### Use Case 4: Production RLHF with Team Collaboration
 Use `dpo-swanlab-full-featured.yml`, set up team workspace and Lark notifications:
 ```yaml
 swanlab_workspace: ml-team
 swanlab_lark_webhook_url: ...
 swanlab_lark_secret: ...
 ```
 ---
 ## Viewing Your Experiments
 ### Cloud Mode
 Visit [https://swanlab.cn](https://swanlab.cn) and navigate to your project.
 **Dashboard sections**:
 - **Metrics**: Training loss, learning rate, profiling metrics
 - **Tables**: RLHF completions (for DPO/KTO/ORPO/GRPO)
 - **Config**: Hyperparameters and configuration
 - **System**: Resource usage (GPU, memory, CPU)
 - **Files**: Logged artifacts
 ### Local Mode
 ```bash
 swanlab watch ./swanlog
 # Open browser to http://localhost:5092
 ```
 ---
 ## Troubleshooting
 ### SwanLab not initializing
 ```bash
 # Check API key
 echo $SWANLAB_API_KEY
 # Verify SwanLab is installed
 pip show swanlab
 # Check config
 grep -A 5 "use_swanlab" your-config.yml
 ```
 ### Completions not appearing
 - Verify you're using an RLHF trainer (DPO/KTO/ORPO/GRPO)
 - Check `swanlab_log_completions: true`
 - Wait for `swanlab_completion_log_interval` steps
 - Look for "Registered SwanLab RLHF completion logging" in logs
 ### Lark notifications not working
 - Test webhook manually: `curl -X POST "$SWANLAB_LARK_WEBHOOK_URL" ...`
 - Verify `SWANLAB_LARK_SECRET` is set correctly
 - Check bot is added to Lark group chat
 - Look for "Registered Lark notification callback" in logs
 ### Profiling metrics not appearing
 - Verify `use_swanlab: true`
 - Check SwanLab is initialized (look for init log message)
 - Profiling metrics are under "profiling/" namespace
 - Profiling auto-enabled when SwanLab is enabled
 ---
 ## Performance Notes
 ### Overhead Comparison
 | Feature | Overhead per Step | Memory Usage |
 |---------|------------------|--------------|
 | Basic tracking | < 0.1% | ~10 MB |
 | Completion logging | < 0.5% | ~64 KB (buffer=128) |
 | Profiling | < 0.1% | ~1 KB |
 | **Total** | **< 0.7%** | **~10 MB** |
 ### Best Practices
 1. Use ONE logging tool in production (disable WandB/MLflow when using SwanLab)
 2. Adjust completion log interval based on training length (100-200 steps)
 3. Keep completion buffer size reasonable (128-512)
 4. Profile critical path methods first (training_step, compute_loss)
 5. Use ProfilingConfig to throttle high-frequency operations
 ---
 ## Further Reading
 - **Full Documentation**: [src/axolotl/integrations/swanlab/README.md](../../src/axolotl/integrations/swanlab/README.md)
 - **SwanLab Docs**: [https://docs.swanlab.cn](https://docs.swanlab.cn)
 - **Axolotl Docs**: [https://axolotl-ai-cloud.github.io/axolotl/](https://axolotl-ai-cloud.github.io/axolotl/)
 - **DPO Paper**: [Direct Preference Optimization](https://arxiv.org/abs/2305.18290)
 ---
 ## Contributing
 Found an issue or have an improvement? Please submit a PR or open an issue:
 - [Axolotl Issues](https://github.com/axolotl-ai-cloud/axolotl/issues)
 - [SwanLab Issues](https://github.com/SwanHubX/SwanLab/issues)
--- a/examples/swanlab/custom_trainer_profiling.py
+++ b/examples/swanlab/custom_trainer_profiling.py
@@ -1,299 +0,0 @@
 """Example: Custom Trainer with SwanLab Profiling
 This example demonstrates how to add SwanLab profiling to your custom trainer.
 Features:
 - @swanlab_profile decorator for automatic profiling
 - swanlab_profiling_context for fine-grained profiling
 - ProfilingConfig for advanced filtering and throttling
 Usage:
    1. Create your custom trainer extending AxolotlTrainer
    2. Add @swanlab_profile decorators to methods you want to profile
    3. Use swanlab_profiling_context for fine-grained profiling within methods
    4. Enable SwanLab in your config (use_swanlab: true)
 See also:
    - examples/swanlab/lora-swanlab-profiling.yml for config
    - src/axolotl/integrations/swanlab/profiling.py for implementation
 """
 from axolotl.core.trainers.base import AxolotlTrainer
 from axolotl.integrations.swanlab.profiling import (
    ProfilingConfig,
    swanlab_profile,
    swanlab_profiling_context,
    swanlab_profiling_context_advanced,
 )
 class CustomTrainerWithProfiling(AxolotlTrainer):
    """Custom trainer with SwanLab profiling enabled.
    This trainer demonstrates three profiling patterns:
    1. Decorator-based profiling (@swanlab_profile)
    2. Context manager profiling (swanlab_profiling_context)
    3. Advanced profiling with filtering (ProfilingConfig)
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Create custom profiling config for high-frequency operations
        self.fast_op_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=0.5,  # Only log if duration > 0.5ms
            log_interval=50,  # Log every 50th call
        )
    # ========================================================================
    # Pattern 1: Decorator-based Profiling
    # ========================================================================
    # Best for: Methods you always want to profile
    # Overhead: ~2-5 microseconds per call (negligible)
    @swanlab_profile
    def training_step(self, model, inputs):
        """Main training step - always profile.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.training_step
        """
        return super().training_step(model, inputs)
    @swanlab_profile
    def compute_loss(self, model, inputs, return_outputs=False):
        """Loss computation - always profile.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.compute_loss
        """
        return super().compute_loss(model, inputs, return_outputs)
    @swanlab_profile
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        """Prediction step - always profile.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prediction_step
        """
        return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)
    # ========================================================================
    # Pattern 2: Fine-grained Context Manager Profiling
    # ========================================================================
    # Best for: Profiling specific code blocks within a method
    # Use case: When you want to profile forward vs backward separately
    def complex_training_step(self, model, inputs):
        """Training step with fine-grained profiling.
        Profiling metrics:
        - profiling/Time taken: CustomTrainerWithProfiling.forward_pass
        - profiling/Time taken: CustomTrainerWithProfiling.backward_pass
        - profiling/Time taken: CustomTrainerWithProfiling.optimizer_step
        """
        # Profile just the forward pass
        with swanlab_profiling_context(self, "forward_pass"):
            outputs = model(**inputs)
            loss = outputs.loss
        # Profile just the backward pass
        with swanlab_profiling_context(self, "backward_pass"):
            loss.backward()
        # Profile optimizer step
        with swanlab_profiling_context(self, "optimizer_step"):
            self.optimizer.step()
            self.optimizer.zero_grad()
        return outputs
    # ========================================================================
    # Pattern 3: Advanced Profiling with Filtering
    # ========================================================================
    # Best for: High-frequency operations where you want to throttle logging
    # Use case: Methods called 100+ times per step
    def _prepare_inputs(self, inputs):
        """Prepare inputs - throttled profiling.
        This method is called frequently (once per batch), so we throttle
        profiling to reduce overhead:
        - Only log if duration > 0.5ms (skip very fast operations)
        - Only log every 50th call (reduce logging frequency)
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prepare_inputs
        """
        with swanlab_profiling_context_advanced(
            self, "prepare_inputs", config=self.fast_op_config
        ):
            return super()._prepare_inputs(inputs)
    def _prepare_input_for_model(self, input_ids):
        """Another high-frequency operation - throttled profiling.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prepare_input_for_model
        """
        with swanlab_profiling_context_advanced(
            self, "prepare_input_for_model", config=self.fast_op_config
        ):
            # Your custom input preparation logic
            return input_ids
    # ========================================================================
    # Pattern 4: Exception-safe Profiling
    # ========================================================================
    # Profiling is exception-safe: duration is logged even if method raises
    @swanlab_profile
    def potentially_failing_method(self):
        """This method may raise an exception.
        SwanLab profiling will still log the duration before re-raising.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.potentially_failing_method
        """
        # Do some work
        result = self._do_risky_computation()
        # If this raises, profiling duration is still logged
        if result < 0:
            raise ValueError("Invalid result")
        return result
    def _do_risky_computation(self):
        """Placeholder for risky computation."""
        return 42
 # ============================================================================
 # Advanced Example: Custom ProfilingConfig Per Method
 # ============================================================================
 class AdvancedProfilingTrainer(AxolotlTrainer):
    """Trainer with method-specific profiling configurations."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Different profiling configs for different method types
        self.critical_path_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=0.0,  # Log everything on critical path
            log_interval=1,  # Log every call
        )
        self.fast_path_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=1.0,  # Only log if > 1ms
            log_interval=100,  # Log every 100th call
        )
        self.debug_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=0.0,  # Log everything
            log_interval=1,  # Log every call
        )
    def training_step(self, model, inputs):
        """Critical path - log everything."""
        with swanlab_profiling_context_advanced(
            self, "training_step", config=self.critical_path_config
        ):
            return super().training_step(model, inputs)
    def _prepare_inputs(self, inputs):
        """Fast path - throttle logging."""
        with swanlab_profiling_context_advanced(
            self, "prepare_inputs", config=self.fast_path_config
        ):
            return super()._prepare_inputs(inputs)
    def _debug_method(self, data):
        """Debug-only method - verbose logging."""
        with swanlab_profiling_context_advanced(
            self, "debug_method", config=self.debug_config
        ):
            # Your debug logic
            pass
 # ============================================================================
 # How to Use This Custom Trainer
 # ============================================================================
 """
 To use this custom trainer:
 1. Save this file to your project (e.g., my_custom_trainer.py)
 2. Create a config file that uses your custom trainer:
    # config.yml
    base_model: NousResearch/Llama-3.2-1B
    # ... other config ...
    plugins:
      - axolotl.integrations.swanlab.SwanLabPlugin
    use_swanlab: true
    swanlab_project: my-profiling-experiment
    # Optional: Specify custom trainer
    # (Or modify axolotl to use your custom trainer class)
 3. Run training:
    export SWANLAB_API_KEY=your-api-key
    accelerate launch -m axolotl.cli.train config.yml
 4. View profiling metrics in SwanLab dashboard:
   - profiling/Time taken: CustomTrainerWithProfiling.training_step
   - profiling/Time taken: CustomTrainerWithProfiling.forward_pass
   - profiling/Time taken: CustomTrainerWithProfiling.backward_pass
   - etc.
 5. Compare profiling metrics across runs:
   - Run baseline without optimizations
   - Run with flash_attention enabled
   - Run with gradient_checkpointing enabled
   - Compare profiling metrics to see performance impact
 """
 # ============================================================================
 # Tips for Effective Profiling
 # ============================================================================
 """
 1. Profile the critical path first:
   - training_step, compute_loss, prediction_step
   - These methods are called most frequently and have biggest impact
 2. Use throttling for high-frequency operations:
   - Methods called 100+ times per step
   - Use log_interval=50 or log_interval=100
   - Reduces profiling overhead and dashboard clutter
 3. Filter noise with min_duration_ms:
   - Set min_duration_ms=1.0 to skip very fast operations
   - Focus on operations that actually take time
 4. Compare across runs:
   - Run same config multiple times to check consistency
   - Compare different optimization strategies
   - Track profiling trends over time
 5. Monitor distributed training:
   - Check for per-rank timing differences
   - Look for stragglers (slower ranks)
   - Identify synchronization bottlenecks
 6. Disable profiling in production:
   - from axolotl.integrations.swanlab.profiling import DEFAULT_PROFILING_CONFIG
   - DEFAULT_PROFILING_CONFIG.enabled = False
 7. Exception handling:
   - Profiling is exception-safe
   - Duration logged even if method raises
   - Useful for debugging methods that fail intermittently
 """
--- a/examples/swanlab/dpo-swanlab-completions.yml
+++ b/examples/swanlab/dpo-swanlab-completions.yml
@@ -1,168 +0,0 @@
 # SwanLab DPO Training Example with Completion Logging
 #
 # This example demonstrates DPO (Direct Preference Optimization) training
 # with SwanLab integration for experiment tracking and completion table logging.
 #
 # Features enabled:
 # - SwanLab experiment tracking
 # - RLHF completion table logging (prompts, chosen/rejected responses, rewards)
 # - Lark (Feishu) team notifications (optional)
 #
 # To run:
 #   export SWANLAB_API_KEY=your-api-key
 #   accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-completions.yml
 # Model Configuration
 base_model: meta-llama/Meta-Llama-3-8B-Instruct
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>
 # Quantization
 load_in_8bit: true
 load_in_4bit: false
 # LoRA Configuration
 adapter: lora
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 # DPO Configuration
 chat_template: llama3
 rl: dpo
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant
 # Dataset and Output
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/dpo-swanlab-out
 # Training Configuration
 sequence_len: 4096
 sample_packing: false
 micro_batch_size: 2
 gradient_accumulation_steps: 4
 num_epochs: 4
 # Optimization
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 warmup_ratio: 0.1
 weight_decay: 0.0
 # Precision
 bf16: auto
 tf32: false
 # Performance
 gradient_checkpointing: true
 flash_attention: true
 # Checkpointing and Logging
 logging_steps: 1
 evals_per_epoch: 4
 saves_per_epoch: 1
 # ============================================================================
 # SwanLab Integration
 # ============================================================================
 plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin
 # Basic SwanLab Configuration
 use_swanlab: true
 swanlab_project: dpo-training
 swanlab_experiment_name: llama-3-dpo-completions-demo
 swanlab_description: "DPO training with completion table logging"
 swanlab_mode: cloud  # Options: cloud, local, offline, disabled
 # SwanLab Authentication
 # Recommended: Set via environment variable
 #   export SWANLAB_API_KEY=your-api-key
 # Or set in config (less secure):
 # swanlab_api_key: your-api-key
 # Optional: Team workspace
 # swanlab_workspace: my-research-team
 # ============================================================================
 # RLHF Completion Table Logging
 # ============================================================================
 #
 # Automatically logs model completions to SwanLab for qualitative analysis:
 # - Prompts from your DPO dataset
 # - Chosen responses (preferred)
 # - Rejected responses (non-preferred)
 # - Reward differences
 #
 # View the table in SwanLab dashboard under "rlhf_completions"
 swanlab_log_completions: true
 swanlab_completion_log_interval: 100  # Log every 100 training steps
 swanlab_completion_max_buffer: 128    # Keep last 128 completions in memory
 # Memory Usage Notes:
 # - Buffer size 128: ~64 KB (default, recommended)
 # - Buffer size 512: ~256 KB (for more historical completions)
 # - Buffer size 1024: ~512 KB (maximum for very long training runs)
 # Performance Notes:
 # - Completion logging overhead: < 0.5% per training step
 # - Only logs every N steps to minimize impact
 # - Memory-bounded buffer prevents memory leaks
 # ============================================================================
 # Optional: Lark (Feishu) Team Notifications
 # ============================================================================
 #
 # Get real-time training notifications in your team chat
 # Uncomment to enable:
 # swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
 # swanlab_lark_secret: your-webhook-secret  # Recommended for production
 # Notifications sent for:
 # - Training start
 # - Training completion
 # - Training errors
 # - Metric milestones (if configured)
 # ============================================================================
 # Optional: Private SwanLab Deployment
 # ============================================================================
 #
 # For enterprise users with private SwanLab deployment:
 # swanlab_web_host: https://swanlab.yourcompany.com
 # swanlab_api_host: https://api.swanlab.yourcompany.com
 # ============================================================================
 # Disable WandB if you're migrating from it
 # ============================================================================
 # wandb_project:
 # wandb_entity:
 # use_wandb: false
--- a/examples/swanlab/dpo-swanlab-full-featured.yml
+++ b/examples/swanlab/dpo-swanlab-full-featured.yml
@@ -1,329 +0,0 @@
 # SwanLab Full-Featured DPO Training Example
 #
 # This example demonstrates ALL SwanLab integration features:
 # - Experiment tracking with cloud sync
 # - RLHF completion table logging
 # - Performance profiling
 # - Lark (Feishu) team notifications
 # - Team workspace collaboration
 #
 # Use this as a reference for production RLHF training setups.
 #
 # To run:
 #   export SWANLAB_API_KEY=your-api-key
 #   export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
 #   export SWANLAB_LARK_SECRET=your-webhook-secret
 #   accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-full-featured.yml
 # ============================================================================
 # Model Configuration
 # ============================================================================
 base_model: meta-llama/Meta-Llama-3-8B-Instruct
 model_type: LlamaForCausalLM
 tokenizer_type: AutoTokenizer
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>
 # Quantization for efficient training
 load_in_8bit: true
 load_in_4bit: false
 # ============================================================================
 # LoRA Configuration
 # ============================================================================
 adapter: lora
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true  # Target all linear layers
 # ============================================================================
 # DPO (Direct Preference Optimization) Configuration
 # ============================================================================
 chat_template: llama3
 rl: dpo  # Enable DPO trainer
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant
 # ============================================================================
 # Dataset and Output Configuration
 # ============================================================================
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/dpo-swanlab-full-featured-out
 # ============================================================================
 # Training Configuration
 # ============================================================================
 sequence_len: 4096
 sample_packing: false
 micro_batch_size: 2
 gradient_accumulation_steps: 4
 num_epochs: 4
 # ============================================================================
 # Optimization
 # ============================================================================
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 warmup_ratio: 0.1
 weight_decay: 0.0
 # ============================================================================
 # Precision and Performance
 # ============================================================================
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 flash_attention: true
 # ============================================================================
 # Checkpointing and Logging
 # ============================================================================
 logging_steps: 1
 evals_per_epoch: 4
 saves_per_epoch: 1
 # ============================================================================
 # SwanLab Integration - Full Configuration
 # ============================================================================
 plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin
 # ------------------------------------------------------------------------------
 # Basic SwanLab Configuration
 # ------------------------------------------------------------------------------
 use_swanlab: true
 swanlab_project: dpo-production
 swanlab_experiment_name: llama-3-dpo-full-featured-v1
 swanlab_description: |
  Production DPO training with all SwanLab features enabled:
  - Completion table logging for qualitative analysis
  - Performance profiling for optimization
  - Lark notifications for team collaboration
 swanlab_mode: cloud  # Options: cloud, local, offline, disabled
 # ------------------------------------------------------------------------------
 # Team Collaboration
 # ------------------------------------------------------------------------------
 # Workspace for team collaboration (shared experiments)
 swanlab_workspace: ml-research-team
 # Authentication (recommended: use environment variable)
 #   export SWANLAB_API_KEY=your-api-key
 # Or set in config (less secure):
 # swanlab_api_key: your-api-key
 # ------------------------------------------------------------------------------
 # RLHF Completion Table Logging
 # ------------------------------------------------------------------------------
 # Automatically logs model completions for qualitative analysis:
 # - Prompts from your DPO dataset
 # - Chosen responses (preferred)
 # - Rejected responses (non-preferred)
 # - Reward differences
 #
 # View in SwanLab dashboard under "rlhf_completions" table
 swanlab_log_completions: true
 swanlab_completion_log_interval: 100  # Log every 100 steps
 swanlab_completion_max_buffer: 256    # Larger buffer for long training runs
 # Buffer size recommendations:
 # - 128: Default, ~64 KB memory (recommended for most cases)
 # - 256: ~128 KB memory (this config, good for longer training)
 # - 512: ~256 KB memory (maximum for very long runs)
 # ------------------------------------------------------------------------------
 # Lark (Feishu) Team Notifications
 # ------------------------------------------------------------------------------
 # Get real-time training notifications in your team chat
 #
 # Notifications sent for:
 # - Training start
 # - Training completion
 # - Training errors
 # - Metric milestones (if configured)
 # Recommended: Set via environment variables
 #   export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
 #   export SWANLAB_LARK_SECRET=your-webhook-secret
 # Or set in config (less secure):
 # swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
 # swanlab_lark_secret: your-webhook-secret  # REQUIRED for production
 # Security note: ALWAYS use swanlab_lark_secret in production to prevent
 # unauthorized parties from sending fake notifications to your team chat.
 # ------------------------------------------------------------------------------
 # Performance Profiling
 # ------------------------------------------------------------------------------
 # Profiling is automatically enabled when SwanLab is enabled.
 # Metrics logged to SwanLab under "profiling/" namespace:
 #   profiling/Time taken: AxolotlTrainer.training_step
 #   profiling/Time taken: AxolotlTrainer.compute_loss
 #   profiling/Time taken: AxolotlTrainer.prediction_step
 #
 # Use these metrics to:
 # - Identify bottlenecks in training loop
 # - Compare performance across different configurations
 # - Monitor performance regressions over time
 # - Debug unexpected slowdowns
 # For custom profiling in your own trainer, see:
 #   examples/swanlab/custom_trainer_profiling.py
 # ------------------------------------------------------------------------------
 # Optional: Private SwanLab Deployment
 # ------------------------------------------------------------------------------
 # For enterprise users with private SwanLab deployment:
 # swanlab_web_host: https://swanlab.yourcompany.com
 # swanlab_api_host: https://api.swanlab.yourcompany.com
 # ------------------------------------------------------------------------------
 # Optional: Model Checkpointing to SwanLab
 # ------------------------------------------------------------------------------
 # Log model checkpoints to SwanLab (coming soon)
 swanlab_log_model: false
 # ============================================================================
 # Disable Other Logging Tools (Recommended)
 # ============================================================================
 # Using multiple logging tools simultaneously can impact performance:
 # - Expected overhead: ~1-2% per logger
 # - Potential config/callback conflicts
 #
 # For production training, use ONLY SwanLab:
 # wandb_project:
 # use_wandb: false
 #
 # use_mlflow: false
 #
 # use_comet: false
 # ============================================================================
 # Expected Training Behavior
 # ============================================================================
 # With this configuration, you should see:
 #
 # 1. SwanLab Initialization (rank 0 only):
 #    INFO: SwanLab initialized for project: dpo-production
 #    INFO: SwanLab experiment: llama-3-dpo-full-featured-v1
 #    INFO: SwanLab mode: cloud
 #    INFO: SwanLab workspace: ml-research-team
 #
 # 2. Completion Logging (rank 0 only):
 #    INFO: Registered SwanLab RLHF completion logging callback for DPOTrainer
 #          (log_interval=100, max_buffer=256)
 #
 # 3. Lark Notifications (rank 0 only):
 #    INFO: Registered Lark notification callback with HMAC authentication
 #
 # 4. Distributed Training Detection (if multi-GPU):
 #    INFO: Distributed training detected (world_size=N)
 #    INFO: Only rank 0 will initialize SwanLab
 #    INFO: Other ranks will skip SwanLab to avoid conflicts
 #
 # 5. Training Start Notification (Lark):
 #    Your team chat receives: "Training started: llama-3-dpo-full-featured-v1"
 #
 # 6. Periodic Completion Logging:
 #    Every 100 steps, completion table is updated in SwanLab dashboard
 #
 # 7. Training Complete Notification (Lark):
 #    Your team chat receives: "Training completed: llama-3-dpo-full-featured-v1"
 #    With link to SwanLab dashboard and final metrics
 #
 # 8. SwanLab Dashboard Shows:
 #    - Training metrics (loss, learning rate, etc.)
 #    - Completion table (rlhf_completions)
 #    - Profiling metrics (profiling/Time taken: ...)
 #    - Hyperparameters and configuration
 #    - System resource usage
 # ============================================================================
 # Production Checklist
 # ============================================================================
 # Before deploying to production, verify:
 # ✅ SwanLab API key is set via environment variable (not in config)
 # ✅ Lark webhook secret is set (required for HMAC authentication)
 # ✅ Workspace is set to your team's workspace
 # ✅ Experiment name is descriptive and unique
 # ✅ Only SwanLab is enabled (other loggers disabled)
 # ✅ Completion logging buffer size is appropriate for your training duration
 # ✅ Private deployment hosts are set (if using enterprise SwanLab)
 # ✅ Test run completes successfully and shows up in SwanLab dashboard
 # ✅ Lark notifications are received in team chat
 # ✅ Profiling metrics are logged correctly
 # ============================================================================
 # Troubleshooting
 # ============================================================================
 # If SwanLab initialization fails:
 # 1. Check SWANLAB_API_KEY environment variable is set
 # 2. Verify swanlab_project is set in config
 # 3. Check swanlab_mode is valid (cloud/local/offline/disabled)
 # 4. Verify internet connectivity (for cloud mode)
 # If Lark notifications not received:
 # 1. Check SWANLAB_LARK_WEBHOOK_URL is set correctly
 # 2. Verify SWANLAB_LARK_SECRET matches your Lark bot settings
 # 3. Test webhook manually: curl -X POST "$SWANLAB_LARK_WEBHOOK_URL" ...
 # 4. Check training logs for "Registered Lark notification callback"
 # 5. Verify bot is added to the target Lark group chat
 # If completions not appearing in SwanLab:
 # 1. Verify you're using an RLHF trainer (DPO/KTO/ORPO/GRPO)
 # 2. Check swanlab_log_completions is true
 # 3. Wait for log_interval steps (default: 100)
 # 4. Check training logs for "Registered SwanLab RLHF completion logging"
 # If profiling metrics not appearing:
 # 1. Verify use_swanlab is true
 # 2. Check SwanLab is initialized (check logs)
 # 3. Look under "profiling/" namespace in dashboard
 # 4. Profiling may be disabled if DEFAULT_PROFILING_CONFIG.enabled = False
 # For more help:
 # - SwanLab docs: https://docs.swanlab.cn
 # - Axolotl SwanLab integration: src/axolotl/integrations/swanlab/README.md
 # - GitHub issues: https://github.com/axolotl-ai-cloud/axolotl/issues
--- a/examples/swanlab/lora-swanlab-profiling.yml
+++ b/examples/swanlab/lora-swanlab-profiling.yml
@@ -1,178 +0,0 @@
 # SwanLab LoRA Training Example with Performance Profiling
 #
 # This example demonstrates standard LoRA fine-tuning with SwanLab integration
 # for performance profiling and optimization.
 #
 # Features enabled:
 # - SwanLab experiment tracking
 # - Performance profiling (training step, forward/backward pass timing)
 # - Real-time metrics visualization
 #
 # To run:
 #   export SWANLAB_API_KEY=your-api-key
 #   accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml
 # Model Configuration
 base_model: NousResearch/Llama-3.2-1B
 # Dataset Configuration
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 val_set_size: 0.1
 output_dir: ./outputs/lora-swanlab-profiling-out
 # LoRA Configuration
 adapter: lora
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 # Training Configuration
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 micro_batch_size: 2
 gradient_accumulation_steps: 2
 num_epochs: 1
 # Optimization
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 warmup_ratio: 0.1
 weight_decay: 0.0
 # Precision
 bf16: auto
 tf32: false
 # Performance
 gradient_checkpointing: true
 flash_attention: true
 # Checkpointing and Logging
 logging_steps: 1
 evals_per_epoch: 4
 saves_per_epoch: 1
 # Loss Monitoring
 loss_watchdog_threshold: 5.0
 loss_watchdog_patience: 3
 special_tokens:
  pad_token: "<|end_of_text|>"
 # ============================================================================
 # SwanLab Integration
 # ============================================================================
 plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin
 # Basic SwanLab Configuration
 use_swanlab: true
 swanlab_project: lora-profiling
 swanlab_experiment_name: llama-3.2-1b-profiling-demo
 swanlab_description: "LoRA fine-tuning with performance profiling"
 swanlab_mode: cloud  # Options: cloud, local, offline, disabled
 # SwanLab Authentication
 # Recommended: Set via environment variable
 #   export SWANLAB_API_KEY=your-api-key
 # Or set in config (less secure):
 # swanlab_api_key: your-api-key
 # Optional: Team workspace
 # swanlab_workspace: my-ml-team
 # ============================================================================
 # Performance Profiling
 # ============================================================================
 #
 # SwanLab automatically profiles trainer methods when enabled.
 # Profiling metrics appear in SwanLab dashboard under "profiling/" namespace.
 #
 # Built-in profiling:
 # - Minimal overhead (< 0.1% per step)
 # - High-precision timing (microsecond accuracy)
 # - Exception-safe (logs duration even if method fails)
 #
 # View profiling metrics in SwanLab dashboard:
 #   profiling/Time taken: AxolotlTrainer.training_step
 #   profiling/Time taken: AxolotlTrainer.compute_loss
 #   profiling/Time taken: AxolotlTrainer.prediction_step
 #
 # For custom profiling in your own trainer, see:
 #   examples/swanlab/custom_trainer_profiling.py
 # Completion logging is disabled for non-RLHF trainers
 swanlab_log_completions: false  # Only works with DPO/KTO/ORPO/GRPO
 # ============================================================================
 # Optional: Compare with Multiple Runs
 # ============================================================================
 #
 # To compare profiling metrics across different configurations:
 #
 # 1. Run baseline without flash attention:
 #    swanlab_experiment_name: llama-3.2-1b-no-flash-attn
 #    flash_attention: false
 #
 # 2. Run with gradient checkpointing:
 #    swanlab_experiment_name: llama-3.2-1b-grad-checkpoint
 #    gradient_checkpointing: true
 #
 # 3. Run with both:
 #    swanlab_experiment_name: llama-3.2-1b-optimized
 #    flash_attention: true
 #    gradient_checkpointing: true
 #
 # Then compare profiling metrics in SwanLab dashboard to see performance impact
 # ============================================================================
 # Optional: Lark (Feishu) Team Notifications
 # ============================================================================
 #
 # Get notified when profiling experiments complete:
 # swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
 # swanlab_lark_secret: your-webhook-secret
 # ============================================================================
 # Profiling Best Practices
 # ============================================================================
 #
 # 1. Run multiple epochs to see profiling trends over time
 # 2. Ignore first ~10 steps (warmup period, slower)
 # 3. Look for outliers (steps that take significantly longer)
 # 4. Compare profiling metrics before/after optimization changes
 # 5. Monitor per-rank profiling in distributed training
 #
 # Common bottlenecks to profile:
 # - training_step: Overall step time (should be consistent)
 # - compute_loss: Loss computation (scales with sequence length)
 # - prediction_step: Evaluation time (can be slow for large val sets)
 #
 # If you see inconsistent timing:
 # - Check for data loading bottlenecks
 # - Monitor GPU utilization (may be CPU-bound)
 # - Check for gradient accumulation effects
 # - Verify CUDA kernel synchronization
 # ============================================================================
 # Disable WandB if you're migrating from it
 # ============================================================================
 # wandb_project:
 # use_wandb: false
--- a/examples/voxtral/README.md
+++ b/examples/voxtral/README.md
@@ -12,7 +12,7 @@ Thanks to the team at MistralAI for giving us early access to prepare for this r
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
-pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,5 +1,5 @@
 [build-system]
-requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==26.0"]
+requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==23.2"]
 build-backend = "setuptools.build_meta"
 [project]
@@ -24,9 +24,6 @@ Repository = "https://github.com/axolotl-ai-cloud/axolotl.git"
 py-modules = ["setuptools_axolotl_dynamic_dependencies"]
 include-package-data = true
 [tool.setuptools.dynamic]
 version = { file = "VERSION" }
 [tool.setuptools.cmdclass]
 build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"
@@ -60,6 +57,3 @@ indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"
 docstring-code-format = false
 [tool.uv.extra-build-dependencies]
 axolotl = ["huggingface_hub"]
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,25 +1,25 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 # START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.49.1
+bitsandbytes==0.48.2
 triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
 liger-kernel==0.6.4
 # END section
-packaging==26.0
+packaging==23.2
-huggingface_hub>=1.1.7
+
-peft>=0.18.1
+huggingface_hub>=0.36.0
 peft>=0.18.0
 tokenizers>=0.22.1
-transformers==5.0.0
+transformers==4.57.1
 accelerate==1.12.0
-datasets==4.5.0
+datasets==4.4.2
 deepspeed>=0.18.3
-trl==0.27.1
+trl==0.25.1
 hf_xet==1.2.0
 kernels==0.11.5
 trackio>=0.13.0
 typing-extensions>=4.15.0
@@ -63,7 +63,7 @@ langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2
-torchao==0.13.0
+torchao==0.15.0
 openenv-core==0.1.0
 schedulefree==1.4.1
@@ -72,4 +72,4 @@ axolotl-contribs-mit==0.0.6
 # telemetry
 posthog==6.7.11
-mistral-common==1.8.8
+mistral-common==1.8.6
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2"'
 )
--- a/setup.py
+++ b/setup.py
@@ -1,5 +1,6 @@
 """setup.py for axolotl"""
 import ast
 import os
 import platform
 import re
@@ -25,7 +26,6 @@ def parse_requirements(extras_require_map):
                _install_requires.append(line)
    try:
        xformers_version = [req for req in _install_requires if "xformers" in req][0]
        install_xformers = platform.machine() != "aarch64"
        if "Darwin" in platform.system():
            # skip packages not compatible with OSX
            skip_packages = [
@@ -62,68 +62,44 @@ def parse_requirements(extras_require_map):
            else:
                raise ValueError("Invalid version format")
            torch_parts = torch_version.split("+")
            if len(torch_parts) == 2:
                torch_cuda_version = torch_parts[1]
                _dependency_links.append(
                    f"https://download.pytorch.org/whl/{torch_cuda_version}"
                )
            if (major, minor) >= (2, 9):
                extras_require_map.pop("fbgemm-gpu")
-                extras_require_map["fbgemm-gpu"] = [
+                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.4.1"]
                    "fbgemm-gpu==1.4.0",
                    "fbgemm-gpu-genai==1.4.2",
                ]
                extras_require_map["vllm"] = ["vllm==0.11.1"]
                if not install_xformers:
                    _install_requires.pop(_install_requires.index(xformers_version))
                extras_require_map["vllm"] = ["vllm==0.13.0"]
                if patch == 0:
                    extras_require_map["vllm"] = ["vllm==0.13.0"]
                else:
                    extras_require_map["vllm"] = ["vllm==0.14.0"]
            elif (major, minor) >= (2, 8):
                extras_require_map.pop("fbgemm-gpu")
                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
                extras_require_map["vllm"] = ["vllm==0.11.0"]
                if not install_xformers:
                    _install_requires.pop(_install_requires.index(xformers_version))
            elif (major, minor) >= (2, 7):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
-                    if install_xformers:
+                    _install_requires.append("xformers==0.0.30")
                        _install_requires.append("xformers==0.0.30")
                    # vllm 0.9.x is incompatible with latest transformers
                    extras_require_map.pop("vllm")
                else:
-                    if install_xformers:
+                    _install_requires.append("xformers==0.0.31")
                        _install_requires.append("xformers==0.0.31")
                    extras_require_map["vllm"] = ["vllm==0.10.1"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
-                if install_xformers:
+                _install_requires.append("xformers==0.0.29.post3")
                    _install_requires.append("xformers==0.0.29.post3")
                # since we only support 2.6.0+cu126
                _dependency_links.append("https://download.pytorch.org/whl/cu126")
                extras_require_map.pop("vllm")
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
-                if install_xformers:
+                if patch == 0:
-                    if patch == 0:
+                    _install_requires.append("xformers==0.0.28.post2")
-                        _install_requires.append("xformers==0.0.28.post2")
+                else:
-                    else:
+                    _install_requires.append("xformers>=0.0.28.post3")
                        _install_requires.append("xformers>=0.0.28.post3")
                extras_require_map.pop("vllm")
            elif (major, minor) >= (2, 4):
                extras_require_map.pop("vllm")
-                if install_xformers:
+                if patch == 0:
-                    if patch == 0:
+                    _install_requires.pop(_install_requires.index(xformers_version))
-                        _install_requires.pop(_install_requires.index(xformers_version))
+                    _install_requires.append("xformers>=0.0.27")
-                        _install_requires.append("xformers>=0.0.27")
+                else:
-                    else:
+                    _install_requires.pop(_install_requires.index(xformers_version))
-                        _install_requires.pop(_install_requires.index(xformers_version))
+                    _install_requires.append("xformers==0.0.28.post1")
                        _install_requires.append("xformers==0.0.28.post1")
            else:
                raise ValueError("axolotl requires torch>=2.4")
@@ -134,11 +110,15 @@ def parse_requirements(extras_require_map):
 def get_package_version():
    with open(
-        Path(os.path.dirname(os.path.abspath(__file__))) / "VERSION",
+        Path(os.path.dirname(os.path.abspath(__file__)))
        / "src"
        / "axolotl"
        / "__init__.py",
        "r",
        encoding="utf-8",
    ) as fin:
-        version_ = fin.read().strip()
+        version_match = re.search(r"^__version__\s*=\s*(.*)$", fin.read(), re.MULTILINE)
    version_ = ast.literal_eval(version_match.group(1))
    return version_
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -1,11 +1,7 @@
 """Axolotl - Train and fine-tune large language models"""
 import pkgutil
 from importlib.metadata import PackageNotFoundError, version
 __path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package
-try:
+__version__ = "0.13.0.dev"
    __version__ = version("axolotl")
 except PackageNotFoundError:
    __version__ = "unknown"
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -5,6 +5,6 @@ import os
 from axolotl.logging_config import configure_logging
 os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
-os.environ.setdefault("HF_XET_HIGH_PERFORMANCE", "1")
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
 configure_logging()
--- a/src/axolotl/cli/checks.py
+++ b/src/axolotl/cli/checks.py
@@ -44,7 +44,7 @@ def check_user_token() -> bool:
        return bool(user_info)
    except LocalTokenNotFoundError:
        LOG.warning(
-            "Error verifying HuggingFace token. Remember to log in using `hf auth login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets."
+            "Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets."
        )
        return False
    except HTTPError:
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -24,6 +24,7 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
        cfg: Dictionary mapping `axolotl` config keys to values.
    """
    model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
    safe_serialization = cfg.save_safetensors is True
    LOG.info("Running merge of LoRA with base model...")
    model = model.merge_and_unload(progressbar=True)
@@ -41,6 +42,7 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
        LOG.info(f"Saving merged model to: {str(Path(cfg.output_dir) / 'merged')}...")
        model.save_pretrained(
            str(Path(cfg.output_dir) / "merged"),
            safe_serialization=safe_serialization,
            progressbar=True,
        )
        tokenizer.save_pretrained(
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -14,6 +14,8 @@ from accelerate import PartialState
 from accelerate.utils import (
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
    WEIGHTS_INDEX_NAME,
    WEIGHTS_NAME,
    is_torch_version,
 )
 from huggingface_hub import split_torch_state_dict_into_shards
@@ -38,15 +40,17 @@ class BFloat16CastPlanner(_EmptyStateDictLoadPlanner):
 def _distributed_checkpoint_to_merged_weights(
    checkpoint_dir: Union[str, Path],
    save_path: str,
    safe_serialization: bool = False,
    max_shard_size: str = "5GB",
 ) -> Path:
    """
    Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`. Will
-    save under `save_path` as `model.safetensors`.
+    save under `save_path` as either `model.safetensors` or `pytorch_model.bin`.
    Args:
        checkpoint_dir: Directory where distributed checkpoint is saved.
        save_path: Path to save model to.
        safe_serialization: Whether to save in safetensors format.
        max_shard_size: Max size of model shards to save.
    Returns:
@@ -72,7 +76,11 @@ def _distributed_checkpoint_to_merged_weights(
        if isinstance(value, torch.Tensor) and value.dtype != torch.bfloat16:
            state_dict[key] = value.to(torch.bfloat16)
-    filename_pattern = SAFE_WEIGHTS_NAME.replace(".safetensors", "{suffix}.safetensors")
+    weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
    filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(
        ".safetensors", "{suffix}.safetensors"
    )
    state_dict_split = split_torch_state_dict_into_shards(
        state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size
    )
@@ -90,12 +98,19 @@ def _distributed_checkpoint_to_merged_weights(
    for shard_file, tensors in filename_to_tensors:
        shard = {tensor: state_dict[tensor] for tensor in tensors}
-        safe_save_file(
+
-            shard, os.path.join(save_path_, shard_file), metadata={"format": "pt"}
+        if safe_serialization:
-        )
+            safe_save_file(
                shard, os.path.join(save_path_, shard_file), metadata={"format": "pt"}
            )
        else:
            torch.save(shard, os.path.join(save_path_, shard_file))
    if index is not None:
-        save_index_file = os.path.join(save_path_, SAFE_WEIGHTS_INDEX_NAME)
+        save_index_file = (
            SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
        )
        save_index_file = os.path.join(save_path_, save_index_file)
        # Save the index as well
        with open(save_index_file, "w", encoding="utf-8") as fout:
            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
@@ -108,11 +123,13 @@ def _distributed_checkpoint_to_merged_weights(
 def merge_fsdp_weights(
    checkpoint_dir: str,
    output_path: str,
    safe_serialization: bool = False,
    remove_checkpoint_dir: bool = False,
 ):
    """
    Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
-    `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors`.
+    `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors` if
    `safe_serialization` else `pytorch_model.bin`.
    Note: this is a CPU-bound process.
@@ -121,6 +138,8 @@ def merge_fsdp_weights(
            The directory containing the FSDP checkpoints (can be either the model or optimizer).
        output_path (`str`):
            The path to save the merged checkpoint.
        safe_serialization (`bool`, *optional*, defaults to `True`):
            Whether to save the merged weights with safetensors (recommended).
        remove_checkpoint_dir (`bool`, *optional*, defaults to `False`):
            Whether to remove the checkpoint directory after merging.
@@ -158,7 +177,7 @@ def merge_fsdp_weights(
    if state.is_main_process:
        LOG.info(f"Merging FSDP weights from {checkpoint_dir_}")
        save_path = _distributed_checkpoint_to_merged_weights(
-            checkpoint_dir_, output_path
+            checkpoint_dir_, output_path, safe_serialization
        )
        LOG.info(f"Successfully merged FSDP weights and saved to {save_path}")
        if remove_checkpoint_dir:
@@ -191,6 +210,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    merge_fsdp_weights(
        checkpoint_dir=str(fsdp_dir),
        output_path=output_path,
        safe_serialization=True,
    )
    state = PartialState()
    state.wait_for_everyone()
--- a/src/axolotl/cli/quantize.py
+++ b/src/axolotl/cli/quantize.py
@@ -102,10 +102,12 @@ def do_quantize(
    LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}.")
    model.save_pretrained(
        str(Path(output_dir) / "quantized"),
        safe_serialization=False,
        progressbar=True,
    )
    tokenizer.save_pretrained(
        str(Path(output_dir) / "quantized"),
        safe_serialization=False,
        progressbar=True,
        save_jinja_files=cfg.tokenizer_save_jinja_files,
    )
@@ -119,7 +121,7 @@ def do_quantize(
            hub_model_id.rstrip("-")
            + f"-{quantization_config_to_str[type(quantization_config)]}"
        )
-        model.push_to_hub(hub_model_id)
+        model.push_to_hub(hub_model_id, safe_serialization=False)
        tokenizer.push_to_hub(hub_model_id)
        if processor:
            processor.push_to_hub(hub_model_id)
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -216,7 +216,7 @@ class TrainerBuilderBase(abc.ABC):
    def _configure_warmup_and_logging(
        self, total_num_steps: int, training_args_kwargs: dict
    ):
-        warmup_steps: int | float = 0
+        warmup_steps = 0
        warmup_ratio = 0.0
        if self.cfg.warmup_steps is not None:
            warmup_steps = self.cfg.warmup_steps
@@ -230,10 +230,6 @@ class TrainerBuilderBase(abc.ABC):
        else:
            warmup_ratio = 0.03
        # transformers v5
        if warmup_ratio > 0.0 and warmup_steps == 0:
            warmup_steps = warmup_ratio
        if warmup_steps == 1:
            warmup_steps = 2
@@ -246,6 +242,7 @@ class TrainerBuilderBase(abc.ABC):
                else max(min(int(0.005 * total_num_steps), 10), 1)
            )
        training_args_kwargs["warmup_ratio"] = warmup_ratio
        training_args_kwargs["warmup_steps"] = warmup_steps
    def _configure_precision_settings(self, training_args_kwargs: dict):
@@ -533,7 +530,9 @@ class TrainerBuilderBase(abc.ABC):
            "loraplus_lr_ratio",
            "loraplus_lr_embedding",
            "output_dir",
            "save_safetensors",
            "save_only_model",
            "include_tokens_per_second",
            "weight_decay",
            "seed",
            "dion_momentum",
@@ -546,7 +545,6 @@ class TrainerBuilderBase(abc.ABC):
        arg_map = {
            "dion_learning_rate": "dion_lr",
            "include_num_input_tokens_seen": "include_tokens_per_second",
        }
        for kwarg, cfg_arg in arg_map.items():
            if hasattr(self.cfg, cfg_arg) and getattr(self.cfg, cfg_arg) is not None:
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -373,18 +373,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
            data_collator_kwargs["pad_to_multiple_of"] = multiple
        if self.cfg.use_eaft:
            from functools import partial
            from axolotl.monkeypatch.loss.eaft import eaft_loss
            configured_eaft_loss = partial(
                eaft_loss,
                alpha=self.cfg.eaft_alpha if self.cfg.eaft_alpha is not None else 1.0,
                k=self.cfg.eaft_k if self.cfg.eaft_k is not None else 20,
            )
            trainer_kwargs["compute_loss_func"] = configured_eaft_loss
        trainer_cls = self._get_trainer_cls()
        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
@@ -449,9 +437,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                or self.cfg.micro_batch_size > 1
            ):
                return DataCollatorForSeq2Seq(self.tokenizer, **kwargs)
-            if not (self.cfg.sample_packing and self.cfg.pretrain_multipack_attn) or (
+            if not (self.cfg.sample_packing and self.cfg.pretrain_multipack_attn):
                self.cfg.micro_batch_size == 1 and is_eval is False
            ):
                return None
        if self.cfg.model_config_type == "mamba":
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -52,11 +52,12 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        trainer_cls = None
        trainer_cls_args = [self.model]
-        if self.cfg.rl in {RLType.GRPO, RLType.GDPO}:
+        if self.cfg.rl is RLType.GRPO:
            trainer_cls = GRPOStrategy.get_trainer_class(
                sequence_parallel=self.cfg.context_parallel_size > 1
            )
            trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))
            trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
@@ -146,8 +147,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        elif self.cfg.rl is RLType.KTO:
            training_args_cls = AxolotlKTOConfig
            # KTOConfig in TRL >= 0.27.0 no longer accepts max_prompt_length
            blocklist_args_kwargs = ["max_prompt_length"]
            training_args_kwargs["desirable_weight"] = (
                self.cfg.kto_desirable_weight or 1.0
@@ -156,14 +155,10 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
                self.cfg.kto_undesirable_weight or 1.0
            )
-        elif self.cfg.rl in {RLType.GRPO, RLType.GDPO}:
+        elif self.cfg.rl is RLType.GRPO:
            training_args_cls = GRPOStrategy.get_training_args_class()
            training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
            blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()
            if self.cfg.rl is RLType.GDPO:
                training_args_kwargs.setdefault(
                    "multi_objective_aggregation", "normalize_then_sum"
                )
        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
            training_args_cls = AxolotlDPOConfig
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -25,7 +25,7 @@ from torch.utils.data import (
 from transformers import PreTrainedModel, Trainer
 from transformers.trainer import TRAINING_ARGS_NAME
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length, seed_worker
-from transformers.utils import SAFE_WEIGHTS_NAME, is_peft_available
+from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME, is_peft_available
 from trl.trainer.utils import pad_to_length
 from typing_extensions import override
@@ -660,10 +660,11 @@ class AxolotlTrainer(
            logs["tokens/train_per_sec_per_gpu"] = round(
                self.state.last_tokens_per_second.item() / self.args.logging_steps, 2
            )
-            if "total" in self.state.tokens:
+            if (
-                logs["tokens/total"] = int(self.state.tokens["total"].item())
+                hasattr(self.state, "total_tokens")
-            if "trainable" in self.state.tokens:
+                and self.state.total_tokens is not None
-                logs["tokens/trainable"] = int(self.state.tokens["trainable"].item())
+            ):
                logs["total_tokens"] = int(self.state.total_tokens.item())
        del self._stored_metrics[train_eval]
@@ -738,38 +739,43 @@ class AxolotlTrainer(
                ).save_pretrained(
                    output_dir,
                    state_dict=state_dict,
                    safe_serialization=self.args.save_safetensors,
                )
            else:
                LOG.info(
                    "Trainer.model is not a `PreTrainedModel`, only saving its state dict."
                )
-                safetensors.torch.save_file(
+                if self.args.save_safetensors:
-                    state_dict,
+                    safetensors.torch.save_file(
-                    os.path.join(output_dir, SAFE_WEIGHTS_NAME),
+                        state_dict,
-                    metadata={"format": "pt"},
+                        os.path.join(output_dir, SAFE_WEIGHTS_NAME),
-                )
+                        metadata={"format": "pt"},
                    )
                else:
                    torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
        else:
            self.model.save_pretrained(
                output_dir,
                state_dict=state_dict,
                safe_serialization=self.args.save_safetensors,
                is_main_process=self.accelerator.is_main_process,
            )
-        if self.processing_class is not None:
+            if self.processing_class is not None:
-            self.processing_class.save_pretrained(output_dir)
+                self.processing_class.save_pretrained(output_dir)
-        elif (
+            elif (
-            self.data_collator is not None
+                self.data_collator is not None
-            and hasattr(self.data_collator, "tokenizer")
+                and hasattr(self.data_collator, "tokenizer")
-            and self.data_collator.tokenizer is not None
+                and self.data_collator.tokenizer is not None
-        ):
+            ):
-            LOG.info(
+                LOG.info(
-                "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
+                    "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
-            )
+                )
-            save_jinja_files = True
+                save_jinja_files = True
-            if self.axolotl_cfg:
+                if self.axolotl_cfg:
-                save_jinja_files = self.axolotl_cfg.tokenizer_save_jinja_files
+                    save_jinja_files = self.axolotl_cfg.tokenizer_save_jinja_files
-            self.data_collator.tokenizer.save_pretrained(
+                self.data_collator.tokenizer.save_pretrained(
-                output_dir, save_jinja_files=save_jinja_files
+                    output_dir, save_jinja_files=save_jinja_files
-            )
+                )
-        # Good practice: save your training arguments together with the trained model
+            # Good practice: save your training arguments together with the trained model
-        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
+            torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -129,11 +129,6 @@ class GRPOStrategy:
        if trl.rollout_func:
            grpo_args_kwargs["rollout_func"] = cls.get_rollout_func(trl.rollout_func)
        if trl.multi_objective_aggregation is not None:
            grpo_args_kwargs["multi_objective_aggregation"] = (
                trl.multi_objective_aggregation
            )
        return grpo_args_kwargs
    @classmethod
--- a/src/axolotl/core/trainers/trl.py
+++ b/src/axolotl/core/trainers/trl.py
@@ -1,10 +1,12 @@
 """Module for TRL RL trainers"""
-from trl import RewardTrainer
+from trl import (
-from trl.experimental.cpo import CPOTrainer
+    CPOTrainer,
-from trl.experimental.kto import KTOTrainer
+    KTOTrainer,
-from trl.experimental.orpo import ORPOTrainer
+    ORPOTrainer,
-from trl.experimental.prm import PRMTrainer
+    PRMTrainer,
    RewardTrainer,
 )
 from axolotl.core.trainers.mixins import DistributedParallelMixin, RngLoaderMixin
 from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -8,11 +8,7 @@ from dataclasses import dataclass, field
 from typing import Optional, Type
 from transformers import TrainingArguments
-from trl import RewardConfig
+from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig
 from trl.experimental.cpo import CPOConfig
 from trl.experimental.kto import KTOConfig
 from trl.experimental.orpo import ORPOConfig
 from trl.experimental.prm import PRMConfig
 from axolotl.integrations.config import merge_training_args
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2"
 ```
 ## Usage
@@ -36,7 +36,6 @@ plugins:
 - cohere
 - cohere2
 - deepseek_v3
 - exaone4
 - gemma
 - gemma2
 - gemma3
@@ -46,11 +45,8 @@ plugins:
 - glm
 - glm4
 - glm4_moe
 - glm4_moe_lite
 - glm46v
 - glm4v
 - glm4v_moe
 - glm_image
 - gpt_oss
 - granite
 - granitemoe
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)
 _CCE_INSTALL_MESSAGE = (
    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@318b7e2"`'
 )
--- a/src/axolotl/integrations/kernels/init.py
+++ b/src/axolotl/integrations/kernels/init.py
@@ -1,7 +0,0 @@
 from .args import KernelsArgs
 from .plugin import KernelsPlugin
 __all__ = [
    "KernelsArgs",
    "KernelsPlugin",
 ]
--- a/src/axolotl/integrations/kernels/args.py
+++ b/src/axolotl/integrations/kernels/args.py
@@ -1,35 +0,0 @@
 from pydantic import BaseModel, model_validator
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
 class KernelsArgs(BaseModel):
    use_scattermoe: bool | None = True
    @model_validator(mode="before")
    @classmethod
    def check_use_kernels(cls, data):
        if data.get("use_kernels") is not True:
            LOG.warning(
                "`use_kernels` must be set to True to use this. Automatically setting it to True."
            )
            data["use_kernels"] = True
        return data
    @model_validator(mode="before")
    @classmethod
    def check_experts_implementation(cls, data):
        experts_implementation = data.get("experts_implementation")
        if experts_implementation is None:
            # transformers may default to batched_mm when unset
            data["experts_implementation"] = "eager"
        elif experts_implementation != "eager":
            LOG.warning(
                "`experts_implementation` must be set to 'eager' to use this. Automatically setting it to 'eager'."
            )
            data["experts_implementation"] = "eager"
        return data
--- a/src/axolotl/integrations/kernels/plugin.py
+++ b/src/axolotl/integrations/kernels/plugin.py
@@ -1,61 +0,0 @@
 from kernels import (
    LayerRepository,
    Mode,
    register_kernel_mapping,
    replace_kernel_forward_from_hub,
 )
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
 class KernelsPlugin(BasePlugin):
    def get_input_args(self):
        return "axolotl.integrations.kernels.KernelsArgs"
    def pre_model_load(self, cfg):
        if cfg.use_scattermoe:
            self._register_kernels()
            self._kernelize_model(cfg.model_config_type)
    def _register_kernels(self):
        register_kernel_mapping(
            {
                "HFScatterMoEParallelExperts": {
                    "cuda": {
                        Mode.TRAINING: LayerRepository(
                            repo_id="axolotl-ai-co/scattermoe",
                            layer_name="HFScatterMoEGatedMLP",
                        ),
                        Mode.INFERENCE: LayerRepository(
                            repo_id="axolotl-ai-co/scattermoe",
                            layer_name="HFScatterMoEGatedMLP",
                        ),
                    },
                }
            }
        )
    def _kernelize_model(self, model_type: str):
        if model_type == "olmoe":
            from transformers.models.olmoe.modeling_olmoe import OlmoeSparseMoeBlock
            replace_kernel_forward_from_hub(
                OlmoeSparseMoeBlock, "HFScatterMoEParallelExperts"
            )
        else:
            try:
                model_moe_cls = get_model_moe_block(model_type)
                replace_kernel_forward_from_hub(
                    model_moe_cls, "HFScatterMoEParallelExperts"
                )
            except Exception as err:
                raise ValueError(f"Unsupported model type: {model_type}") from err
 def get_model_moe_block(model_type: str):
    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
    model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
    module = __import__(module_path, fromlist=[f"{model_cls_prefix}SparseMoeBlock"])
    model_cls = getattr(module, f"{model_cls_prefix}SparseMoeBlock")
    return model_cls
--- a/src/axolotl/integrations/llm_compressor/utils.py
+++ b/src/axolotl/integrations/llm_compressor/utils.py
@@ -12,6 +12,7 @@ def save_compressed_model(
    model: PreTrainedModel,
    output_dir: Union[str, bytes],
    trainer: Trainer,
    safe_serialization: bool = False,
    save_compressed: bool = False,
 ) -> None:
    """
@@ -21,6 +22,7 @@ def save_compressed_model(
        model (PreTrainedModel): The model to be saved.
        output_dir (str or bytes): Path where the model files will be written.
        trainer (Trainer): Hugging Face Trainer for process synchronization.
        safe_serialization (bool): Use safe serialization if True.
        save_compressed (bool): Write compressed tensors if True.
    """
    trainer.accelerator.wait_for_everyone()
@@ -32,6 +34,7 @@ def save_compressed_model(
    modify_save_pretrained(model)
    model.save_pretrained(
        output_dir,
        safe_serialization=safe_serialization,
        save_compressed=save_compressed,
        skip_sparsity_compression_stats=not save_compressed,
    )
--- a/src/axolotl/integrations/swanlab/README.md
+++ b/src/axolotl/integrations/swanlab/README.md
--- a/src/axolotl/integrations/swanlab/init.py
+++ b/src/axolotl/integrations/swanlab/init.py
@@ -1,6 +0,0 @@
 """SwanLab integration plugin for Axolotl"""
 from axolotl.integrations.swanlab.args import SwanLabConfig
 from axolotl.integrations.swanlab.plugins import SwanLabPlugin
 __all__ = ["SwanLabConfig", "SwanLabPlugin"]
--- a/src/axolotl/integrations/swanlab/args.py
+++ b/src/axolotl/integrations/swanlab/args.py
@@ -1,140 +0,0 @@
 """SwanLab configuration arguments"""
 from pydantic import BaseModel, Field, field_validator, model_validator
 class SwanLabConfig(BaseModel):
    """SwanLab configuration subset"""
    use_swanlab: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "Enable SwanLab experiment tracking and visualization"
        },
    )
    swanlab_project: str | None = Field(
        default=None,
        json_schema_extra={"description": "Your SwanLab project name"},
    )
    swanlab_experiment_name: str | None = Field(
        default=None,
        json_schema_extra={"description": "Set the name of your SwanLab experiment"},
    )
    swanlab_description: str | None = Field(
        default=None,
        json_schema_extra={"description": "Description for your SwanLab experiment"},
    )
    swanlab_mode: str | None = Field(
        default=None,
        json_schema_extra={
            "description": '"cloud" to sync to SwanLab cloud, "local" for local only, "offline" to save metadata locally, "disabled" to turn off SwanLab'
        },
    )
    swanlab_workspace: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "SwanLab workspace name (organization or username)"
        },
    )
    swanlab_api_key: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "SwanLab API key for authentication. Can also be set via SWANLAB_API_KEY environment variable"
        },
    )
    swanlab_log_model: bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "Whether to log model checkpoints to SwanLab (feature coming soon)"
        },
    )
    swanlab_web_host: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Web address for SwanLab cloud environment (for private deployment)"
        },
    )
    swanlab_api_host: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "API address for SwanLab cloud environment (for private deployment)"
        },
    )
    swanlab_lark_webhook_url: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Lark (Feishu) webhook URL for sending training notifications to team chat"
        },
    )
    swanlab_lark_secret: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Secret for Lark webhook HMAC signature authentication (optional)"
        },
    )
    swanlab_log_completions: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "Enable logging RLHF completions to SwanLab for qualitative analysis (DPO/KTO/ORPO/GRPO)"
        },
    )
    swanlab_completion_log_interval: int | None = Field(
        default=100,
        json_schema_extra={
            "description": "Number of training steps between completion table logging to SwanLab"
        },
    )
    swanlab_completion_max_buffer: int | None = Field(
        default=128,
        json_schema_extra={
            "description": "Maximum number of completions to buffer before logging (prevents memory leaks)"
        },
    )
    @field_validator("swanlab_mode")
    @classmethod
    def validate_swanlab_mode(cls, v):
        """Validate swanlab_mode is one of the allowed values."""
        if v is None:
            return v
        valid_modes = ["cloud", "local", "offline", "disabled"]
        if v not in valid_modes:
            raise ValueError(
                f"Invalid swanlab_mode: '{v}'.\n\n"
                f"Valid options: {', '.join(valid_modes)}\n\n"
                f"Examples:\n"
                f"  swanlab_mode: cloud     # Sync to SwanLab cloud\n"
                f"  swanlab_mode: local     # Local only, no cloud sync\n"
                f"  swanlab_mode: offline   # Save metadata locally\n"
                f"  swanlab_mode: disabled  # Turn off SwanLab\n"
            )
        return v
    @field_validator("swanlab_project")
    @classmethod
    def validate_swanlab_project(cls, v):
        """Validate swanlab_project is non-empty when provided."""
        if v is not None and isinstance(v, str) and len(v.strip()) == 0:
            raise ValueError(
                "swanlab_project cannot be an empty string.\n\n"
                "Either:\n"
                "  1. Provide a valid project name: swanlab_project: my-project\n"
                "  2. Remove the swanlab_project field entirely\n"
            )
        return v
    @model_validator(mode="after")
    def validate_swanlab_enabled_requires_project(self):
        """Validate that if use_swanlab is True, swanlab_project must be set."""
        if self.use_swanlab is True and not self.swanlab_project:
            raise ValueError(
                "SwanLab enabled (use_swanlab: true) but 'swanlab_project' is not set.\n\n"
                "Solutions:\n"
                "  1. Add 'swanlab_project: your-project-name' to your config\n"
                "  2. Set 'use_swanlab: false' to disable SwanLab\n\n"
                "Example:\n"
                "  use_swanlab: true\n"
                "  swanlab_project: my-llm-training\n"
            )
        return self
--- a/src/axolotl/integrations/swanlab/callbacks.py
+++ b/src/axolotl/integrations/swanlab/callbacks.py
@@ -1,179 +0,0 @@
 """SwanLab callbacks for Axolotl trainers.
 This module provides HuggingFace Trainer callbacks for logging
 RLHF completions to SwanLab.
 """
 from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
 )
 from axolotl.integrations.swanlab.completion_logger import CompletionLogger
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
 class SwanLabRLHFCompletionCallback(TrainerCallback):
    """Callback for logging RLHF completions to SwanLab.
    This callback periodically logs model completions (prompts, chosen/rejected
    responses, rewards) to SwanLab during RLHF training for qualitative analysis.
    Supports DPO, KTO, ORPO, and GRPO trainers.
    Example usage:
        >>> callback = SwanLabRLHFCompletionCallback(
        ...     log_interval=100,  # Log every 100 steps
        ...     max_completions=128,  # Keep last 128 completions
        ... )
        >>> trainer.add_callback(callback)
    Attributes:
        logger: CompletionLogger instance
        log_interval: Number of steps between SwanLab logging
        trainer_type: Auto-detected trainer type (dpo/kto/orpo/grpo)
    """
    def __init__(
        self,
        log_interval: int = 100,
        max_completions: int = 128,
        table_name: str = "rlhf_completions",
    ):
        """Initialize SwanLab RLHF completion callback.
        Args:
            log_interval: Log to SwanLab every N steps. Default: 100
            max_completions: Maximum completions to buffer. Default: 128
            table_name: SwanLab table name. Default: "rlhf_completions"
        """
        super().__init__()
        self.logger = CompletionLogger(maxlen=max_completions)
        self.log_interval = log_interval
        self.table_name = table_name
        self.trainer_type: str | None = None  # Auto-detected
        self._last_logged_step = 0
    def on_init_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Detect trainer type on initialization."""
        trainer = kwargs.get("trainer")
        if trainer is not None:
            trainer_name = trainer.__class__.__name__
            if "DPO" in trainer_name:
                self.trainer_type = "dpo"
            elif "KTO" in trainer_name:
                self.trainer_type = "kto"
            elif "ORPO" in trainer_name:
                self.trainer_type = "orpo"
            elif "GRPO" in trainer_name:
                self.trainer_type = "grpo"
            else:
                self.trainer_type = "unknown"
            LOG.info(
                f"SwanLab RLHF completion logging enabled for {trainer_name} "
                f"(type: {self.trainer_type})"
            )
    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: dict | None = None,
        **kwargs,
    ):
        """Capture completions from logs and buffer them.
        Different trainers log completions in different formats:
        - DPO: logs['dpo/chosen'], logs['dpo/rejected'], logs['dpo/reward_diff']
        - KTO: logs['kto/completion'], logs['kto/label'], logs['kto/reward']
        - ORPO: logs['orpo/chosen'], logs['orpo/rejected']
        - GRPO: logs['grpo/completion'], logs['grpo/reward']
        Note: This is a placeholder implementation. Actual log keys depend
        on the TRL trainer implementation. You may need to patch the trainers
        to expose completion data in logs.
        """
        if logs is None or self.trainer_type is None:
            return
        step = state.global_step
        # DPO completions
        if self.trainer_type == "dpo":
            if all(key in logs for key in ["dpo/prompt", "dpo/chosen", "dpo/rejected"]):
                self.logger.add_dpo_completion(
                    step=step,
                    prompt=logs.get("dpo/prompt", ""),
                    chosen=logs.get("dpo/chosen", ""),
                    rejected=logs.get("dpo/rejected", ""),
                    reward_diff=logs.get("dpo/reward_diff"),
                )
        # KTO completions
        elif self.trainer_type == "kto":
            if all(key in logs for key in ["kto/prompt", "kto/completion"]):
                self.logger.add_kto_completion(
                    step=step,
                    prompt=logs.get("kto/prompt", ""),
                    completion=logs.get("kto/completion", ""),
                    label=logs.get("kto/label", False),
                    reward=logs.get("kto/reward"),
                )
        # ORPO completions
        elif self.trainer_type == "orpo":
            if all(
                key in logs for key in ["orpo/prompt", "orpo/chosen", "orpo/rejected"]
            ):
                self.logger.add_orpo_completion(
                    step=step,
                    prompt=logs.get("orpo/prompt", ""),
                    chosen=logs.get("orpo/chosen", ""),
                    rejected=logs.get("orpo/rejected", ""),
                    log_odds_ratio=logs.get("orpo/log_odds_ratio"),
                )
        # GRPO completions
        elif self.trainer_type == "grpo":
            if all(key in logs for key in ["grpo/prompt", "grpo/completion"]):
                self.logger.add_grpo_completion(
                    step=step,
                    prompt=logs.get("grpo/prompt", ""),
                    completion=logs.get("grpo/completion", ""),
                    reward=logs.get("grpo/reward"),
                    advantage=logs.get("grpo/advantage"),
                )
        # Periodically log to SwanLab
        if step - self._last_logged_step >= self.log_interval:
            if len(self.logger) > 0:
                self.logger.log_to_swanlab(table_name=self.table_name)
                self.logger.clear()
                self._last_logged_step = step
    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Log remaining completions at end of training."""
        if len(self.logger) > 0:
            LOG.info(
                f"Training complete, logging final {len(self.logger)} completions to SwanLab"
            )
            self.logger.log_to_swanlab(table_name=self.table_name)
            self._last_logged_step = state.global_step
--- a/src/axolotl/integrations/swanlab/completion_logger.py
+++ b/src/axolotl/integrations/swanlab/completion_logger.py
@@ -1,228 +0,0 @@
 """SwanLab completion logger for RLHF/DPO/KTO/ORPO/GRPO training.
 This module provides utilities for logging model completions during
 preference training to SwanLab for qualitative analysis.
 """
 from collections import deque
 from collections.abc import Mapping
 from typing import Any
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
 class CompletionLogger:
    """Memory-bounded logger for RLHF completions.
    Stores prompts, completions, and rewards in fixed-size deques to prevent
    memory leaks during long training runs. Logs completion tables to SwanLab
    for qualitative analysis of model outputs.
    Example usage:
        >>> logger = CompletionLogger(maxlen=128)
        >>> logger.add_dpo_completion(
        ...     step=0,
        ...     prompt="What is AI?",
        ...     chosen="Artificial Intelligence is...",
        ...     rejected="AI means...",
        ...     reward_diff=0.5
        ... )
        >>> logger.log_to_swanlab()
    Attributes:
        maxlen: Maximum number of completions to store (older ones are dropped)
        data: Deque storing completion dictionaries
    """
    def __init__(self, maxlen: int = 128):
        """Initialize completion logger with bounded buffer.
        Args:
            maxlen: Maximum number of completions to store. When the buffer
                is full, oldest completions are automatically discarded.
                Default: 128 (sufficient for most RLHF runs without memory issues)
        """
        self.maxlen = maxlen
        self.data: deque[Mapping[str, Any]] = deque(maxlen=maxlen)
    def add_dpo_completion(
        self,
        step: int,
        prompt: str,
        chosen: str,
        rejected: str,
        reward_diff: float | None = None,
    ) -> None:
        """Add a DPO completion to the buffer.
        Args:
            step: Training step number
            prompt: Input prompt
            chosen: Chosen (preferred) completion
            rejected: Rejected (non-preferred) completion
            reward_diff: Reward difference (chosen - rejected), if available
        """
        entry = {
            "step": step,
            "prompt": prompt,
            "chosen": chosen,
            "rejected": rejected,
        }
        if reward_diff is not None:
            entry["reward_diff"] = reward_diff
        self.data.append(entry)
    def add_kto_completion(
        self,
        step: int,
        prompt: str,
        completion: str,
        label: bool,
        reward: float | None = None,
    ) -> None:
        """Add a KTO completion to the buffer.
        Args:
            step: Training step number
            prompt: Input prompt
            completion: Model-generated completion
            label: True if desirable, False if undesirable
            reward: Reward score, if available
        """
        entry = {
            "step": step,
            "prompt": prompt,
            "completion": completion,
            "label": "desirable" if label else "undesirable",
        }
        if reward is not None:
            entry["reward"] = reward
        self.data.append(entry)
    def add_orpo_completion(
        self,
        step: int,
        prompt: str,
        chosen: str,
        rejected: str,
        log_odds_ratio: float | None = None,
    ) -> None:
        """Add an ORPO completion to the buffer.
        Args:
            step: Training step number
            prompt: Input prompt
            chosen: Chosen (preferred) completion
            rejected: Rejected (non-preferred) completion
            log_odds_ratio: Log odds ratio between chosen and rejected
        """
        entry = {
            "step": step,
            "prompt": prompt,
            "chosen": chosen,
            "rejected": rejected,
        }
        if log_odds_ratio is not None:
            entry["log_odds_ratio"] = log_odds_ratio
        self.data.append(entry)
    def add_grpo_completion(
        self,
        step: int,
        prompt: str,
        completion: str,
        reward: float | None = None,
        advantage: float | None = None,
    ) -> None:
        """Add a GRPO completion to the buffer.
        Args:
            step: Training step number
            prompt: Input prompt
            completion: Model-generated completion
            reward: Reward score from reward model
            advantage: Advantage estimate (reward - baseline)
        """
        entry = {
            "step": step,
            "prompt": prompt,
            "completion": completion,
        }
        if reward is not None:
            entry["reward"] = reward
        if advantage is not None:
            entry["advantage"] = advantage
        self.data.append(entry)
    def log_to_swanlab(self, table_name: str = "completions") -> bool:
        """Log buffered completions to SwanLab as a table.
        Creates a SwanLab echarts Table with all buffered completions.
        Only logs if SwanLab is initialized and data is available.
        Args:
            table_name: Name of the table in SwanLab dashboard.
                Default: "completions"
        Returns:
            True if logging succeeded, False otherwise
        """
        if not self.data:
            LOG.debug("No completions to log to SwanLab")
            return False
        try:
            import swanlab
            if swanlab.get_run() is None:
                LOG.debug("SwanLab not initialized, skipping completion logging")
                return False
            # Convert deque to list of dicts
            completions = list(self.data)
            # Extract headers from first entry (all entries should have same structure)
            headers = list(completions[0].keys())
            # Build rows: each completion becomes one row
            rows = []
            for completion in completions:
                row = [completion.get(header, "") for header in headers]
                rows.append(row)
            # Log to SwanLab as echarts Table
            swanlab.log({table_name: swanlab.echarts.Table().add(headers, rows)})
            LOG.info(f"Logged {len(rows)} completions to SwanLab table '{table_name}'")
            return True
        except ImportError:
            LOG.warning(
                "SwanLab not installed, cannot log completions. "
                "Install with: pip install swanlab"
            )
            return False
        except Exception as err:  # pylint: disable=broad-except
            LOG.exception("Failed to log completions to SwanLab: %s", err)
            return False
    def clear(self) -> None:
        """Clear all buffered completions."""
        self.data.clear()
    def __len__(self) -> int:
        """Return number of buffered completions."""
        return len(self.data)
    def __repr__(self) -> str:
        """String representation showing buffer status."""
        return (
            f"CompletionLogger(maxlen={self.maxlen}, "
            f"buffered={len(self.data)}/{self.maxlen})"
        )
--- a/src/axolotl/integrations/swanlab/plugins.py
+++ b/src/axolotl/integrations/swanlab/plugins.py
@@ -1,554 +0,0 @@
 """SwanLab Plugin for Axolotl"""
 from __future__ import annotations
 from typing import TYPE_CHECKING
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils.logging import get_logger
 if TYPE_CHECKING:
    from transformers import TrainerCallback
    from axolotl.utils.dict import DictDefault
 LOG = get_logger(__name__)
 class SwanLabPlugin(BasePlugin):
    """
    SwanLab integration plugin for Axolotl.
    Provides experiment tracking, visualization, and logging capabilities
    using SwanLab (https://swanlab.cn).
    Usage in config.yaml:
        plugins:
          - axolotl.integrations.swanlab.SwanLabPlugin
        use_swanlab: true
        swanlab_project: my-project
        swanlab_experiment_name: my-experiment
        swanlab_mode: cloud  # or 'local', 'offline', 'disabled'
    """
    def __init__(self):
        super().__init__()
        self.swanlab_initialized = False
        LOG.info("SwanLab plugin initialized")
    def get_input_args(self) -> str:
        """Returns the configuration model for SwanLab integration."""
        return "axolotl.integrations.swanlab.SwanLabConfig"
    def register(self, cfg: dict):
        """Register SwanLab plugin with configuration and conflict detection."""
        LOG.info("Registering SwanLab plugin")
        # === Conflict Detection: Required Fields ===
        # Check if SwanLab is enabled
        if cfg.get("use_swanlab"):
            # 1. Validate project name is set
            if not cfg.get("swanlab_project"):
                raise ValueError(
                    "SwanLab enabled but 'swanlab_project' is not set.\n\n"
                    "Solutions:\n"
                    "  1. Add 'swanlab_project: your-project-name' to your config\n"
                    "  2. Set 'use_swanlab: false' to disable SwanLab\n\n"
                    "See: src/axolotl/integrations/swanlab/README.md for examples"
                )
            # 2. Validate swanlab_mode value
            valid_modes = ["cloud", "local", "offline", "disabled"]
            mode = cfg.get("swanlab_mode")
            if mode and mode not in valid_modes:
                raise ValueError(
                    f"Invalid swanlab_mode: '{mode}'.\n\n"
                    f"Valid options: {', '.join(valid_modes)}\n\n"
                    f"Example:\n"
                    f"  swanlab_mode: cloud  # Sync to SwanLab cloud\n"
                    f"  swanlab_mode: local  # Local only, no cloud sync\n"
                )
            # 3. Check API key for cloud mode
            import os
            mode = cfg.get("swanlab_mode", "cloud")  # Default is cloud
            if mode == "cloud":
                api_key = cfg.get("swanlab_api_key") or os.environ.get(
                    "SWANLAB_API_KEY"
                )
                if not api_key:
                    LOG.warning(
                        "SwanLab cloud mode enabled but no API key found.\n"
                        "SwanLab may fail to initialize during training.\n\n"
                        "Solutions:\n"
                        "  1. Set SWANLAB_API_KEY environment variable:\n"
                        "     export SWANLAB_API_KEY=your-api-key\n"
                        "  2. Add 'swanlab_api_key: your-api-key' to config (less secure)\n"
                        "  3. Run 'swanlab login' before training\n"
                        "  4. Use 'swanlab_mode: local' for offline tracking\n"
                    )
        # === Conflict Detection: Multi-Logger Performance Warning ===
        # Detect all active logging tools
        active_loggers = []
        if cfg.get("use_wandb"):
            active_loggers.append("WandB")
        if cfg.get("use_mlflow"):
            active_loggers.append("MLflow")
        if cfg.get("comet_api_key") or cfg.get("comet_project_name"):
            active_loggers.append("Comet")
        if cfg.get("use_swanlab"):
            active_loggers.append("SwanLab")
        if len(active_loggers) > 1:
            LOG.warning(
                f"\n{'=' * 70}\n"
                f"Multiple logging tools enabled: {', '.join(active_loggers)}\n"
                f"{'=' * 70}\n"
                f"This may cause:\n"
                f"  - Performance overhead (~1-2% per logger, cumulative)\n"
                f"  - Increased memory usage\n"
                f"  - Longer training time per step\n"
                f"  - Potential config/callback conflicts\n\n"
                f"Recommendations:\n"
                f"  - Choose ONE primary logging tool for production training\n"
                f"  - Use multiple loggers only for:\n"
                f"    * Migration period (transitioning between tools)\n"
                f"    * Short comparison runs\n"
                f"    * Debugging specific tool issues\n"
                f"  - Monitor system resources (CPU, memory) during training\n"
                f"{'=' * 70}\n"
            )
            if len(active_loggers) >= 3:
                LOG.error(
                    f"\n{'!' * 70}\n"
                    f"WARNING: {len(active_loggers)} logging tools enabled simultaneously!\n"
                    f"{'!' * 70}\n"
                    f"This is likely unintentional and WILL significantly impact performance.\n"
                    f"Expected overhead: ~{len(active_loggers) * 1.5:.1f}% per training step.\n\n"
                    f"STRONGLY RECOMMEND:\n"
                    f"  - Disable all but ONE logging tool\n"
                    f"  - Use config inheritance to manage multiple configs\n"
                    f"{'!' * 70}\n"
                )
        # === Auto-Enable Logic ===
        # Enable SwanLab if project is specified
        if cfg.get("swanlab_project") and not cfg.get("use_swanlab"):
            cfg["use_swanlab"] = True
            LOG.info("Automatically enabled use_swanlab because swanlab_project is set")
    def pre_model_load(self, cfg: DictDefault):
        """Initialize SwanLab before model loading with runtime checks."""
        if not cfg.use_swanlab:
            return
        # === Runtime Check: Import Availability ===
        try:
            import swanlab
        except ImportError as err:
            raise ImportError(
                "SwanLab is not installed.\n\n"
                "Install with:\n"
                "  pip install swanlab\n\n"
                "Or add to requirements:\n"
                "  swanlab>=0.3.0\n\n"
                f"Original error: {err}"
            ) from err
        # Log SwanLab version
        try:
            swanlab_version = swanlab.__version__
            LOG.info(f"SwanLab version: {swanlab_version}")
        except AttributeError:
            LOG.warning("Could not determine SwanLab version")
        # === Runtime Check: Distributed Training Setup ===
        from axolotl.utils.distributed import get_world_size, is_main_process
        world_size = get_world_size()
        if world_size > 1:
            mode = getattr(cfg, "swanlab_mode", "cloud")
            LOG.info(
                f"\n{'=' * 70}\n"
                f"Distributed training detected (world_size={world_size})\n"
                f"SwanLab mode: {mode}\n"
                f"{'=' * 70}\n"
                f"Behavior:\n"
                f"  - Only rank 0 will initialize SwanLab\n"
                f"  - Other ranks will skip SwanLab to avoid conflicts\n"
            )
            if mode == "cloud":
                LOG.info(
                    f"  - Only rank 0 will upload to SwanLab cloud\n"
                    f"  - Other ranks run without SwanLab overhead\n"
                    f"{'=' * 70}\n"
                )
        # Only initialize SwanLab on the main process (rank 0)
        # to avoid creating multiple runs in distributed training
        if not is_main_process():
            LOG.debug("Skipping SwanLab initialization on non-main process")
            return
        # Initialize SwanLab run (passing all params directly to init)
        try:
            init_kwargs = self._get_swanlab_init_kwargs(cfg)
            swanlab.init(**init_kwargs)
            self.swanlab_initialized = True
            LOG.info(f"SwanLab initialized with project: {cfg.swanlab_project}")
            # Register Lark notification callback (if configured)
            self._register_lark_callback(cfg)
            # Log configuration (with error handling)
            try:
                config_dict = self._prepare_config_for_logging(cfg)
                swanlab.config.update(config_dict)
                LOG.debug("Successfully logged config to SwanLab")
            except Exception as config_err:  # pylint: disable=broad-except
                LOG.warning(
                    f"Failed to log config to SwanLab: {config_err}. Continuing anyway."
                )
        except Exception as err:  # pylint: disable=broad-except
            LOG.exception("Failed to initialize SwanLab: %s", err)
            self.swanlab_initialized = False
    def add_callbacks_pre_trainer(self, cfg: DictDefault, model):
        """Add SwanLab callbacks before trainer creation."""
        callbacks: list[TrainerCallback] = []
        if not cfg.use_swanlab:
            return callbacks
        if not self.swanlab_initialized:
            LOG.warning("SwanLab not initialized, skipping callback registration")
            return callbacks
        try:
            from axolotl.utils.callbacks.swanlab import (
                CustomSwanLabCallback,
                SaveAxolotlConfigtoSwanLabCallback,
            )
            # Add our custom lightweight SwanLabCallback
            # (avoids omegaconf/antlr4 version conflicts)
            swanlab_callback = CustomSwanLabCallback()
            callbacks.append(swanlab_callback)
            LOG.info("Added CustomSwanLabCallback for metrics logging")
            # Add Axolotl config logging callback
            if cfg.axolotl_config_path:
                config_callback = SaveAxolotlConfigtoSwanLabCallback(
                    cfg.axolotl_config_path
                )
                callbacks.append(config_callback)
                LOG.info("Added SaveAxolotlConfigtoSwanLabCallback")
        except ImportError as err:
            LOG.exception("Failed to import SwanLab callbacks: %s", err)
        return callbacks
    def post_trainer_create(self, cfg: DictDefault, trainer):
        """Post-trainer creation hook."""
        if cfg.use_swanlab and self.swanlab_initialized:
            try:
                import swanlab
                # Log additional trainer information (with safe conversion)
                trainer_config = {
                    "total_steps": int(trainer.state.max_steps)
                    if trainer.state.max_steps
                    else None,
                    "num_train_epochs": float(trainer.args.num_train_epochs)
                    if trainer.args.num_train_epochs
                    else None,
                    "train_batch_size": int(trainer.args.train_batch_size)
                    if hasattr(trainer.args, "train_batch_size")
                    else None,
                    "gradient_accumulation_steps": int(
                        trainer.args.gradient_accumulation_steps
                    )
                    if trainer.args.gradient_accumulation_steps
                    else None,
                }
                # Remove None values
                trainer_config = {
                    k: v for k, v in trainer_config.items() if v is not None
                }
                if trainer_config:
                    swanlab.config.update(trainer_config)
                    LOG.info("Logged trainer configuration to SwanLab")
            except Exception as err:  # pylint: disable=broad-except
                LOG.debug(f"Failed to log trainer config to SwanLab: {err}")
            # Register RLHF completion logging callback if enabled
            self._register_completion_callback(cfg, trainer)
    def _get_swanlab_init_kwargs(self, cfg: DictDefault) -> dict:
        """Prepare kwargs for swanlab.init().
        Passes all configuration parameters directly to swanlab.init()
        instead of using environment variables as an intermediate layer.
        Returns:
            dict: Keyword arguments for swanlab.init()
        """
        init_kwargs = {}
        # Project name (required)
        if cfg.swanlab_project:
            init_kwargs["project"] = cfg.swanlab_project
        # Experiment name
        if cfg.swanlab_experiment_name:
            init_kwargs["experiment_name"] = cfg.swanlab_experiment_name
        # Description
        if cfg.swanlab_description:
            init_kwargs["description"] = cfg.swanlab_description
        # Workspace (organization)
        if cfg.swanlab_workspace:
            init_kwargs["workspace"] = cfg.swanlab_workspace
        # Mode: cloud, local, offline, disabled
        if cfg.swanlab_mode:
            init_kwargs["mode"] = cfg.swanlab_mode
        # API key (pass directly instead of via env var)
        if cfg.swanlab_api_key:
            init_kwargs["api_key"] = cfg.swanlab_api_key
        # Private deployment hosts (pass directly instead of via env var)
        if cfg.swanlab_web_host:
            init_kwargs["web_host"] = cfg.swanlab_web_host
        if cfg.swanlab_api_host:
            init_kwargs["api_host"] = cfg.swanlab_api_host
        # Log model checkpoints (coming soon in SwanLab)
        if cfg.swanlab_log_model:
            init_kwargs["log_model"] = cfg.swanlab_log_model
        # Custom branding - adds Axolotl identifier to SwanLab UI
        # This helps identify runs from Axolotl vs other frameworks
        init_kwargs["config"] = {"UPPERFRAME": "🦎 Axolotl"}
        return init_kwargs
    def _prepare_config_for_logging(self, cfg: DictDefault) -> dict:
        """Prepare configuration dict for logging to SwanLab."""
        def safe_convert(value):
            """Convert value to JSON-serializable type."""
            if value is None:
                return None
            if isinstance(value, (int, float, bool)):
                return value
            if isinstance(value, str):
                return value
            # Convert everything else to string
            return str(value)
        try:
            # Extract important training parameters with safe conversion
            config_dict = {
                "base_model": safe_convert(getattr(cfg, "base_model", "")),
                "model_type": safe_convert(getattr(cfg, "model_type", "")),
                "sequence_len": safe_convert(getattr(cfg, "sequence_len", None)),
                "micro_batch_size": safe_convert(
                    getattr(cfg, "micro_batch_size", None)
                ),
                "gradient_accumulation_steps": safe_convert(
                    getattr(cfg, "gradient_accumulation_steps", None)
                ),
                "num_epochs": safe_convert(getattr(cfg, "num_epochs", None)),
                "max_steps": safe_convert(getattr(cfg, "max_steps", None)),
                "learning_rate": safe_convert(getattr(cfg, "learning_rate", None)),
                "lr_scheduler": safe_convert(getattr(cfg, "lr_scheduler", "")),
                "optimizer": safe_convert(getattr(cfg, "optimizer", "")),
                "warmup_ratio": safe_convert(getattr(cfg, "warmup_ratio", None)),
                "weight_decay": safe_convert(getattr(cfg, "weight_decay", None)),
                "seed": safe_convert(getattr(cfg, "seed", None)),
                "bf16": safe_convert(getattr(cfg, "bf16", None)),
                "tf32": safe_convert(getattr(cfg, "tf32", None)),
                "flash_attention": safe_convert(getattr(cfg, "flash_attention", None)),
                "sample_packing": safe_convert(getattr(cfg, "sample_packing", None)),
            }
            # Add FSDP/parallel config - only boolean flags
            if hasattr(cfg, "fsdp_config") and cfg.fsdp_config:
                config_dict["fsdp_enabled"] = True
                config_dict["fsdp_version"] = safe_convert(
                    getattr(cfg, "fsdp_version", None)
                )
            if hasattr(cfg, "deepspeed") and cfg.deepspeed:
                config_dict["deepspeed_enabled"] = True
            # Add context parallel info
            if hasattr(cfg, "context_parallel_size"):
                config_dict["context_parallel_size"] = safe_convert(
                    getattr(cfg, "context_parallel_size", None)
                )
            if hasattr(cfg, "tensor_parallel_size"):
                config_dict["tensor_parallel_size"] = safe_convert(
                    getattr(cfg, "tensor_parallel_size", None)
                )
            if hasattr(cfg, "dp_shard_size"):
                config_dict["dp_shard_size"] = safe_convert(
                    getattr(cfg, "dp_shard_size", None)
                )
            # Remove None values and empty strings
            config_dict = {
                k: v
                for k, v in config_dict.items()
                if v is not None and v != "" and v != "None"
            }
            return config_dict
        except Exception as err:  # pylint: disable=broad-except
            LOG.warning(f"Failed to prepare config for logging: {err}")
            # Return minimal config
            try:
                lr = getattr(cfg, "learning_rate", None)
                lr_value = float(lr) if lr is not None else None
            except (TypeError, ValueError):
                lr_value = None
            return {
                "base_model": str(getattr(cfg, "base_model", "unknown")),
                "learning_rate": lr_value,
            }
    def _register_lark_callback(self, cfg: DictDefault):
        """Register Lark (Feishu) notification callback if configured.
        Lark notifications enable sending training updates to team chat channels,
        useful for production monitoring and team collaboration.
        Args:
            cfg: Configuration object with Lark webhook settings
        """
        # Check if Lark webhook URL is configured
        lark_webhook_url = getattr(cfg, "swanlab_lark_webhook_url", None)
        if not lark_webhook_url:
            return  # Lark not configured, skip
        try:
            import swanlab
            from swanlab.plugin.notification import LarkCallback
            # Get optional secret for HMAC signature authentication
            lark_secret = getattr(cfg, "swanlab_lark_secret", None)
            # Create Lark callback with webhook URL and optional secret
            lark_callback = LarkCallback(
                webhook_url=lark_webhook_url,
                secret=lark_secret,
            )
            # Register callback with SwanLab
            swanlab.register_callbacks([lark_callback])
            if lark_secret:
                LOG.info(
                    "Registered Lark notification callback with HMAC authentication"
                )
            else:
                LOG.info("Registered Lark notification callback (no HMAC secret)")
                LOG.warning(
                    "Lark webhook has no secret configured. "
                    "For production use, set 'swanlab_lark_secret' to enable HMAC signature verification."
                )
        except ImportError as err:
            LOG.warning(
                f"Failed to import SwanLab Lark plugin: {err}\n\n"
                "Lark notifications require SwanLab >= 0.3.0 with plugin support.\n"
                "Install with: pip install 'swanlab>=0.3.0'\n\n"
                "Continuing without Lark notifications..."
            )
        except Exception as err:  # pylint: disable=broad-except
            LOG.exception(
                "Failed to register Lark callback: %s\n\n"
                "Check your Lark webhook URL and secret configuration.\n"
                "Continuing without Lark notifications...",
                err,
            )
    def _register_completion_callback(self, cfg: DictDefault, trainer):
        """Register RLHF completion logging callback if enabled and applicable.
        This callback logs model completions (prompts, chosen/rejected responses,
        rewards) to SwanLab during RLHF training for qualitative analysis.
        Args:
            cfg: Configuration object with completion logging settings
            trainer: The trainer instance to add callback to
        """
        # Check if completion logging is enabled
        log_completions = getattr(cfg, "swanlab_log_completions", True)
        if not log_completions:
            LOG.debug("SwanLab completion logging disabled by config")
            return
        # Check if trainer is an RLHF trainer
        trainer_name = trainer.__class__.__name__
        rlhf_trainers = ["DPO", "KTO", "ORPO", "GRPO", "CPO"]
        is_rlhf_trainer = any(name in trainer_name for name in rlhf_trainers)
        if not is_rlhf_trainer:
            LOG.debug(
                f"Trainer {trainer_name} is not an RLHF trainer, "
                "skipping completion logging callback"
            )
            return
        try:
            from axolotl.integrations.swanlab.callbacks import (
                SwanLabRLHFCompletionCallback,
            )
            # Get configuration parameters
            log_interval = getattr(cfg, "swanlab_completion_log_interval", 100)
            max_buffer = getattr(cfg, "swanlab_completion_max_buffer", 128)
            # Create and register callback
            completion_callback = SwanLabRLHFCompletionCallback(
                log_interval=log_interval,
                max_completions=max_buffer,
                table_name="rlhf_completions",
            )
            trainer.add_callback(completion_callback)
            LOG.info(
                f"Registered SwanLab RLHF completion logging callback for {trainer_name} "
                f"(log_interval={log_interval}, max_buffer={max_buffer})"
            )
        except ImportError as err:
            LOG.warning(
                f"Failed to import SwanLab completion callback: {err}\n\n"
                "This is a bug - the callback should be available.\n"
                "Please report this issue.\n\n"
                "Continuing without completion logging..."
            )
        except Exception as err:  # pylint: disable=broad-except
            LOG.exception(
                "Failed to register SwanLab completion callback: %s\n\n"
                "Continuing without completion logging...",
                err,
            )
--- a/src/axolotl/integrations/swanlab/profiling.py
+++ b/src/axolotl/integrations/swanlab/profiling.py
@@ -1,203 +0,0 @@
 """SwanLab profiling utilities for Axolotl trainers.
 This module provides decorators and context managers for profiling
 trainer methods and logging execution times to SwanLab.
 """
 import time
 from contextlib import contextmanager
 from functools import wraps
 from typing import Any, Callable
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
@contextmanager
 def swanlab_profiling_context(trainer: Any, func_name: str):
    """Context manager for profiling trainer methods.
    Measures execution time and logs to SwanLab if enabled.
    Example usage:
        >>> with swanlab_profiling_context(self, "training_step"):
        ...     result = do_expensive_computation()
    Args:
        trainer: Trainer instance (must have cfg attribute with use_swanlab flag)
        func_name: Name of the function being profiled
    Yields:
        None
    """
    start_time = time.perf_counter()
    try:
        yield
    finally:
        duration = time.perf_counter() - start_time
        # Check if SwanLab is enabled and initialized
        use_swanlab = getattr(getattr(trainer, "cfg", None), "use_swanlab", False)
        if use_swanlab:
            try:
                import swanlab
                if swanlab.get_run() is not None:
                    # Log profiling metric
                    trainer_class = trainer.__class__.__name__
                    metric_name = f"profiling/Time taken: {trainer_class}.{func_name}"
                    swanlab.log({metric_name: duration})
            except ImportError:
                # SwanLab not installed, silently skip
                pass
            except Exception as err:  # pylint: disable=broad-except
                # Log error but don't fail training
                LOG.debug(f"Failed to log profiling metric for {func_name}: {err}")
 def swanlab_profile(func: Callable) -> Callable:
    """Decorator to profile and log function execution time to SwanLab.
    Automatically measures execution time of trainer methods and logs
    to SwanLab as profiling metrics.
    Example usage:
        >>> class MyTrainer:
        ...     @swanlab_profile
        ...     def training_step(self, model, inputs):
        ...         return super().training_step(model, inputs)
    Args:
        func: Function to profile (must be a method of a trainer instance)
    Returns:
        Wrapped function with profiling
    """
    @wraps(func)
    def wrapper(self, *args, **kwargs):
        with swanlab_profiling_context(self, func.__name__):
            return func(self, *args, **kwargs)
    return wrapper
 class ProfilingConfig:
    """Configuration for SwanLab profiling.
    This class provides a centralized way to control profiling behavior.
    Attributes:
        enabled: Whether profiling is enabled globally
        min_duration_ms: Minimum duration (in ms) to log (filters out very fast ops)
        log_interval: Log every N function calls (to reduce overhead)
    """
    def __init__(
        self,
        enabled: bool = True,
        min_duration_ms: float = 0.1,
        log_interval: int = 1,
    ):
        """Initialize profiling configuration.
        Args:
            enabled: Enable profiling. Default: True
            min_duration_ms: Minimum duration to log (ms). Default: 0.1
            log_interval: Log every N calls. Default: 1 (log all)
        """
        self.enabled = enabled
        self.min_duration_ms = min_duration_ms
        self.log_interval = log_interval
        self._call_counts: dict[str, int] = {}
    def should_log(self, func_name: str, duration_seconds: float) -> bool:
        """Check if a profiling measurement should be logged.
        Args:
            func_name: Name of the profiled function
            duration_seconds: Execution duration in seconds
        Returns:
            True if should log, False otherwise
        """
        if not self.enabled:
            return False
        # Check minimum duration threshold
        duration_ms = duration_seconds * 1000
        if duration_ms < self.min_duration_ms:
            return False
        # Check log interval
        self._call_counts.setdefault(func_name, 0)
        self._call_counts[func_name] += 1
        # Always log on first call OR at intervals
        count = self._call_counts[func_name]
        if count == 1 or count % self.log_interval == 0:
            return True
        return False
 # Global profiling config (can be modified by users)
 DEFAULT_PROFILING_CONFIG = ProfilingConfig()
@contextmanager
 def swanlab_profiling_context_advanced(
    trainer: Any,
    func_name: str,
    config: ProfilingConfig | None = None,
 ):
    """Advanced profiling context with configurable behavior.
    Similar to swanlab_profiling_context but with additional configuration
    options for filtering and throttling profiling logs.
    Example usage:
        >>> config = ProfilingConfig(min_duration_ms=1.0, log_interval=10)
        >>> with swanlab_profiling_context_advanced(self, "forward", config):
        ...     output = model(inputs)
    Args:
        trainer: Trainer instance
        func_name: Function name
        config: Profiling configuration. If None, uses DEFAULT_PROFILING_CONFIG
    Yields:
        None
    """
    if config is None:
        config = DEFAULT_PROFILING_CONFIG
    start_time = time.perf_counter()
    try:
        yield
    finally:
        duration = time.perf_counter() - start_time
        # Check if should log based on config
        if config.should_log(func_name, duration):
            # Check if SwanLab is enabled
            use_swanlab = getattr(getattr(trainer, "cfg", None), "use_swanlab", False)
            if use_swanlab:
                try:
                    import swanlab
                    if swanlab.get_run() is not None:
                        trainer_class = trainer.__class__.__name__
                        metric_name = (
                            f"profiling/Time taken: {trainer_class}.{func_name}"
                        )
                        swanlab.log({metric_name: duration})
                except ImportError:
                    pass
                except Exception as err:  # pylint: disable=broad-except
                    LOG.debug(f"Failed to log profiling metric for {func_name}: {err}")
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -26,6 +26,7 @@ from torch.distributed import DeviceMesh
 from transformers import (
    AutoModelForCausalLM,
    AutoModelForImageTextToText,
    AutoModelForVision2Seq,
    AwqConfig,
    BitsAndBytesConfig,
    GPTQConfig,
@@ -225,7 +226,6 @@ class ModelLoader:
        ):
            self.model = self.model.merge_and_unload()
        self._configure_experts_implementation()
        self._apply_activation_checkpointing()
        self._resize_token_embeddings()
        self._adjust_model_config()
@@ -233,10 +233,6 @@ class ModelLoader:
        self._configure_qat()
        log_gpu_memory_usage(LOG, "Memory usage after model load", 0)
    def _configure_experts_implementation(self):
        if self.cfg.experts_implementation is not None:
            self.model.set_experts_implementation(self.cfg.experts_implementation)
    def _apply_activation_checkpointing(self):
        if self.cfg.activation_offloading is True:
            from axolotl.core.trainers.mixins.activation_checkpointing import (
@@ -438,7 +434,7 @@ class ModelLoader:
        """
        if self.cfg.is_multimodal:
            self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get(
-                self.model_config.model_type, AutoModelForImageTextToText
+                self.model_config.model_type, AutoModelForVision2Seq
            )
            if isinstance(self.auto_model_loader, str):
                self.auto_model_loader = AutoModelForImageTextToText
@@ -480,7 +476,6 @@ class ModelLoader:
            max_memory = None
        self.model_kwargs["torch_dtype"] = self.cfg.torch_dtype
        self.model_kwargs["dtype"] = self.cfg.torch_dtype
        is_ds_zero3 = is_deepspeed_zero3_enabled()
@@ -675,7 +670,7 @@ class ModelLoader:
        Uses the selected loader when provided; otherwise falls back to the auto loader.
        """
        loader = model_loader_class or self.auto_model_loader
-        if loader in [AutoModelForCausalLM, AutoModelForImageTextToText]:
+        if loader in [AutoModelForCausalLM, AutoModelForVision2Seq]:
            model = loader.from_config(
                config=self.model_config,
                trust_remote_code=self.cfg.trust_remote_code or False,
@@ -793,7 +788,6 @@ class ModelLoader:
                # Use auto model loader (handles gptq and default cases)
                model_loader_class = self.auto_model_loader
            self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
            if self.cfg.reinit_weights:
                self.model = self._load_model_from_config(model_loader_class)
            else:
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -138,7 +138,6 @@ class PatchManager:
        self._apply_llama_flash_attn_patches(model)
        self._apply_unsloth_patches(model)
        self._apply_lora_kernel_patch(model)
        self._apply_scaling_softmax_patch(model)
    def _apply_flash_attention_patches(self):
        """Apply patches related to Flash Attention."""
@@ -220,6 +219,13 @@ class PatchManager:
            patch_qwen3_next_modeling_packing()
        if self.cfg.model_config_type == "mistral3" and self.cfg.processor_type:
            from axolotl.monkeypatch.models.mistral3.mistral_common_tokenizer import (
                apply_mistral_tokenizer_image_patch,
            )
            apply_mistral_tokenizer_image_patch()
        if self.cfg.model_config_type == "kimi_linear":
            from axolotl.monkeypatch.models.kimi_linear.patch_kimi_linear import (
                patch_kimi_model,
@@ -554,16 +560,3 @@ class PatchManager:
            )
            patch_apertus_xielu_activation()
    def _apply_scaling_softmax_patch(self, model: PreTrainedModel):
        """Apply Scaling Softmax (SSMax) patch.  Ref: https://arxiv.org/abs/2501.19399"""
        if self.cfg.scaling_softmax:
            from axolotl.monkeypatch.scaled_softmax_attn import (
                patch_scaled_softmax_attention,
            )
            patch_scaled_softmax_attention(
                scaling_factor_init=self.cfg.scaling_softmax_factor or 0.43,
                bias=self.cfg.scaling_softmax_bias or 0.0,
                model=model,
            )
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -31,7 +31,7 @@ def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
            from axolotl.utils.mistral import HFMistralTokenizer
-            tokenization_mistral_common.MistralCommonBackend = HFMistralTokenizer
+            tokenization_mistral_common.MistralCommonTokenizer = HFMistralTokenizer
        _patch_mistralcommontokenizer()
--- a/src/axolotl/loaders/utils.py
+++ b/src/axolotl/loaders/utils.py
@@ -5,7 +5,6 @@ from typing import Type
 import addict
 import torch
 import transformers
 from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
 from axolotl.utils.dict import DictDefault
@@ -154,9 +153,6 @@ def load_model_config(cfg: DictDefault) -> PretrainedConfig | addict.Dict:
    This function determines the appropriate model config source, loads it, applies any
    necessary overrides, and validates it for compatibility with the `axolotl` config.
    If `cfg.cls_model_config` is set, a custom config class from transformers will be
    used instead of `AutoConfig` (e.g., 'LlamaConfig', 'MistralConfig').
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
@@ -178,13 +174,8 @@ def load_model_config(cfg: DictDefault) -> PretrainedConfig | addict.Dict:
    if cfg.num_labels:
        # num_labels is used to initialize classifier models
        config_kwargs["num_labels"] = cfg.num_labels
    config_cls = AutoConfig
    if cfg.cls_model_config:
        config_cls = getattr(transformers, cfg.cls_model_config)
    try:
-        model_config = config_cls.from_pretrained(
+        model_config = AutoConfig.from_pretrained(
            model_config_name,
            trust_remote_code=trust_remote_code,
            **config_kwargs,
--- a/src/axolotl/models/mamba/modeling_mamba.py
+++ b/src/axolotl/models/mamba/modeling_mamba.py
@@ -111,6 +111,7 @@ class MambaLMHeadModel(nn.Module, GenerationMixin):
        self,
        save_directory: Union[str, os.PathLike],
        state_dict: Optional[dict] = None,
        safe_serialization: Optional[bool] = None,
    ):
        if state_dict is None:
            state_dict = self.state_dict()
--- a/src/axolotl/monkeypatch/loss/eaft.py
+++ b/src/axolotl/monkeypatch/loss/eaft.py
@@ -1,51 +0,0 @@
 """
 eaft (entropy-aware focal training) loss implementation
 weights examples by entropy approximation from top-k logits
 Reference: https://github.com/ymxyll/LlamaFactory-EAFT/blob/e2ce19e8efcc226450ee8f2b81dfe4e69f1f945d/src/llamafactory/train/trainer_utils.py
 """
 import torch
 import torch.nn.functional as F
 def eaft_loss(outputs, labels, num_items_in_batch=None, alpha=1.0, k=20):
    """
    compute eaft loss with entropy weighting
    args:
        outputs: model outputs containing logits
        labels: target labels for computing loss
        num_items_in_batch: for sample packing support
        alpha: exponent for entropy weighting (default 1.0)
        k: number of top logits for entropy approximation (default 20)
    """
    logits = outputs.logits
    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()
    vocab_size = shift_logits.size(-1)
    shift_logits_view = shift_logits.view(-1, vocab_size)
    shift_labels_view = shift_labels.view(-1)
    mask = shift_labels_view != -100
    with torch.no_grad():
        top_k_logits, _ = torch.topk(
            shift_logits_view[mask].float(), k=min(k, vocab_size), dim=-1
        )
        top_k_probs = F.softmax(top_k_logits, dim=-1)
        entropy = -(top_k_probs * torch.log(top_k_probs + 1e-10)).sum(dim=-1)
        weights = torch.pow(entropy, alpha)
    loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
    per_token_loss = loss_fct(shift_logits_view[mask], shift_labels_view[mask])
    weighted_loss = per_token_loss * weights
    if num_items_in_batch is not None:
        loss = weighted_loss.sum() / num_items_in_batch
    else:
        loss = weighted_loss.mean()
    return loss
--- a/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
+++ b/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
@@ -1,5 +1,5 @@
 """
-Monkeypatch to fix inefficient tensor conversion in MistralCommonBackend.apply_chat_template
+Monkeypatch to fix inefficient tensor conversion in MistralCommonTokenizer.apply_chat_template
 """
 import importlib
@@ -12,11 +12,11 @@ LOG = get_logger(__name__)
 def apply_mistral_tokenizer_image_patch():
-    """Apply patch to MistralCommonBackend.apply_chat_template to fix image tensor conversion."""
+    """Apply patch to MistralCommonTokenizer.apply_chat_template to fix image tensor conversion."""
-    from transformers.tokenization_mistral_common import MistralCommonBackend
+    from transformers.tokenization_mistral_common import MistralCommonTokenizer
    # Get original source
-    original_source = inspect.getsource(MistralCommonBackend.apply_chat_template)
+    original_source = inspect.getsource(MistralCommonTokenizer.apply_chat_template)
    original_source, _ = detab_code(original_source)
    # Define the replacement
@@ -41,7 +41,7 @@ def apply_mistral_tokenizer_image_patch():
        )
        # Load necessary imports from the module
-        module_name = MistralCommonBackend.__module__
+        module_name = MistralCommonTokenizer.__module__
        module = importlib.import_module(module_name)
        # Detect what needs to be imported
@@ -79,7 +79,7 @@ def apply_mistral_tokenizer_image_patch():
        exec(patched_source, globals())  # nosec B102
        # Replace the method
-        MistralCommonBackend.apply_chat_template = patched_apply_chat_template
+        MistralCommonTokenizer.apply_chat_template = patched_apply_chat_template
-        LOG.info("Successfully applied MistralCommonBackend tensor conversion patch")
+        LOG.info("Successfully applied MistralCommonTokenizer tensor conversion patch")
    else:
-        LOG.warning("Could not find target code for MistralCommonBackend patching")
+        LOG.warning("Could not find target code for MistralCommonTokenizer patching")
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -155,6 +155,7 @@ class ReLoRACallback(TrainerCallback):
                    f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
                    "adapter",
                ),
                safe_serialization=True,
            )
            with torch.no_grad():
                merge_and_save(
@@ -213,7 +214,7 @@ class ReLoRACallback(TrainerCallback):
                    self.last_full_model = checkpoint_folder
            else:
-                model.model.save_pretrained(checkpoint_folder)
+                model.model.save_pretrained(checkpoint_folder, safe_serialization=True)
        return control
--- a/src/axolotl/monkeypatch/scaled_softmax_attn.py
+++ b/src/axolotl/monkeypatch/scaled_softmax_attn.py
@@ -1,141 +0,0 @@
 """
 Scaled Softmax (SSMax) attention patch using FlexAttention.
 SSMax:  softmax(scores * s * log(n) + b) where n is the position index
 Ref: https://arxiv.org/abs/2501.19399
 """
 import torch
 from transformers import PreTrainedModel
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
 try:
    from torch.nn.attention.flex_attention import BlockMask
    from transformers.integrations.flex_attention import (
        compile_friendly_flex_attention,
        repeat_kv,
    )
    FLEX_ATTENTION_AVAILABLE = True
 except ImportError:
    FLEX_ATTENTION_AVAILABLE = False
    BlockMask = None
 _ssmax_config = {}
 def patch_scaled_softmax_attention(
    scaling_factor_init: float = 0.43, bias: float = 0.0, model: PreTrainedModel = None
 ):
    """Patch attention to apply SSMax via FlexAttention score_mod."""
    global _ssmax_config
    if not FLEX_ATTENTION_AVAILABLE:
        raise RuntimeError("SSMax requires FlexAttention.")
    _ssmax_config["ssmax_s"] = scaling_factor_init
    _ssmax_config["ssmax_b"] = bias
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
    if "flex_attention" in ALL_ATTENTION_FUNCTIONS:
        _ssmax_config["original_flex_fn"] = ALL_ATTENTION_FUNCTIONS["flex_attention"]
        ALL_ATTENTION_FUNCTIONS["flex_attention"] = ssmax_flex_attention_forward
        LOG.info(
            f"Patched flex_attention with SSMax (s={scaling_factor_init}, b={bias})"
        )
    else:
        LOG.warning("flex_attention not found.  Ensure flex_attention:  true is set.")
 def ssmax_flex_attention_forward(
    module: torch.nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask,
    scaling: float | None = None,
    softcap: float | None = None,
    **kwargs,
 ) -> tuple[torch.Tensor, torch.Tensor | None]:
    """FlexAttention forward with SSMax:  score * (s * log(n) + b)."""
    if kwargs.get("dropout", 0.0) > 0:
        raise ValueError("flex_attention does not support dropout")
    ssmax_s = _ssmax_config.get("ssmax_s", 0.43)
    ssmax_b = _ssmax_config.get("ssmax_b", 0.0)
    position_ids = kwargs.get("position_ids", None)
    position_ids_flat = position_ids.view(-1) if position_ids is not None else None
    block_mask = attention_mask if isinstance(attention_mask, BlockMask) else None
    score_mask = None if block_mask else attention_mask
    if score_mask is not None:
        score_mask = score_mask[:, :, :, : key.shape[-2]]
    def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
        """
        Apply SSMax scaling:  score * (s * log(n) + b)
        where n is the relative position within each packed sequence.
        """
        if position_ids_flat is not None:
            relative_pos = position_ids_flat[q_idx]
            n = (relative_pos + 1).float()
        else:
            n = (q_idx + 1).float()
        n = torch.clamp(n, min=2.0)
        ssmax_scale = ssmax_s * torch.log(n) + ssmax_b
        score = score * ssmax_scale
        if softcap is not None:
            score = softcap * torch.tanh(score / softcap)
        if score_mask is not None:
            score = score + score_mask[batch_idx][0][q_idx][kv_idx]
        return score
    enable_gqa = True
    if (query.shape[1] & (query.shape[1] - 1)) != 0:
        key = repeat_kv(key, query.shape[1] // key.shape[1])
        value = repeat_kv(value, query.shape[1] // value.shape[1])
        enable_gqa = False
    return_lse = query.device.type != "cpu"
    flex_output = compile_friendly_flex_attention(
        query,
        key,
        value,
        score_mod=score_mod,
        block_mask=block_mask,
        enable_gqa=enable_gqa,
        scale=scaling,
        kernel_options=kwargs.get("kernel_options"),
        return_lse=return_lse,
        training=module.training,
    )
    if return_lse:
        attention_output, lse = flex_output
        lse = lse.to(value.dtype)
    else:
        attention_output, lse = flex_output, None
    return attention_output.transpose(1, 2).contiguous(), lse
 def unpatch_scaled_softmax_attention():
    """Restore the original FlexAttention function."""
    global _ssmax_config
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
    if "original_flex_fn" in _ssmax_config:
        ALL_ATTENTION_FUNCTIONS["flex_attention"] = _ssmax_config["original_flex_fn"]
        _ssmax_config.clear()
        LOG.info("Unpatched flex_attention, restored original")
--- a/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
@@ -52,15 +52,9 @@ def patch_prepare_context_parallel_inputs() -> None:
        if item in patched_source:
            items_to_import.append(item)
-    # Use a separate namespace to capture the exec'd function
+    exec(f"from {module_name} import ({', '.join(items_to_import)})", globals())
-    namespace = {}
+    exec(patched_source, globals())
    exec(f"from {module_name} import ({', '.join(items_to_import)})", namespace)
    exec(patched_source, namespace)
    # Explicitly get the function from the namespace
    axolotl_prepare_context_parallel_inputs = namespace[
        "axolotl_prepare_context_parallel_inputs"
    ]
    Trainer._original_prepare_context_parallel_inputs = (
        Trainer._prepare_context_parallel_inputs
    )
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -14,6 +14,7 @@ from transformers.models.voxtral import VoxtralProcessor
 from axolotl.utils.dict import remove_none_values
 from axolotl.utils.logging import get_logger
 from axolotl.utils.mistral.mistral3_processor import Mistral3Processor
 LOG = get_logger(__name__)
@@ -429,7 +430,7 @@ class Mistral3ProcessingStrategy(ProcessingStrategy):
    def __init__(
        self,
-        processor,
+        processor: Mistral3Processor,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
@@ -492,8 +493,6 @@ def get_processing_strategy(
    image_size: int | tuple[int, int] | None = None,
    image_resize_algorithm: Resampling | None = None,
 ):
    from axolotl.utils.mistral.mistral3_processor import Mistral3Processor
    processing_kwargs = {
        "processor": processor,
        "chat_template": chat_template,
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -150,8 +150,6 @@ class ChatTemplatePrompter(Prompter):
        return self.tokenizer.apply_chat_template(
            conversation,
            tokenize=True,
            return_dict=False,
            **chat_template_kwargs,
        )
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -135,13 +135,16 @@ def setup_reference_model(
    return model_ref
-def setup_signal_handler(cfg: DictDefault, model: PreTrainedModel):
+def setup_signal_handler(
    cfg: DictDefault, model: PreTrainedModel, safe_serialization: bool
 ):
    """
    Set up signal handler for graceful termination.
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        model: The model to save on termination
        safe_serialization: Whether to use safe serialization when saving
    """
    # ray workers don't have access to this signal
    if cfg.local_rank == 0 and not cfg.use_ray:
@@ -149,7 +152,9 @@ def setup_signal_handler(cfg: DictDefault, model: PreTrainedModel):
        def terminate_handler(_, __, model_weakref):
            if model_weakref() is not None:
                _model = model_weakref()
-                _model.save_pretrained(cfg.output_dir)
+                _model.save_pretrained(
                    cfg.output_dir, safe_serialization=safe_serialization
                )
            cleanup_distributed()
            sys.exit(0)
@@ -214,6 +219,7 @@ def save_trained_model(
    cfg: DictDefault,
    trainer: Any,
    model: PreTrainedModel,
    safe_serialization: bool,
 ):
    """
    Save the trained model according to configuration and training setup.
@@ -222,6 +228,7 @@ def save_trained_model(
        cfg: Dictionary mapping `axolotl` config keys to values.
        trainer: The trainer object.
        model: The trained model to save.
        safe_serialization: Whether to use safe serialization.
    """
    LOG.info(f"Training completed! Saving trained model to {cfg.output_dir}.")
@@ -276,6 +283,7 @@ def save_trained_model(
                merge_fsdp_weights(
                    checkpoint_dir=str(fsdp_dir),
                    output_path=merged_path,
                    safe_serialization=True,
                )
                trainer.accelerator.wait_for_everyone()
                if trainer.accelerator.is_main_process:
@@ -322,9 +330,11 @@ def save_trained_model(
                pass
    elif cfg.local_rank == 0:
        if cfg.rl and cfg.adapter and not cfg.rl_adapter_ref_model:
-            trainer.model.save_pretrained(cfg.output_dir)
+            trainer.model.save_pretrained(
                cfg.output_dir, safe_serialization=safe_serialization
            )
-        model.save_pretrained(cfg.output_dir)
+        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
    if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
        # TODO: add integration support so this can be implemented completely within the plugin
@@ -334,6 +344,7 @@ def save_trained_model(
            model=model,
            output_dir=cfg.output_dir,
            trainer=trainer,
            safe_serialization=safe_serialization,
            save_compressed=cfg.llmcompressor.save_compressed,
        )
@@ -438,6 +449,7 @@ def handle_untrained_tokens_fix(
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    train_dataset: Dataset,
    safe_serialization: bool,
 ):
    """
    Apply fixes for untrained tokens if configured.
@@ -447,6 +459,7 @@ def handle_untrained_tokens_fix(
        model: The model to apply fixes to.
        tokenizer: The tokenizer for token identification.
        train_dataset: The training dataset to use.
        safe_serialization: Whether to use safe serialization when saving.
    """
    if not cfg.fix_untrained_tokens:
        return
@@ -470,7 +483,9 @@ def handle_untrained_tokens_fix(
    fix_untrained_tokens(model, tokenizer, train_dataset, **fix_kwargs)
    if cfg.local_rank == 0:
-        model.save_pretrained(str(Path(cfg.output_dir)))
+        model.save_pretrained(
            str(Path(cfg.output_dir)), safe_serialization=safe_serialization
        )
 def setup_model_and_trainer(
@@ -567,12 +582,15 @@ def train(
    ) = setup_model_and_trainer(cfg, dataset_meta)
    # Handle untrained tokens if configured
    safe_serialization = cfg.save_safetensors is True
    train_dataset = dataset_meta.train_dataset
-    handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)
+    handle_untrained_tokens_fix(
        cfg, model, tokenizer, train_dataset, safe_serialization
    )
    # Additional setup
    save_initial_configs(cfg, tokenizer, model, peft_config, processor)
-    setup_signal_handler(cfg, model)
+    setup_signal_handler(cfg, model, safe_serialization)
    setup_model_card(cfg)
    # Execute the training
@@ -584,7 +602,7 @@ def train(
        torch.cuda.empty_cache()
    # Save the trained model and cleanup
-    save_trained_model(cfg, trainer, model)
+    save_trained_model(cfg, trainer, model, safe_serialization)
    tokenizer.save_pretrained(
        str(Path(cfg.output_dir)), save_jinja_files=cfg.tokenizer_save_jinja_files
    )
--- a/src/axolotl/utils/callbacks/perplexity.py
+++ b/src/axolotl/utils/callbacks/perplexity.py
@@ -7,11 +7,7 @@ from torch import Tensor
 from tqdm import tqdm
 from transformers.modeling_outputs import CausalLMOutput
 from transformers.modeling_utils import PreTrainedModel
-
+from transformers.tokenization_utils import PreTrainedTokenizer
 try:
    from transformers.tokenization_python import PreTrainedTokenizer
 except ImportError:
    from transformers.tokenization_utils import PreTrainedTokenizer
 from axolotl.utils.distributed import is_main_process
--- a/src/axolotl/utils/callbacks/swanlab.py
+++ b/src/axolotl/utils/callbacks/swanlab.py
@@ -1,248 +0,0 @@
 """Callbacks for SwanLab integration"""
 from __future__ import annotations
 import json
 import os
 from shutil import copyfile
 from tempfile import NamedTemporaryFile
 from typing import TYPE_CHECKING
 from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
 )
 from axolotl.utils.logging import get_logger
 if TYPE_CHECKING:
    from axolotl.core.training_args import AxolotlTrainingArguments
 LOG = get_logger(__name__)
 class CustomSwanLabCallback(TrainerCallback):
    """
    Lightweight SwanLab callback that directly logs metrics without using
    SwanLab's transformers integration (which requires omegaconf).
    This avoids the antlr4 version conflict between omegaconf and axolotl.
    """
    def __init__(self):
        self._initialized = False
        self.swanlab = None
    def setup(self):
        """Lazy initialization of SwanLab"""
        if self._initialized:
            return
        try:
            import swanlab
            self.swanlab = swanlab
            # Check if SwanLab run is initialized
            if swanlab.get_run() is None:
                LOG.warning("SwanLab run is not initialized")
                return
            self._initialized = True
            LOG.info("CustomSwanLabCallback initialized successfully")
        except ImportError:
            LOG.error("SwanLab is not installed")
    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Called at the beginning of training"""
        if not state.is_world_process_zero:
            return control
        self.setup()
        if not self._initialized:
            return control
        # Log training configuration
        try:
            self.swanlab.config.update(
                {
                    "train_batch_size": args.per_device_train_batch_size,
                    "eval_batch_size": args.per_device_eval_batch_size,
                    "learning_rate": args.learning_rate,
                    "num_train_epochs": args.num_train_epochs,
                    "max_steps": args.max_steps,
                    "warmup_steps": args.warmup_steps,
                    "logging_steps": args.logging_steps,
                    "save_steps": args.save_steps,
                    "gradient_accumulation_steps": args.gradient_accumulation_steps,
                }
            )
            LOG.debug("Training configuration logged to SwanLab")
        except Exception as err:
            LOG.warning(f"Failed to log training config: {err}")
        return control
    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs=None,
        **kwargs,
    ):
        """Called when logging metrics"""
        if not state.is_world_process_zero:
            return control
        if not self._initialized:
            self.setup()
        if not self._initialized or logs is None:
            return control
        # Log metrics to SwanLab
        try:
            # Filter out non-numeric values and prepare for logging
            metrics = {}
            for key, value in logs.items():
                if isinstance(value, (int, float)):
                    # Use step from state
                    metrics[key] = value
            if metrics and state.global_step is not None:
                self.swanlab.log(metrics, step=state.global_step)
        except Exception as err:
            LOG.warning(f"Failed to log metrics to SwanLab: {err}")
        return control
    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Called at the end of training"""
        if not state.is_world_process_zero:
            return control
        if self._initialized:
            LOG.info("Training completed. SwanLab logs are available.")
        return control
 class SaveAxolotlConfigtoSwanLabCallback(TrainerCallback):
    """Callback to save axolotl config to SwanLab"""
    def __init__(self, axolotl_config_path):
        self.axolotl_config_path = axolotl_config_path
    def on_train_begin(
        self,
        args: AxolotlTrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if state.is_world_process_zero:
            try:
                import swanlab
                # Check if SwanLab is initialized
                if swanlab.get_run() is None:
                    LOG.warning(
                        "SwanLab run is not initialized. Please initialize SwanLab before training."
                    )
                    return control
                # Log Axolotl config as artifact
                with NamedTemporaryFile(
                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
                ) as temp_file:
                    copyfile(self.axolotl_config_path, temp_file.name)
                    # Log config file to SwanLab
                    with open(temp_file.name, "r", encoding="utf-8") as config_file:
                        swanlab.log(
                            {
                                "axolotl_config": swanlab.Text(
                                    config_file.read(), caption="Axolotl Config"
                                )
                            }
                        )
                    LOG.info(
                        "The Axolotl config has been saved to the SwanLab run under logs."
                    )
                    # Clean up temp file
                    os.unlink(temp_file.name)
            except ImportError:
                LOG.warning(
                    "SwanLab is not installed. Install it with: pip install swanlab"
                )
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to SwanLab: {err}")
            # Log DeepSpeed config if available
            if args.deepspeed:
                try:
                    import swanlab
                    with NamedTemporaryFile(
                        mode="w",
                        delete=False,
                        suffix=".json",
                        prefix="deepspeed_config_",
                    ) as temp_file:
                        skip_upload = False
                        if isinstance(args.deepspeed, dict):
                            json.dump(args.deepspeed, temp_file, indent=4)
                        elif isinstance(args.deepspeed, str) and os.path.exists(
                            args.deepspeed
                        ):
                            copyfile(args.deepspeed, temp_file.name)
                        else:
                            skip_upload = True
                        if not skip_upload:
                            temp_file.flush()
                            with open(
                                temp_file.name, "r", encoding="utf-8"
                            ) as ds_config_file:
                                swanlab.log(
                                    {
                                        "deepspeed_config": swanlab.Text(
                                            ds_config_file.read(),
                                            caption="DeepSpeed Config",
                                        )
                                    }
                                )
                            LOG.info(
                                "The DeepSpeed config has been saved to the SwanLab run under logs."
                            )
                        # Clean up temp file
                        os.unlink(temp_file.name)
                except (FileNotFoundError, ConnectionError) as err:
                    LOG.warning(
                        f"Error while saving DeepSpeed config to SwanLab: {err}"
                    )
                except ImportError:
                    pass
        return control
--- a/src/axolotl/utils/callbacks/tokens_per_second.py
+++ b/src/axolotl/utils/callbacks/tokens_per_second.py
@@ -101,3 +101,9 @@ class TokensPerSecondCallback(TrainerCallback):
        # Clear per-step tokens after logging
        if tokens and "trainable_tokens" in tokens:
            tokens["trainable_tokens"] = torch.zeros_like(tokens["trainable_tokens"])
        if tokens and "total" in tokens:
            logs["tokens/total"] = tokens["total"].item()
        if tokens and "trainable" in tokens:
            logs["tokens/trainable"] = tokens["trainable"].item()
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -173,7 +173,7 @@ def _drop_long_sequences(
        return (len_prompt + len_completion) <= sequence_len
-    if rl in {RLType.GRPO, RLType.GDPO}:
+    if rl is RLType.GRPO:
        return True
    raise ValueError("Unknown RL type")
--- a/src/axolotl/utils/mistral/mistral_tokenizer.py
+++ b/src/axolotl/utils/mistral/mistral_tokenizer.py
@@ -7,11 +7,11 @@ import numpy as np
 from mistral_common.protocol.instruct.validator import ValidationMode
 from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub
 from torch import Tensor
-from transformers.tokenization_mistral_common import MistralCommonBackend
+from transformers.tokenization_mistral_common import MistralCommonTokenizer
 from transformers.tokenization_utils_base import VERY_LARGE_INTEGER
-class HFMistralTokenizer(MistralCommonBackend):
+class HFMistralTokenizer(MistralCommonTokenizer):
    """
    Wraps mistral_common.tokens.tokenizers.mistral.MistralTokenizer
    and exposes HuggingFace API for special tokens.
@@ -37,19 +37,11 @@ class HFMistralTokenizer(MistralCommonBackend):
    def name_or_path(self) -> str:
        return self._name_or_path
    @name_or_path.setter
    def name_or_path(self, name_or_path: str) -> None:
        self._name_or_path = name_or_path
    @property
    def chat_template(self) -> str | None:
        """Chat template is not supported. Dummy method to satisfy HuggingFace API."""
        return "[This is a dummy chat template]"
    @chat_template.setter
    def chat_template(self, chat_template: str | None) -> None:
        pass
    def _set_mode(self, mode: ValidationMode):
        """Set the mode of the MistralRequestValidator.
@@ -141,7 +133,7 @@ class HFMistralTokenizer(MistralCommonBackend):
        r"""
        Patched fn to pass `name_or_path` and remove extra kwargs.
-        Instantiate a `MistralCommonBackend` from a predefined
+        Instantiate a `MistralCommonTokenizer` from a predefined
        tokenizer.
        Args:
@@ -150,7 +142,7 @@ class HFMistralTokenizer(MistralCommonBackend):
                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                - A path to a *directory* containing the tokenizer config, for instance saved
-                  using the [`MistralCommonBackend.tokenization_mistral_common.save_pretrained`] method, e.g.,
+                  using the [`MistralCommonTokenizer.tokenization_mistral_common.save_pretrained`] method, e.g.,
                  `./my_model_directory/`.
            mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`):
                Validation mode for the `MistralTokenizer` tokenizer.
@@ -162,7 +154,7 @@ class HFMistralTokenizer(MistralCommonBackend):
                exist.
            token (`str` or *bool*, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `hf auth login` (stored in `~/.huggingface`).
+                when running `huggingface-cli login` (stored in `~/.huggingface`).
            local_files_only (`bool`, *optional*, defaults to `False`):
                Whether or not to only rely on local files and not to attempt to download any files.
            revision (`str`, *optional*, defaults to `"main"`):
@@ -187,12 +179,12 @@ class HFMistralTokenizer(MistralCommonBackend):
                Whether or not the model should cleanup the spaces that were added when splitting the input text during the
                tokenization process.
            kwargs (additional keyword arguments, *optional*):
-                Not supported by `MistralCommonBackend.from_pretrained`.
+                Not supported by `MistralCommonTokenizer.from_pretrained`.
                Will raise an error if used.
        """
        if init_inputs:
            raise ValueError(
-                "`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`."
+                "`init_inputs` are not supported by `MistralCommonTokenizer.from_pretrained`."
            )
        # Delete trust_remote_code as it does nothing
@@ -204,7 +196,7 @@ class HFMistralTokenizer(MistralCommonBackend):
        # Handle kwargs and AutoTokenizer case
        if kwargs and not kwargs.keys() == {"_from_auto"}:
            raise ValueError(
-                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.from_pretrained`."
+                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.from_pretrained`."
            )
        if not os.path.isfile(pretrained_model_name_or_path):
--- a/src/axolotl/utils/quantization.py
+++ b/src/axolotl/utils/quantization.py
@@ -9,6 +9,10 @@ from torchao.quantization import quantize_
 from torchao.quantization.qat import (
    QATConfig,
 )
 from torchao.quantization.qat import fake_quantizer
 from torchao.quantization.qat.fake_quantizer import (
    Int4WeightFakeQuantizer as AoInt4WeightFakeQuantizer,
 )
 from torchao.quantization.quant_api import (
    Float8DynamicActivationFloat8WeightConfig,
    Float8DynamicActivationInt4WeightConfig,
@@ -17,6 +21,27 @@ from torchao.quantization.quant_api import (
 from axolotl.utils.schemas.enums import TorchAOQuantDType
 class Int4WeightFakeQuantizer(AoInt4WeightFakeQuantizer):
    """
    Adds 'enabled' attribute to Int4WeightFakeQuantizer (removed in torchao 0.15).
    Allows toggling fake quantization on/off for fake_quant_after_n_steps.
    """
    def __init__(self, config):
        super().__init__(config)
        self.enabled = True
    def forward(self, w: torch.Tensor) -> torch.Tensor:
        if not self.enabled:
            return w
        return super().forward(w)
 # Replace the original Int4WeightFakeQuantizer in the fake_quantizer module
 # so that torchao's quantize_() function will use our version
 fake_quantizer.Int4WeightFakeQuantizer = Int4WeightFakeQuantizer
 quantization_config_to_str = {
    Int8DynamicActivationInt4WeightConfig: "int8int4",
    Float8DynamicActivationFloat8WeightConfig: "fp8fp8",
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -619,32 +619,6 @@ class AxolotlInputConfig(
        },
    )
    experts_implementation: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Which experts implementation to use for MoE models,"
        },
    )
    scaling_softmax: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399"
        },
    )
    scaling_softmax_factor: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Scaling factor for SSMax attention. Default is 0.43"
        },
    )
    scaling_softmax_bias: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better length generalization."
        },
    )
    unsloth_cross_entropy_loss: bool | None = None
    unsloth_lora_mlp: bool | None = None
    unsloth_lora_qkv: bool | None = None
@@ -683,24 +657,6 @@ class AxolotlInputConfig(
            "description": "Number of chunks to use for chunked cross entropy loss"
        },
    )
    use_eaft: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Enable Entropy-Aware Focal Training loss (EAFT)"
        },
    )
    eaft_alpha: float | None = Field(
        default=1.0,
        json_schema_extra={
            "description": "Exponent for entropy weighting in EAFT (default: 1.0)"
        },
    )
    eaft_k: int | None = Field(
        default=20,
        json_schema_extra={
            "description": "Number of top logits for entropy approximation (default: 20)"
        },
    )
    tiled_mlp: bool | None = Field(
        default=None,
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -26,7 +26,6 @@ class RLType(str, Enum):
    """RL trainer type configuration subset"""
    DPO = "dpo"
    GDPO = "gdpo"
    GRPO = "grpo"
    IPO = "ipo"
    ORPO = "orpo"
--- a/src/axolotl/utils/schemas/fsdp.py
+++ b/src/axolotl/utils/schemas/fsdp.py
@@ -4,7 +4,7 @@ FSDP Configuration Schema
 from typing import Literal
-from pydantic import AliasChoices, BaseModel, Field
+from pydantic import BaseModel, Field
 class FSDPConfig(BaseModel):
@@ -12,11 +12,6 @@ class FSDPConfig(BaseModel):
    FSDP Configuration Schema
    """
    fsdp_version: int | None = Field(
        validation_alias=AliasChoices("fsdp_version", "version"),
        default=None,
        json_schema_extra={"description": "FSDP version"},
    )
    activation_checkpointing: bool | None = Field(
        default=None,
        description="Enable activation checkpointing to reduce memory usage during forward passes",
--- a/Show More
+++ b/Show More