cpu offloading

fft
2023-12-31 22:17:43 +01:00 · 2023-12-31 12:02:29 +01:00 · 2023-12-31 07:42:46 +01:00 · 2023-12-31 07:40:33 +01:00 · 2023-12-30 22:36:50 +01:00 · 2023-12-30 21:36:01 +01:00
1225 changed files with 13576 additions and 194858 deletions
--- a/.axolotl-complete.bash
+++ b/.axolotl-complete.bash
@@ -1,41 +0,0 @@
-#!/bin/bash
-
-_axolotl_completions() {
-    local cur prev
-    COMPREPLY=()
-    cur="${COMP_WORDS[COMP_CWORD]}"
-    prev="${COMP_WORDS[COMP_CWORD-1]}"
-
-    # If we're completing the first argument (the command)
-    if [[ $COMP_CWORD -eq 1 ]]; then
-        mapfile -t COMPREPLY < <(compgen -W "delinearize-llama4 fetch lm-eval merge-sharded-fsdp-weights quantize vllm-serve evaluate inference merge-lora preprocess train" -- "$cur")
-        return 0
-    fi
-
-    # Commands that should complete with directories and YAML files
-    local -a yaml_commands=("merge-sharded-fsdp-weights" "quantize" "vllm-serve" "evaluate" "inference" "merge-lora" "preprocess" "train")
-
-    # Check if previous word is in our list
-    if [[ " ${yaml_commands[*]} " =~ (^|[[:space:]])$prev($|[[:space:]]) ]]; then
-        # Use filename completion which handles directories properly
-        compopt -o filenames
-        mapfile -t COMPREPLY < <(compgen -f -- "$cur")
-
-        # Filter to only include directories and YAML files
-        local -a filtered=()
-        for item in "${COMPREPLY[@]}"; do
-            if [[ -d "$item" ]] || [[ "$item" == *.yaml ]] || [[ "$item" == *.yml ]]; then
-                filtered+=("$item")
-            fi
-        done
-        COMPREPLY=("${filtered[@]}")
-
-        return 0
-    fi
-
-    # Default: no completion
-    return 0
-}
-
-# Remove the -o nospace option - let filenames handle it
-complete -F _axolotl_completions axolotl
--- a/.bandit
+++ b/.bandit
@@ -1,3 +1,3 @@
 [bandit]
 exclude = tests
-skips = B101,B615,B102,B110
+skips = B101
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -1,17 +0,0 @@
-# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
-language: "en-US"
-early_access: false
-reviews:
-  profile: "chill"
-  request_changes_workflow: false
-  high_level_summary: true
-  review_status: true
-  collapse_walkthrough: true
-  poem: false
-  sequence_diagrams: false
-  auto_review:
-    enabled: true
-    drafts: false
-    auto_incremental_review: false
-chat:
-  auto_reply: true
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,14 +0,0 @@
-[run]
-source = axolotl
-omit =
-    */tests/*
-    setup.py
-
-[report]
-exclude_lines =
-    pragma: no cover
-    def __repr__
-    raise NotImplementedError
-    if __name__ == .__main__.:
-    pass
-    raise ImportError
--- a/.flake8
+++ b/.flake8
@@ -0,0 +1,5 @@
+[flake8]
+max-line-length = 88
+
+select = C,E,F,W,B,B950
+extend-ignore = E203, E501, W503
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -15,27 +15,23 @@ First of all, thank you for your interest in contributing to axolotl! We appreci
  - [Commit Messages](#commit-messages)
 - [Additional Resources](#additional-resources)

-## Code of Conduct
+## Code of Conductcode

 All contributors are expected to adhere to our [Code of Conduct](CODE_OF_CONDUCT.md). Please read it before participating in the axolotl community.

 ## Getting Started

-Bugs? Please check for open issue else create a new [Issue](https://github.com/axolotl-ai-cloud/axolotl/issues/new).
+Bugs? Please check for open issue else create a new [Issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues/new).

 PRs are **greatly welcome**!

 1. Fork the repository and clone it to your local machine.
-2. Set up the development environment by following the instructions in the [README.md](https://github.com/axolotl-ai-cloud/axolotl/tree/main/README.md) file.
+2. Set up the development environment by following the instructions in the [README.md](https://github.com/OpenAccess-AI-Collective/axolotl/tree/main/README.md) file.
 3. Explore the codebase, run tests, and verify that everything works as expected.

 Please run below to setup env
 ```bash
-# Install axolotl + dev and test dependencies
-export UV_TORCH_BACKEND=cu128  # or cu130
-uv venv --no-project --relocatable
-source .venv/bin/activate
-uv pip install --no-build-isolation -e '.[deepspeed]' --group dev --group test
+pip3 install -r requirements-dev.txt -r requirements-tests.txt
 pre-commit install

 # test
@@ -46,11 +42,11 @@ pytest tests/

 ### Reporting Bugs

-If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs.
+If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs.

 ### Suggesting Enhancements

-We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project.
+We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/OpenAccess-AI-Collective/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project.

 ### Submitting Pull Requests

@@ -61,23 +57,11 @@ We welcome ideas for improvements and new features. To suggest an enhancement, o
 5. Push your branch to your fork on GitHub.
 6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues.

-#### Skipping CI Checks
-
-You can skip certain CI checks by including specific keywords in your commit messages:
-
- `[skip ci]` or `skip ci` - Skips all CI checks for that commit
- `[skip-e2e]` or `skip-e2e` - Skips only end-to-end tests while running other CI checks. You may also include this in the title of your PR to disable end-to-end tests for the entire PR.
-
 ## Style Guidelines

 ### Code Style

-axolotl uses [Ruff](https://docs.astral.sh/ruff/) as its code style guide. Please ensure that your code follows these guidelines.
-
-Use the pre-commit linter to ensure that your code is formatted consistently.
-```bash
-pre-commit run --all-files
-```
+axolotl uses [{codestyle}]({URLofCodestyle}) as its code style guide. Please ensure that your code follows these guidelines.

 ### Commit Messages

@@ -87,6 +71,6 @@ Write clear and concise commit messages that briefly describe the changes made i

 - [GitHub Help](https://help.github.com/)
 - [GitHub Pull Request Documentation](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests)
- [Ruff](https://docs.astral.sh/ruff/)
+- [{codestyle}]({URLofCodestyle})

 Thank you once again for your interest in contributing to axolotl. We look forward to collaborating with you and creating an even better project together!
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1,6 +1,6 @@
 # These are supported funding model platforms

-github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+github: OpenAccess-AI-Collective # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 patreon: # Replace with a single Patreon username
 open_collective: # Replace with a single Open Collective username
 ko_fi: # Replace with a single Ko-fi username
--- a/.github/ISSUE_TEMPLATE/bug-report.yaml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yaml
@@ -15,7 +15,7 @@ body:
      label: "Please check that this issue hasn't been reported before."
      description: "The **Label filters** may help make your search more focussed."
      options:
-        - label: "I searched previous [Bug Reports](https://github.com/axolotl-ai-cloud/axolotl/labels/bug) didn't find any similar reports."
+        - label: "I searched previous [Bug Reports](https://github.com/OpenAccess-AI-Collective/axolotl/labels/bug) didn't find any similar reports."
          required: true

  - type: textarea
@@ -59,7 +59,6 @@ body:
      label: Config yaml
      description: |
        Please attach the config yaml!
-      render: yaml

  - type: textarea
    id: possible-solution
--- a/.github/ISSUE_TEMPLATE/config.yml
+++ b/.github/ISSUE_TEMPLATE/config.yml
@@ -1,7 +1,7 @@
 blank_issues_enabled: false
 contact_links:
  - name: Ask a question
-    url: https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/q-a
+    url: https://github.com/OpenAccess-AI-Collective/axolotl/discussions/categories/q-a
    about: Ask questions and discuss with other community members
  - name: Discuss the Project in Discord
    url: https://discord.gg/HhrNrHJPRb
--- a/.github/ISSUE_TEMPLATE/docs.yml
+++ b/.github/ISSUE_TEMPLATE/docs.yml
@@ -10,7 +10,7 @@ body:
      value: |
        * Ask questions in [Discord](https://discord.gg/HhrNrHJPRb).
        * Before you file an issue read the [Contributing guide](./CONTRIBUTING.md).
-        * Check to make sure someone hasn't already opened a [similar issue](https://github.com/axolotl-ai-cloud/axolotl/issues).
+        * Check to make sure someone hasn't already opened a [similar issue](https://github.com/OpenAccess-AI-Collective/axolotl/issues).
  - type: textarea
    attributes:
      label: What piece of documentation is affected?
--- a/.github/ISSUE_TEMPLATE/feature-request.yaml
+++ b/.github/ISSUE_TEMPLATE/feature-request.yaml
@@ -8,9 +8,9 @@ body:
      label: "⚠️ Please check that this feature request hasn't been suggested before."
      description: "There are two locations for previous feature requests. Please search in both. Thank you. The **Label filters** may help make your search more focussed."
      options:
-        - label: "I searched previous [Ideas in Discussions](https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/ideas) didn't find any similar feature requests."
+        - label: "I searched previous [Ideas in Discussions](https://github.com/OpenAccess-AI-Collective/axolotl/discussions/categories/ideas) didn't find any similar feature requests."
          required: true
-        - label: "I searched previous [Issues](https://github.com/axolotl-ai-cloud/axolotl/labels/enhancement) didn't find any similar feature requests."
+        - label: "I searched previous [Issues](https://github.com/OpenAccess-AI-Collective/axolotl/labels/enhancement) didn't find any similar feature requests."
          required: true

  - type: textarea
--- a/.github/PULL_REQUEST_TEMPLATE/pull_request_template_simple.md
+++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template_simple.md
@@ -15,18 +15,8 @@
 <!--- Include details of your testing environment, tests ran to see how -->
 <!--- your change affects other areas of the code, etc. -->

-## AI Usage Disclaimer
-
-<!--- Was AI (e.g., ChatGPT, Claude, Copilot) used to generate or assist with this PR? -->
-<!--- Please indicate: No / Yes (specify which tool and to what extent) -->
-
 ## Screenshots (if appropriate)

 ## Types of changes

 <!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->
-
-## Social Handles (Optional)
-
-<!-- Thanks for submitting a bugfix or enhancement. -->
-<!-- We'd love to show our thanks to you on Twitter & Discord if you provide your handle -->
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -3,240 +3,63 @@ name: ci-cd-base
 on:
  push:
    branches:
-      - "main"
-    paths:
-      - 'docker/Dockerfile-base'
-      - 'docker/Dockerfile-uv-base'
-      - '.github/workflows/base.yml'
-  pull_request:
-    paths:
-      - 'docker/Dockerfile-base'
-      - 'docker/Dockerfile-uv-base'
-      - '.github/workflows/base.yml'
-  workflow_dispatch:
-
-permissions:
-  contents: read
+      - "main-base"
+      - "dev-base"

 jobs:
  build-base:
-    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
-    timeout-minutes: 480
+    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
-    runs-on: ubuntu-latest-m
-    env:
-      HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
+    runs-on: self-hosted
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
-#          - cuda: "129"
-#            cuda_version: 12.9.1
-#            cudnn_version: ""
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-#            dockerfile: "Dockerfile-base"
-#            platforms: "linux/amd64,linux/arm64"
-          - cuda: "130"
-            cuda_version: 13.0.0
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.1
-            torch_cuda_arch_list: "9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "130"
-            cuda_version: 13.0.0
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.9.1
-            torch_cuda_arch_list: "9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "130"
-            cuda_version: 13.0.0
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "9.0+PTX"
-            dockerfile: "Dockerfile-base"
-            platforms: "linux/amd64,linux/arm64"
-#          - cuda: "128"
-#            cuda_version: 12.8.1
-#            cudnn_version: ""
-#            python_version: "3.11"
-#            pytorch: nightly
-#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-#            dockerfile: "Dockerfile-base-nightly"
-#          # "next" is for release candidates of pytorch
-#          - cuda: "128"
-#            cuda_version: 12.8.1
-#            cudnn_version: ""
-#            python_version: "3.11"
-#            pytorch: next
-#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-#            dockerfile: "Dockerfile-base-next"
+          - cuda: "118"
+            cuda_version: 11.8.0
+            python_version: "3.9"
+            pytorch: 2.0.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+          - cuda: "118"
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.0.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+          - cuda: "118"
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.1.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
+          - cuda: "121"
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.1
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
    steps:
      - name: Checkout
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3
      - name: Docker metadata
        id: metadata
-        uses: docker/metadata-action@v5
+        uses: docker/metadata-action@v3
        with:
-          images: |
-            axolotlai/axolotl-base
+          images: winglian/axolotl-base
      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
+        uses: docker/login-action@v2
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2
      - name: Build
-        uses: docker/build-push-action@v5
+        uses: docker/build-push-action@v4
        with:
          context: .
-          file: ./docker/${{ matrix.dockerfile }}
-          platforms: ${{ matrix.platforms }}
+          file: ./docker/Dockerfile-base
          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
-            CUDNN_VERSION=${{ matrix.cudnn_version }}
-            CUDA=${{ matrix.cuda }}
-            PYTHON_VERSION=${{ matrix.python_version }}
-            PYTORCH_VERSION=${{ matrix.pytorch }}
-            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
-  build-base-uv:
-    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
-    timeout-minutes: 480
-    runs-on: ubuntu-latest-m
-    env:
-      HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.9.1
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "128"
-            cuda_version: 12.8.1
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
-#          - cuda: "129"
-#            cuda_version: 12.9.1
-#            cudnn_version: ""
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-#            dockerfile: "Dockerfile-uv-base"
-#            platforms: "linux/amd64,linux/arm64"
-          - cuda: "130"
-            cuda_version: 13.0.0
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.9.1
-            torch_cuda_arch_list: "9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "130"
-            cuda_version: 13.0.0
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.9.1
-            torch_cuda_arch_list: "9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: "130"
-            cuda_version: 13.0.0
-            cudnn_version: ""
-            python_version: "3.12"
-            pytorch: 2.10.0
-            torch_cuda_arch_list: "9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
-            platforms: "linux/amd64,linux/arm64"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl-base-uv
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          file: ./docker/${{ matrix.dockerfile }}
-          platforms: ${{ matrix.platforms }}
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-          labels: ${{ steps.metadata.outputs.labels }}
-          build-args: |
-            CUDA_VERSION=${{ matrix.cuda_version }}
-            CUDNN_VERSION=${{ matrix.cudnn_version }}
            CUDA=${{ matrix.cuda }}
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -1,37 +0,0 @@
-name: Publish Docs
-on:
-  push:
-    branches:
-      - main
-
-permissions:
-    contents: write
-    pages: write
-
-jobs:
-    build-deploy:
-        runs-on: ubuntu-latest
-        steps:
-        - name: cleanup node
-          run: |
-            sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
-        - name: Check out repository
-          uses: actions/checkout@v4
-        - name: Set up Quarto
-          uses: quarto-dev/quarto-actions/setup@v2
-        - name: Setup Python
-          uses: actions/setup-python@v5
-          with:
-            python-version: '3.11'
-        - name: Install dependencies
-          run: |
-            python3 -m pip install jupyter quartodoc
-            python3 -m pip install -e .
-        - name: Build autodoc
-          run: quartodoc build
-        - name: Publish to GitHub Pages (and render)
-          uses: quarto-dev/quarto-actions/publish@v2
-          with:
-            target: gh-pages
-          env:
-            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -1,30 +0,0 @@
-name: lint
-on:
-  # check on PRs, and manual triggers
-  merge_group:
-  pull_request:
-      types: [opened, synchronize, reopened, ready_for_review]
-      paths:
-       - '**.py'
-       - 'pyproject.toml'
-       - '.github/workflows/*.yml'
-       - "*.[q]md"
-       - "examples/**/*.y[a]?ml"
-       - ".pre-commit-config.yaml"
-  workflow_dispatch:
-
-permissions:
-  contents: read
-
-jobs:
-  pre-commit:
-    name: pre-commit
-    runs-on: ubuntu-latest
-    if: ${{ !github.event.pull_request.draft }}
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-          cache: 'pip' # caching pip dependencies
-      - uses: pre-commit/action@v3.0.1
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -4,52 +4,37 @@ on:
  push:
    branches:
      - "main"
-    tags:
-      - "v*"
-  workflow_dispatch:
-
-permissions:
-  contents: read

 jobs:
  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: github.repository_owner == 'OpenAccess-AI-Collective'
+    # this job needs to be run on self-hosted GPU runners...
    strategy:
      fail-fast: false
      matrix:
        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.9"
+            pytorch: 2.0.1
+            axolotl_extras:
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.0.1
            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
            is_latest: true
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.12"
-            pytorch: 2.10.0
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.1.1
            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-#          - cuda: 129
-#            cuda_version: 12.9.1
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            axolotl_extras:
-#            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.1
            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-    runs-on: axolotl-gpu-runner
+    runs-on: [self-hosted, gpu, docker]
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -57,11 +42,7 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
-            axolotlai/axolotl
-          tags: |
-            type=ref,event=branch
-            type=pep440,pattern={{version}}
+          images: winglian/axolotl
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
@@ -74,138 +55,57 @@ jobs:
        uses: docker/build-push-action@v5
        with:
          context: .
-          platforms: ${{ matrix.platforms }}
+          load: true
          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
+            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
-            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
-            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
          file: ./docker/Dockerfile
-          push: ${{ github.event_name != 'pull_request' }}
          tags: |
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}
+      - name: Unit Tests
+        run: |
+          docker run --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }} pytest --ignore=tests/e2e/ /workspace/axolotl/tests/
+      - name: Push to Docker Hub
+        if: github.event_name != 'pull_request'
+        run: |
+          docker push ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+          latest_tag=${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
+          if [ -n "$latest_tag" ]; then
+            docker push "$latest_tag"
+          fi

-  build-axolotl-uv:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.12"
-            pytorch: 2.9.1
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-            is_latest: true
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-    runs-on: axolotl-gpu-runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl-uv
-          tags: |
-            type=ref,event=branch
-            type=pep440,pattern={{version}}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
-      - name: Build and export to Docker
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          platforms: ${{ matrix.platforms }}
-          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
-            CUDA=${{ matrix.cuda }}
-            PYTORCH_VERSION=${{ matrix.pytorch }}
-            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
-            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
-          file: ./docker/Dockerfile-uv
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: |
-            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
-            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
-          labels: ${{ steps.metadata.outputs.labels }}
-
-  build-axolotl-cloud:
+  build-axolotl-runpod:
    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
+    if: github.repository_owner == 'OpenAccess-AI-Collective'
    # this job needs to be run on self-hosted GPU runners...
    strategy:
-      fail-fast: false
      matrix:
        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.9"
+            pytorch: 2.0.1
+            axolotl_extras:
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.0.1
            axolotl_extras:
            is_latest: true
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.12"
-            pytorch: 2.10.0
+          - cuda: 118
+            cuda_version: 11.8.0
+            python_version: "3.10"
+            pytorch: 2.1.1
            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-#          - cuda: 129
-#            cuda_version: 12.9.1
-#            python_version: "3.12"
-#            pytorch: 2.9.1
-#            axolotl_extras:
-#            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
+          - cuda: 121
+            cuda_version: 12.1.0
+            python_version: "3.10"
+            pytorch: 2.1.1
            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-    runs-on: axolotl-gpu-runner
+    runs-on: [self-hosted, gpu, docker]
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -213,156 +113,22 @@ jobs:
        id: metadata
        uses: docker/metadata-action@v5
        with:
-          images: |
-            axolotlai/axolotl-cloud
-          tags: |
-            type=ref,event=branch
-            type=pep440,pattern={{version}}
+          images: winglian/axolotl-runpod
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@v2
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
-          platforms: ${{ matrix.platforms }}
          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
+            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
-          file: ./docker/Dockerfile-cloud
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: |
-             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
-          labels: ${{ steps.metadata.outputs.labels }}
-
-  build-axolotl-cloud-uv:
-    needs: build-axolotl-uv
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    # this job needs to be run on self-hosted GPU runners...
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.12"
-            pytorch: 2.9.1
-            axolotl_extras:
-            is_latest: true
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.12"
-            pytorch: 2.10.0
-            axolotl_extras:
-            platforms: "linux/amd64,linux/arm64"
-    runs-on: axolotl-gpu-runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl-cloud-uv
-          tags: |
-            type=ref,event=branch
-            type=pep440,pattern={{version}}
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          platforms: ${{ matrix.platforms }}
-          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            CUDA=${{ matrix.cuda }}
-          file: ./docker/Dockerfile-cloud-uv
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: |
-             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
-          labels: ${{ steps.metadata.outputs.labels }}
-
-  build-axolotl-cloud-no-tmux:
-    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    # this job needs to be run on self-hosted GPU runners...
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            is_latest: true
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            is_latest:
-    runs-on: axolotl-gpu-runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl-cloud-term
-          tags: |
-            type=ref,event=branch
-            type=pep440,pattern={{version}}
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          platforms: linux/amd64,linux/arm64
-          build-args: |
-            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            CUDA=${{ matrix.cuda }}
-          file: ./docker/Dockerfile-cloud-no-tmux
+          file: ./docker/Dockerfile-runpod
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -1,80 +0,0 @@
-name: docker-multigpu-tests-biweekly
-
-on:
-  pull_request:
-    paths:
-      - "tests/e2e/multigpu/**.py"
-      - "pyproject.toml"
-      - ".github/workflows/multi-gpu-e2e.yml"
-      - "scripts/cutcrossentropy_install.py"
-      - "src/axolotl/core/trainers/mixins/sequence_parallel.py"
-      - "src/axolotl/utils/distributed.py"
-  workflow_dispatch:
-  schedule:
-    - cron: "0 0 * * 1,4" # Runs at 00:00 UTC every monday & thursday
-
-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-permissions:
-  contents: read
-
-env:
-  MODAL_IMAGE_BUILDER_VERSION: "2025.06"
-
-jobs:
-  test-axolotl-multigpu:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          #          - cuda: 129
-          #            cuda_version: 12.9.1
-          #            python_version: "3.12"
-          #            pytorch: 2.9.1
-          #            axolotl_extras: "fbgemm-gpu"
-          #            num_gpus: 2
-          #            dockerfile: "Dockerfile-uv.jinja"
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-            #            axolotl_extras: fbgemm-gpu
-            num_gpus: 2
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.10.0
-            axolotl_extras: "fbgemm-gpu"
-            num_gpus: 2
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==1.3.0.post1 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-        run: |
-          modal run -m cicd.multigpu
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -1,100 +0,0 @@
-name: docker-nightlies
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
-
-permissions:
-  contents: read
-
-jobs:
-  build-axolotl:
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-    runs-on: axolotl-gpu-runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl
-          tags: |
-            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
-      - name: Build and export to Docker
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
-            CUDA=${{ matrix.cuda }}
-            PYTORCH_VERSION=${{ matrix.pytorch }}
-            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
-          file: ./docker/Dockerfile
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: |
-            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-          labels: ${{ steps.metadata.outputs.labels }}
-
-  build-axolotl-cloud:
-    needs: build-axolotl
-    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
-    # this job needs to be run on self-hosted GPU runners...
-    strategy:
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            axolotl_extras:
-    runs-on: axolotl-gpu-runner
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Docker metadata
-        id: metadata
-        uses: docker/metadata-action@v5
-        with:
-          images: |
-            axolotlai/axolotl-cloud
-          tags: |
-            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
-      - name: Login to Docker Hub
-        uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.DOCKERHUB_USERNAME }}
-          password: ${{ secrets.DOCKERHUB_TOKEN }}
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v3
-      - name: Build
-        uses: docker/build-push-action@v5
-        with:
-          context: .
-          build-args: |
-            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-            CUDA=${{ matrix.cuda }}
-          file: ./docker/Dockerfile-cloud
-          push: ${{ github.event_name != 'pull_request' }}
-          tags: |
-             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
-          labels: ${{ steps.metadata.outputs.labels }}
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -1,42 +0,0 @@
-name: Pre-commit auto-update
-
-on:
-  schedule:
-    - cron: '0 0 1 * *'  # Run monthly
-  workflow_dispatch:  # Manual kickoff
-
-permissions: {}
-
-jobs:
-  auto-update:
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-      pull-requests: write
-    steps:
-      - uses: actions/checkout@v4
-
-      - uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Update pre-commit hooks
-        id: update
-        run: |
-          pip install pre-commit
-          pre-commit autoupdate
-          if [[ -n $(git status --porcelain) ]]; then
-            echo "changes=true" >> $GITHUB_OUTPUT
-          fi
-
-      - name: Create Pull Request
-        if: steps.update.outputs.changes == 'true'
-        uses: peter-evans/create-pull-request@v6
-        with:
-          token: ${{ secrets.GITHUB_TOKEN }}
-          branch: update/pre-commit-hooks
-          delete-branch: true
-          title: "chore: update pre-commit hooks"
-          commit-message: "chore: update pre-commit hooks"
-          body: |
-            Automated PR to update pre-commit hooks to their latest versions.
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -1,77 +0,0 @@
-name: Preview
-on:
-  workflow_dispatch:
-  pull_request:
-    types: [opened, synchronize, reopened, ready_for_review]
-
-    # Run the workflow only when one of these files changes
-    paths:
-      - '**/*.md'      # any Markdown file
-      - '**/*.qmd'     # any Quarto file
-      - '_quarto.yml'
-      - docs/scripts/generate_config_docs.py
-      - src/axolotl/utils/schemas/**.py
-      - .github/workflows/preview-docs.yml
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  preview:
-    runs-on: ubuntu-latest
-    if: ${{ !github.event.pull_request.draft }}
-    steps:
-      - name: cleanup node
-        run: |
-          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
-
-      - name: Check out repository
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-
-      - name: Set up Quarto
-        uses: quarto-dev/quarto-actions/setup@v2
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.11'
-
-      - name: Install dependencies
-        run: |
-          python3 -m pip install jupyter quartodoc
-          python3 -m pip install -e .
-
-      - name: Build autodoc
-        run: quartodoc build
-
-      - name: Quarto render
-        run: quarto render
-
-      - name: Netlify Publish
-        uses: nwtgck/actions-netlify@v3.0
-        if: ${{ github.event.pull_request.head.repo.full_name == github.repository }}
-        id: netlify
-        with:
-          publish-dir: './_site'
-          enable-pull-request-comment: false
-          enable-github-deployment: false
-          github-token: ${{ secrets.GITHUB_TOKEN }}
-          deploy-message: "Deployed On Netlify"
-          github-deployment-environment: 'preview'
-          github-deployment-description: 'Preview Deployment'
-        env:
-          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
-          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
-
-      - name: Update PR with preview link
-        if: ${{ steps.netlify.outcome == 'success' }}
-        uses: marocchino/sticky-pull-request-comment@v2
-        with:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          message: |
-            📖 **Documentation Preview**: ${{ steps.netlify.outputs.deploy-url }}
-
-            Deployed on Netlify from commit ${{ github.event.pull_request.head.sha }}
--- a/.github/workflows/pypi.yml
+++ b/.github/workflows/pypi.yml
@@ -3,68 +3,43 @@ name: publish pypi
 on:
  push:
    tags:
-      - "v*"
-  workflow_dispatch:
-
-permissions: {}
-
-env:
-  UV_SYSTEM_PYTHON: "1"
+      - '*'

 jobs:
-  setup_release:
-    name: Create Release
-    runs-on: ubuntu-latest
-    permissions:
-      contents: write
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-
-      - name: Create release
-        env:
-          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: gh release create "$GITHUB_REF_NAME" --generate-notes
  pypi-publish:
    name: Upload release to PyPI
    runs-on: ubuntu-latest
-    needs: [setup_release]
    environment:
      name: pypi
      url: https://pypi.org/p/axolotl
    permissions:
-      contents: read
-      id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
+      id-token: write  # IMPORTANT: this permission is mandatory for trusted publishing
    steps:
      - name: Check out repository code
-        uses: actions/checkout@v4
+        uses: actions/checkout@v3

      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
-          python-version: "3.11"
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
+          python-version: "3.10"

      - name: Install dependencies
        run: |
-          uv pip install wheel packaging
-          uv pip install --no-build-isolation -e .
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
+          pip3 install wheel
+          pip3 install -e .
+          pip3 install -r requirements-tests.txt

      - name: Extract tag name
        id: tag
-        run: echo "TAG_NAME=$(echo $GITHUB_REF | cut -d / -f 3)" >> "$GITHUB_OUTPUT"
+        run: echo ::set-output name=TAG_NAME::$(echo $GITHUB_REF | cut -d / -f 3)

-      - name: Update version in VERSION file
-        run: |
-          echo "${{ steps.tag.outputs.TAG_NAME }}" | sed 's/^v//' > VERSION
+      - name: Update version in setup.py
+        run: >-
+          sed -i -E 's/version="([0-9.]+)",/version="${{ steps.tag.outputs.TAG_NAME }}",/g' setup.py

-      - name: Build a source dist
-        run: |
-          python setup.py sdist
+      - name: Build a binary wheel
+        run: >-
+          python setup.py sdist bdist_wheel

      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -1,206 +0,0 @@
-name: Tests Nightly against upstream main
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: "0 0 * * *" # Runs at 00:00 UTC every day
-  pull_request:
-    types: [opened, synchronize, reopened, ready_for_review]
-    paths:
-      - ".github/workflows/tests-nightly.yml"
-
-permissions:
-  contents: read
-
-env:
-  UV_SYSTEM_PYTHON: "1"
-
-jobs:
-  pre-commit:
-    name: pre-commit
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-          cache: "pip" # caching pip dependencies
-      - uses: pre-commit/action@v3.0.1
-        env:
-          SKIP: no-commit-to-branch
-
-  prime-cdn-s3-cache:
-    name: Prefetch S3 once to prime the CDN cache
-    runs-on: ubuntu-latest
-    if: ${{ !github.event.pull_request.draft }}
-    timeout-minutes: 10
-    steps:
-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
-
-  pytest:
-    name: PyTest
-    runs-on: ubuntu-latest
-    needs: [prime-cdn-s3-cache]
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
-        pytorch_version: ["2.9.1", "2.10.0"]
-    timeout-minutes: 20
-
-    steps:
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p /home/runner/.cache/huggingface/hub
-          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
-
-      - name: Install PyTorch
-        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
-
-      - name: Install dependencies
-        run: |
-          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
-
-      - name: Override with nightly HF packages
-        run: |
-          uv pip install --no-deps \
-            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
-            "peft @ git+https://github.com/huggingface/peft.git@main" \
-            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
-            "trl @ git+https://github.com/huggingface/trl.git@main" \
-            "datasets @ git+https://github.com/huggingface/datasets.git@main"
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Run tests
-        run: |
-          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
-          pytest -v --durations=10 tests/patched/
-          pytest -v --durations=10 tests/cli/
-
-
-  docker-e2e-tests:
-    if: github.repository_owner == 'axolotl-ai-cloud'
-    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
-    needs: [pre-commit, pytest]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            num_gpus: 1
-            axolotl_extras:
-            nightly_build: "true"
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.10.0
-            num_gpus: 1
-            axolotl_extras:
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.12"
-            pytorch: 2.9.1
-            num_gpus: 1
-            axolotl_extras:
-            nightly_build: "true"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==1.3.0.post1 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
-          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-        run: |
-          modal run cicd.e2e_tests
-  docker-e2e-multigpu-tests:
-    if: github.repository_owner == 'axolotl-ai-cloud'
-    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
-    needs: [pre-commit, pytest, docker-e2e-tests]
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            num_gpus: 2
-            axolotl_extras:
-            nightly_build: "true"
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==1.3.0.post1 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-        run: |
-          modal run cicd.multigpu
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -1,407 +1,81 @@
 name: Tests
 on:
  # check on push/merge to main, PRs, and manual triggers
-  merge_group:
  push:
    branches:
      - "main"
    paths:
-      - "**.py"
-      - "pyproject.toml"
-      - ".github/workflows/*.yml"
-      - "cicd/cicd.sh"
-      - "cicd/Dockerfile-uv.jinja"
+      - '**.py'
+      - 'requirements.txt'
  pull_request:
-    types: [opened, synchronize, reopened, ready_for_review]
-    paths:
-      - "**.py"
-      - "pyproject.toml"
-      - ".github/workflows/*.yml"
-      - "cicd/cicd.sh"
-      - "cicd/Dockerfile-uv.jinja"
+      paths:
+       - '**.py'
+       - 'requirements.txt'
  workflow_dispatch:

-# Cancel jobs on the same ref if a new one is triggered
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
-
-permissions:
-  contents: read
-
-env:
-  TRANSFORMERS_IS_CI: "yes"
-  UV_SYSTEM_PYTHON: "1"
-
 jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
-    if: ${{ !github.event.pull_request.draft }}
    steps:
-      - uses: actions/checkout@v4
-      - uses: actions/setup-python@v5
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
        with:
-          python-version: "3.11"
-          cache: "pip" # caching pip dependencies
-      - uses: pre-commit/action@v3.0.1
-        env:
-          SKIP: no-commit-to-branch
-
-  prime-cdn-s3-cache:
-    name: Prefetch S3 once to prime the CDN cache
-    runs-on: ubuntu-latest
-    if: ${{ !github.event.pull_request.draft }}
-    timeout-minutes: 10
-    steps:
-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
+          python-version: "3.9"
+          cache: 'pip' # caching pip dependencies
+      - uses: pre-commit/action@v3.0.0

  pytest:
    name: PyTest
    runs-on: ubuntu-latest
-    if: ${{ !github.event.pull_request.draft }}
-    needs: [prime-cdn-s3-cache]
    strategy:
      fail-fast: false
      matrix:
-        python_version: ["3.12", "3.14"]
-        pytorch_version: ["2.9.1", "2.10.0"]
-        exclude:
-          - python_version: "3.14"
-            pytorch_version: "2.9.1"
-    timeout-minutes: 25
+        python_version: ["3.9", "3.10"]
+    timeout-minutes: 10

    steps:
-      - name: cleanup node
-        run: |
-          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
-
      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p ~/.cache/huggingface/hub
-          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
-          ls -ltr ~/.cache/huggingface/hub/
+        uses: actions/checkout@v3

      - name: Setup Python
-        uses: actions/setup-python@v5
+        uses: actions/setup-python@v4
        with:
          python-version: ${{ matrix.python_version }}
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
-
-      - name: Install PyTorch
-        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
+          cache: 'pip' # caching pip dependencies

      - name: Install dependencies
        run: |
-          uv pip install --no-build-isolation -e . --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Pre-Download dataset fixture
-        run: |
-          hf download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
-
-      - name: Show HF cache
-        run: hf cache ls
+          pip3 install -U -e .
+          pip3 install -r requirements-tests.txt

      - name: Run tests
        run: |
-          df -h
-          pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
-          df -h
-          pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
-          df -h
-          pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
-          df -h
-          pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
+          pytest --ignore=tests/e2e/ tests/

-      - name: Show HF cache
-        run: hf cache ls
-
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v5
-        with:
-          token: ${{ secrets.CODECOV_TOKEN }}
-          files: ./coverage.xml
-          flags: unittests,pytorch-${{ matrix.pytorch_version }}
-          fail_ci_if_error: false
-
-  pytest-sdist:
-    name: PyTest from Source Dist
-    runs-on: ubuntu-latest
-    if: ${{ !github.event.pull_request.draft }}
-    needs: [prime-cdn-s3-cache]
-    strategy:
-      fail-fast: false
-      matrix:
-        python_version: ["3.12", "3.14"]
-        pytorch_version: ["2.9.1", "2.10.0"]
-        exclude:
-          - python_version: "3.14"
-            pytorch_version: "2.9.1"
-    timeout-minutes: 30
-
-    steps:
-      - name: cleanup node
-        run: |
-          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
-
-      - name: Check out repository code
-        uses: actions/checkout@v4
-
-      - name: Restore Cache from S3
-        id: hf-cache-restore-s3
-        run: |
-          mkdir -p ~/.cache/huggingface/hub
-          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
-          ls -ltr ~/.cache/huggingface/hub/
-
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: ${{ matrix.python_version }}
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v7
-
-      - name: Install PyTorch
-        run: |
-          uv pip install torch==${{ matrix.pytorch_version }} torchvision
-          uv pip freeze | grep -E "^(torch|torchvision)==" > /tmp/torch-pin.txt
-
-      - name: Install dependencies
-        run: |
-          uv pip install packaging setuptools_scm build wheel psutil
-          python -m build --no-isolation --sdist
-          uv pip install --no-build-isolation dist/axolotl*.tar.gz --override /tmp/torch-pin.txt
-          python scripts/cutcrossentropy_install.py --uv | sh
-          uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-            codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
-
-      - name: Make sure PyTorch version wasn't clobbered
-        run: |
-          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__, f'Expected torch ${{ matrix.pytorch_version }} but got {torch.__version__}'"
-
-      - name: Ensure axolotl CLI was installed
-        run: |
-          axolotl --help
-
-      - name: Verify agent docs are discoverable
-        run: |
-          # Agent docs live in docs/agents/ (source of truth) and are resolved
-          # at runtime from the repo checkout or via `axolotl fetch docs`
-          axolotl agent-docs --list
-          axolotl agent-docs | grep -q "Fine-tuning framework"
-          axolotl agent-docs grpo | grep -q "GRPO"
-          axolotl agent-docs sft | grep -q "SFT"
-          python -c "from axolotl.cli.agent_docs import get_doc, list_topics; assert len(list_topics()) >= 5; assert 'GRPO' in get_doc('grpo')"
-
-      - name: Show HF cache
-        run: hf cache ls
-
-      - name: Run tests
-        run: |
-          pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
-          pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
-          pytest -v --durations=10 tests/cli/
-
-      - name: Show HF cache
-        run: hf cache ls
-
-  gate-skip-e2e:
-    needs: [pre-commit]
-    runs-on: ubuntu-latest
-    outputs:
-      skip: ${{ steps.compute.outputs.skip }}
-    steps:
-      - uses: actions/github-script@v7
-        id: compute
-        with:
-          script: |
-            const token = /\[skip-e2e\]/i;
-            let msg = '';
-            if (context.eventName === 'push') {
-              msg = context.payload.head_commit?.message || '';
-            } else if (context.eventName === 'pull_request') {
-              const { owner, repo } = context.repo;
-              const prNumber = context.payload.pull_request.number;
-              const commits = await github.paginate(
-                github.rest.pulls.listCommits,
-                { owner, repo, pull_number: prNumber, per_page: 100 }
-              );
-              msg = commits.at(-1)?.commit?.message || '';
-            }
-            const title = context.payload.pull_request?.title || '';
-            const body  = context.payload.pull_request?.body  || '';
-            const skip = token.test(msg) || token.test(title) || token.test(body);
-            core.setOutput('skip', String(skip));
-
-  docker-e2e-tests-1st:
-    # Run this job first as a gate for running the remainder of the test matrix
-    if: >
-      github.repository_owner == 'axolotl-ai-cloud' &&
-      (github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
-      needs.gate-skip-e2e.outputs.skip != 'true'
-    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
+  e2e-test:
+    name: E2E Tests
+    runs-on: [self-hosted, gpu]
+    timeout-minutes: 20
    needs: [pre-commit, pytest]

-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.12"
-            pytorch: 2.9.1
-            num_gpus: 1
-            axolotl_extras:
    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
+      - name: Check out repository code
+        uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v4
        with:
-          python-version: "3.11"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==1.3.0.post1 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-        run: |
-          modal run cicd.e2e_tests
+          python-version: "3.10"
+#          cache: 'pip' # caching pip dependencies

-  docker-e2e-tests:
-    if: >
-      github.repository_owner == 'axolotl-ai-cloud' &&
-      (github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
-      needs.gate-skip-e2e.outputs.skip != 'true'
-    # this job needs to be run on self-hosted GPU runners...
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
-    # Only run the remainder of the matrix if the first e2e check passed;
-    # this is to save on wasted compute costs for known failures that get caught in the first run
-    needs: [pre-commit, pytest, gate-skip-e2e, docker-e2e-tests-1st]
+      - name: Install dependencies
+        run: |
+          pip3 install --extra-index-url https://download.pytorch.org/whl/cu118 -U torch==2.0.1
+          pip3 uninstall -y transformers accelerate
+          pip3 install -U -e .[flash-attn,mamba-ssm]
+          pip3 install -r requirements-tests.txt

-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            num_gpus: 1
-            axolotl_extras:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.10.0
-            num_gpus: 1
-            axolotl_extras:
-          - cuda: 130
-            cuda_version: 13.0.0
-            python_version: "3.11"
-            pytorch: 2.9.1
-            num_gpus: 1
-            axolotl_extras:
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Modal
+      - name: Run e2e tests
        run: |
-          python -m pip install --upgrade pip
-          pip install modal==1.3.0.post1 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
-          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile-uv.jinja'}}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        env:
-          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
-        run: |
-          modal run cicd.e2e_tests
-
-  docker-e2e-cleanup:
-    runs-on: [self-hosted, modal]
-    timeout-minutes: 90
-    needs: [docker-e2e-tests]
-    if: ${{ !github.event.pull_request.draft }}
-
-    strategy:
-      fail-fast: false
-      matrix:
-        include:
-          - cuda: 128
-            cuda_version: 12.8.1
-            python_version: "3.11"
-            pytorch: 2.9.1
-            num_gpus: 1
-            axolotl_extras:
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-      - name: Install Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: "3.11"
-      - name: Install Modal
-        run: |
-          python -m pip install --upgrade pip
-          pip install modal==1.3.0.post1 jinja2
-      - name: Update env vars
-        run: |
-          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
-          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
-          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
-          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
-          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
-          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
-          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
-      - name: Run tests job on Modal
-        run: |
-          modal run cicd.cleanup
+          pytest tests/e2e/
--- a/.gitignore
+++ b/.gitignore
@@ -1,9 +1,5 @@
 **/axolotl.egg-info
 configs
-last_run_prepared/
-outputs
-.vscode
-_site/

 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -134,7 +130,6 @@ venv/
 ENV/
 env.bak/
 venv.bak/
-venv3.10/

 # Spyder project settings
 .spyderproject
@@ -170,26 +165,3 @@ cython_debug/
 # WandB
 # wandb creates a folder to store logs for training runs
 wandb
-
-# Runs
-lora-out/*
-qlora-out/*
-mlruns/*
-
-/.quarto/
-prepared-datasets/
-submit.sh
-*.out*
-
-# Quartodoc generated files
-objects.json
-site_libs/
-
-typings/
-out/
-
-# vim
-*.swp
-
-# scm auto-versioning
-src/axolotl/_version.py
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -0,0 +1,3 @@
+[settings]
+profile=black
+known_third_party=wandb
--- a/.mypy.ini
+++ b/.mypy.ini
@@ -1,5 +1,5 @@
 [mypy]
-plugins = pydantic.mypy
+
 exclude = venv

 [mypy-alpaca_lora_4bit.*]
@@ -11,9 +11,6 @@ ignore_errors = True
 [mypy-axolotl.models.mixtral.*]
 ignore_errors = True

-[mypy-axolotl.integrations.liger.models.*]
-ignore_errors = True
-
 [mypy-axolotl.models.phi.*]
 ignore_errors = True

@@ -35,9 +32,6 @@ ignore_missing_imports = True
 [mypy-bitsandbytes]
 ignore_missing_imports = True

-[mypy-requests]
-ignore_missing_imports = True
-
 [mypy-datasets]
 ignore_missing_imports = True

--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,30 +3,37 @@ default_language_version:

 repos:
 -   repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v6.0.0
+    rev: v4.4.0
    hooks:
    -   id: check-yaml
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
-    -   id: no-commit-to-branch
-        args: ['--branch', 'main']
-   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.15.8
+-   repo: https://github.com/psf/black
+    rev: 23.3.0
    hooks:
-    -   id: ruff
-        args: [--fix]
-    -   id: ruff-format
+    -   id: black
+-   repo: https://github.com/pycqa/isort
+    rev: 5.12.0
+    hooks:
+      - id: isort
+-   repo: https://github.com/PyCQA/flake8
+    rev: 6.0.0
+    hooks:
+    - id: flake8
+-   repo: https://github.com/PyCQA/pylint
+    rev: v2.17.4
+    hooks:
+    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.19.1
+    rev: v1.3.0
    hooks:
    - id: mypy
      additional_dependencies:
        [
            'types-PyYAML',
-            'pydantic>=2.5.3',
        ]
 -   repo: https://github.com/PyCQA/bandit
-    rev: 1.9.4
+    rev: 1.7.5
    hooks:
    -   id: bandit
        args: [
--- a/.pylintrc
+++ b/.pylintrc
@@ -0,0 +1,14 @@
+[MASTER]
+init-hook="from pylint.config import find_pylintrc; import os, sys; sys.path.append(os.path.dirname(find_pylintrc()))"
+
+[TYPECHECK]
+
+# List of members which are set dynamically and missed by Pylint inference
+# system, and so shouldn't trigger E1101 when accessed.
+generated-members=numpy.*, torch.*
+
+
+[pylint.messages_control]
+disable=missing-function-docstring, line-too-long, import-error,
+    too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
+    too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
--- a/.runpod/.gitignore
+++ b/.runpod/.gitignore
@@ -1,161 +0,0 @@
-# Byte-compiled / optimized / DLL files
-__pycache__/
-*.py[cod]
-*$py.class
-
-# C extensions
-*.so
-
-# Distribution / packaging
-.Python
-build/
-develop-eggs/
-dist/
-downloads/
-eggs/
-.eggs/
-lib/
-lib64/
-parts/
-sdist/
-var/
-wheels/
-share/python-wheels/
-*.egg-info/
-.installed.cfg
-*.egg
-MANIFEST
-
-# PyInstaller
-#  Usually these files are written by a python script from a template
-#  before PyInstaller builds the exe, so as to inject date/other infos into it.
-*.manifest
-*.spec
-
-# Installer logs
-pip-log.txt
-pip-delete-this-directory.txt
-
-# Unit test / coverage reports
-htmlcov/
-.tox/
-.nox/
-.coverage
-.coverage.*
-.cache
-nosetests.xml
-coverage.xml
-*.cover
-*.py,cover
-.hypothesis/
-.pytest_cache/
-cover/
-
-# Translations
-*.mo
-*.pot
-
-# Django stuff:
-*.log
-local_settings.py
-db.sqlite3
-db.sqlite3-journal
-
-# Flask stuff:
-instance/
-.webassets-cache
-
-# Scrapy stuff:
-.scrapy
-
-# Sphinx documentation
-docs/_build/
-
-# PyBuilder
-.pybuilder/
-target/
-
-# Jupyter Notebook
-.ipynb_checkpoints
-
-# IPython
-profile_default/
-ipython_config.py
-
-# pyenv
-#   For a library or package, you might want to ignore these files since the code is
-#   intended to run in multiple environments; otherwise, check them in:
-# .python-version
-
-# pipenv
-#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
-#   However, in case of collaboration, if having platform-specific dependencies or dependencies
-#   having no cross-platform support, pipenv may install dependencies that don't work, or not
-#   install all needed dependencies.
-#Pipfile.lock
-
-# poetry
-#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
-#   This is especially recommended for binary packages to ensure reproducibility, and is more
-#   commonly ignored for libraries.
-#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
-#poetry.lock
-
-# pdm
-#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
-#pdm.lock
-#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
-#   in version control.
-#   https://pdm.fming.dev/#use-with-ide
-.pdm.toml
-
-# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
-__pypackages__/
-
-# Celery stuff
-celerybeat-schedule
-celerybeat.pid
-
-# SageMath parsed files
-*.sage.py
-
-# Environments
-.env
-.venv
-env/
-venv/
-ENV/
-env.bak/
-venv.bak/
-
-# Spyder project settings
-.spyderproject
-.spyproject
-
-# Rope project settings
-.ropeproject
-
-# mkdocs documentation
-/site
-
-# mypy
-.mypy_cache/
-.dmypy.json
-dmypy.json
-
-# Pyre type checker
-.pyre/
-
-# pytype static type analyzer
-.pytype/
-
-# Cython debug symbols
-cython_debug/
-
-# PyCharm
-#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
-#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
-#  and can be added to the global gitignore or merged into this file.  For a more nuclear
-#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
-#.idea/
-pod/scripts/config.yaml
--- a/.runpod/Dockerfile
+++ b/.runpod/Dockerfile
@@ -1,19 +0,0 @@
-FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
-
-COPY .runpod/requirements.txt /requirements.txt
-RUN --mount=type=cache,target=/root/.cache/pip \
-    python3 -m pip install --upgrade pip && \
-    python3 -m pip install --upgrade -r /requirements.txt
-
-# Environment settings
-ARG BASE_VOLUME="/runpod-volume"
-ENV BASE_VOLUME=$BASE_VOLUME
-ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
-ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
-ENV HF_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
-ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
-
-COPY .runpod/src /src
-
-WORKDIR /src
-CMD ["python3", "/src/handler.py"]
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -1,335 +0,0 @@
-<h1>LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more</h1>
-
-# Configuration Options
-
-This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.
-
-## Usage
-
-You can use these configuration Options:
-
-1. As a JSON request body:
-
-```json
-{
-  "input": {
-    "user_id": "user",
-    "model_id": "model-name",
-    "run_id": "run-id",
-    "credentials": {
-      "wandb_api_key": "", # add your Weights & biases key. TODO:  you will be able to set this in Enviornment variables.
-      "hf_token": "", # add your HF_token. TODO:  you will be able to set this in Enviornment variables.
-    },
-    "args": {
-      "base_model": "NousResearch/Llama-3.2-1B",
-      // ... other options
-    }
-  }
-}
-```
-
-## Configuration Options
-
-### Model Configuration
-
-| Option              | Description                                                                                   | Default              |
-| ------------------- | --------------------------------------------------------------------------------------------- | -------------------- |
-| `base_model`        | Path to the base model (local or HuggingFace)                                                 | Required             |
-| `base_model_config` | Configuration path for the base model                                                         | Same as base_model   |
-| `revision_of_model` | Specific model revision from HuggingFace hub                                                  | Latest               |
-| `tokenizer_config`  | Custom tokenizer configuration path                                                           | Optional             |
-| `model_type`        | Type of model to load                                                                         | AutoModelForCausalLM |
-| `tokenizer_type`    | Type of tokenizer to use                                                                      | AutoTokenizer        |
-| `hub_model_id`      | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional             |
-
-## Model Family Identification
-
-| Option                     | Default | Description                    |
-| -------------------------- | ------- | ------------------------------ |
-| `is_falcon_derived_model`  | `false` | Whether model is Falcon-based  |
-| `is_llama_derived_model`   | `false` | Whether model is LLaMA-based   |
-| `is_qwen_derived_model`    | `false` | Whether model is Qwen-based    |
-| `is_mistral_derived_model` | `false` | Whether model is Mistral-based |
-
-## Model Configuration Overrides
-
-| Option                                          | Default    | Description                        |
-| ----------------------------------------------- | ---------- | ---------------------------------- |
-| `overrides_of_model_config.rope_scaling.type`   | `"linear"` | RoPE scaling type (linear/dynamic) |
-| `overrides_of_model_config.rope_scaling.factor` | `1.0`      | RoPE scaling factor                |
-
-### Model Loading Options
-
-| Option         | Description                   | Default |
-| -------------- | ----------------------------- | ------- |
-| `load_in_8bit` | Load model in 8-bit precision | false   |
-| `load_in_4bit` | Load model in 4-bit precision | false   |
-| `bf16`         | Use bfloat16 precision        | false   |
-| `fp16`         | Use float16 precision         | false   |
-| `tf32`         | Use tensor float 32 precision | false   |
-
-## Memory and Device Settings
-
-| Option             | Default   | Description             |
-| ------------------ | --------- | ----------------------- |
-| `gpu_memory_limit` | `"20GiB"` | GPU memory limit        |
-| `lora_on_cpu`      | `false`   | Load LoRA on CPU        |
-| `device_map`       | `"auto"`  | Device mapping strategy |
-| `max_memory`       | `null`    | Max memory per device   |
-
-## Training Hyperparameters
-
-| Option                        | Default   | Description                 |
-| ----------------------------- | --------- | --------------------------- |
-| `gradient_accumulation_steps` | `1`       | Gradient accumulation steps |
-| `micro_batch_size`            | `2`       | Batch size per GPU          |
-| `eval_batch_size`             | `null`    | Evaluation batch size       |
-| `num_epochs`                  | `4`       | Number of training epochs   |
-| `warmup_steps`                | `100`     | Warmup steps                |
-| `warmup_ratio`                | `0.05`    | Warmup ratio                |
-| `learning_rate`               | `0.00003` | Learning rate               |
-| `lr_quadratic_warmup`         | `false`   | Quadratic warmup            |
-| `logging_steps`               | `null`    | Logging frequency           |
-| `eval_steps`                  | `null`    | Evaluation frequency        |
-| `evals_per_epoch`             | `null`    | Evaluations per epoch       |
-| `save_strategy`               | `"epoch"` | Checkpoint saving strategy  |
-| `save_steps`                  | `null`    | Saving frequency            |
-| `saves_per_epoch`             | `null`    | Saves per epoch             |
-| `save_total_limit`            | `null`    | Maximum checkpoints to keep |
-| `max_steps`                   | `null`    | Maximum training steps      |
-
-### Dataset Configuration
-
-```yaml
-datasets:
-  - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
-    type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
-    ds_type: json # Dataset type
-    data_files: path/to/data # Source data files
-    train_on_split: train # Dataset split to use
-```
-
-## Chat Template Settings
-
-| Option                   | Default                          | Description            |
-| ------------------------ | -------------------------------- | ---------------------- |
-| `chat_template`          | `"tokenizer_default"`            | Chat template type     |
-| `chat_template_jinja`    | `null`                           | Custom Jinja template  |
-| `default_system_message` | `"You are a helpful assistant."` | Default system message |
-
-## Dataset Processing
-
-| Option                            | Default                    | Description                         |
-| --------------------------------- | -------------------------- | ----------------------------------- |
-| `dataset_prepared_path`           | `"data/last_run_prepared"` | Path for prepared dataset           |
-| `push_dataset_to_hub`             | `""`                       | Push dataset to HF hub              |
-| `dataset_num_proc`                | `4`                        | Number of preprocessing processes   |
-| `dataset_keep_in_memory`          | `false`                    | Keep dataset in memory              |
-| `shuffle_merged_datasets`         | `true`                     | Shuffle merged datasets             |
-| `shuffle_before_merging_datasets` | `false`                    | Shuffle each dataset before merging |
-| `dataset_exact_deduplication`     | `true`                     | Deduplicate datasets                |
-
-## LoRA Configuration
-
-| Option                     | Default                | Description                    |
-| -------------------------- | ---------------------- | ------------------------------ |
-| `adapter`                  | `"lora"`               | Adapter type (lora/qlora)      |
-| `lora_model_dir`           | `""`                   | Directory with pretrained LoRA |
-| `lora_r`                   | `8`                    | LoRA attention dimension       |
-| `lora_alpha`               | `16`                   | LoRA alpha parameter           |
-| `lora_dropout`             | `0.05`                 | LoRA dropout                   |
-| `lora_target_modules`      | `["q_proj", "v_proj"]` | Modules to apply LoRA          |
-| `lora_target_linear`       | `false`                | Target all linear modules      |
-| `peft_layers_to_transform` | `[]`                   | Layers to transform            |
-| `lora_modules_to_save`     | `[]`                   | Modules to save                |
-| `lora_fan_in_fan_out`      | `false`                | Fan in/out structure           |
-
-## Optimization Settings
-
-| Option                    | Default | Description                |
-| ------------------------- | ------- | -------------------------- |
-| `train_on_inputs`         | `false` | Train on input prompts     |
-| `group_by_length`         | `false` | Group by sequence length   |
-| `gradient_checkpointing`  | `false` | Use gradient checkpointing |
-| `early_stopping_patience` | `3`     | Early stopping patience    |
-
-## Learning Rate Scheduling
-
-| Option                     | Default    | Description          |
-| -------------------------- | ---------- | -------------------- |
-| `lr_scheduler`             | `"cosine"` | Scheduler type       |
-| `lr_scheduler_kwargs`      | `{}`       | Scheduler parameters |
-| `cosine_min_lr_ratio`      | `null`     | Minimum LR ratio     |
-| `cosine_constant_lr_ratio` | `null`     | Constant LR ratio    |
-| `lr_div_factor`            | `null`     | LR division factor   |
-
-## Optimizer Settings
-
-| Option                 | Default      | Description         |
-| ---------------------- | ------------ | ------------------- |
-| `optimizer`            | `"adamw_hf"` | Optimizer choice    |
-| `optim_args`           | `{}`         | Optimizer arguments |
-| `optim_target_modules` | `[]`         | Target modules      |
-| `weight_decay`         | `null`       | Weight decay        |
-| `adam_beta1`           | `null`       | Adam beta1          |
-| `adam_beta2`           | `null`       | Adam beta2          |
-| `adam_epsilon`         | `null`       | Adam epsilon        |
-| `max_grad_norm`        | `null`       | Gradient clipping   |
-
-## Attention Implementations
-
-| Option                     | Default | Description                   |
-| -------------------------- | ------- | ----------------------------- |
-| `flash_optimum`            | `false` | Use better transformers       |
-| `xformers_attention`       | `false` | Use xformers                  |
-| `flash_attention`          | `false` | Use flash attention           |
-| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
-| `flash_attn_rms_norm`      | `false` | Flash attention RMS norm      |
-| `flash_attn_fuse_mlp`      | `false` | Fuse MLP operations           |
-| `sdp_attention`            | `false` | Use scaled dot product        |
-| `s2_attention`             | `false` | Use shifted sparse attention  |
-
-## Tokenizer Modifications
-
-| Option           | Default | Description                  |
-| ---------------- | ------- | ---------------------------- |
-| `special_tokens` | -       | Special tokens to add/modify |
-| `tokens`         | `[]`    | Additional tokens            |
-
-## Distributed Training
-
-| Option                  | Default | Description           |
-| ----------------------- | ------- | --------------------- |
-| `fsdp`                  | `null`  | FSDP configuration    |
-| `fsdp_config`           | `null`  | FSDP config options   |
-| `deepspeed`             | `null`  | Deepspeed config path |
-| `ddp_timeout`           | `null`  | DDP timeout           |
-| `ddp_bucket_cap_mb`     | `null`  | DDP bucket capacity   |
-| `ddp_broadcast_buffers` | `null`  | DDP broadcast buffers |
-
-<details>
-<summary><h3>Example Configuration Request:</h3></summary>
-
-Here's a complete example for fine-tuning a LLaMA model using LoRA:
-
-```json
-{
-  "input": {
-    "user_id": "user",
-    "model_id": "llama-test",
-    "run_id": "test-run",
-    "credentials": {
-      "wandb_api_key": "",
-      "hf_token": ""
-    },
-    "args": {
-      "base_model": "NousResearch/Llama-3.2-1B",
-      "load_in_8bit": false,
-      "load_in_4bit": false,
-      "strict": false,
-      "datasets": [
-        {
-          "path": "teknium/GPT4-LLM-Cleaned",
-          "type": "alpaca"
-        }
-      ],
-      "dataset_prepared_path": "last_run_prepared",
-      "val_set_size": 0.1,
-      "output_dir": "./outputs/lora-out",
-      "adapter": "lora",
-      "sequence_len": 2048,
-      "sample_packing": true,
-      "eval_sample_packing": true,
-      "pad_to_sequence_len": true,
-      "lora_r": 16,
-      "lora_alpha": 32,
-      "lora_dropout": 0.05,
-      "lora_target_modules": [
-        "gate_proj",
-        "down_proj",
-        "up_proj",
-        "q_proj",
-        "v_proj",
-        "k_proj",
-        "o_proj"
-      ],
-      "gradient_accumulation_steps": 2,
-      "micro_batch_size": 2,
-      "num_epochs": 1,
-      "optimizer": "adamw_8bit",
-      "lr_scheduler": "cosine",
-      "learning_rate": 0.0002,
-      "train_on_inputs": false,
-      "group_by_length": false,
-      "bf16": "auto",
-      "tf32": false,
-      "gradient_checkpointing": true,
-      "logging_steps": 1,
-      "flash_attention": true,
-      "loss_watchdog_threshold": 5,
-      "loss_watchdog_patience": 3,
-      "warmup_steps": 10,
-      "evals_per_epoch": 4,
-      "saves_per_epoch": 1,
-      "weight_decay": 0,
-      "hub_model_id": "runpod/llama-fr-lora",
-      "wandb_name": "test-run-1",
-      "wandb_project": "test-run-1",
-      "wandb_entity": "axo-test",
-      "special_tokens": {
-        "pad_token": "<|end_of_text|>"
-      }
-    }
-  }
-}
-```
-
-</details>
-
-### Advanced Features
-
-#### Wandb Integration
-
- `wandb_project`: Project name for Weights & Biases
- `wandb_entity`: Team name in W&B
- `wandb_watch`: Monitor model with W&B
- `wandb_name`: Name of the W&B run
- `wandb_run_id`: ID for the W&B run
-
-#### Performance Optimization
-
- `sample_packing`: Enable efficient sequence packing
- `eval_sample_packing`: Use sequence packing during evaluation
- `torch_compile`: Enable PyTorch 2.0 compilation
- `flash_attention`: Use Flash Attention implementation
- `xformers_attention`: Use xFormers attention implementation
-
-### Available Optimizers
-
-The following optimizers are supported:
-
- `adamw_hf`: HuggingFace's AdamW implementation
- `adamw_torch`: PyTorch's AdamW
- `adamw_torch_fused`: Fused AdamW implementation
- `adamw_torch_xla`: XLA-optimized AdamW
- `adamw_apex_fused`: NVIDIA Apex fused AdamW
- `adafactor`: Adafactor optimizer
- `adamw_anyprecision`: Anyprecision AdamW
- `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
- `lion_8bit`: 8-bit Lion optimizer
- `lion_32bit`: 32-bit Lion optimizer
- `sgd`: Stochastic Gradient Descent
- `adagrad`: Adagrad optimizer
-
-## Notes
-
- Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
- Enable `flash_attention: true` for faster training on modern GPUs
- Use `gradient_checkpointing: true` to reduce memory usage
- Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
-
-For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config-reference.html).
-
-### Errors:
-
- if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.
--- a/.runpod/hub.json
+++ b/.runpod/hub.json
@@ -1,93 +0,0 @@
-{
-  "title": "Axolotl Fine-Tuning",
-  "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
-  "type": "serverless",
-  "category": "language",
-  "iconUrl": "https://avatars.githubusercontent.com/u/167502477",
-  "config": {
-    "runsOn": "GPU",
-    "containerDiskInGb": 200,
-    "gpuCount": 1,
-    "allowedCudaVersions": [
-      "12.8",
-      "12.7",
-      "12.6",
-      "12.5",
-      "12.4"
-    ],
-    "presets": [],
-    "env": [
-      {
-        "key": "TOKENIZER",
-        "input": {
-          "name": "Tokenizer",
-          "type": "string",
-          "description": "Name or path of the Hugging Face tokenizer to use.",
-          "default": "",
-          "advanced": true
-        }
-      },
-      {
-        "key": "MAX_NUM_SEQS",
-        "input": {
-          "name": "Max Num Seqs",
-          "type": "number",
-          "description": "Maximum number of sequences per iteration.",
-          "default": 256,
-          "advanced": true
-        }
-      },
-      {
-        "key": "DISABLE_LOG_STATS",
-        "input": {
-          "name": "Disable Log Stats",
-          "type": "boolean",
-          "description": "Disable logging statistics.",
-          "default": false,
-          "trueValue": "true",
-          "falseValue": "false"
-        }
-      },
-      {
-        "key": "LOAD_FORMAT",
-        "input": {
-          "name": "Load Format",
-          "type": "string",
-          "description": "The format of the model weights to load.",
-          "default": "auto",
-          "options": [
-            {
-              "label": "auto",
-              "value": "auto"
-            },
-            {
-              "label": "pt",
-              "value": "pt"
-            },
-            {
-              "label": "safetensors",
-              "value": "safetensors"
-            },
-            {
-              "label": "npcache",
-              "value": "npcache"
-            },
-            {
-              "label": "dummy",
-              "value": "dummy"
-            },
-            {
-              "label": "tensorizer",
-              "value": "tensorizer"
-            },
-            {
-              "label": "bitsandbytes",
-              "value": "bitsandbytes"
-            }
-          ],
-          "advanced": true
-        }
-      }
-    ]
-  }
-}
--- a/.runpod/requirements.txt
+++ b/.runpod/requirements.txt
@@ -1,7 +0,0 @@
-# Required Python packages get listed here, one per line.
-# Reccomended to lock the version number to avoid unexpected changes.
-
-# You can also install packages from a git repository, e.g.:
-# git+https://github.com/runpod/runpod-python.git
-# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
-runpod~=1.7.0
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -1,564 +0,0 @@
-# # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
-# # This can also be a relative path to a model on disk
-# base_model: ./llama-7b-hf
-# # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
-# base_model_ignore_patterns:
-# # If the base_model repo on hf hub doesn't include configuration .json files,
-# # You can set that here, or leave this empty to default to base_model
-# base_model_config: ./llama-7b-hf
-# # You can specify to choose a specific model revision from huggingface hub
-# model_revision:
-# # Optional tokenizer configuration override in case you want to use a different tokenizer
-# # than the one defined in the base model
-# tokenizer_config:
-# # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
-# model_type: AutoModelForCausalLM
-# # Corresponding tokenizer for the model AutoTokenizer is a good choice
-# tokenizer_type: AutoTokenizer
-# # Trust remote code for untrusted source
-# trust_remote_code:
-# # use_fast option for tokenizer loading from_pretrained, default to True
-# tokenizer_use_fast:
-# # Whether to use the legacy tokenizer setting, defaults to True
-# tokenizer_legacy:
-# # Resize the model embeddings when new tokens are added to multiples of 32
-# # This is reported to improve training speed on some models
-# resize_token_embeddings_to_32x:
-
-# # Used to identify which the model is based on
-# is_falcon_derived_model:
-# is_llama_derived_model:
-# # Please note that if you set this to true, `padding_side` will be set to "left" by default
-# is_mistral_derived_model:
-# is_qwen_derived_model:
-
-# # optional overrides to the base model configuration
-# model_config:
-#   # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
-#   rope_scaling:
-#     type: # linear | dynamic
-#     factor: # float
-
-# # Whether you are training a 4-bit GPTQ quantized model
-# gptq: true
-# gptq_groupsize: 128 # group size
-# gptq_model_v1: false # v1 or v2
-
-# # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
-# load_in_8bit: true
-# # Use bitsandbytes 4 bit
-# load_in_4bit:
-
-# # Use CUDA bf16
-# bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
-# # Use CUDA fp16
-# fp16: true
-# # Use CUDA tf32
-# tf32: true # require >=ampere
-
-# # No AMP (automatic mixed precision)
-# bfloat16: true # require >=ampere
-# float16: true
-
-# # A list of one or more datasets to finetune the model with
-# datasets:
-#   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
-#   - path: vicgalle/alpaca-gpt4
-#   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
-#     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
-#     ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
-#     data_files: # Optional[str] path to source data files
-#     shards: # Optional[int] number of shards to split data into
-#     name: # Optional[str] name of dataset configuration to load
-#     train_on_split: train # Optional[str] name of dataset split to load from
-
-#     # Optional[str] fastchat conversation type, only used with type: sharegpt
-#     conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
-#     field_human: # Optional[str]. Human key to use for conversation.
-#     field_model: # Optional[str]. Assistant key to use for conversation.
-
-#   # Custom user prompt
-#   - path: repo
-#     type:
-#       # The below are defaults. only set what's needed.
-#       system_prompt: ""
-#       system_format: "{system}"
-#       field_system: system
-#       field_instruction: instruction
-#       field_input: input
-#       field_output: output
-
-#       # Customizable to be single line or multi-line
-#       # 'format' can include {input}
-#       format: |-
-#         User: {instruction} {input}
-#         Assistant:
-#       # 'no_input_format' cannot include {input}
-#       no_input_format: "{instruction} "
-
-#       # For `completion` datasets only, uses the provided field instead of `text` column
-#       field:
-
-# # Axolotl attempts to save the dataset as an arrow after packing the data together so
-# # subsequent training attempts load faster, relative path
-# dataset_prepared_path: data/last_run_prepared
-# # Push prepared dataset to hub
-# push_dataset_to_hub: # repo path
-# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
-# # if not set.
-# dataset_num_proc: # defaults to os.cpu_count() if not set
-# # push checkpoints to hub
-# hub_model_id: # repo path to push finetuned model
-# # how to push checkpoints to hub
-# # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
-# hub_strategy:
-# # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
-# # Required to be true when used in combination with `push_dataset_to_hub`
-# hf_use_auth_token: # boolean
-# # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
-# val_set_size: 0.04
-# # Num shards for whole dataset
-# dataset_shard_num:
-# # Index of shard to use for whole dataset
-# dataset_shard_idx:
-
-# # The maximum length of an input to train with, this should typically be less than 2048
-# # as most models have a token/context limit of 2048
-# sequence_len: 2048
-# # Pad inputs so each step uses constant sized buffers
-# # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
-# pad_to_sequence_len:
-# # Max sequence length to concatenate training samples together up to
-# # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
-# # FutureWarning: This will soon be DEPRECATED
-# max_packed_sequence_len: 1024
-# # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
-# sample_packing:
-# # Set to 'false' if getting errors during eval with sample_packing on.
-# eval_sample_packing:
-# # You can set these packing optimizations AFTER starting a training at least once.
-# # The trainer will provide recommended values for these values.
-# sample_packing_eff_est:
-# total_num_tokens:
-
-# # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
-# adapter: lora
-# # If you already have a lora model trained that you want to load, put that here.
-# # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
-# lora_model_dir:
-
-# # LoRA hyperparameters
-# # For more details about the following options, see:
-# # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
-# lora_r: 8
-# lora_alpha: 16
-# lora_dropout: 0.05
-# lora_target_modules:
-#   - q_proj
-#   - v_proj
-# #  - k_proj
-# #  - o_proj
-# #  - gate_proj
-# #  - down_proj
-# #  - up_proj
-# lora_target_linear: # If true, will target all linear layers
-
-# # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
-# # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
-# # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
-# # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
-# lora_modules_to_save:
-# #  - embed_tokens
-# #  - lm_head
-
-# # Once you complete training, the model will be saved to the following directory.
-# # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
-# # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
-# lora_out_dir:
-# lora_fan_in_fan_out: false
-
-# # ReLoRA configuration
-# # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
-# relora_steps: # Number of steps per ReLoRA restart
-# relora_warmup_steps: # Number of per-restart warmup steps
-# relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
-
-# # wandb configuration if you're using it
-# wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
-# wandb_project: # Your wandb project name
-# wandb_entity: # A wandb Team name if using a Team
-# wandb_watch:
-# wandb_run_id: # Set the name of your wandb run
-# wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
-
-# # Where to save the full-finetuned model to
-# output_dir: ./completed-model
-
-# # Whether to use torch.compile and which backend to use
-# torch_compile:  # bool
-# torch_compile_backend:  # Optional[str]
-
-# # Training hyperparameters
-
-# # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
-# gradient_accumulation_steps: 1
-# # The number of samples to include in each batch. This is the number of samples sent to each GPU.
-# micro_batch_size: 2
-# eval_batch_size:
-# num_epochs: 4
-# warmup_steps: 100  # cannot use with warmup_ratio
-# warmup_ratio: 0.05  # cannot use with warmup_steps
-# learning_rate: 0.00003
-# lr_quadratic_warmup:
-# logging_steps:
-# save_strategy: # Set to `no` to skip checkpoint saves
-# save_steps: # Leave empty to save at each epoch
-# eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
-# save_total_limit: # Checkpoints saved at a time
-# # Maximum number of iterations to train for. It precedes num_epochs which means that
-# # if both are set, num_epochs will not be guaranteed.
-# # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
-# max_steps:
-
-# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
-# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
-
-# # Whether to mask out or include the human's prompt from the training labels
-# train_on_inputs: false
-# # Group similarly sized data to minimize padding.
-# # May be slower to start, as it must download and sort the entire dataset.
-# # Note that training loss may have an oscillating pattern with this enabled.
-# group_by_length: false
-
-# # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
-# gradient_checkpointing: false
-
-# # Stop training after this many evaluation losses have increased in a row
-# # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
-# early_stopping_patience: 3
-
-# # Specify a scheduler and kwargs to use with the optimizer
-# lr_scheduler: # 'one_cycle' | empty for cosine
-# lr_scheduler_kwargs:
-
-# # For one_cycle optim
-# lr_div_factor: # Learning rate div factor
-
-# # Specify optimizer
-# # Valid values are driven by the Transformers OptimizerNames class, see:
-# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
-# #
-# # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
-# # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
-# # in the examples/ for your model and fine-tuning use case.
-# #
-# # Valid values for 'optimizer' include:
-# # - adamw_hf
-# # - adamw_torch
-# # - adamw_torch_fused
-# # - adamw_torch_xla
-# # - adamw_apex_fused
-# # - adafactor
-# # - adamw_anyprecision
-# # - sgd
-# # - adagrad
-# # - adamw_bnb_8bit
-# # - lion_8bit
-# # - lion_32bit
-# # - paged_adamw_32bit
-# # - paged_adamw_8bit
-# # - paged_lion_32bit
-# # - paged_lion_8bit
-# optimizer:
-# # Specify weight decay
-# weight_decay:
-# # adamw hyperparams
-# adam_beta1:
-# adam_beta2:
-# adam_epsilon:
-# # Gradient clipping max norm
-# max_grad_norm:
-
-# # Augmentation techniques
-# # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
-# # currently only supported on Llama and Mistral
-# noisy_embedding_alpha:
-
-# # Whether to bettertransformers
-# flash_optimum:
-# # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
-# xformers_attention:
-# # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
-# flash_attention:
-# flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
-# flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
-# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
-# # Whether to use scaled-dot-product attention
-# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
-# sdp_attention:
-# # Landmark attention (only llama)
-# landmark_attention:
-# # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
-# # LLaMA only
-# xpos_rope:
-
-# # Resume from a specific checkpoint dir
-# resume_from_checkpoint:
-# # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
-# # Be careful with this being turned on between different models.
-# auto_resume_from_checkpoints: false
-
-# # Don't mess with this, it's here for accelerate and torchrun
-# local_rank:
-
-# # Add or change special tokens.
-# # If you add tokens here, you don't need to add them to the `tokens` list.
-# special_tokens:
-#   # bos_token: "<s>"
-#   # eos_token: "</s>"
-#   # unk_token: "<unk>"
-
-# # Add extra tokens.
-# tokens:
-
-# # FSDP
-# fsdp:
-# fsdp_config:
-
-# # Deepspeed config path. e.g., deepspeed/zero3.json
-# deepspeed:
-
-# # Advanced DDP Arguments
-# ddp_timeout:
-# ddp_bucket_cap_mb:
-# ddp_broadcast_buffers:
-
-# # Path to torch distx for optim 'adamw_anyprecision'
-# torchdistx_path:
-
-# # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
-# pretraining_dataset:
-
-# # Debug mode
-# debug:
-
-# # Seed
-# seed:
-
-# # Allow overwrite yml config using from cli
-# strict:
-
-base_model: ${BASE_MODEL}
-base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
-base_model_config: ${BASE_MODEL_CONFIG}
-revision_of_model: ${REVISION_OF_MODEL}
-tokenizer_config: ${TOKENIZER_CONFIG}
-model_type: ${MODEL_TYPE}
-tokenizer_type: ${TOKENIZER_TYPE}
-trust_remote_code: ${TRUST_REMOTE_CODE}
-tokenizer_use_fast: ${TOKENIZER_USE_FAST}
-tokenizer_legacy: ${TOKENIZER_LEGACY}
-resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}
-
-is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
-is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
-is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
-is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}
-
-overrides_of_model_config:
-  rope_scaling:
-    type: ${ROPE_SCALING_TYPE}
-    factor: ${ROPE_SCALING_FACTOR}
-
-bnb_config_kwargs:
-  llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
-  bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
-  bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}
-
-gptq: ${GPTQ}
-load_in_8bit: ${LOAD_IN_8BIT}
-load_in_4bit: ${LOAD_IN_4BIT}
-bf16: ${BF16}
-fp16: ${FP16}
-tf32: ${TF32}
-bfloat16: ${BFLOAT16}
-float16: ${FLOAT16}
-
-gpu_memory_limit: ${GPU_MEMORY_LIMIT}
-lora_on_cpu: ${LORA_ON_CPU}
-
-datasets:
-  - path: ${DATASET_PATH}
-    type: ${DATASET_TYPE}
-    ds_type: ${DATASET_DS_TYPE}
-    data_files: ${DATASET_DATA_FILES}
-    shards: ${DATASET_SHARDS}
-    name: ${DATASET_NAME}
-    train_on_split: ${DATASET_TRAIN_ON_SPLIT}
-    revision: ${DATASET_REVISION}
-    trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}
-
-rl: ${RL}
-dpo_use_weighting: ${DPO_USE_WEIGHTING}
-
-chat_template: ${CHAT_TEMPLATE}
-chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
-default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
-dataset_prepared_path: ${DATASET_PREPARED_PATH}
-push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
-dataset_num_proc: ${DATASET_NUM_PROC}
-dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
-hub_model_id: ${HUB_MODEL_ID}
-hub_strategy: ${HUB_STRATEGY}
-hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
-val_set_size: ${VAL_SET_SIZE}
-dataset_shard_num: ${DATASET_SHARD_NUM}
-dataset_shard_idx: ${DATASET_SHARD_IDX}
-
-sequence_len: ${SEQUENCE_LEN}
-pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
-sample_packing: ${SAMPLE_PACKING}
-eval_sample_packing: ${EVAL_SAMPLE_PACKING}
-sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
-total_num_tokens: ${TOTAL_NUM_TOKENS}
-sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
-sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}
-
-batch_flattening: ${BATCH_FLATTENING}
-device_map: ${DEVICE_MAP}
-max_memory: ${MAX_MEMORY}
-
-adapter: ${ADAPTER}
-lora_model_dir: ${LORA_MODEL_DIR}
-
-lora_r: ${LORA_R}
-lora_alpha: ${LORA_ALPHA}
-lora_dropout: ${LORA_DROPOUT}
-lora_target_modules:
-  - ${LORA_TARGET_MODULES}
-lora_target_linear: ${LORA_TARGET_LINEAR}
-peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
-lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
-lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}
-
-loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
-loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}
-
-peft:
-  loftq_config:
-    loftq_bits: ${LOFTQ_BITS}
-
-relora_steps: ${RELORA_STEPS}
-relora_warmup_steps: ${RELORA_WARMUP_STEPS}
-relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
-relora_prune_ratio: ${RELORA_PRUNE_RATIO}
-relora_cpu_offload: ${RELORA_CPU_OFFLOAD}
-
-wandb_mode: ${WANDB_MODE}
-wandb_project: ${WANDB_PROJECT}
-wandb_entity: ${WANDB_ENTITY}
-wandb_watch: ${WANDB_WATCH}
-wandb_name: ${WANDB_NAME}
-wandb_run_id: ${WANDB_RUN_ID}
-wandb_log_model: ${WANDB_LOG_MODEL}
-
-mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
-mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
-mlflow_run_name: ${MLFLOW_RUN_NAME}
-hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}
-
-use_comet: ${USE_COMET}
-comet_api_key: ${COMET_API_KEY}
-comet_workspace: ${COMET_WORKSPACE}
-comet_project_name: ${COMET_PROJECT_NAME}
-comet_experiment_key: ${COMET_EXPERIMENT_KEY}
-comet_mode: ${COMET_MODE}
-comet_online: ${COMET_ONLINE}
-comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}
-
-output_dir: ${OUTPUT_DIR}
-
-torch_compile: ${TORCH_COMPILE}
-torch_compile_backend: ${TORCH_COMPILE_BACKEND}
-
-gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
-micro_batch_size: ${MICRO_BATCH_SIZE}
-eval_batch_size: ${EVAL_BATCH_SIZE}
-num_epochs: ${NUM_EPOCHS}
-warmup_steps: ${WARMUP_STEPS}
-warmup_ratio: ${WARMUP_RATIO}
-learning_rate: ${LEARNING_RATE}
-lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
-logging_steps: ${LOGGING_STEPS}
-eval_steps: ${EVAL_STEPS}
-evals_per_epoch: ${EVALS_PER_EPOCH}
-save_strategy: ${SAVE_STRATEGY}
-save_steps: ${SAVE_STEPS}
-saves_per_epoch: ${SAVES_PER_EPOCH}
-save_total_limit: ${SAVE_TOTAL_LIMIT}
-max_steps: ${MAX_STEPS}
-
-eval_table_size: ${EVAL_TABLE_SIZE}
-eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
-eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}
-
-profiler_steps: ${PROFILER_STEPS}
-loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
-loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
-
-train_on_inputs: ${TRAIN_ON_INPUTS}
-group_by_length: ${GROUP_BY_LENGTH}
-gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
-early_stopping_patience: ${EARLY_STOPPING_PATIENCE}
-
-lr_scheduler: ${LR_SCHEDULER}
-lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
-cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
-cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
-lr_div_factor: ${LR_DIV_FACTOR}
-
-optimizer: ${OPTIMIZER}
-optim_args: ${OPTIM_ARGS}
-optim_target_modules: ${OPTIM_TARGET_MODULES}
-weight_decay: ${WEIGHT_DECAY}
-adam_beta1: ${ADAM_BETA1}
-adam_beta2: ${ADAM_BETA2}
-adam_epsilon: ${ADAM_EPSILON}
-max_grad_norm: ${MAX_GRAD_NORM}
-
-neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}
-
-flash_optimum: ${FLASH_OPTIMUM}
-xformers_attention: ${XFORMERS_ATTENTION}
-flash_attention: ${FLASH_ATTENTION}
-flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
-flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
-flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
-sdp_attention: ${SDP_ATTENTION}
-s2_attention: ${S2_ATTENTION}
-resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
-auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}
-
-local_rank: ${LOCAL_RANK}
-
-special_tokens:
-  bos_token: ${SPECIAL_TOKEN_BOS}
-  eos_token: ${SPECIAL_TOKEN_EOS}
-  unk_token: ${SPECIAL_TOKEN_UNK}
-  pad_token: ${SPECIAL_TOKEN_PAD}
-
-tokens: ${TOKENS}
-
-fsdp: ${FSDP}
-fsdp_config: ${FSDP_CONFIG}
-deepspeed: ${DEEPSPEED}
-
-ddp_timeout: ${DDP_TIMEOUT}
-ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
-ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}
-
-torchdistx_path: ${TORCHDISTX_PATH}
-pretraining_dataset: ${PRETRAINING_DATASET}
-debug: ${DEBUG}
-seed: ${SEED}
-strict: ${STRICT}
--- a/.runpod/src/handler.py
+++ b/.runpod/src/handler.py
@@ -1,66 +0,0 @@
-"""
-Runpod serverless entrypoint handler
-"""
-
-import os
-
-import runpod
-import yaml
-from huggingface_hub._login import login
-from train import train
-from utils import get_output_dir
-
-BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
-if not os.path.exists(BASE_VOLUME):
-    os.makedirs(BASE_VOLUME)
-
-logger = runpod.RunPodLogger()
-
-
-async def handler(job):
-    runpod_job_id = job["id"]
-    inputs = job["input"]
-    run_id = inputs.get("run_id", "default_run_id")
-    args = inputs.get("args", {})
-
-    # Set output directory
-    output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
-    args["output_dir"] = output_dir
-
-    # First save args to a temporary config file
-    config_path = "/workspace/test_config.yaml"
-
-    # Add run_name and job_id to args before saving
-    args["run_name"] = run_id
-    args["runpod_job_id"] = runpod_job_id
-
-    yaml_data = yaml.dump(args, default_flow_style=False)
-    with open(config_path, "w", encoding="utf-8") as file:
-        file.write(yaml_data)
-
-    # Handle credentials
-    credentials = inputs.get("credentials", {})
-
-    if "wandb_api_key" in credentials:
-        os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
-    if "hf_token" in credentials:
-        os.environ["HF_TOKEN"] = credentials["hf_token"]
-
-    if os.environ.get("HF_TOKEN"):
-        login(token=os.environ["HF_TOKEN"])
-    else:
-        logger.info("No HF_TOKEN provided. Skipping login.")
-
-    logger.info("Starting Training.")
-    async for result in train(config_path):  # Pass the config path instead of args
-        logger.info(result)
-    logger.info("Training Complete.")
-
-    # Cleanup
-    if "WANDB_API_KEY" in os.environ:
-        del os.environ["WANDB_API_KEY"]
-    if "HF_TOKEN" in os.environ:
-        del os.environ["HF_TOKEN"]
-
-
-runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
--- a/.runpod/src/test_input.json
+++ b/.runpod/src/test_input.json
@@ -1,61 +0,0 @@
-{
-  "input": {
-    "user_id": "user",
-    "model_id": "llama-test",
-    "run_id": "llama-test",
-    "credentials": {
-      "wandb_api_key": "",
-      "hf_token": ""
-    },
-    "args": {
-      "base_model": "NousResearch/Meta-Llama-3-8B",
-      "model_type": "LlamaForCausalLM",
-      "tokenizer_type": "AutoTokenizer",
-      "load_in_8bit": true,
-      "load_in_4bit": false,
-      "strict": false,
-      "datasets": [
-        {
-          "path": "mhenrichsen/alpaca_2k_test",
-          "type": "alpaca"
-        }
-      ],
-      "val_set_size": 0.05,
-      "output_dir": "./outputs/lora-out",
-      "sequence_len": 4096,
-      "sample_packing": true,
-      "eval_sample_packing": false,
-      "pad_to_sequence_len": true,
-      "adapter": "lora",
-      "lora_r": 32,
-      "lora_alpha": 16,
-      "lora_dropout": 0.05,
-      "lora_target_linear": true,
-      "lora_modules_to_save": [
-        "embed_tokens",
-        "lm_head"
-      ],
-      "gradient_accumulation_steps": 4,
-      "micro_batch_size": 2,
-      "num_epochs": 1,
-      "optimizer": "adamw_bnb_8bit",
-      "lr_scheduler": "cosine",
-      "learning_rate": 0.0002,
-      "train_on_inputs": false,
-      "group_by_length": false,
-      "bf16": "auto",
-      "tf32": false,
-      "gradient_checkpointing": true,
-      "logging_steps": 1,
-      "flash_attention": true,
-      "warmup_steps": 1,
-      "evals_per_epoch": 1,
-      "eval_max_new_tokens": 128,
-      "saves_per_epoch": 1,
-      "weight_decay": 0.0,
-      "special_tokens": {
-        "pad_token": "<|end_of_text|>"
-      }
-    }
-  }
-}
--- a/.runpod/src/train.py
+++ b/.runpod/src/train.py
@@ -1,45 +0,0 @@
-"""
-Runpod train entrypoint
-"""
-
-import asyncio
-
-
-async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
-    """
-    Run preprocessing (if enabled) and training with the given config file
-    :param config_path: Path to the YAML config file
-    :param gpu_id: GPU ID to use (default: "0")
-    :param preprocess: Whether to run preprocessing (default: True)
-
-    """
-    # First check if preprocessing is needed
-    if preprocess:
-        # Preprocess command
-        preprocess_cmd = (
-            f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
-        )
-        process = await asyncio.create_subprocess_shell(
-            preprocess_cmd,
-            stdout=asyncio.subprocess.PIPE,
-            stderr=asyncio.subprocess.STDOUT,
-        )
-
-        if process.stdout is not None:
-            async for line in process.stdout:
-                yield f"Preprocessing: {line.decode().strip()}"
-        await process.wait()
-        yield "Preprocessing completed."
-    else:
-        yield "Skipping preprocessing step."
-
-    # Training command
-    train_cmd = f"axolotl train {config_path}"
-    process = await asyncio.create_subprocess_shell(
-        train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
-    )
-
-    if process.stdout is not None:
-        async for line in process.stdout:
-            yield f"Training: {line.decode().strip()}"
-    await process.wait()
--- a/.runpod/src/utils.py
+++ b/.runpod/src/utils.py
@@ -1,89 +0,0 @@
-"""
-Runpod launcher utils
-"""
-
-import os
-
-import yaml
-
-
-def get_output_dir(run_id):
-    path = f"fine-tuning/{run_id}"
-    return path
-
-
-def make_valid_config(input_args):
-    """
-    Creates and saves updated config file, returns the path to the new config
-    :param input_args: dict of input args
-    :return: str, path to the updated config file
-    """
-    # Load default config
-    with open("config/config.yaml", "r", encoding="utf-8") as fin:
-        all_args = yaml.safe_load(fin)
-
-    if not input_args:
-        print("No args provided, using defaults")
-    else:
-        all_args.update(input_args)
-
-    # Create updated config path
-    updated_config_path = "config/updated_config.yaml"
-
-    # Save updated config to new file
-    with open(updated_config_path, "w", encoding="utf-8") as f:
-        yaml.dump(all_args, f)
-
-    return updated_config_path
-
-
-def set_config_env_vars(args: dict):
-    """
-    Convert API arguments into environment variables.
-    Handles nested dictionaries, lists, and special values.
-
-    Args:
-        args (dict): The arguments dictionary from the API request
-    """
-
-    def process_value(value):
-        """Convert Python values to string format for environment variables"""
-        if value is None:
-            return ""
-        if isinstance(value, bool):
-            return str(value).lower()
-        if isinstance(value, (list, dict)):
-            return str(value)
-        return str(value)
-
-    def set_env_vars(data, prefix=""):
-        """Recursively set environment variables from nested dictionary"""
-        for key, value in data.items():
-            env_key = prefix + key.upper()
-
-            # Handle special cases
-            if isinstance(value, dict):
-                # For nested dictionaries (like special_tokens)
-                set_env_vars(value, f"{env_key}_")
-            elif isinstance(value, list):
-                # Handle list of dictionaries (like datasets)
-                if value and isinstance(value[0], dict):
-                    for i, item in enumerate(value):
-                        set_env_vars(item, f"{env_key}_{i}_")
-                else:
-                    # For simple lists (like lora_target_modules)
-                    os.environ[env_key] = process_value(value)
-            else:
-                # Handle all other cases
-                os.environ[env_key] = process_value(value)
-
-    # Clear any existing related environment variables
-    # This prevents old values from persisting
-    for key in list(os.environ.keys()):
-        if key.startswith(
-            ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
-        ):
-            del os.environ[key]
-
-    # Set new environment variables
-    set_env_vars(args)
--- a/.runpod/test-input.json
+++ b/.runpod/test-input.json
@@ -1,86 +0,0 @@
-{
-  "input": {
-    "name": "quick_smoke_test_sft",
-    "user_id": "user",
-    "model_id": "llama-test",
-    "run_id": "llama-test",
-    "credentials": {
-      "wandb_api_key": "",
-      "hf_token": ""
-    },
-    "args": {
-      "base_model": "HuggingFaceTB/SmolLM2-135M",
-      "model_type": "AutoModelForCausalLM",
-      "tokenizer_type": "AutoTokenizer",
-      "load_in_4bit": true,
-      "strict": false,
-      "datasets": [
-        {
-          "path": "mhenrichsen/alpaca_2k_test",
-          "type": "alpaca",
-          "split": "train[:10%]"
-        }
-      ],
-      "val_set_size": 0.02,
-      "output_dir": "./outputs/lora-out",
-      "sequence_len": 4096,
-      "sample_packing": true,
-      "eval_sample_packing": false,
-      "pad_to_sequence_len": true,
-      "adapter": "qlora",
-      "lora_r": 32,
-      "lora_alpha": 64,
-      "lora_dropout": 0.05,
-      "lora_target_linear": true,
-      "lora_modules_to_save": [
-        "embed_tokens",
-        "lm_head"
-      ],
-      "gradient_accumulation_steps": 2,
-      "micro_batch_size": 1,
-      "num_epochs": 1,
-      "optimizer": "adamw_torch_fused",
-      "lr_scheduler": "cosine",
-      "learning_rate": 0.0002,
-      "train_on_inputs": false,
-      "group_by_length": false,
-      "bf16": "auto",
-      "tf32": true,
-      "gradient_checkpointing": true,
-      "logging_steps": 1,
-      "flash_attention": true,
-      "warmup_steps": 1,
-      "evals_per_epoch": 1,
-      "eval_max_new_tokens": 128,
-      "saves_per_epoch": 1,
-      "weight_decay": 0.0,
-      "special_tokens": {
-        "pad_token": "<|endoftext|>"
-      },
-      "max_steps": 20
-    },
-    "timeout": 100000
-  },
-  "config": {
-    "gpuTypeId": "NVIDIA GeForce RTX 4090",
-    "gpuCount": 1,
-    "containerDiskInGb": 200,
-    "env": [
-      {
-        "key": "TOKENIZER",
-        "value": ""
-      },
-      {
-        "key": "DISABLE_LOG_STATS",
-        "value": "true"
-      }
-    ],
-    "allowedCudaVersions": [
-      "12.8",
-      "12.7",
-      "12.6",
-      "12.5",
-      "12.4"
-    ]
-  }
-}
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
@@ -1,90 +0,0 @@
-{
-  "tests": [
-    {
-      "name": "quick_smoke_test_sft",
-      "input": {
-        "user_id": "user",
-        "model_id": "llama-test",
-        "run_id": "llama-test",
-        "credentials": {
-          "wandb_api_key": "",
-          "hf_token": ""
-        },
-        "args": {
-          "base_model": "HuggingFaceTB/SmolLM2-135M",
-          "model_type": "AutoModelForCausalLM",
-          "tokenizer_type": "AutoTokenizer",
-          "load_in_4bit": true,
-          "strict": false,
-          "datasets": [
-            {
-              "path": "mhenrichsen/alpaca_2k_test",
-              "type": "alpaca",
-              "split": "train[:10%]"
-            }
-          ],
-          "val_set_size": 0.02,
-          "output_dir": "./outputs/lora-out",
-          "sequence_len": 4096,
-          "sample_packing": true,
-          "eval_sample_packing": false,
-          "pad_to_sequence_len": true,
-          "adapter": "qlora",
-          "lora_r": 32,
-          "lora_alpha": 64,
-          "lora_dropout": 0.05,
-          "lora_target_linear": true,
-          "lora_modules_to_save": [
-            "embed_tokens",
-            "lm_head"
-          ],
-          "gradient_accumulation_steps": 2,
-          "micro_batch_size": 1,
-          "num_epochs": 1,
-          "optimizer": "adamw_torch_fused",
-          "lr_scheduler": "cosine",
-          "learning_rate": 0.0002,
-          "train_on_inputs": false,
-          "group_by_length": false,
-          "bf16": "auto",
-          "tf32": true,
-          "gradient_checkpointing": true,
-          "logging_steps": 1,
-          "flash_attention": true,
-          "warmup_steps": 1,
-          "evals_per_epoch": 1,
-          "eval_max_new_tokens": 128,
-          "saves_per_epoch": 1,
-          "weight_decay": 0.0,
-          "special_tokens": {
-            "pad_token": "<|endoftext|>"
-          },
-          "max_steps": 20
-        }
-      },
-      "timeout": 100000
-    }
-  ],
-  "config": {
-    "gpuTypeId": "NVIDIA GeForce RTX 4090",
-    "gpuCount": 1,
-    "containerDiskInGb": 200,
-    "env": [
-      {
-        "key": "TOKENIZER",
-        "value": ""
-      },
-      {
-        "key": "DISABLE_LOG_STATS",
-        "value": "true"
-      }
-    ],
-    "allowedCudaVersions": [
-      "12.8",
-      "12.7",
-      "12.6",
-      "12.5",
-      "12.4"
-    ]
-  }
-}
--- a/.vscode/README.md
+++ b/.vscode/README.md
@@ -1 +0,0 @@
-See [docs/debugging.md](../docs/debugging.md) for guidance on how to modify these files to debug axolotl with VSCode.
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -1,34 +0,0 @@
-{
-    // Use IntelliSense to learn about possible attributes.
-    // Hover to view descriptions of existing attributes.
-    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
-    "version": "0.2.0",
-    "configurations": [
-        {
-            "name": "Debug axolotl prompt - sharegpt",
-            "type": "python",
-            "module": "accelerate.commands.launch",
-            "request": "launch",
-            "args": [
-                "-m", "axolotl.cli.train", "dev_sharegpt.yml",
-                // The flags below simplify debugging by overriding the axolotl config
-                // with the debugging tips above.  Modify as needed.
-                "--dataset_processes=1",      // limits data preprocessing to one process
-                "--max_steps=1",              // limits training to just one step
-                "--batch_size=1",             // minimizes batch size
-                "--micro_batch_size=1",       // minimizes batch size
-                "--val_set_size=0",           // disables validation
-                "--sample_packing=False",     // disables sample packing which is necessary for small datasets
-                "--eval_sample_packing=False",// disables sample packing on eval set
-                "--dataset_prepared_path=temp_debug/axolotl_outputs/data", // send data outputs to a temp folder
-                "--output_dir=temp_debug/axolotl_outputs/model" // send model outputs to a temp folder
-                ],
-            "console": "integratedTerminal",      // show output in the integrated terminal
-            "cwd": "${workspaceFolder}/devtools", // set working directory to devtools from the root of the project
-            "justMyCode": true,                   // step through only axolotl code
-            "env": {"CUDA_VISIBLE_DEVICES": "0",  // Since we aren't doing distributed training, we need to limit to one GPU
-                    "HF_HOME": "${workspaceFolder}/devtools/temp_debug/.hf-cache"}, // send HF cache to a temp folder
-            "preLaunchTask": "cleanup-for-dataprep", // delete temp folders (see below)
-        }
-    ]
-}
--- a/.vscode/tasks.json
+++ b/.vscode/tasks.json
@@ -1,27 +0,0 @@
-//this file is used by launch.json
-{
-  "version": "2.0.0",
-  "tasks": [
-    // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder
-    {
-      "label": "delete-outputs",
-      "type": "shell",
-      "command": "rm -rf temp_debug/axolotl_outputs",
-      "options":{ "cwd": "${workspaceFolder}/devtools"},
-      "problemMatcher": []
-    },
-    // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder
-    {
-      "label": "delete-temp-hf-dataset-cache",
-      "type": "shell",
-      "command": "rm -rf temp_debug/.hf-cache/datasets",
-      "options":{ "cwd": "${workspaceFolder}/devtools"},
-      "problemMatcher": []
-    },
-      // this task combines the two tasks above
-    {
-     "label": "cleanup-for-dataprep",
-     "dependsOn": ["delete-outputs", "delete-temp-hf-dataset-cache"],
-    }
-  ]
-}
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,99 +0,0 @@
-# Axolotl
-
-Fine-tuning framework for LLMs. Config-driven: every training run is defined by a single YAML file.
-
-## Tech Stack
-
-Python, PyTorch, HuggingFace Transformers, TRL, PEFT (LoRA/QLoRA), DeepSpeed, FSDP, vLLM (for GRPO generation).
-
-## Commands
-
-```bash
-axolotl train config.yaml              # Train (single or multi-GPU, auto-detected)
-axolotl preprocess config.yaml         # Tokenize dataset and validate config
-axolotl preprocess config.yaml --debug # Inspect tokenized samples and label masking
-axolotl inference config.yaml          # Interactive inference
-axolotl merge-lora config.yaml         # Merge LoRA adapter into base model
-axolotl vllm-serve config.yaml         # Start vLLM server for GRPO/EBFT training
-axolotl fetch examples                 # Download example configs
-axolotl agent-docs                     # Show agent-optimized docs (bundled with pip package)
-axolotl agent-docs grpo                # Topic-specific agent reference
-axolotl config-schema                  # Dump config JSON schema
-```
-
-## Training Methods
-
-| Method | Config Key | When to Use |
-|--------|-----------|-------------|
-| SFT | *(default)* | Input-output pairs, instruction tuning |
-| DPO/IPO | `rl: dpo` / `rl: dpo, dpo_loss_type: ["ipo"]` | Paired preference data (chosen vs rejected) |
-| KTO | `rl: kto` | Unpaired binary preference labels |
-| ORPO | `rl: orpo` | Single-stage alignment, no ref model |
-| GRPO | `rl: grpo` | RL with verifiable reward functions (math, code) |
-| EBFT | `rl: ebft` | Feature-matching rewards from internal representations |
-
-Agent-specific references:
- [docs/agents/sft.md](docs/agents/sft.md) — supervised fine-tuning
- [docs/agents/preference_tuning.md](docs/agents/preference_tuning.md) — DPO, IPO, KTO, ORPO, SimPO
- [docs/agents/grpo.md](docs/agents/grpo.md) — GRPO online RL with reward functions
- [docs/agents/reward_modelling.md](docs/agents/reward_modelling.md) — outcome and process reward models
- [docs/agents/pretraining.md](docs/agents/pretraining.md) — continual pretraining
- [docs/agents/model_architectures.md](docs/agents/model_architectures.md) — model-specific quirks (Gemma4, Qwen3.5 MoE, etc.)
- [docs/agents/new_model_support.md](docs/agents/new_model_support.md) — debugging and adding support for new model architectures
-
-## Config Pattern
-
-All training is config-driven. A YAML file specifies model, adapter, dataset(s), and hyperparameters:
-
-```yaml
-base_model: meta-llama/Llama-3.1-8B-Instruct
-adapter: lora                    # or qlora, or omit for full fine-tune
-datasets:
-  - path: my_dataset
-    type: chat_template          # prompt strategy (see docs/dataset-formats/)
-output_dir: ./outputs/lora-out
-```
-
-Config schema: `src/axolotl/utils/schemas/config.py` (AxolotlInputConfig).
-
-## Project Structure
-
-```
-src/axolotl/
-  cli/                           # CLI entry points (train, preprocess, inference, merge_lora, vllm_serve)
-  core/
-    builders/                    # TrainerBuilder classes (causal.py for SFT, rl.py for RLHF)
-    trainers/                    # Trainer classes, mixins (optimizer, scheduler, packing)
-      dpo/                       # DPO trainer and config
-      grpo/                      # GRPO trainer and sampler
-  loaders/                       # Model, tokenizer, adapter, processor loading
-  prompt_strategies/             # Dataset format handlers (chat_template, alpaca, dpo/, kto/, orpo/)
-  utils/schemas/                 # Pydantic config schemas (config, model, training, peft, trl, fsdp)
-  integrations/                  # Plugins (liger, cut_cross_entropy, swanlab, nemo_gym)
-  monkeypatch/                   # Runtime patches for HF transformers
-
-examples/                        # Example YAML configs by model (llama-3/, qwen2/, mistral/, ebft/)
-deepspeed_configs/               # DeepSpeed JSON configs (zero2, zero3)
-docs/                            # Quarto documentation site
-```
-
-## Code Conventions
-
- Config-driven: features are toggled via YAML, not code changes
- Prompt strategies: `src/axolotl/prompt_strategies/` — each `type:` value maps to a function
- Plugin system: `plugins:` list in config loads integration modules
- Trainer mixins: `core/trainers/mixins/` for composable trainer behaviors
- Schemas: all config validation via Pydantic in `utils/schemas/`
-
-## Key Documentation
-
- [Getting Started](docs/getting-started.qmd) — quickstart tutorial
- [Choosing a Method](docs/choosing_method.qmd) — SFT vs DPO vs GRPO decision guide
- [Config Reference](docs/config-reference.qmd) — all config options
- [Dataset Formats](docs/dataset-formats/) — chat_template, alpaca, input_output, completion
- [RLHF](docs/rlhf.qmd) — DPO, KTO, ORPO, GRPO, EBFT configs and dataset formats
- [GRPO Deep Dive](docs/grpo.qmd) — async training, custom rewards, scaling
- [vLLM Serving](docs/vllm_serving.qmd) — vLLM setup for GRPO/EBFT
- [Multi-GPU](docs/multi-gpu.qmd) — FSDP and DeepSpeed
- [Training Stability](docs/training_stability.qmd) — debugging loss, NaN, OOM
- [Debugging](docs/debugging.qmd) — VSCode setup, Docker debugging
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -1,10 +0,0 @@
-cff-version: 1.2.0
-type: software
-title: "Axolotl: Open Source LLM Post-Training"
-message: "If you use this software, please cite it as below."
-authors:
-  - name: "Axolotl maintainers and contributors"
-repository-code: "https://github.com/axolotl-ai-cloud/axolotl"
-url: "https://axolotl.ai/"
-license: Apache-2.0
-date-released: "2023-05-30"
--- a/1
+++ b/1
@@ -1 +0,0 @@
-docs.axolotl.ai
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,7 +0,0 @@
-include README.md
-include LICENSE
-include VERSION
-include src/axolotl/utils/chat_templates/templates/*.jinja
-include AGENTS.md
-recursive-include docs/agents *.md
-recursive-include axolotl *.py
--- a/README.md
+++ b/README.md
--- a/SETUP_MIAAI.md
+++ b/SETUP_MIAAI.md
@@ -1,83 +0,0 @@
-# Axolotl Setup — miaai (RTX 5080, CUDA 13.2)
-
-## System Info
- GPU: NVIDIA RTX 5080 (16GB VRAM)
- Driver: 580.126.09 — max CUDA 13.0 (nvcc from conda resolves to 13.2)
- OS: Ubuntu (Python 3.13 system — do NOT use system Python for ML)
- Axolotl branch: `activeblue/main`
-
-## One-time Setup
-
-### 1. Install Miniconda
-```bash
-wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh
-bash miniconda.sh -b -p /opt/miniconda3
-/opt/miniconda3/bin/conda init bash
-source ~/.bashrc
-```
-
-### 2. Create Python 3.11 environment
-```bash
-conda create -n axolotl python=3.11 -y
-conda activate axolotl
-```
-
-### 3. Clone and sync repo with upstream
-```bash
-git clone https://git.activeblue.net/tocmo0nlord/axolotl.git
-cd axolotl
-git remote add upstream https://github.com/axolotl-ai-cloud/axolotl.git
-git fetch upstream
-git rebase upstream/main        # keeps activeblue patches on top
-git push origin activeblue/main --force-with-lease
-```
-
-### 4. Install CUDA toolkit (needed to compile flash-attn)
-```bash
-conda install -y -c "nvidia/label/cuda-12.8.0" cuda-toolkit
-export CUDA_HOME=$CONDA_PREFIX
-export PATH=$CUDA_HOME/bin:$PATH
-```
-
-### 5. Install PyTorch — use cu132 (matches nvcc from conda)
-> NOTE: torchaudio has no cu132 wheel — skip it, not needed for LLM training
-```bash
-pip install torch torchvision --index-url https://download.pytorch.org/whl/cu132
-python -c "import torch; print('CUDA:', torch.version.cuda); print('GPU:', torch.cuda.get_device_name(0))"
-```
-
-### 6. Install Axolotl
-```bash
-pip install -e "."
-```
-
-> **flash-attn compiles CUDA kernels from source — takes 15–25 min on 10 cores of i7-14700K.**
-> Always set `MAX_JOBS` to the number of available CPU cores to parallelize and speed up compilation:
-```bash
-MAX_JOBS=10 pip install flash-attn --no-build-isolation
-```
-
-## Every Session (after first-time setup)
-```bash
-export PATH="/opt/miniconda3/bin:$PATH"
-conda activate axolotl
-export CUDA_HOME=$CONDA_PREFIX
-export PATH=$CUDA_HOME/bin:$PATH
-cd /home/tocmo0nlord/axolotl
-```
-
-## Run Training
-```bash
-axolotl train human_chat_qlora.yml
-```
-
-## Common Pitfalls Encountered
-| Problem | Cause | Fix |
-|---|---|---|
-| `externally-managed-environment` | System Python 3.13 blocks pip | Use conda env, never system pip |
-| `No module named torch` (flash-attn) | pip builds in isolated env | Use `--no-build-isolation` |
-| `CUDA_HOME not set` | CUDA toolkit not installed | `conda install cuda-toolkit` from nvidia channel |
-| `CUDA version mismatch 13.2 vs 12.8` | Conda nvcc is 13.2, torch was cu128 | Reinstall torch with `--index-url .../cu132` |
-| `torchaudio` not found for cu132 | No cu132 wheel exists | Skip torchaudio — not needed |
-| `src refspec main does not match` | Fork default branch is `activeblue/main` | `git push origin activeblue/main` |
-| flash-attn compile is slow | Single-threaded by default | Set `MAX_JOBS=<cpu_count>` before pip install |
--- a/TODO.md
+++ b/TODO.md
@@ -0,0 +1,10 @@
+# todo list
+
+- [] Validation of parameters for combinations that won't work
+
+
+
+## things that are known not to work
+
+- FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203
+- adamw_bnb_8bit doesn't play well with FSDP offload
--- a/1
+++ b/1
@@ -1 +0,0 @@
-0.16.2.dev0
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -1,367 +0,0 @@
-project:
-  type: website
-  pre-render:
-   - docs/scripts/generate_config_docs.py
-   - docs/scripts/generate_examples_docs.py
-
-quartodoc:
-  dir: docs/api
-  package: axolotl
-  title: API Reference
-  parser: google
-
-  sections:
-    - title: Core
-      desc: Core functionality for training
-      contents:
-        - train
-        - evaluate
-        - datasets
-        - convert
-        - prompt_tokenizers
-        - logging_config
-        - core.builders.base
-        - core.builders.causal
-        - core.builders.rl
-        - core.training_args
-        - core.chat.messages
-        - core.chat.format.chatml
-        - core.chat.format.llama3x
-        - core.chat.format.shared
-        - core.datasets.chat
-        - core.datasets.transforms.chat_builder
-    - title: CLI
-      desc: Command-line interface
-      contents:
-        - cli.main
-        - cli.train
-        - cli.evaluate
-        - cli.args
-        - cli.art
-        - cli.checks
-        - cli.config
-        - cli.delinearize_llama4
-        - cli.inference
-        - cli.merge_lora
-        - cli.merge_sharded_fsdp_weights
-        - cli.preprocess
-        - cli.quantize
-        - cli.vllm_serve
-        - cli.cloud.base
-        - cli.cloud.modal_
-        - cli.utils
-        - cli.utils.args
-        - cli.utils.fetch
-        - cli.utils.load
-        - cli.utils.sweeps
-        - cli.utils.train
-    - title: Trainers
-      desc: Training implementations
-      contents:
-        - core.trainers.base
-        - core.trainers.trl
-        - core.trainers.mamba
-        - core.trainers.dpo.trainer
-        - core.trainers.grpo.trainer
-        - core.trainers.grpo.sampler
-        - core.trainers.utils
-    - title: Model Loading
-      desc: Functionality for loading and patching models, tokenizers, etc.
-      contents:
-        - loaders.model
-        - loaders.tokenizer
-        - loaders.processor
-        - loaders.adapter
-        - loaders.patch_manager
-        - loaders.constants
-    - title: Mixins
-      desc: Mixin classes for augmenting trainers
-      contents:
-        - core.trainers.mixins.optimizer
-        - core.trainers.mixins.rng_state_loader
-        - core.trainers.mixins.scheduler
-    - title: Context Managers
-      desc: Context managers for altering trainer behaviors
-      contents:
-        - utils.ctx_managers.sequence_parallel
-    - title: Prompt Strategies
-      desc: Prompt formatting strategies
-      contents:
-        - prompt_strategies.base
-        - prompt_strategies.chat_template
-        - prompt_strategies.alpaca_chat
-        - prompt_strategies.alpaca_instruct
-        - prompt_strategies.alpaca_w_system
-        - prompt_strategies.user_defined
-        - prompt_strategies.llama2_chat
-        - prompt_strategies.completion
-        - prompt_strategies.input_output
-        - prompt_strategies.stepwise_supervised
-        - prompt_strategies.metharme
-        - prompt_strategies.orcamini
-        - prompt_strategies.pygmalion
-        - prompt_strategies.messages.chat
-        - prompt_strategies.dpo.chat_template
-        - prompt_strategies.dpo.llama3
-        - prompt_strategies.dpo.chatml
-        - prompt_strategies.dpo.zephyr
-        - prompt_strategies.dpo.user_defined
-        - prompt_strategies.dpo.passthrough
-        - prompt_strategies.kto.llama3
-        - prompt_strategies.kto.chatml
-        - prompt_strategies.kto.user_defined
-        - prompt_strategies.orpo.chat_template
-        - prompt_strategies.bradley_terry.llama3
-    - title: Kernels
-      desc: Low-level performance optimizations
-      contents:
-        - kernels.lora
-        - kernels.geglu
-        - kernels.swiglu
-        - kernels.quantize
-        - kernels.utils
-    - title: Monkey Patches
-      desc: Runtime patches for model optimizations
-      contents:
-        - monkeypatch.llama_attn_hijack_flash
-        - monkeypatch.llama_attn_hijack_xformers
-        - monkeypatch.mistral_attn_hijack_flash
-        - monkeypatch.multipack
-        - monkeypatch.relora
-        - monkeypatch.lora_kernels
-        - monkeypatch.utils
-        - monkeypatch.btlm_attn_hijack_flash
-        - monkeypatch.stablelm_attn_hijack_flash
-        - monkeypatch.trainer_fsdp_optim
-        - monkeypatch.transformers_fa_utils
-        - monkeypatch.data.batch_dataset_fetcher
-        - monkeypatch.mixtral
-        - monkeypatch.gradient_checkpointing.offload_cpu
-        - monkeypatch.gradient_checkpointing.offload_disk
-    - title: Utils
-      desc: Utility functions
-      contents:
-        - utils.tokenization
-        - utils.chat_templates
-        - utils.lora
-        - utils.model_shard_quant
-        - utils.bench
-        - utils.freeze
-        - utils.trainer
-        - utils.schedulers
-        - utils.distributed
-        - utils.dict
-        - utils.optimizers.adopt
-        - utils.data.streaming
-        - utils.data.sft
-        - utils.quantization
-    - title: Schemas
-      desc: Pydantic data models for Axolotl config
-      contents:
-        - utils.schemas.config
-        - utils.schemas.model
-        - utils.schemas.training
-        - utils.schemas.datasets
-        - utils.schemas.peft
-        - utils.schemas.trl
-        - utils.schemas.multimodal
-        - utils.schemas.integrations
-        - utils.schemas.enums
-        - utils.schemas.utils
-    - title: Integrations
-      desc: Third-party integrations and extensions
-      contents:
-        - integrations.base
-        - integrations.cut_cross_entropy.args
-        - integrations.grokfast.optimizer
-        - integrations.kd.trainer
-        - integrations.liger.args
-        - integrations.lm_eval.args
-        - integrations.spectrum.args
-    - title: Common
-      desc: Common utilities and shared functionality
-      contents:
-        - common.architectures
-        - common.const
-        - common.datasets
-    - title: Models
-      desc: Custom model implementations
-      contents:
-        - models.mamba.modeling_mamba
-    - title: Data Processing
-      desc: Data processing utilities
-      contents:
-        - utils.collators.core
-        - utils.collators.batching
-        - utils.collators.mamba
-        - utils.collators.mm_chat
-        - utils.samplers.multipack
-    - title: Callbacks
-      desc: Training callbacks
-      contents:
-        - utils.callbacks.perplexity
-        - utils.callbacks.profiler
-        - utils.callbacks.lisa
-        - utils.callbacks.mlflow_
-        - utils.callbacks.comet_
-        - utils.callbacks.qat
-website:
-  title: "Axolotl"
-  description: "We make fine-tuning accessible, scalable, and fun"
-  favicon: favicon.jpg
-
-  google-analytics: "G-9KYCVJBNMQ"
-
-  navbar:
-    logo: image/axolotl_logo_digital_white.svg
-    title: false
-    background: dark
-    pinned: false
-    collapse: false
-    tools:
-    - icon: twitter
-      href: https://twitter.com/axolotl_ai
-    - icon: github
-      href: https://github.com/axolotl-ai-cloud/axolotl/
-    - icon: discord
-      href: https://discord.gg/7m9sfhzaf3
-
-  sidebar:
-      pinned: true
-      collapse-level: 2
-      style: docked
-      contents:
-        - text: Home
-          href: index.qmd
-
-        - section: "Getting Started"
-          contents:
-            - docs/getting-started.qmd
-            - docs/choosing_method.qmd
-            - docs/installation.qmd
-            - docs/inference.qmd
-            - section: "Model Guides"
-              contents:
-                - docs/models/kimi-linear.qmd
-                - docs/models/plano.qmd
-                - docs/models/mimo.qmd
-                - docs/models/internvl3_5.qmd
-                - docs/models/olmo3.qmd
-                - docs/models/trinity.qmd
-                - docs/models/arcee.qmd
-                - section: "Ministral3"
-                  contents:
-                    - docs/models/ministral3.qmd
-                    - docs/models/ministral3/think.qmd
-                    - docs/models/ministral3/vision.qmd
-                - section: "Magistral"
-                  contents:
-                    - docs/models/magistral.qmd
-                    - docs/models/magistral/think.qmd
-                    - docs/models/magistral/vision.qmd
-                - docs/models/ministral.qmd
-                - docs/models/mistral-small.qmd
-                - docs/models/voxtral.qmd
-                - docs/models/devstral.qmd
-                - docs/models/mistral.qmd
-                - docs/models/llama-4.qmd
-                - docs/models/llama-2.qmd
-                - docs/models/qwen3-next.qmd
-                - docs/models/qwen3.qmd
-                - docs/models/gemma3n.qmd
-                - docs/models/apertus.qmd
-                - docs/models/gpt-oss.qmd
-                - docs/models/seed-oss.qmd
-                - docs/models/phi.qmd
-                - docs/models/smolvlm2.qmd
-                - docs/models/granite4.qmd
-                - docs/models/LiquidAI.qmd
-                - docs/models/hunyuan.qmd
-                - docs/models/jamba.qmd
-                - docs/models/orpheus.qmd
-
-            - docs/cli.qmd
-            - docs/telemetry.qmd
-            - docs/config-reference.qmd
-            - text: "API Reference"
-              href: docs/api
-
-        - section: "Dataset Formats"
-          contents: docs/dataset-formats/*
-
-        - section: "Deployments"
-          contents:
-            - docs/docker.qmd
-            - docs/multi-gpu.qmd
-            - docs/multi-node.qmd
-            - docs/ray-integration.qmd
-            - docs/amd_hpc.qmd
-            - docs/mac.qmd
-
-        - section: "How To Guides"
-          contents:
-            - docs/multimodal.qmd
-            - docs/rlhf.qmd
-            - docs/grpo.qmd
-            - docs/ebft.qmd
-            - docs/vllm_serving.qmd
-            - docs/reward_modelling.qmd
-            - docs/lr_groups.qmd
-            - docs/lora_optims.qmd
-            - docs/dataset_loading.qmd
-            - docs/qat.qmd
-            - docs/quantize.qmd
-            - docs/1_58bit_finetuning.qmd
-            - docs/optimizations.qmd
-
-        - section: "Core Concepts"
-          contents:
-            - docs/batch_vs_grad.qmd
-            - docs/dataset_preprocessing.qmd
-            - docs/streaming.qmd
-            - docs/multipack.qmd
-            - docs/mixed_precision.qmd
-            - docs/optimizers.qmd
-            - docs/attention.qmd
-
-        - section: "Advanced Features"
-          contents:
-            - docs/fsdp_qlora.qmd
-            - docs/torchao.qmd
-            - docs/custom_integrations.qmd
-            - docs/sequence_parallelism.qmd
-            - docs/gradient_checkpointing.qmd
-            - docs/nd_parallelism.qmd
-            - docs/expert_quantization.qmd
-
-        - section: "Troubleshooting"
-          contents:
-            - docs/faq.qmd
-            - docs/training_stability.qmd
-            - docs/debugging.qmd
-            - docs/nccl.qmd
-
-format:
-  html:
-    theme: darkly
-    css: styles.css
-    toc: true
-    # Enable better handling of line breaks in markdown
-    preserve-tabs: true
-    html-math-method: mathjax
-    # Improved markdown processing options
-    md-extensions:
-      - markdown_it
-      - def_list
-      - attr_list
-      - fenced_divs
-      - tables
-      - html_admonition
-      - lineblocks
-      - fancy_lists
-    # Control whitespace handling
-    whitespace: preserve
-    # Process newlines in paragraphs
-    wrap: preserve
-    # Better line break handling
-    preserve-linebreaks: true
--- a/benchmarks/bench_entropy.py
+++ b/benchmarks/bench_entropy.py
@@ -1,208 +0,0 @@
-"""Benchmark for entropy_from_logits Triton kernel vs original chunked implementation.
-
-Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_entropy.py
-"""
-
-import gc
-import statistics
-
-import torch
-import torch.nn.functional as F
-
-from axolotl.monkeypatch.trainer.utils import entropy_from_logits
-
-V = 151936  # Qwen vocab
-WARMUP = 5
-BENCH_ITERS = 20
-MEM_ITERS = 10
-
-
-def entropy_from_logits_original(logits: torch.Tensor, chunk_size: int = 128):
-    """Original chunked implementation (reference)."""
-    original_shape = logits.shape[:-1]
-    num_classes = logits.shape[-1]
-    flat_logits = logits.reshape(-1, num_classes)
-    entropies = []
-    for chunk in flat_logits.split(chunk_size, dim=0):
-        logps = F.log_softmax(chunk, dim=-1)
-        chunk_entropy = -(torch.exp(logps) * logps).sum(-1)
-        entropies.append(chunk_entropy)
-    return torch.cat(entropies, dim=0).reshape(original_shape)
-
-
-def _clean_gpu():
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.reset_accumulated_memory_stats()
-    torch.cuda.synchronize()
-
-
-def profile_time(fn, logits, n_iters=BENCH_ITERS):
-    for _ in range(WARMUP):
-        out = fn(logits, chunk_size=128)
-        del out
-    torch.cuda.synchronize()
-
-    times = []
-    for _ in range(n_iters):
-        s = torch.cuda.Event(enable_timing=True)
-        e = torch.cuda.Event(enable_timing=True)
-        s.record()
-        out = fn(logits, chunk_size=128)
-        e.record()
-        torch.cuda.synchronize()
-        times.append(s.elapsed_time(e))
-        del out
-    return times
-
-
-def profile_memory(fn, logits, n_iters=MEM_ITERS):
-    for _ in range(WARMUP):
-        out = fn(logits, chunk_size=128)
-        del out
-    torch.cuda.synchronize()
-
-    peaks = []
-    for _ in range(n_iters):
-        _clean_gpu()
-        base = torch.cuda.max_memory_allocated()
-        out = fn(logits, chunk_size=128)
-        torch.cuda.synchronize()
-        peaks.append(torch.cuda.max_memory_allocated() - base)
-        del out
-    return [p / 1e6 for p in peaks]
-
-
-def fmt(values, unit=""):
-    mean = statistics.mean(values)
-    std = statistics.stdev(values) if len(values) > 1 else 0.0
-    return f"{mean:8.2f} ± {std:5.2f} {unit}  [min={min(values):.2f}, max={max(values):.2f}]"
-
-
-def benchmark_contiguous():
-    print("=" * 60)
-    print(
-        f"CONTIGUOUS BENCHMARK  (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})"
-    )
-    print("=" * 60)
-
-    configs = [
-        (1, 2048),
-        (1, 8192),
-        (1, 16384),
-        (4, 4096),
-        (8, 2048),
-        (16, 2048),
-        (16, 4096),
-    ]
-
-    for B, L in configs:
-        mem_gb = B * L * V * 2 / 1e9
-        if mem_gb > 28:
-            print(f"\n  skip B={B}, L={L} ({mem_gb:.1f} GB)")
-            continue
-
-        N = B * L
-        print(f"\n{'─' * 60}")
-        print(f"B={B:2d}, L={L:5d}  ({N:6d} rows, logits {mem_gb:.2f} GB)")
-        print(f"{'─' * 60}")
-
-        torch.manual_seed(42)
-        logits = torch.randn(B, L, V, device="cuda", dtype=torch.bfloat16)
-
-        t_orig = profile_time(entropy_from_logits_original, logits)
-        t_triton = profile_time(entropy_from_logits, logits)
-        orig_mean = statistics.mean(t_orig)
-        triton_mean = statistics.mean(t_triton)
-
-        print("  TIME (ms):")
-        print(f"    original: {fmt(t_orig, 'ms')}")
-        print(f"    triton:   {fmt(t_triton, 'ms')}")
-        print(f"    speedup:  {orig_mean / triton_mean:.2f}x")
-
-        m_orig = profile_memory(entropy_from_logits_original, logits)
-        m_triton = profile_memory(entropy_from_logits, logits)
-        orig_peak = statistics.mean(m_orig)
-        triton_peak = statistics.mean(m_triton)
-
-        print("  MEMORY (peak overhead):")
-        print(f"    original: {fmt(m_orig, 'MB')}")
-        print(f"    triton:   {fmt(m_triton, 'MB')}")
-        print(f"    saved:    {orig_peak - triton_peak:.1f} MB")
-
-        del logits
-        _clean_gpu()
-
-
-def benchmark_noncontiguous():
-    print("\n" + "=" * 60)
-    print(
-        f"NON-CONTIGUOUS BENCHMARK  (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})"
-    )
-    print("=" * 60)
-
-    configs = [
-        (4, 2048, "transpose"),
-        (4, 8192, "transpose"),
-        (8, 2048, "transpose"),
-        (4, 4096, "slice_batch"),
-    ]
-
-    for B, L, method in configs:
-        torch.manual_seed(42)
-
-        if method == "transpose":
-            raw = torch.randn(L, B, V, device="cuda", dtype=torch.bfloat16)
-            logits_nc = raw.transpose(0, 1)
-            raw_gb = L * B * V * 2 / 1e9
-        elif method == "slice_batch":
-            raw = torch.randn(B * 2, L, V, device="cuda", dtype=torch.bfloat16)
-            logits_nc = raw[::2]
-            raw_gb = B * 2 * L * V * 2 / 1e9
-        else:
-            continue
-
-        if raw_gb > 28:
-            print(f"\n  skip B={B}, L={L}, {method} ({raw_gb:.1f} GB)")
-            del raw, logits_nc
-            torch.cuda.empty_cache()
-            continue
-
-        N = B * L
-        print(f"\n{'─' * 60}")
-        print(f"B={B}, L={L}  {method}  ({N} rows, raw {raw_gb:.2f} GB)")
-        print(f"{'─' * 60}")
-
-        def original_with_copy(logits, chunk_size=128):
-            return entropy_from_logits_original(
-                logits.contiguous(), chunk_size=chunk_size
-            )
-
-        t_orig = profile_time(original_with_copy, logits_nc)
-        t_triton = profile_time(entropy_from_logits, logits_nc)
-        orig_mean = statistics.mean(t_orig)
-        triton_mean = statistics.mean(t_triton)
-
-        print("  TIME (ms):")
-        print(f"    orig+copy:     {fmt(t_orig, 'ms')}")
-        print(f"    triton-strided:{fmt(t_triton, 'ms')}")
-        print(f"    speedup:       {orig_mean / triton_mean:.2f}x")
-
-        m_orig = profile_memory(original_with_copy, logits_nc)
-        m_triton = profile_memory(entropy_from_logits, logits_nc)
-        orig_peak = statistics.mean(m_orig)
-        triton_peak = statistics.mean(m_triton)
-
-        print("  MEMORY (peak overhead):")
-        print(f"    orig+copy:     {fmt(m_orig, 'MB')}")
-        print(f"    triton-strided:{fmt(m_triton, 'MB')}")
-        print(f"    saved:         {orig_peak - triton_peak:.1f} MB")
-
-        del raw, logits_nc
-        _clean_gpu()
-
-
-if __name__ == "__main__":
-    benchmark_contiguous()
-    benchmark_noncontiguous()
--- a/benchmarks/bench_scattermoe_lora.py
+++ b/benchmarks/bench_scattermoe_lora.py
@@ -1,284 +0,0 @@
-"""Benchmark for ScatterMoE LoRA Triton kernels.
-
-Measures forward, backward dX, and backward dA/dB kernels at common MoE
-model shapes. Reports per-kernel timings, LoRA overhead vs base scatter2scatter,
-and full fwd+bwd autograd throughput.
-
-Usage:
-  CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py
-  CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py --ranks 16 64
-  CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py --models Qwen/Qwen3.5-35B-A3B
-"""
-
-import argparse
-import gc
-import time
-from functools import partial
-
-import torch
-
-from axolotl.integrations.kernels.libs.scattermoe_lora.kernels import (
-    lora_ops,
-    ops as base_ops,
-)
-from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
-    flatten_sort_count,
-)
-from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_linear_lora import (
-    ScatterMoELoRA,
-)
-
-DEVICE = "cuda"
-DTYPE = torch.bfloat16
-WARMUP = 5
-ITERS = 20
-
-# ─── Model configs ──────────────────────────────────────────────────────────
-
-BUILTIN_CONFIGS = {
-    "Qwen3.5-35B-A3B": (256, 2048, 512, 8),  # E, H, I, k
-    "Qwen3-30B-A3B": (128, 2048, 768, 8),
-    "OLMoE-1B-7B": (64, 2048, 1024, 8),
-    "Mixtral-8x7B": (8, 4096, 14336, 2),
-}
-
-
-def _resolve_config(spec):
-    """Resolve a model spec to (E, H, I, k). Accepts builtin names or HF IDs."""
-    key = spec.lower().replace("/", "-")
-    for name, cfg in BUILTIN_CONFIGS.items():
-        if key in name.lower() or name.lower() in key:
-            return name, cfg
-
-    from transformers import AutoConfig
-
-    hf_cfg = AutoConfig.from_pretrained(spec, trust_remote_code=True)
-    if callable(getattr(hf_cfg, "get_text_config", None)):
-        tc = hf_cfg.get_text_config()
-        if hasattr(tc, "model_type") and tc.model_type != hf_cfg.model_type:
-            hf_cfg = tc
-    hidden = hf_cfg.hidden_size
-    inter = getattr(hf_cfg, "moe_intermediate_size", None) or hf_cfg.intermediate_size
-    experts = (
-        getattr(hf_cfg, "num_experts", None)
-        or getattr(hf_cfg, "num_local_experts", None)
-        or getattr(hf_cfg, "n_routed_experts", None)
-    )
-    top_k = (
-        getattr(hf_cfg, "num_experts_per_tok", None)
-        or getattr(hf_cfg, "num_experts_per_token", None)
-        or 2
-    )
-    name = spec.split("/")[-1]
-    return name, (experts, hidden, inter, top_k)
-
-
-# ─── Benchmark helpers ──────────────────────────────────────────────────────
-
-
-def _clean():
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.synchronize()
-
-
-def _bench(fn, warmup=WARMUP, iters=ITERS):
-    for _ in range(warmup):
-        fn()
-    torch.cuda.synchronize()
-    times = []
-    for _ in range(iters):
-        torch.cuda.synchronize()
-        t0 = time.perf_counter()
-        fn()
-        torch.cuda.synchronize()
-        times.append((time.perf_counter() - t0) * 1000)
-    times.sort()
-    return times[len(times) // 2]
-
-
-def _setup(num_experts, K, N, T, top_k, R):
-    torch.manual_seed(42)
-    x = torch.randn(T, K, device=DEVICE, dtype=DTYPE)
-    W = torch.randn(num_experts, K, N, device=DEVICE, dtype=DTYPE) * 0.02
-    lora_A = torch.randn(R * num_experts, K, device=DEVICE, dtype=DTYPE) * 0.01
-    lora_B = torch.randn(N, R * num_experts, device=DEVICE, dtype=DTYPE) * 0.01
-    logits = torch.randn(T, num_experts, device=DEVICE)
-    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1)
-    sei, ssi, eo = flatten_sort_count(top_idx, num_experts)
-    gx = base_ops.group(x, ssi, fan_out=top_k)
-    dy = torch.randn(gx.size(0), N, device=DEVICE, dtype=DTYPE)
-    return x, W, lora_A, lora_B, sei, ssi, eo, gx, dy
-
-
-# ─── Kernel wrappers (avoid B023 loop-variable capture) ──────────────────────
-
-
-def _call_fwd(x, W, sei, ssi, top_k, lA, lB):
-    return lora_ops.scatter2scatter_lora(
-        X=x,
-        W=W,
-        sorted_expert_idxs=sei,
-        sorted_scattered_idxs=ssi,
-        k=top_k,
-        lora_A=lA,
-        lora_B=lB,
-        scaling=2.0,
-    )
-
-
-def _call_base(x, W, sei, ssi, top_k):
-    return base_ops.scatter2scatter(
-        X=x,
-        W=W,
-        sorted_expert_idxs=sei,
-        sorted_scattered_idxs=ssi,
-        k=top_k,
-    )
-
-
-def _call_dx(dy, W, sei, ssi, lA, lB):
-    return lora_ops.scatter2scatter_lora_dX(
-        DY=dy,
-        W=W,
-        sorted_expert_idxs=sei,
-        sorted_scattered_idxs=ssi,
-        k=1,
-        lora_A=lA,
-        lora_B=lB,
-        scaling=2.0,
-        dy_grouped=True,
-        dx_grouped=False,
-    )
-
-
-def _call_bwd(dy, gx, lA, lB, eo, num_experts):
-    return lora_ops.group_bwd_lora(
-        DY=dy,
-        X=gx,
-        lora_A=lA,
-        lora_B=lB,
-        expert_offsets=eo,
-        E=num_experts,
-        scaling=2.0,
-    )
-
-
-# ─── Main ────────────────────────────────────────────────────────────────────
-
-
-def main():
-    parser = argparse.ArgumentParser(description="ScatterMoE LoRA kernel benchmark")
-    parser.add_argument(
-        "--models",
-        "-m",
-        nargs="+",
-        help="Model names or HF IDs (default: all builtins)",
-    )
-    parser.add_argument("--ranks", "-r", nargs="+", type=int, default=[16, 32, 64])
-    parser.add_argument("--seq-len", "-T", type=int, default=2048)
-    args = parser.parse_args()
-
-    T = args.seq_len
-    print(f"GPU: {torch.cuda.get_device_name()}")
-    print(f"T={T}, ranks={args.ranks}\n")
-
-    if args.models:
-        configs = [_resolve_config(m) for m in args.models]
-    else:
-        configs = list(BUILTIN_CONFIGS.items())
-
-    for model_name, (num_experts, hidden, inter, top_k) in configs:
-        print(f"{'=' * 70}")
-        print(f"  {model_name}: E={num_experts}, H={hidden}, I={inter}, k={top_k}")
-        print(f"{'=' * 70}")
-
-        for R in args.ranks:
-            for proj, K, N in [("gate_up", hidden, 2 * inter), ("down", inter, hidden)]:
-                _clean()
-                x, W, lA, lB, sei, ssi, eo, gx, dy = _setup(
-                    num_experts, K, N, T, top_k, R
-                )
-
-                # Forward with LoRA (auto-dispatched: fused or split)
-                dispatch = (
-                    "split"
-                    if (
-                        num_experts <= lora_ops._SPLIT_LORA_FWD_MAX_EXPERTS
-                        and K * N >= lora_ops._SPLIT_LORA_FWD_THRESHOLD
-                    )
-                    else "fused"
-                )
-                t_fwd = _bench(partial(_call_fwd, x, W, sei, ssi, top_k, lA, lB))
-                t_base = _bench(partial(_call_base, x, W, sei, ssi, top_k))
-                t_dx = _bench(partial(_call_dx, dy, W, sei, ssi, lA, lB))
-                t_bwd = _bench(partial(_call_bwd, dy, gx, lA, lB, eo, num_experts))
-
-                total = t_fwd + t_dx + t_bwd
-                overhead = t_fwd / t_base - 1 if t_base > 0 else 0
-
-                print(
-                    f"  R={R:>2} {proj:<8}  "
-                    f"fwd={t_fwd:>6.2f}ms [{dispatch}]  "
-                    f"base={t_base:>6.2f}ms "
-                    f"(+{overhead * 100:.0f}%)  "
-                    f"dx={t_dx:>6.2f}ms  bwd={t_bwd:>6.2f}ms  "
-                    f"total={total:>6.2f}ms"
-                )
-
-                # Full autograd fwd+bwd with memory measurement
-                x_ag = x.clone().requires_grad_(True)
-                lA_ag = lA.clone().requires_grad_(True)
-                lB_ag = lB.clone().requires_grad_(True)
-
-                def _run_autograd(
-                    _x=x_ag,
-                    _W=W,
-                    _k=top_k,
-                    _sei=sei,
-                    _ssi=ssi,
-                    _eo=eo,
-                    _lA=lA_ag,
-                    _lB=lB_ag,
-                ):
-                    out = ScatterMoELoRA.apply(
-                        _x,
-                        _W,
-                        _k,
-                        _sei,
-                        _ssi,
-                        _eo,
-                        _lA,
-                        _lB,
-                        2.0,
-                        None,
-                        None,
-                        False,
-                        False,
-                        True,
-                        False,
-                    )
-                    out.sum().backward()
-                    _x.grad = None
-                    _lA.grad = None
-                    _lB.grad = None
-
-                t_full = _bench(_run_autograd)
-
-                _clean()
-                torch.cuda.reset_peak_memory_stats()
-                mem_before = torch.cuda.memory_allocated()
-                _run_autograd()
-                torch.cuda.synchronize()
-                mem_peak = torch.cuda.max_memory_allocated() - mem_before
-
-                print(
-                    f"         full_fwd_bwd={t_full:>6.2f}ms  "
-                    f"peak_delta={mem_peak / 1e6:>6.1f}MB"
-                )
-
-        print()
-
-
-if __name__ == "__main__":
-    main()
--- a/benchmarks/bench_selective_logsoftmax.py
+++ b/benchmarks/bench_selective_logsoftmax.py
@@ -1,191 +0,0 @@
-"""Benchmark for selective_log_softmax Triton kernel vs original implementation.
-
-Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_selective_logsoftmax.py
-"""
-
-import gc
-import statistics
-
-import torch
-
-from axolotl.monkeypatch.trainer.utils import (
-    selective_log_softmax,
-    selective_log_softmax_original,
-)
-
-V = 151936  # Qwen vocab
-WARMUP = 5
-BENCH_ITERS = 20
-MEM_ITERS = 10
-
-
-def _clean_gpu():
-    gc.collect()
-    torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.reset_accumulated_memory_stats()
-    torch.cuda.synchronize()
-
-
-def profile_time(fn, args, n_iters=BENCH_ITERS):
-    for _ in range(WARMUP):
-        fn(*args)
-    torch.cuda.synchronize()
-
-    times = []
-    for _ in range(n_iters):
-        s = torch.cuda.Event(enable_timing=True)
-        e = torch.cuda.Event(enable_timing=True)
-        s.record()
-        fn(*args)
-        e.record()
-        torch.cuda.synchronize()
-        times.append(s.elapsed_time(e))
-    return times
-
-
-def profile_memory(fn, args, n_iters=MEM_ITERS):
-    for _ in range(WARMUP):
-        out = fn(*args)
-        del out
-    torch.cuda.synchronize()
-
-    peaks = []
-    for _ in range(n_iters):
-        _clean_gpu()
-        base = torch.cuda.max_memory_allocated()
-        out = fn(*args)
-        torch.cuda.synchronize()
-        peaks.append(torch.cuda.max_memory_allocated() - base)
-        del out
-    return [p / 1e6 for p in peaks]
-
-
-def fmt(values, unit=""):
-    mean = statistics.mean(values)
-    std = statistics.stdev(values) if len(values) > 1 else 0.0
-    return f"{mean:8.2f} ± {std:5.2f} {unit}  [min={min(values):.2f}, max={max(values):.2f}]"
-
-
-def benchmark_forward():
-    print("=" * 60)
-    print(f"FORWARD BENCHMARK  (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})")
-    print("=" * 60)
-
-    configs = [
-        (1, 2048),
-        (1, 8192),
-        (4, 4096),
-        (8, 2048),
-        (16, 2048),
-        (16, 4096),
-    ]
-
-    for B, L in configs:
-        mem_gb = B * L * V * 2 / 1e9
-        if mem_gb > 28:
-            print(f"\n  skip B={B}, L={L} ({mem_gb:.1f} GB)")
-            continue
-
-        N = B * L
-        print(f"\n{'─' * 60}")
-        print(f"B={B:2d}, L={L:5d}  ({N:6d} rows, logits {mem_gb:.2f} GB)")
-        print(f"{'─' * 60}")
-
-        torch.manual_seed(42)
-        logits = torch.randn(B, L, V, device="cuda", dtype=torch.bfloat16)
-        index = torch.randint(0, V, (B, L), device="cuda")
-
-        t_orig = profile_time(selective_log_softmax_original, (logits, index))
-        t_triton = profile_time(selective_log_softmax, (logits, index))
-        orig_mean = statistics.mean(t_orig)
-        triton_mean = statistics.mean(t_triton)
-
-        print("  TIME (ms):")
-        print(f"    original: {fmt(t_orig, 'ms')}")
-        print(f"    triton:   {fmt(t_triton, 'ms')}")
-        print(f"    speedup:  {orig_mean / triton_mean:.2f}x")
-
-        m_orig = profile_memory(selective_log_softmax_original, (logits, index))
-        m_triton = profile_memory(selective_log_softmax, (logits, index))
-        orig_peak = statistics.mean(m_orig)
-        triton_peak = statistics.mean(m_triton)
-
-        print("  MEMORY (peak overhead):")
-        print(f"    original: {fmt(m_orig, 'MB')}")
-        print(f"    triton:   {fmt(m_triton, 'MB')}")
-        print(f"    saved:    {orig_peak - triton_peak:.1f} MB")
-
-        del logits, index
-        _clean_gpu()
-
-
-def benchmark_backward():
-    print("\n" + "=" * 60)
-    print(f"FWD+BWD BENCHMARK  (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})")
-    print("=" * 60)
-
-    configs = [
-        (1, 2048),
-        (1, 8192),
-        (4, 4096),
-        (8, 2048),
-        (16, 2048),
-        (16, 4096),
-    ]
-
-    def fwd_bwd_original(logits, index):
-        logits.grad = None
-        out = selective_log_softmax_original(logits, index)
-        out.sum().backward()
-
-    def fwd_bwd_triton(logits, index):
-        logits.grad = None
-        out = selective_log_softmax(logits, index)
-        out.sum().backward()
-
-    for B, L in configs:
-        mem_gb = B * L * V * 2 / 1e9
-        if mem_gb > 20:
-            print(f"\n  skip B={B}, L={L} ({mem_gb:.1f} GB, need room for grads)")
-            continue
-
-        N = B * L
-        print(f"\n{'─' * 60}")
-        print(f"B={B:2d}, L={L:5d}  ({N:6d} rows, logits {mem_gb:.2f} GB)")
-        print(f"{'─' * 60}")
-
-        torch.manual_seed(42)
-        logits_orig = torch.randn(
-            B, L, V, device="cuda", dtype=torch.bfloat16, requires_grad=True
-        )
-        logits_tri = logits_orig.detach().clone().requires_grad_(True)
-        index = torch.randint(0, V, (B, L), device="cuda")
-
-        t_orig = profile_time(fwd_bwd_original, (logits_orig, index))
-        t_triton = profile_time(fwd_bwd_triton, (logits_tri, index))
-        orig_mean = statistics.mean(t_orig)
-        triton_mean = statistics.mean(t_triton)
-
-        print("  FWD+BWD TIME (ms):")
-        print(f"    original: {fmt(t_orig, 'ms')}")
-        print(f"    triton:   {fmt(t_triton, 'ms')}")
-        print(f"    speedup:  {orig_mean / triton_mean:.2f}x")
-
-        m_orig = profile_memory(fwd_bwd_original, (logits_orig, index))
-        m_triton = profile_memory(fwd_bwd_triton, (logits_tri, index))
-        orig_peak = statistics.mean(m_orig)
-        triton_peak = statistics.mean(m_triton)
-
-        print("  FWD+BWD MEMORY (peak overhead):")
-        print(f"    original: {fmt(m_orig, 'MB')}")
-        print(f"    triton:   {fmt(m_triton, 'MB')}")
-        print(f"    saved:    {orig_peak - triton_peak:.1f} MB")
-
-        del logits_orig, logits_tri, index
-        _clean_gpu()
-
-
-if __name__ == "__main__":
-    benchmark_forward()
-    benchmark_backward()
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -1,55 +0,0 @@
-FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}
-
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
-ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
-ENV CUDA="{{ CUDA }}"
-ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
-ENV GITHUB_REF="{{ GITHUB_REF }}"
-ENV GITHUB_SHA="{{ GITHUB_SHA }}"
-ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
-ENV HF_HOME="{{ HF_HOME }}"
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-RUN git fetch origin +$GITHUB_REF && \
-    git checkout FETCH_HEAD
-
-RUN uv pip install packaging==26.0 setuptools==78.1.1
-RUN uv pip install torchvision
-RUN uv pip uninstall causal_conv1d
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
-    fi
-
-# Override with nightly HF packages for nightly builds
-RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
-        uv pip install --no-deps \
-            "transformers @ git+https://github.com/huggingface/transformers.git@main" \
-            "peft @ git+https://github.com/huggingface/peft.git@main" \
-            "accelerate @ git+https://github.com/huggingface/accelerate.git@main" \
-            "trl @ git+https://github.com/huggingface/trl.git@main" \
-            "datasets @ git+https://github.com/huggingface/datasets.git@main"; \
-    fi
-
-RUN python scripts/cutcrossentropy_install.py --uv | sh
-
-# So we can test the Docker image
-RUN uv pip install black mypy pre-commit types-requests quartodoc jupyter blobfile tiktoken \
-    codecov codecov-cli pytest pytest-cov pytest-retry pytest-sugar pytest-xdist tbparse
-
-# fix so that git fetch/pull from remote works
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
--- a/cicd/init.py
+++ b/cicd/init.py
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -1,73 +0,0 @@
-#!/bin/bash
-set -e
-
-python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__, f'Expected torch $PYTORCH_VERSION but got {torch.__version__}'"
-
-set -o pipefail
-for i in 1 2 3; do
-  if curl --silent --show-error --fail -L \
-    https://axolotl-ci.b-cdn.net/hf-cache.tar.zst \
-    | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1; then
-    echo "HF cache extracted successfully"
-    break
-  fi
-  echo "Attempt $i failed, cleaning up and retrying in 15s..."
-  rm -rf "${HF_HOME}/hub/"*
-  sleep 15
-done
-# hf download "NousResearch/Meta-Llama-3-8B"
-# hf download "NousResearch/Meta-Llama-3-8B-Instruct"
-# hf download "microsoft/Phi-4-reasoning"
-# hf download "microsoft/Phi-3.5-mini-instruct"
-# hf download "microsoft/Phi-3-medium-128k-instruct"
-
-# Run unit tests with initial coverage report
-pytest -v --durations=10 -n8 \
-  --ignore=tests/e2e/ \
-  --ignore=tests/patched/ \
-  --ignore=tests/cli \
-  /workspace/axolotl/tests/ \
-  --cov=axolotl
-
-# Run lora kernels tests with coverage append
-pytest -v --durations=10 \
-  /workspace/axolotl/tests/e2e/patched/lora_kernels \
-  --cov=axolotl \
-  --cov-append
-
-# Run patched tests excluding lora kernels with coverage append
-pytest --full-trace -vvv --durations=10 \
-  --ignore=tests/e2e/patched/lora_kernels \
-  /workspace/axolotl/tests/e2e/patched \
-  --cov=axolotl \
-  --cov-append
-
-# Run solo tests with coverage append
-pytest -v --durations=10 -n1 \
-  /workspace/axolotl/tests/e2e/solo/ \
-  --cov=axolotl \
-  --cov-append
-
-# Run integration tests with coverage append
-pytest -v --durations=10 \
-  /workspace/axolotl/tests/e2e/integrations/ \
-  --cov=axolotl \
-  --cov-append
-
-pytest -v --durations=10 /workspace/axolotl/tests/cli \
-  --cov=axolotl \
-  --cov-append
-
-# Run remaining e2e tests with coverage append and final report
-pytest -v --durations=10 \
-  --ignore=tests/e2e/solo/ \
-  --ignore=tests/e2e/patched/ \
-  --ignore=tests/e2e/multigpu/ \
-  --ignore=tests/e2e/integrations/ \
-  --ignore=tests/cli \
-  /workspace/axolotl/tests/e2e/ \
-  --cov=axolotl \
-  --cov-append \
-  --cov-report=xml:e2e-coverage.xml
-
-codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
--- a/cicd/cleanup.py
+++ b/cicd/cleanup.py
@@ -1,19 +0,0 @@
-"""Modal app to run axolotl GPU cleanup"""
-
-from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
-
-
-@app.function(
-    image=cicd_image,
-    timeout=60 * 60,
-    cpu=8.0,
-    memory=131072,
-    volumes=VOLUME_CONFIG,
-)
-def cleanup():
-    run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
-
-
-@app.local_entrypoint()
-def main():
-    cleanup.remote()
--- a/cicd/cleanup.sh
+++ b/cicd/cleanup.sh
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-# cleanup old cache files for datasets processing and intermediate mappings
-find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
-find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -1,20 +0,0 @@
-"""Modal app to run axolotl GPU tests"""
-
-from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
-
-
-@app.function(
-    image=cicd_image,
-    gpu=GPU_CONFIG,
-    timeout=120 * 60,  # 90 min
-    cpu=8.0,
-    memory=131072,
-    volumes=VOLUME_CONFIG,
-)
-def cicd_pytest():
-    run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
-
-
-@app.local_entrypoint()
-def main():
-    cicd_pytest.remote()
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -1,85 +0,0 @@
-"""
-modal application to run axolotl gpu tests in Modal
-"""
-
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-from jinja2 import select_autoescape
-from modal import App, Image
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
-df_template = template_env.get_template(dockerfile)
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
-    "CUDA": os.environ.get("CUDA", "126"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
-    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
-    "HF_HOME": "/workspace/data/huggingface-cache/hub",
-    "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
-    "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = Image.from_dockerfile(
-    pathlib.Path(temp_dir) / "Dockerfile",
-    force_build=True,
-    gpu="A10G",
-).env(df_args)
-
-app = App("Axolotl CI/CD", secrets=[])
-
-hf_cache_volume = modal.Volume.from_name(
-    "axolotl-ci-hf-hub-cache", create_if_missing=True
-)
-VOLUME_CONFIG = {
-    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
-}
-
-N_GPUS = int(os.environ.get("N_GPUS", 2))
-GPU_CONFIG = f"H100:{N_GPUS}"
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)
-
-
-@app.function(
-    image=cicd_image,
-    gpu=GPU_CONFIG,
-    timeout=120 * 60,
-    cpu=16.0,
-    memory=131072 * N_GPUS,
-    volumes=VOLUME_CONFIG,
-)
-def cicd_pytest():
-    run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
-
-
-@app.local_entrypoint()
-def main():
-    cicd_pytest.remote()
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -1,25 +0,0 @@
-#!/bin/bash
-set -e
-
-# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
-pytest -v --durations=10 -n2 --maxfail=3 \
-  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
-  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
-  /workspace/axolotl/tests/e2e/multigpu/ \
-  --cov=axolotl
-
-# Run solo tests with coverage append
-pytest -v --durations=10 -n1 \
-  /workspace/axolotl/tests/e2e/multigpu/solo/ \
-  --cov=axolotl \
-  --cov-append
-
-pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
-  --cov=axolotl \
-  --cov-append \
-  --cov-report=xml:multigpu-coverage.xml
-
-# Upload coverage to Codecov if CODECOV_TOKEN is available
-if [ -n "$CODECOV_TOKEN" ]; then
-  codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
-fi
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -1,73 +0,0 @@
-"""Modal app to run axolotl GPU tests"""
-
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-import modal.experimental
-from jinja2 import select_autoescape
-from modal import App
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile-uv.jinja")
-df_template = template_env.get_template(dockerfile)
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
-    "CUDA": os.environ.get("CUDA", "126"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
-    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
-    "HF_HOME": "/workspace/data/huggingface-cache/hub",
-    "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
-    "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = modal.experimental.raw_dockerfile_image(
-    pathlib.Path(temp_dir) / "Dockerfile",
-    # context_mount=None,
-    force_build=True,
-    # gpu="A10G",
-).env(df_args)
-
-app = App("Axolotl CI/CD", secrets=[])
-
-hf_cache_volume = modal.Volume.from_name(
-    "axolotl-ci-hf-hub-cache", create_if_missing=True
-)
-VOLUME_CONFIG = {
-    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
-}
-
-N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_TYPE = os.environ.get("GPU_TYPE", "L40S")
-GPU_CONFIG = f"{GPU_TYPE}:{N_GPUS}"
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    sp_env = os.environ.copy()
-    sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8"
-
-    # Propagate errors from subprocess.
-    exit_code = subprocess.call(cmd.split(), cwd=run_folder, env=sp_env)  # nosec
-    if exit_code:
-        raise RuntimeError(f"Command '{cmd}' failed with exit code {exit_code}")
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,58 +0,0 @@
-codecov:
-  require_ci_to_pass: yes
-  notify:
-    wait_for_ci: true
-
-coverage:
-  precision: 2
-  round: down
-  range: "70...100"
-  status:
-    project:
-      default:
-        # basic
-        target: auto
-        threshold: 1%
-        base: auto
-        # advanced
-        branches: null
-        if_no_uploads: error
-        if_not_found: success
-        if_ci_failed: error
-        only_pulls: true
-        flags: null
-        paths: null
-        informational: true
-    patch:
-      default:
-        # basic
-        target: auto
-        threshold: 1%
-        base: auto
-        # advanced
-        branches: null
-        if_no_uploads: error
-        if_not_found: success
-        if_ci_failed: error
-        only_pulls: false
-        flags: null
-        paths: null
-        informational: true
-
-parsers:
-  gcov:
-    branch_detection:
-      conditional: yes
-      loop: yes
-      method: no
-      macro: no
-
-comment:
-  layout: "reach,diff,flags,files,footer"
-  behavior: default
-  require_changes: no
-  require_base: no
-  require_head: yes
-
-github_checks:
-  annotations: false
--- a/deepspeed_configs/zero1_torch_compile.json
+++ b/deepspeed_configs/zero1_torch_compile.json
@@ -15,12 +15,16 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "compile": {
-    "disable": false,
-    "backend": "inductor"
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
  },
  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/deepspeed_configs/zero2.json
+++ b/deepspeed_configs/zero2.json
@@ -19,8 +19,16 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/deepspeed_configs/zero3.json
+++ b/deepspeed_configs/zero3.json
@@ -7,9 +7,9 @@
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
-    "max_live_parameters": 0,
-    "max_reuse_distance": 0,
-    "gather_16bit_weights_on_model_save": true
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
  },
  "bf16": {
    "enabled": "auto"
@@ -23,8 +23,16 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/deepspeed/zero3_bf16.json
+++ b/deepspeed/zero3_bf16.json
@@ -0,0 +1,39 @@
+{
+  "zero_optimization": {
+    "stage": 3,
+    "overlap_comm": true,
+    "contiguous_gradients": true,
+    "sub_group_size": 0,
+    "reduce_bucket_size": "auto",
+    "stage3_prefetch_bucket_size": "auto",
+    "stage3_param_persistence_threshold": "auto",
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
+  },
+  "bf16": {
+    "enabled": true
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
+  },
+  "gradient_accumulation_steps": "auto",
+  "train_batch_size": "auto",
+  "train_micro_batch_size_per_gpu": "auto",
+  "wall_clock_breakdown": false
+}
--- a/deepspeed_configs/zero3_bf16_cpuoffload_all.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_all.json
@@ -1,6 +1,4 @@
 {
-  "zero_force_ds_cpu_optimizer": false,
-  "zero_allow_untested_optimizer": true,
  "zero_optimization": {
    "stage": 3,
    "offload_optimizer": {
@@ -17,15 +15,32 @@
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
-    "max_live_parameters": 0,
-    "max_reuse_distance": 0,
-    "gather_16bit_weights_on_model_save": true
+    "stage3_max_live_parameters": 0,
+    "stage3_max_reuse_distance": 0,
+    "stage3_gather_16bit_weights_on_model_save": true
  },
  "bf16": {
-    "enabled": true
+    "enabled": "auto"
+  },
+  "fp16": {
+    "enabled": "auto",
+    "auto_cast": false,
+    "loss_scale": 0,
+    "initial_scale_power": 32,
+    "loss_scale_window": 1000,
+    "hysteresis": 2,
+    "min_loss_scale": 1
+  },
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": "auto",
+      "betas": "auto",
+      "eps": "auto",
+      "weight_decay": "auto"
+    }
  },
  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/deepspeed_configs/zero1.json
+++ b/deepspeed_configs/zero1.json
@@ -1,23 +0,0 @@
-{
-  "zero_optimization": {
-    "stage": 1,
-    "overlap_comm": true
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
--- a/deepspeed_configs/zero2_torch_compile.json
+++ b/deepspeed_configs/zero2_torch_compile.json
@@ -1,31 +0,0 @@
-{
-  "compile": {
-    "disable": false,
-    "backend": "inductor"
-  },
-  "zero_optimization": {
-    "stage": 2,
-    "offload_optimizer": {
-      "device": "cpu"
-    },
-    "contiguous_gradients": true,
-    "overlap_comm": true
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
--- a/deepspeed_configs/zero3_bf16.json
+++ b/deepspeed_configs/zero3_bf16.json
@@ -1,22 +0,0 @@
-{
-  "zero_optimization": {
-    "stage": 3,
-    "overlap_comm": true,
-    "contiguous_gradients": true,
-    "sub_group_size": 0,
-    "reduce_bucket_size": "auto",
-    "stage3_prefetch_bucket_size": "auto",
-    "stage3_param_persistence_threshold": "auto",
-    "max_live_parameters": 0,
-    "max_reuse_distance": 0,
-    "gather_16bit_weights_on_model_save": true
-  },
-  "bf16": {
-    "enabled": true
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
--- a/deepspeed_configs/zero3_bf16_cpuoffload_params.json
+++ b/deepspeed_configs/zero3_bf16_cpuoffload_params.json
@@ -1,28 +0,0 @@
-{
-  "zero_force_ds_cpu_optimizer": false,
-  "zero_allow_untested_optimizer": true,
-  "zero_optimization": {
-    "stage": 3,
-    "offload_param": {
-      "device": "cpu",
-      "pin_memory": true
-    },
-    "overlap_comm": true,
-    "contiguous_gradients": true,
-    "sub_group_size": 0,
-    "reduce_bucket_size": "auto",
-    "stage3_prefetch_bucket_size": "auto",
-    "stage3_param_persistence_threshold": "auto",
-    "max_live_parameters": 0,
-    "max_reuse_distance": 0,
-    "gather_16bit_weights_on_model_save": true
-  },
-  "bf16": {
-    "enabled": true
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
--- a/devtools/README.md
+++ b/devtools/README.md
@@ -1 +0,0 @@
-This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.
--- a/devtools/dev_chat_template.yml
+++ b/devtools/dev_chat_template.yml
@@ -1,48 +0,0 @@
-# Example config for debugging the chat_template prompt format
-base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-
-load_in_8bit: true
-load_in_4bit: false
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-    shards: 10
-val_set_size: 0
-output_dir: temp_debug/axolotl_outputs/model
-dataset_prepared_path: temp_debug/axolotl_outputs/data
-dataset_num_proc: 1
-
-sequence_len: 4096
-sample_packing: false
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-
-micro_batch_size: 1
-num_epochs: 1
-max_steps: 10
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: false
-fp16: true
-tf32: false
-
-gradient_checkpointing: true
-logging_steps: 1
-flash_attention: true
-
-warmup_steps: 10
-weight_decay: 0.0
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,47 +1,36 @@
 ARG BASE_TAG=main-base
-FROM axolotlai/axolotl-base:$BASE_TAG
+FROM winglian/axolotl-base:$BASE_TAG

 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
 ARG AXOLOTL_EXTRAS=""
-ARG AXOLOTL_ARGS=""
 ARG CUDA="118"
-ARG PYTORCH_VERSION="2.1.2"
-ARG TARGETARCH
+ENV BNB_CUDA_VERSION=$CUDA
+ARG PYTORCH_VERSION="2.0.1"

 ENV PYTORCH_VERSION=$PYTORCH_VERSION

 RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
-    rm -rf /var/cache/apt/archives && \
-    rm -rf /var/lib/apt/lists/*
+    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev

 WORKDIR /workspace

-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
+RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git

 WORKDIR /workspace/axolotl

-# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
-RUN pip uninstall -y causal_conv1d
-RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="optimizers,ray"; \
+# If AXOLOTL_EXTRAS is set, append it in brackets
+RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
+        pip install -e .[deepspeed,flash-attn,$AXOLOTL_EXTRAS]; \
    else \
-        BASE_EXTRAS="deepspeed,optimizers,ray"; \
-    fi && \
-    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
-        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
-    fi && \
-    python scripts/cutcrossentropy_install.py | sh && \
-    pip install pytest && \
-    pip cache purge
+        pip install -e .[deepspeed,flash-attn]; \
+    fi

-# fix so that git fetch/pull from remote works with shallow clone
+# So we can test the Docker image
+RUN pip install pytest
+
+# fix so that git fetch/pull from remote works
 RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch && \
-    git config --global credential.helper store
+    git config --get remote.origin.fetch

-COPY .axolotl-complete.bash /root/.axolotl-complete.bash
-RUN chmod +x /root/.axolotl-complete.bash && \
-    echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc
+# helper for huggingface-login cli
+RUN git config --global credential.helper store
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -2,59 +2,36 @@ ARG CUDA_VERSION="11.8.0"
 ARG CUDNN_VERSION="8"
 ARG UBUNTU_VERSION="22.04"
 ARG MAX_JOBS=4
-ARG TARGETARCH

-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
+FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION as base-builder

 ENV PATH="/root/miniconda3/bin:${PATH}"

-ARG TARGETARCH
-ARG PYTHON_VERSION="3.11"
-ARG PYTORCH_VERSION="2.1.2"
-ARG CUDA="128"
+ARG PYTHON_VERSION="3.9"
+ARG PYTORCH_VERSION="2.0.1"
+ARG CUDA="118"
 ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

 ENV PYTHON_VERSION=$PYTHON_VERSION
 ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

 RUN apt-get update \
-    && apt-get install -y --no-install-recommends \
-        wget git build-essential ninja-build git-lfs libaio-dev pkg-config \
-        ibverbs-providers ibverbs-utils infiniband-diags  \
-        librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \
-    && rm -rf /var/cache/apt/archives \
-    && rm -rf /var/lib/apt/lists/* \
-    && if [ "$TARGETARCH" = "amd64" ]; then \
-        MINICONDA_ARCH="x86_64"; \
-    elif [ "$TARGETARCH" = "arm64" ]; then \
-        MINICONDA_ARCH="aarch64"; \
-    else \
-        echo "Unsupported architecture: $TARGETARCH"; exit 1; \
-    fi \
-    && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
+    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/* \
+    && wget \
+    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    && mkdir /root/.conda \
-    && bash Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh -b \
-    && rm -f Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
-    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
-    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
+    && bash Miniconda3-latest-Linux-x86_64.sh -b \
+    && rm -f Miniconda3-latest-Linux-x86_64.sh \
    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

 ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

 WORKDIR /workspace

-RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel psutil && \
-    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
-    python3 -m pip cache purge
-
-RUN if [ "$CUDA" != "130" ] ; then \
-        CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.4"; \
-        python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
-        python3 -m pip cache purge; \
-    fi
+RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
+    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} deepspeed-kernels --extra-index-url https://download.pytorch.org/whl/cu$CUDA

 RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
-    pip3 cache purge
+    pip3 install -U --no-cache-dir pydantic==1.10.10
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -1,38 +0,0 @@
-ARG CUDA_VERSION="12.8.1"
-ARG CUDNN_VERSION="8"
-ARG UBUNTU_VERSION="22.04"
-ARG MAX_JOBS=4
-
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
-
-ENV PATH="/root/miniconda3/bin:${PATH}"
-
-ARG PYTHON_VERSION="3.11"
-ARG PYTORCH_VERSION="next"
-ARG CUDA="128"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-
-RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
-    && wget \
-    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
-    && mkdir /root/.conda \
-    && bash Miniconda3-latest-Linux-x86_64.sh -b \
-    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
-
-ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
-
-WORKDIR /workspace
-
-RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
-    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
-    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
-
-RUN git lfs install --skip-repo && \
-    pip3 install awscli && \
-    pip3 install -U --no-cache-dir pydantic==2.10.6
--- a/docker/Dockerfile-base-nightly
+++ b/docker/Dockerfile-base-nightly
@@ -1,43 +0,0 @@
-ARG CUDA_VERSION="12.8.1"
-ARG CUDNN_VERSION="8"
-ARG UBUNTU_VERSION="22.04"
-ARG MAX_JOBS=4
-
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
-
-ENV PATH="/root/miniconda3/bin:${PATH}"
-
-ARG PYTHON_VERSION="3.11"
-ARG PYTORCH_VERSION="nightly"
-ARG CUDA="128"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-
-RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
-    && wget \
-    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
-    && mkdir /root/.conda \
-    && bash Miniconda3-latest-Linux-x86_64.sh -b \
-    && rm -f Miniconda3-latest-Linux-x86_64.sh \
-    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
-    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
-    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
-
-ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
-
-WORKDIR /workspace
-
-RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel && \
-    python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
-    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
-    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
-    python3 -m pip cache purge
-
-RUN git lfs install --skip-repo && \
-    pip3 install awscli && \
-    # The base image ships with `pydantic==1.8.2` which is not working
-    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
-    pip3 cache purge
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -1,30 +0,0 @@
-ARG BASE_TAG=main
-FROM axolotlai/axolotl:$BASE_TAG
-
-ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
-ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
-ENV HF_HOME="/workspace/data/huggingface-cache/hub"
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-EXPOSE 8888
-EXPOSE 22
-
-COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
-COPY scripts/motd /etc/motd
-
-RUN pip install jupyterlab notebook ipywidgets && \
-    jupyter lab clean
-RUN apt update && \
-    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
-    rm -rf /var/cache/apt/archives && \
-    rm -rf /var/lib/apt/lists/* && \
-    mkdir -p ~/.ssh && \
-    chmod 700 ~/.ssh && \
-    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
-    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
-    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
-    chmod +x /root/cloud-entrypoint.sh && \
-    echo 'set-option -g history-limit 5000' >> ~/.tmux.conf
-
-ENTRYPOINT ["/root/cloud-entrypoint.sh"]
-CMD ["sleep", "infinity"]
--- a/docker/Dockerfile-cloud-no-tmux
+++ b/docker/Dockerfile-cloud-no-tmux
@@ -1,28 +0,0 @@
-ARG BASE_TAG=main
-FROM axolotlai/axolotl:$BASE_TAG
-
-ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
-ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
-ENV HF_HOME="/workspace/data/huggingface-cache/hub"
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-EXPOSE 8888
-EXPOSE 22
-
-COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
-COPY scripts/motd /etc/motd
-
-RUN pip install jupyterlab notebook ipywidgets && \
-    jupyter lab clean
-RUN apt update && \
-    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
-    rm -rf /var/cache/apt/archives && \
-    rm -rf /var/lib/apt/lists/* && \
-    mkdir -p ~/.ssh && \
-    chmod 700 ~/.ssh && \
-    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
-    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
-    chmod +x /root/cloud-entrypoint.sh
-
-ENTRYPOINT ["/root/cloud-entrypoint.sh"]
-CMD ["sleep", "infinity"]
--- a/docker/Dockerfile-cloud-uv
+++ b/docker/Dockerfile-cloud-uv
@@ -1,31 +0,0 @@
-ARG BASE_TAG=main
-FROM axolotlai/axolotl-uv:$BASE_TAG
-
-ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
-ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
-ENV HF_HOME="/workspace/data/huggingface-cache/hub"
-ENV HF_HUB_ENABLE_HF_TRANSFER="1"
-
-EXPOSE 8888
-EXPOSE 22
-
-COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
-COPY scripts/motd /etc/motd
-
-RUN uv pip install jupyterlab notebook ipywidgets && \
-    jupyter lab clean
-RUN apt update && \
-    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
-    rm -rf /var/cache/apt/archives && \
-    rm -rf /var/lib/apt/lists/* && \
-    mkdir -p ~/.ssh && \
-    chmod 700 ~/.ssh && \
-    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
-    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
-    printf "source /workspace/axolotl-venv/bin/activate\n" >> ~/.bashrc && \
-    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
-    chmod +x /root/cloud-entrypoint.sh && \
-    echo 'set-option -g history-limit 5000' >> ~/.tmux.conf
-
-ENTRYPOINT ["/root/cloud-entrypoint.sh"]
-CMD ["sleep", "infinity"]
--- a/docker/Dockerfile-runpod
+++ b/docker/Dockerfile-runpod
@@ -0,0 +1,19 @@
+ARG BASE_TAG=main
+FROM winglian/axolotl:$BASE_TAG
+
+ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
+ENV HF_HOME="/workspace/data/huggingface-cache/hub"
+
+COPY scripts/runpod-entrypoint.sh /root/runpod-entrypoint.sh
+
+RUN apt install --yes --no-install-recommends openssh-server tmux && \
+    mkdir -p ~/.ssh && \
+    chmod 700 ~/.ssh && \
+    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
+    chmod +x /workspace/axolotl/scripts/runpod-entrypoint.sh && \
+    chmod +x /root/runpod-entrypoint.sh
+
+ENTRYPOINT ["/root/runpod-entrypoint.sh"]
+CMD ["sleep", "infinity"]
--- a/docker/Dockerfile-tests
+++ b/docker/Dockerfile-tests
@@ -1,40 +0,0 @@
-ARG BASE_TAG=main-base
-FROM axolotlai/axolotl-base:$BASE_TAG
-
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
-ARG AXOLOTL_EXTRAS=""
-ARG AXOLOTL_ARGS=""
-ARG CUDA="118"
-ARG PYTORCH_VERSION="2.1.2"
-ARG GITHUB_REF="main"
-
-ENV PYTORCH_VERSION=$PYTORCH_VERSION
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-RUN git fetch origin +$GITHUB_REF && \
-    git checkout FETCH_HEAD
-
-# If AXOLOTL_EXTRAS is set, append it in brackets
-RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
-        pip install --no-build-isolation -e .[deepspeed,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        pip install --no-build-isolation -e .[deepspeed,mamba-ssm] $AXOLOTL_ARGS; \
-    fi
-
-# So we can test the Docker image
-RUN pip install pytest
-
-# fix so that git fetch/pull from remote works
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch
-
-# helper for huggingface-login cli
-RUN git config --global credential.helper store
--- a/docker/Dockerfile-uv
+++ b/docker/Dockerfile-uv
@@ -1,47 +0,0 @@
-ARG BASE_TAG=main-base
-FROM axolotlai/axolotl-base-uv:$BASE_TAG
-
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
-ARG AXOLOTL_EXTRAS=""
-ARG AXOLOTL_ARGS=""
-ARG CUDA="118"
-ARG PYTORCH_VERSION="2.1.2"
-ARG TARGETARCH
-
-ENV PYTORCH_VERSION=$PYTORCH_VERSION
-
-RUN apt-get update && \
-    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
-    rm -rf /var/cache/apt/archives && \
-    rm -rf /var/lib/apt/lists/*
-
-WORKDIR /workspace
-
-RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git
-
-WORKDIR /workspace/axolotl
-
-# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
-RUN uv pip uninstall causal_conv1d
-RUN if [ "$TARGETARCH" = "arm64" ]; then \
-        BASE_EXTRAS="optimizers,ray"; \
-    else \
-        BASE_EXTRAS="deepspeed,optimizers,ray"; \
-    fi && \
-    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
-        uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
-    else \
-        uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
-    fi && \
-    python scripts/cutcrossentropy_install.py --uv | sh && \
-    uv pip install pytest && \
-    uv cache clean
-
-# fix so that git fetch/pull from remote works with shallow clone
-RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
-    git config --get remote.origin.fetch && \
-    git config --global credential.helper store
-
-COPY .axolotl-complete.bash /root/.axolotl-complete.bash
-RUN chmod +x /root/.axolotl-complete.bash && \
-    echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -1,40 +0,0 @@
-ARG CUDA_VERSION="12.6.3"
-ARG CUDNN_VERSION=""
-ARG UBUNTU_VERSION="22.04"
-ARG MAX_JOBS=4
-ARG TARGETARCH
-
-FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
-
-ARG TARGETARCH
-ARG PYTHON_VERSION="3.11"
-ARG PYTORCH_VERSION="2.6.0"
-ARG CUDA="126"
-ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
-
-ENV PYTHON_VERSION=$PYTHON_VERSION
-ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
-ENV UV_TORCH_BACKEND="cu${CUDA}"
-
-RUN apt-get update \
-    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
-    && git lfs install --skip-repo \
-    && curl -LsSf https://astral.sh/uv/install.sh | sh
-
-ENV PATH="/root/.local/bin:${PATH}"
-
-RUN uv python install ${PYTHON_VERSION}
-
-WORKDIR /workspace
-
-RUN uv venv --no-project --relocatable axolotl-venv
-
-ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
-
-RUN uv pip install packaging setuptools wheel psutil \
-    && uv pip install torch==${PYTORCH_VERSION} torchvision \
-    && uv pip install awscli pydantic
-
-RUN if [ "$TARGETARCH" = "amd64" ]; then \
-        MAMBA_SKIP_CUDA_BUILD=TRUE CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE uv pip install --no-build-isolation mamba_ssm causal_conv1d; \
-    fi
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -1,7 +0,0 @@
-/.quarto/
-_site/
-/api/*.qmd
-/api/*.html
-config-reference.qmd
-models/**/*.qmd
-models/**/*.html
--- a/docs/1_58bit_finetuning.qmd
+++ b/docs/1_58bit_finetuning.qmd
@@ -1,70 +0,0 @@
---
-title: "1.58-bit Finetuning"
-back-to-top-navigation: true
-toc: true
-toc-expand: 2
-toc-depth: 4
---
-
-## Overview
-
-1.58-bit finetuning allows you to finetune BitNet models when their prequantized weights are provided. In theory, it will be possible to fine-tune any LLM in 1.58bit format but the performance degradation will be dramatic.
-
-Axolotl supports 1.58-bit finetuning via the [`onebitllms`](https://github.com/tiiuae/onebitllms) library, which replaces standard linear layers with BitNet-compatible counterparts ready to use for training.
-
-::: {.callout-note}
-LoRA is not supported for BitNet models
-:::
-
-## Installation
-
-Install the `onebitllms` package before using this feature:
-
-```bash
-uv pip install onebitllms
-```
-
-Or from source:
-
-```bash
-uv pip install git+https://github.com/tiiuae/onebitllms
-```
-
-## Supported models
-
-For now, only `Falcon-E` series of models are supported. Make sure to use their `-prequantized` version:
-
-```bash
-tiiuae/Falcon-E-3B-Base-prequantized
-tiiuae/Falcon-E-1B-Base-prequantized
-```
-
-In theory, any other model would 'work' but the performance degradation will be huge. This remains an area of exploration.
-
-## Configuration
-
-To enable 1.58-bit finetuning, set the following in your configuration file:
-
-```yaml
-base_model: tiiuae/Falcon-E-3B-Base-prequantized  # A BitNet-compatible model
-
-use_onebitllms: true
-```
-
-::: {.callout-note}
-For BitNet models, it is recommended to use a higher learning rate than classic models (usually in the order of magnitude of 10x).
-:::
-
-## Considerations after training
-
-Once your model has been trained with 1.58bit fine-tuning, you can convert the trained model in ternary format using the `onebitllms` CLI:
-
-```bash
-onebitllms quantize_to_1bit INPUT_PATH OUTPUT_PATH
-```
-
-After that, you can use supported packages such as `llama.cpp` or Apple MLX package to run the trained model.
-
-## Example Configuration
-
-You can find example configurations in `examples/falcon-e` which contain one configuration for SFT and one configuration for DPO.
--- a/docs/agents/grpo.md
+++ b/docs/agents/grpo.md
@@ -1,71 +0,0 @@
-# GRPO — Agent Reference
-
-Online RL with verifiable reward functions. For full config reference, async features, and scaling, see [grpo.qmd](../grpo.qmd). For vLLM setup, see [vllm_serving.qmd](../vllm_serving.qmd).
-
-## Architecture
-
-```
-Terminal 1 (GPU 0)                    Terminal 2 (GPU 1)
-┌──────────────────────┐              ┌──────────────────────────────────┐
-│  vLLM Server         │   HTTP       │  Trainer                         │
-│  Serves base model   │◄────────────►│  1. Send prompts to vLLM         │
-│  + LoRA adapter      │  /generate   │  2. Score completions (rewards)  │
-│                      │  /set_lora   │  3. Compute advantages           │
-│  Punica kernels for  │              │  4. PPO-clip gradient update     │
-│  LoRA inference      │              │  5. Sync LoRA weights to vLLM    │
-└──────────────────────┘              └──────────────────────────────────┘
-```
-
-## Components Required
-
-1. A YAML config with `rl: grpo`
-2. A reward module (Python file with reward functions)
-3. A running vLLM server (`axolotl vllm-serve config.yaml`)
-
-## Reward Function Signature
-
-```python
-def my_reward(completions, **kwargs) -> list[float]:
-    # completions[i][0]["content"] = text of i-th completion
-    # **kwargs contains dataset columns not removed by transform
-    return [score_for_each_completion]
-```
-
-Multiple rewards: `reward_funcs: [r1, r2]` with `reward_weights: [1.0, 0.5]`.
-
-## Key Async Features
-
-| Feature | Config | Purpose |
-|---------|--------|---------|
-| Async prefetch | `async_prefetch: true` | Overlap generation with training |
-| LoRA sync | `vllm_lora_sync: true` | Fast adapter sync via filesystem |
-| Streaming scoring | `streaming_partial_batch: true` | Score one group at a time |
-| Zero-adv skip | `skip_zero_advantage_batches: true` | Skip batches with no learning signal |
-| Replay buffer | `replay_buffer_size: 100` | Cache high-signal groups |
-| IS correction | `vllm_importance_sampling_correction: true` | Fix off-policy distribution shift |
-
-## Health Checks
-
- `rewards/*/mean` > 0.15 within 20 steps (else: test reward function standalone)
- `reward_std` > 0 on most steps (else: no learning signal)
- `entropy` 0.05-0.5 (< 0.01 = mode collapse)
- `grad_norm` 0.001-1.0 (> 10 = unstable, 0.0 = zero-advantage skip)
-
-See [training_stability.qmd](../training_stability.qmd) for detailed diagnostics.
-
-## File Map
-
-```
-src/axolotl/
-  cli/train.py                     # Entry point
-  cli/vllm_serve.py                # Entry point for vLLM server
-  core/trainers/grpo/
-    trainer.py                     # AxolotlGRPOTrainer
-    sampler.py                     # Sampling utilities
-  core/builders/rl.py              # HFRLTrainerBuilder — routes rl type → trainer
-  scripts/vllm_serve_lora.py       # vLLM serve script with LoRA sync support
-  utils/schemas/trl.py             # TRL config schema (all trl: options)
-
-docs/grpo.qmd                     # Full user docs: async, rewards, scaling, config reference
-docs/vllm_serving.qmd             # vLLM server modes, LoRA sync, weight sync
-```
--- a/docs/agents/model_architectures.md
+++ b/docs/agents/model_architectures.md
@@ -1,198 +0,0 @@
-# Model Architectures — Agent Reference
-
-Model-specific quirks, required settings, and known issues. Check this before debugging training failures on specific model families.
-
-## VLM (Vision Language Model) Quick Start
-
-All VLM configs require these four lines:
-```yaml
-processor_type: AutoProcessor
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-```
-
-Decision tree for VLM config:
-```text
-Is the model multimodal (has vision/audio encoder)?
-  ├─ YES: Add `freeze_mm_modules: true` if training text only
-  │       Add `chat_template: <model_template>` (e.g. gemma4, qwen3_5, gemma3)
-  │       LoRA: use regex `lora_target_modules` to restrict to language model
-  └─ NO: Train as a regular text model
-
-Is the model MoE (e.g. Gemma4 26B-A4B, Qwen3.5 35B-A3B)?
-  ├─ YES: Add `lora_target_parameters` for expert LoRA
-  │       Consider ScatterMoE kernels (see Plugins section)
-  └─ NO: Standard LoRA config
-```
-
-## Plugins & Optimizations
-
-### Cut Cross Entropy (CCE)
-
-Computes loss from hidden states + lm_head weight without materializing the full logits tensor, saving significant VRAM. Install if not already present:
-
-```bash
-uv pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@main"
-```
-
-```yaml
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-```
-
-### ScatterMoE Kernels
-
-Fuses expert + LoRA computation into a single kernel for MoE models. Significant speedup for models with many experts.
-
-```yaml
-plugins:
-  - axolotl.integrations.kernels.KernelsPlugin
-use_kernels: true
-use_scattermoe: true
-experts_implementation: scattermoe
-
-# Expert LoRA targets (3D parameter tensors, not nn.Linear):
-lora_target_parameters:
-  - experts.gate_up_proj
-  - experts.down_proj
-```
-
-Supported: Gemma4 (`gemma4_text`), Mixtral, Qwen MoE variants. The plugin auto-detects model type and routing function. Without ScatterMoE, expert LoRA still works but runs base expert matmul and LoRA as separate operations.
-
-## Gemma 4
-
-**Models**: `google/gemma-4-26B-A4B` (MoE), `google/gemma-4-31B` (dense), `google/gemma-4-E2B`, `google/gemma-4-E4B`
-
-**Architecture**: Multimodal wrapper (`Gemma4ForConditionalGeneration`) over a text backbone (`Gemma4TextModel`), with optional vision/audio encoders. All Gemma4 HF repos have `model_type: "gemma4"` — even text-only variants load as multimodal with a vision tower.
-
-### Required settings
-
-```yaml
-# Always needed for Gemma4:
-freeze_mm_modules: true          # Freeze vision/audio encoders for text-only training
-gradient_checkpointing_kwargs:
-  use_reentrant: false           # Shared per-layer norms cause "marked ready twice" with reentrant
-
-# LoRA target — restrict to language model only (DO NOT use lora_target_linear: true):
-lora_target_modules: 'model.language_model.layers.[\d]+.(_checkpoint_wrapped_module.)?(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'
-```
-
-### Auto-detection
-
-Axolotl auto-detects Gemma4 and applies:
- `use_reentrant: false` for gradient checkpointing
- `ddp_find_unused_parameters: true` for DDP (skipped when `activation_offloading: true`)
-
-### Multi-GPU
-
-| Strategy | Works? | Notes |
-|----------|--------|-------|
-| DDP | Yes | Auto-sets `ddp_find_unused_parameters=True` |
-| DDP + activation_offloading | Yes | `find_unused_parameters` is skipped (conflicts with checkpoint wrappers) |
-| FSDP1 | No | OOM during dequantization/sharding with QLoRA |
-| FSDP2 | Yes | Use `Gemma4TextDecoderLayer` (not `Gemma4DecoderLayer`) as wrap class |
-| FSDP2 + activation_offloading | Yes | Lowest VRAM (~26 GiB/GPU for 26B-A4B) |
-
-FSDP2 config:
-```yaml
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_version: 2
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer
-```
-
-### MoE (26B-A4B)
-
- `enable_moe_block: true`, 256 experts, top-k routing
- No separate `SparseMoeBlock` — MoE is embedded in each decoder layer
- Expert LoRA targets 3D parameter tensors:
-  ```yaml
-  lora_target_parameters:
-    - experts.gate_up_proj
-    - experts.down_proj
-  ```
- ScatterMoE kernel acceleration:
-  ```yaml
-  plugins:
-    - axolotl.integrations.kernels.KernelsPlugin
-  use_kernels: true
-  use_scattermoe: true
-  experts_implementation: scattermoe
-  ```
-
-### VLM (Vision) Training
-
-All Gemma4 models load as `Gemma4ForConditionalGeneration` with a vision tower. No custom `ProcessingStrategy` needed — the base class auto-detects the image token.
-
-```yaml
-base_model: google/gemma-4-E2B-it   # or E4B-it, 26B-A4B
-processor_type: AutoProcessor
-freeze_mm_modules: true
-chat_template: gemma4
-
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-```
-
-A starting VLM loss of ~8-15 is typical. In most runs, loss converges below 1.0 within ~30-50 steps, though results may vary across configurations.
-
-For the 26B-A4B MoE variant with ScatterMoE + expert LoRA + CCE, add:
-```yaml
-plugins:
-  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
-  - axolotl.integrations.kernels.KernelsPlugin
-use_kernels: true
-use_scattermoe: true
-experts_implementation: scattermoe
-lora_target_parameters:
-  - experts.gate_up_proj
-  - experts.down_proj
-```
-
-### Common issues
-
-| Symptom | Cause | Fix |
-|---------|-------|-----|
-| `mm_token_type_ids is required` in DDP | `model.config` not accessible through DDP wrapper | Already fixed — `unwrap_model()` in `compute_loss` and `prediction_step` |
-| `marked a variable ready twice` in DDP | `ddp_find_unused_parameters=True` + activation_offloading checkpoint wrappers | Auto-handled — `find_unused_parameters` is skipped when `activation_offloading: true` |
-| Loss ~12 instead of ~0.5 | Using `lora_target_linear: true` (applies LoRA to vision/audio modules) | Use the regex `lora_target_modules` pattern instead |
-| FSDP2 `Could not find Gemma4AudioLayer` | Auto-wrap detects `_no_split_modules` including audio layers that don't exist | Explicitly set `fsdp_transformer_layer_cls_to_wrap: Gemma4TextDecoderLayer` |
-| `Gemma4ClippableLinear not supported` by PEFT | Vision tower uses a non-standard linear wrapper | Axolotl patches this automatically via `_patch_peft_clippable_linear()` |
-
-### E2B/E4B dense models
-
-These have `hidden_size_per_layer_input: 256` (per-layer input embeddings) and `attention_k_eq_v: False`. Known issue: loss starts higher than expected (~12 vs ~0.5 for 26B). Root cause under investigation — may be related to the per-layer input mechanism or the `Gemma4ForConditionalGeneration` loss computation.
-
-## Gemma 3
-
-**Models**: `google/gemma-3-*`
-
- `ddp_find_unused_parameters: true` needed (multimodal unused params)
- `use_reentrant: false` recommended
- Attention mask must be dropped for sample packing (handled automatically)
- Multi-GPU test currently skipped (`tests/e2e/multigpu/test_gemma3.py`)
-
-## Qwen 3.5 MoE
-
-**Models**: `Qwen/Qwen3.5-35B-A3B`
-
- Hybrid architecture: DeltaNet linear attention (30 layers) + full attention (10 layers)
- 256 experts, 8 active per token
- Known weight scale drift in late DeltaNet layers (36-38) due to AdamW + rare expert interaction
- Fix: `normalize_weight_scales` config to detect and rescale outliers:
-  ```yaml
-  normalize_weight_scales:
-    - name_pattern: 'linear_attn\.conv1d\.weight'
-      threshold: 1.3
-  ```
-
-## General MoE Notes
-
- `lora_target_linear: true` with multimodal MoE models will apply LoRA to ALL linear modules including vision/audio encoders — use regex `lora_target_modules` to restrict to language model only
- Rare experts get larger effective learning rate from AdamW (small second-moment estimates) — can cause weight drift in recurrent/SSM components. Use `normalize_weight_scales` with `dry_run: true` to detect.
- For ScatterMoE kernel support, set `experts_implementation: scattermoe` and add the KernelsPlugin
--- a/docs/agents/new_model_support.md
+++ b/docs/agents/new_model_support.md
@@ -1,181 +0,0 @@
-# New Model Support — Agent Reference
-
-Guide for debugging and adding support for new model architectures in axolotl. Based on lessons learned from Gemma4, Gemma3, Qwen2-VL, and other multimodal/MoE models.
-
-## Quick Validation Checklist
-
-When testing a new model, run through these checks in order:
-
-1. **Does the model load?** `axolotl preprocess config.yaml` — catches config schema errors
-2. **Does LoRA apply?** Check for "Unsupported layer type" warnings from PEFT
-3. **Is the initial loss sane?** First-step loss for a pretrained model should be 0.5–2.0 for SFT
-4. **Does sample packing work?** Compare loss with `sample_packing: true` vs `false` — should be similar
-5. **Is CCE active?** Check for "Applying Cut Cross Entropy" log and verify peak VRAM is lower
-
-## Loss Debugging
-
-### Expected initial loss
-A pretrained model doing SFT should start with loss roughly in the 0.5–2.0 range. If loss starts above 3.0, something is wrong. If it's near `log(vocab_size)` (≈ 12 for 262K vocab), the model is predicting at random — attention masking or model weights are broken.
-
-### Direct comparison technique
-The fastest way to isolate a loss issue — bypass the trainer entirely:
-
-```python
-# Load model via axolotl's pipeline (applies all patches)
-from axolotl.cli.config import load_cfg
-from axolotl.utils.config import normalize_config, prepare_plugins
-from axolotl.loaders.tokenizer import load_tokenizer
-from axolotl.loaders.model import ModelLoader
-
-cfg = load_cfg("your_config.yaml")
-normalize_config(cfg)
-prepare_plugins(cfg)
-tokenizer = load_tokenizer(cfg)
-model, _ = ModelLoader(cfg, tokenizer).load()
-
-# Forward pass on preprocessed data
-model.train()
-out = model(input_ids, labels=labels)
-print(f"Direct loss: {out.loss.item()}")  # Compare to trainer's reported loss
-```
-
-If direct loss is correct (~1.0) but trainer reports 3–4x higher, check `model_accepts_loss_kwargs` (see below).
-
-### `model_accepts_loss_kwargs` inflation
-HF Trainer checks if the model's `forward()` has `**kwargs` and sets `model_accepts_loss_kwargs=True`. This changes loss normalization: the trainer does NOT divide loss by `gradient_accumulation_steps` before logging. The gradient is correct — only the logged loss is inflated.
-
-**Symptom**: Logged loss ≈ actual_loss × gradient_accumulation_steps.
-
-**Which models are affected**: Any model with `**kwargs` in forward (common in multimodal models for extra inputs like `mm_token_type_ids`, `pixel_values`, etc.).
-
-**Fix location**: `src/axolotl/core/trainers/base.py` `__init__()` — after `super().__init__()`, check if the unwrapped model actually has `num_items_in_batch` in its forward signature. If not, set `self.model_accepts_loss_kwargs = False`.
-
-## Multimodal Models (ForConditionalGeneration)
-
-Many recent models use `ForConditionalGeneration` as the top-level class, not `ForCausalLM`:
- Gemma3 → `Gemma3ForConditionalGeneration`
- Gemma4 → `Gemma4ForConditionalGeneration`
- Qwen2-VL → `Qwen2VLForConditionalGeneration`
- LLaVA → `LlavaForConditionalGeneration`
-
-### Why this matters
-
-| Component | Targets `ForCausalLM` | Needs `ForConditionalGeneration` |
-|-----------|----------------------|--------------------------------|
-| CCE patches | ✅ (default) | ❌ silently inactive if not patched |
-| PEFT LoRA | ✅ | May fail on custom layer types |
-| HF Trainer label handling | ✅ | May need extra inputs |
-
-### Required extra inputs
-Multimodal models require special inputs during training even for text-only data:
-
-| Model | Required Input | Value for Text-Only |
-|-------|---------------|-------------------|
-| Gemma4 | `mm_token_type_ids` | `torch.zeros_like(input_ids)` |
-| Gemma3 | `token_type_ids` | `torch.zeros_like(input_ids)` |
-
-Auto-inject in `compute_loss()` when not provided by the data collator. See `core/trainers/base.py`.
-
-### Custom layer types and PEFT
-Vision towers often use custom module wrappers that PEFT doesn't support:
-
-| Model | Custom Layer | Wraps | Fix |
-|-------|-------------|-------|-----|
-| Gemma4 | `Gemma4ClippableLinear` | `nn.Linear` | Redirect to `.linear` child |
-
-Fix location: `src/axolotl/loaders/adapter.py` `_patch_peft_clippable_linear()`.
-
-## Sample Packing
-
-### How packed sequence detection works (transformers ≥ 5.x)
-`transformers.masking_utils._preprocess_mask_arguments()` detects packed sequences from `position_ids` resets. But **only when `attention_mask is None`**:
-
-```python
-# From masking_utils.py:
-if position_ids is not None and attention_mask is None and past_key_values is None:
-    packed_sequence_mask = find_packed_sequence_indices(position_ids)
-```
-
-If the collator provides an all-ones `attention_mask`, packing detection is **skipped** and the model builds a single causal mask spanning all packed sequences → cross-sequence attention leakage → very high loss.
-
-### Fix for models using `create_causal_mask_mapping`
-For Gemma3, Gemma4, and similar models that use the new transformers masking system, remove `attention_mask` from inputs when sample packing is active:
-
-```python
-# In compute_loss():
-if (
-    self.args.sample_packing
-    and model_type in ("gemma4", "gemma3")
-    and "attention_mask" in inputs
-    and "position_ids" in inputs
-):
-    del inputs["attention_mask"]
-```
-
-Fix location: `src/axolotl/core/trainers/base.py` `compute_loss()`.
-
-### Models that DON'T need this fix
-Older models that use `_prepare_4d_causal_attention_mask` (Llama, Mistral, Qwen2, etc.) handle sample packing via axolotl's multipack attention monkeypatch instead. Only models using the new `create_causal_mask_mapping` / `create_causal_mask` masking system need the `attention_mask` removal.
-
-## Attention Backend Selection
-
-| Backend | Config | head_dim limit | torch_compile | Notes |
-|---------|--------|---------------|---------------|-------|
-| FA2 | `attn_implementation: flash_attention_2` | 256 | ✅ | Fastest when supported |
-| FA4 | auto with `attn_implementation: flash_attention_2` | 256 (SM90+) | ✅ | Auto-detected on H100+ |
-| SDPA | `attn_implementation: sdpa` | None | ✅ | Universal fallback |
-| flex | `attn_implementation: flex_attention` | None | ⚠️ Triton OOM for large head_dim | Good for variable head dims |
-| eager | `attn_implementation: eager` | None | ✅ | Slowest, always works |
-
-**Check model support**: Look at `_supports_flash_attn_2`, `_supports_flex_attn`, `_supports_sdpa` attributes on the model class.
-
-**head_dim gotcha**: The 256 limit is specific to flash-attn CUDA kernels, NOT PyTorch-level. SDPA and flex_attention both handle arbitrary head_dim. Models with `global_head_dim > 256` (Gemma4: 512) must use SDPA or flex.
-
-**flex + compile gotcha**: `torch_compile` with flex_attention can hit Triton shared memory OOM for large head_dim. Falls back to eager per-function (not a crash, but slower). Unsloth disables flex for Gemma4 for this reason.
-
-## Cut Cross Entropy (CCE)
-
-### How CCE patches work
-CCE replaces the model's `forward()` with a fused version that computes loss from hidden states + lm_head weight without materializing the full logits tensor. This saves ~`batch × seq_len × vocab_size × dtype_bytes` of VRAM.
-
-### Adding CCE for a new model
-1. Check if the model type is in `cut_cross_entropy.transformers.patch.PATCH_FNS`
-2. If not, axolotl's generic fallback (`integrations/cut_cross_entropy/__init__.py` `patch_llama_like()`) patches `{Prefix}ForCausalLM.forward` with `cce_forward`
-3. For multimodal models (`ForConditionalGeneration`), a model-specific patch is needed in `ml-cross-entropy` repo
-4. The multimodal `cce_forward` must accept all extra kwargs (pixel_values, mm_token_type_ids, etc.) and pop any that would conflict before calling `self.model()`
-
-### Common CCE pitfall
-If CCE appears active (log says "Applying Cut Cross Entropy") but peak VRAM doesn't decrease, check which class was patched. If the model loads as `ForConditionalGeneration` but CCE patched `ForCausalLM`, the patch is silently inactive.
-
-## MoE Models
-
-### Dense MLP vs MoE experts
-Some MoE models (e.g., Gemma4) have BOTH dense MLP layers and MoE expert layers at every decoder layer:
- `gate_proj/up_proj/down_proj` → targets the **dense MLP** (`Gemma4TextMLP`)
- `experts.gate_up_proj/experts.down_proj` → targets the **MoE experts** (`Gemma4TextExperts`)
-
-LoRA on the dense MLP works normally. Expert LoRA via `lora_target_parameters` requires PEFT support for the specific expert module type (may warn "Unsupported layer type").
-
-### ScatterMoE kernels
-`use_scattermoe: true` with `experts_implementation: scattermoe` registers fused expert kernels via transformers' `ExpertsInterface`. Significant speedup for MoE models. Requires the kernels plugin:
-```yaml
-plugins:
-  - axolotl.integrations.kernels.KernelsPlugin
-use_kernels: true
-use_scattermoe: true
-experts_implementation: scattermoe
-```
-
-## Where to Add Model-Specific Fixes
-
-| What | Where | Example |
-|------|-------|---------|
-| Missing forward inputs | `core/trainers/base.py` `compute_loss()` | mm_token_type_ids injection |
-| Attention mask fixes | `core/trainers/base.py` `compute_loss()` | Sample packing mask removal |
-| Loss logging fixes | `core/trainers/base.py` `__init__()` | model_accepts_loss_kwargs override |
-| PEFT/LoRA patches | `loaders/adapter.py` | ClippableLinear redirect |
-| Attention patches | `monkeypatch/attention/` | FA4 tuple fix |
-| Model-specific patches | `loaders/patch_manager.py` `_apply_model_specific_patches()` | Llama4, Kimi, NemotronH |
-| CCE patches | `ml-cross-entropy` repo `transformers/` | Per-model cce_forward |
-| Example configs | `examples/<model>/` | Validated YAML |
-| Config validation | `utils/schemas/validation.py` | Compatibility checks |
--- a/docs/agents/preference_tuning.md
+++ b/docs/agents/preference_tuning.md
@@ -1,121 +0,0 @@
-# Preference Learning (RLHF) — Agent Reference
-
-Reference for DPO, IPO, KTO, ORPO, and SimPO. For config templates and dataset format examples, see [rlhf.qmd](../rlhf.qmd). For GRPO, see [grpo.qmd](../grpo.qmd). For EBFT, see [ebft.qmd](../ebft.qmd).
-
-## Method Overview
-
-| Method | Data Requirement | Key Idea | Best For |
-|--------|-----------------|----------|----------|
-| **DPO** | Paired (chosen + rejected) | Implicit reward via preference pairs | General alignment, most common |
-| **IPO** | Paired (chosen + rejected) | DPO with different loss (avoids overfitting) | When DPO overfits |
-| **KTO** | Unpaired (completion + binary label) | Kahneman-Tversky loss, no pairs needed | When you only have thumbs-up/down |
-| **ORPO** | Paired (chosen + rejected) | Combined SFT + preference, no ref model | Single-stage alignment, saves VRAM |
-| **SimPO** | Paired (chosen + rejected) | Length-normalized, no ref model | Simple setup, length-robust |
-
-Default: start with DPO. All methods require `sample_packing: false`.
-
-## Architecture
-
-```
-┌──────────────┐   ┌───────────────┐   ┌───────────────┐
-│ Policy Model │   │ Reference     │   │ Preference    │
-│ (trainable)  │   │ Model (frozen)│   │ Dataset       │
-└──────┬───────┘   └──────┬────────┘   └──────┬────────┘
-       └──────────┬───────┘                    │
-                  v                            │
-       Forward pass on chosen + rejected <─────┘
-                  │
-       Preference Loss (DPO/IPO/KTO/...)
-                  │
-       Backprop + Update
-
-Exception: ORPO and SimPO do NOT use a reference model (~50% less VRAM).
-```
-
-No vLLM server needed (unlike GRPO). Offline RL with pre-collected preference data.
-
-## Method Selection
-
-1. Paired preference data (chosen + rejected)?
-   - Default → `rl: dpo`
-   - Overfitting → `rl: dpo, dpo_loss_type: ["ipo"]`
-   - VRAM-limited → `rl: orpo` (no ref model)
-   - Length-sensitive → `rl: simpo` (no ref model)
-2. Only binary labels (good/bad)? → `rl: kto`
-3. Single-stage training (no separate SFT)? → `rl: orpo`
-
-| | DPO | IPO | KTO | ORPO | SimPO |
-|---|---|---|---|---|---|
-| **Reference model** | Yes | Yes | Yes | No | No |
-| **VRAM overhead** | ~2x model | ~2x model | ~2x model | ~1x model | ~1x model |
-| **TRL trainer class** | DPOTrainer | DPOTrainer | KTOTrainer | ORPOTrainer | CPOTrainer |
-
-## Prompt Strategy Resolution
-
-The `type` field resolves to a Python function:
-
-```
-type: "chatml.intel"
-  → axolotl.prompt_strategies.dpo.chatml.intel(cfg, **kwargs)
-  → returns transform_fn(sample) → {"prompt", "chosen", "rejected"}
-
-type: "chat_template.default"
-  → axolotl.prompt_strategies.dpo.chat_template.default(cfg, dataset_idx, **kwargs)
-
-type: {"field_prompt": "prompt", ...}   (dict)
-  → axolotl.prompt_strategies.dpo.user_defined.default(...)
-```
-
-Module base: `axolotl.prompt_strategies.{rl_method}` — replace `dpo` with `kto` or `orpo`.
-
-## Healthy Training Indicators
-
-| Metric | Healthy Range | Problem |
-|--------|--------------|---------|
-| `train/loss` | Decreasing, 0.3-0.7 | Flat or increasing = broken data or too high LR |
-| `rewards/chosen` | Increasing | Flat = model not learning preferences |
-| `rewards/rejected` | Decreasing | Increasing = model prefers wrong responses |
-| `rewards/margins` | Positive and increasing | Negative = prefers rejected over chosen |
-| `rewards/accuracies` | > 0.5, toward 0.7+ | < 0.5 = worse than random |
-| `logps/rejected` | Decreasing | Increasing = reward hacking |
-| `grad_norm` | 0.01 - 10.0 | > 100 = exploding gradients |
-
-Method-specific: DPO/IPO watch `rewards/margins`; KTO loss is noisier; ORPO monitor SFT + odds ratio components; SimPO check length-normalized reward separation.
-
-## Known Issues
-
-| Issue | Fix |
-|-------|-----|
-| Sample packing crash | Set `sample_packing: false` (required for all preference methods) |
-| KTO `KeyError: 'label'` | Ensure dataset has boolean `label` column |
-| ORPO/KTO `KeyError` during tokenization | Add `remove_unused_columns: false` |
-| ORPO template not applied | ORPO requires explicit `chat_template` setting |
-| OOM with ref model (DPO/IPO/KTO) | Use LoRA/QLoRA, or switch to ORPO/SimPO (no ref model) |
-| IPO + label_smoothing | Do not set `dpo_label_smoothing` when `rl: ipo` |
-
-Full troubleshooting: [training_stability.qmd](../training_stability.qmd)
-
-## File Map
-
-```
-src/axolotl/
-  core/trainers/dpo/              # DPO trainer, args, strategy
-  core/builders/rl.py             # HFRLTrainerBuilder — routes rl type → trainer class
-  core/training_args.py           # AxolotlKTOConfig, AxolotlORPOConfig, AxolotlCPOConfig
-  prompt_strategies/
-    dpo/                          # DPO/IPO/SimPO dataset strategies
-      chat_template.py            # chat_template.default, chat_template.argilla_chat
-      chatml.py                   # chatml.default/intel/icr/argilla_chat/prompt_pairs/ultra
-      llama3.py                   # llama3 variants (same subtypes as chatml)
-      user_defined.py             # Custom field mapping
-      passthrough.py              # No transform
-    kto/                          # KTO dataset strategies (chatml, llama3, user_defined)
-    orpo/                         # ORPO dataset strategies (chat_template.argilla)
-  utils/schemas/enums.py          # RLType enum (dpo, ipo, kto, orpo, simpo, grpo, gdpo, ebft)
-  utils/schemas/config.py         # All rl/dpo/kto/orpo/simpo config fields
-
-docs/rlhf.qmd                    # Full user docs: all dataset formats, config templates
-docs/choosing_method.qmd          # SFT vs DPO vs GRPO decision guide
-examples/qwen2/dpo.yaml           # DPO example
-examples/llama-3/qlora-1b-kto.yaml  # KTO example
-```
--- a/docs/agents/pretraining.md
+++ b/docs/agents/pretraining.md
@@ -1,75 +0,0 @@
-# Pretraining / Continual Pretraining — Agent Reference
-
-Train on raw text with no input masking. Two approaches depending on dataset size.
-
-## When to Use
-
- Continual pretraining on domain-specific corpora
- Adapting a base model to a new language or domain before fine-tuning
- Pretraining-style data where the entire text is the training signal
-
-## Choosing an Approach
-
-| | Non-streaming (`type: completion`) | Streaming (`pretraining_dataset`) |
-|---|---|---|
-| **Dataset size** | Fits in memory | Too large to fit in memory |
-| **Tokenization** | Pre-tokenized before training | On-demand during training |
-| **Config key** | `datasets:` | `pretraining_dataset:` |
-| **Long text handling** | Splits texts exceeding `sequence_len` | Concatenates into fixed-length sequences |
-| **Benefit** | Can preprocess on CPU, transfer to GPU | Start training immediately, no preprocessing |
-
-## Non-Streaming: `type: completion`
-
-For smaller datasets that fit in memory. Pre-tokenizes the entire dataset.
-
-```yaml
-datasets:
-  - path: my_corpus
-    type: completion
-    # field: text              # Column name (default: "text")
-```
-
-## Streaming: `pretraining_dataset`
-
-For large corpora. Streams data on-demand without loading everything into memory.
-
-```yaml
-pretraining_dataset:
-  - path: HuggingFaceFW/fineweb-edu
-    type: pretrain
-    text_column: text
-    split: train
-
-max_steps: 1000                          # Required — axolotl can't infer dataset size
-streaming_multipack_buffer_size: 10000   # Buffer for sample packing
-pretrain_multipack_attn: true            # Prevent cross-attention between packed samples
-```
-
-`max_steps` is required for streaming — one step = `sequence_len * micro_batch_size * gradient_accumulation_steps * num_gpus` tokens.
-
-Full streaming docs: [streaming.qmd](../streaming.qmd)
-
-## Dataset Format
-
-```json
-{"text": "The complete document text goes here."}
-```
-
-## Key Settings
-
- `sample_packing: true` + `pad_to_sequence_len: true` — pack documents into fixed-length sequences
- `flash_attention: true` — required for sample packing
- No adapter — typically full fine-tune for pretraining
- `train_on_inputs: true` — default for completion (all tokens trained on)
-
-## File Map
-
-```
-src/axolotl/
-  prompt_strategies/completion.py    # Non-streaming: completion prompt strategy (no masking)
-  utils/data/sft.py                  # Non-streaming: dataset loading and processing
-  utils/data/streaming.py            # Streaming: encode_streaming(), wrap_streaming_dataset()
-  utils/schemas/config.py            # Config fields: pretraining_dataset, pretrain_multipack_attn, etc.
-
-examples/streaming/pretrain.yaml     # Full streaming pretraining example config
-```
--- a/docs/agents/reward_modelling.md
+++ b/docs/agents/reward_modelling.md
@@ -1,48 +0,0 @@
-# Reward Modelling — Agent Reference
-
-Train models to score responses for use as reward signals in RL. For full docs, see [reward_modelling.qmd](../reward_modelling.qmd).
-
-## Types
-
-### Outcome Reward Models (ORM)
-
-Train a classifier to predict preference over entire interactions. Uses `AutoModelForSequenceClassification`.
-
-```yaml
-base_model: google/gemma-2-2b
-model_type: AutoModelForSequenceClassification
-num_labels: 1
-reward_model: true
-chat_template: gemma
-datasets:
-  - path: argilla/distilabel-intel-orca-dpo-pairs
-    type: bradley_terry.chat_template
-```
-
-Dataset format: `{"system": "...", "input": "...", "chosen": "...", "rejected": "..."}`
-
-### Process Reward Models (PRM)
-
-Train a token classifier to score each reasoning step. Uses `AutoModelForTokenClassification`.
-
-```yaml
-base_model: Qwen/Qwen2.5-3B
-model_type: AutoModelForTokenClassification
-num_labels: 2
-process_reward_model: true
-datasets:
-  - path: trl-lib/math_shepherd
-    type: stepwise_supervised
-```
-
-Dataset format: see [stepwise_supervised.qmd](../dataset-formats/stepwise_supervised.qmd).
-
-## File Map
-
-```
-src/axolotl/
-  core/builders/causal.py                    # Handles reward_model flag in trainer builder
-  prompt_strategies/bradley_terry/           # Bradley-Terry prompt strategies
-  prompt_strategies/stepwise_supervised.py   # PRM dataset strategy
-  utils/schemas/config.py                    # reward_model, process_reward_model config fields
-```
--- a/docs/agents/sft.md
+++ b/docs/agents/sft.md
@@ -1,139 +0,0 @@
-# SFT — Agent Reference
-
-Supervised fine-tuning pipeline reference. For config templates and dataset format examples, see [getting-started.qmd](../getting-started.qmd) and [dataset-formats/](../dataset-formats/).
-
-## Architecture
-
-```
-YAML Config → axolotl train config.yaml
-
-  1. Load base model (+ quantization if QLoRA/8-bit)
-  2. Apply adapter layers (LoRA/QLoRA) if configured
-  3. Load + tokenize dataset(s)
-     - Apply prompt template (chat_template / alpaca / custom)
-     - Mask inputs (train_on_inputs: false)
-     - Pack samples into sequences (sample_packing: true)
-  4. Training loop (HuggingFace Trainer)
-     - forward → loss → backward → optimizer step → lr scheduler step
-  5. Save model / adapter weights + tokenizer
-
-Multi-GPU: FSDP or DeepSpeed shards model across GPUs automatically.
-```
-
-## Components Required
-
-1. A YAML config — model, dataset(s), adapter settings, hyperparameters
-2. A dataset — HuggingFace Hub, local JSONL/JSON/Parquet, or S3/GCS path
-3. (Optional) A custom prompt strategy — for non-standard dataset formats
-
-No external server processes needed (unlike GRPO which requires vLLM).
-
-## Dataset Format Decision Tree
-
-```
-Is your data in chat/message format?
-  ├─ YES: OpenAI message format (role/content)?
-  │   ├─ YES ──────────────────────> type: chat_template  (recommended)
-  │   └─ NO (custom field names) ──> type: chat_template + message_property_mappings
-  └─ NO: Instruction/response pairs?
-      ├─ YES ──> type: alpaca       (instruction, input, output)
-      └─ NO: Raw text?
-          ├─ YES with segments ─────> type: input_output  (template-free masking)
-          └─ YES continuous ────────> type: completion     (pretraining-style)
-```
-
-Full format specs: [dataset-formats/](../dataset-formats/)
-
-## Model Size to Adapter Choice
-
-| Model Size | LoRA | QLoRA (4-bit) | Full Fine-Tune | VRAM (approx) |
-|-----------|------|---------------|----------------|---------------|
-| 1-3B | Preferred | Low-budget option | Single GPU OK | 8-16 GB (LoRA) |
-| 7-8B | Preferred | Good balance | Needs multi-GPU | 16-24 GB (LoRA) |
-| 13-14B | Preferred | Good balance | Multi-GPU required | 24-40 GB (LoRA) |
-| 30-70B | LoRA or QLoRA | Preferred for single GPU | Multi-node | 40-80 GB (QLoRA) |
-
-## Hyperparameter Ranges
-
-| Parameter | LoRA | QLoRA | Full FT |
-|-----------|------|-------|---------|
-| `learning_rate` | 1e-4 to 3e-4 | 1e-4 to 3e-4 | 1e-5 to 5e-5 |
-| `lora_r` | 16-64 | 16-64 | N/A |
-| `lora_alpha` | 1-2x `lora_r` | 1-2x `lora_r` | N/A |
-| `micro_batch_size` | 2-8 | 2-4 | 1-2 |
-| `gradient_accumulation_steps` | 2-8 | 4-16 | 4-16 |
-| `num_epochs` | 1-3 | 1-3 | 1-3 |
-| `optimizer` | `adamw_8bit` | `adamw_bnb_8bit` | `adamw_torch_fused` |
-
-Effective batch = micro_batch * grad_accum * num_gpus. Lower LR for larger models.
-
-## Healthy Training Indicators
-
-| Metric | Healthy | Problem |
-|--------|---------|---------|
-| `train_loss` | Decreasing, starting ~2-4 for chat models | Flat or increasing from step 1 — data or LR issue |
-| `eval_loss` | Decreasing, tracks train_loss | Increasing while train_loss decreases — overfitting |
-| `grad_norm` | 0.1-10, relatively stable | Spikes >100 — instability. 0.0 — frozen weights |
-| `learning_rate` | Follows scheduler curve | Flat or NaN — config issue |
-
-Watch for: loss never decreasing (check `train_on_inputs`, dataset, LR), loss goes to 0 quickly (overfitting), eval_loss diverging (reduce epochs, add regularization). See [training_stability.qmd](../training_stability.qmd).
-
-## Known Issues
-
-| Issue | Fix |
-|-------|-----|
-| OOM during training | Reduce `micro_batch_size`, enable `gradient_checkpointing`, reduce `sequence_len` |
-| `sample_packing` + SDPA + bf16 = 0.0 loss | Use `attn_implementation: flash_attention_2` or disable `sample_packing` |
-| Missing chat template error | Set `chat_template: chatml` explicitly |
-| Label masking wrong | Run `axolotl preprocess config.yaml --debug` and inspect labels |
-| Loss NaN | Use `bf16: auto`, lower LR, check data for empty samples |
-| Tokenizer pad token / infinite loss | Set `special_tokens: pad_token: "<\|end_of_text\|>"` |
-| FSDP save hangs | Use `fsdp_state_dict_type: FULL_STATE_DICT` |
-| DeepSpeed CheckpointError | Set `use_reentrant: true` in `gradient_checkpointing_kwargs` |
-
-## Profiling
-
-To profile training and identify optimization opportunities:
-
-```yaml
-# Profile steps 3-7 (after warmup/autotuning settles)
-profiler_steps_start: 3
-profiler_steps: 5
-```
-
-This produces `profiler_trace.json` (Chrome trace) and `snapshot.pickle` (memory snapshot) in `output_dir`.
-View the Chrome trace at `chrome://tracing`.
-
-To programmatically inspect the trace:
-```bash
-python scripts/analyze_profile.py output_dir/
-```
-
-The trace shows per-kernel CUDA times, memory allocations, and operator-level breakdown. Look for:
- **Large matmul kernels**: candidates for fusion or quantization
- **Memory copies (H2D/D2H)**: unnecessary data movement
- **Small frequent kernels**: candidates for kernel fusion
- **Gaps between kernels**: pipeline bubbles from CPU overhead
-
-Full troubleshooting: [training_stability.qmd](../training_stability.qmd), [debugging.qmd](../debugging.qmd)
-
-## File Map
-
-```
-src/axolotl/
-  cli/train.py                     # Entry point for `axolotl train`
-  cli/preprocess.py                # Entry point for `axolotl preprocess`
-  core/builders/causal.py          # HFCausalTrainerBuilder — wires config → SFT trainer
-  core/trainers/base.py            # AxolotlTrainer — base trainer class
-  core/trainers/mixins/            # Packing, optimizer, scheduler, checkpoints
-  prompt_strategies/               # Format handlers: chat_template, alpaca, completion, input_output
-  utils/schemas/config.py          # AxolotlInputConfig — main config schema
-  utils/schemas/datasets.py        # SFTDataset, DatasetConfig
-  utils/schemas/peft.py            # LoraConfig — LoRA parameters
-  integrations/liger/              # Liger kernel plugin
-
-examples/llama-3/                  # LoRA, QLoRA, full FT example configs
-docs/getting-started.qmd           # Quickstart with config templates
-docs/optimizations.qmd             # Flash attention, gradient checkpointing, sample packing
-docs/multi-gpu.qmd                 # FSDP and DeepSpeed setup
-```
--- a/docs/amd_hpc.qmd
+++ b/docs/amd_hpc.qmd
@@ -1,108 +0,0 @@
---
-title: AMD GPUs on HPC Systems
-description: A comprehensive guide for using Axolotl on distributed systems with AMD GPUs
---
-
-This guide provides step-by-step instructions for installing and configuring Axolotl on a High-Performance Computing (HPC) environment equipped with AMD GPUs.
-
-## Setup
-
-### 1. Install Python
-
-We recommend using Miniforge, a minimal conda-based Python distribution:
-
-```bash
-curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
-bash Miniforge3-$(uname)-$(uname -m).sh
-```
-
-### 2. Configure Python Environment
-Add Python to your PATH and ensure it's available at login:
-
-```bash
-echo 'export PATH=~/miniforge3/bin:$PATH' >> ~/.bashrc
-echo 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' >> ~/.bash_profile
-```
-
-### 3. Load AMD GPU Software
-
-Load the ROCm module:
-
-```bash
-module load rocm/5.7.1
-```
-
-Note: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.
-
-### 4. Install PyTorch
-
-Install PyTorch with ROCm support:
-
-```bash
-pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall
-```
-
-### 5. Install Flash Attention
-
-Clone and install the Flash Attention repository:
-
-```bash
-git clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git
-export GPU_ARCHS="gfx90a"
-cd flash-attention
-export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
-patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" hipify_patch.patch
-pip install --no-build-isolation .
-```
-
-### 6. Install Axolotl
-
-Clone and install Axolotl:
-
-```bash
-git clone https://github.com/axolotl-ai-cloud/axolotl
-cd axolotl
-pip install packaging ninja
-pip install --no-build-isolation -e .
-```
-
-### 7. Apply xformers Workaround
-
-xformers appears to be incompatible with ROCm. Apply the following workarounds:
- - Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return `False` for SwiGLU availability from xformers.
- - Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the "SwiGLU" function with a pass statement.
-
-### 8. Prepare Job Submission Script
-
-Create a script for job submission using your HPC's particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include
-
-```bash
-export TRANSFORMERS_OFFLINE=1
-export HF_DATASETS_OFFLINE=1
-```
-
-### 9. Download Base Model
-
-Download a base model using the Hugging Face CLI:
-
-```bash
-hf download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
-```
-
-### 10. Create Axolotl Configuration
-
-Create an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.
-
-Note: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.
-
-### 11. Preprocess Data
-
-Run preprocessing on the login node:
-
-```bash
-CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess /path/to/your/config.yaml
-```
-
-### 12. Train
-
-You are now ready to submit your previously prepared job script. 🚂
--- a/docs/attention.qmd
+++ b/docs/attention.qmd
@@ -1,242 +0,0 @@
---
-title: Attention
-description: Supported attention modules in Axolotl
---
-
-Axolotl routes attention via a single config field:
-
-```yaml
-attn_implementation: <backend>
-```
-
-`attn_implementation` is passed through to `transformers` verbatim (via
-`model.config._attn_implementation`). Accepted values are the HF-native
-backends, axolotl-registered backends, or a hub-kernel path.
-
-## Backends
-
-| `attn_implementation` | Description |
-|---|---|
-| `eager` | Plain PyTorch attention. No packing support. |
-| `sdpa` | PyTorch `scaled_dot_product_attention`. No packing support. |
-| `flash_attention_2` | Dao-AILab Flash Attention 2. |
-| `flash_attention_3` | Dao-AILab Flash Attention 3 (Hopper+). |
-| `flex_attention` | Torch Flex Attention (requires torch ≥ 2.6). |
-| `xformers` | xFormers memory-efficient attention. |
-| `sage` | SageAttention (QK int8 / PV fp16). |
-| `s2` | Shifted-Sparse Attention (LLaMA only, FA2 under the hood). |
-| `fp8` | torchao FP8 low-precision attention (requires SM90+, torch ≥ 2.11). Loaded as SDPA and patched post-load. |
-| `kernels-community/flash-attn3` | HF hub FA3 kernel. |
-| `kernels-community/sage-attention` | HF hub SageAttention kernel. |
-| Other `<org>/<name>` path | Any hub-kernel path supported by `transformers`. |
-
-Short-form aliases (`flash`, `fa2`, `flex`, `sdp`, etc.) are **not accepted** —
-set the canonical name above.
-
-### Capability flags
-
-Axolotl derives three boolean capability flags from `attn_implementation` and
-exposes them on the validated config:
-
- `cfg.attn_supports_packing` — backend supports varlen sample packing via
-  `position_ids`. Gates multipack patches and `sample_packing_drop_attention_mask`.
- `cfg.attn_uses_flash_lib` — backend needs the `flash_attn` (Dao-AILab)
-  monkeypatches (FA4 auto, LLaMA flash hijack, ring-FA).
- `cfg.attn_needs_dtype_cast` — backend requires fp16/bf16 embeddings
-  (everything except `eager` and `sdpa`).
-
-These are **computed** — they cannot be overridden from YAML.
-
-## Per-backend notes
-
-### SDPA
-
-Default PyTorch attention. See
-[PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html).
-
-```yaml
-attn_implementation: sdpa
-```
-
-### Flash Attention
-
-Axolotl supports FA2, FA3, and FA4. The best available version is used
-automatically based on your installed packages and GPU.
-
-```yaml
-attn_implementation: flash_attention_2  # or flash_attention_3
-```
-
-#### Flash Attention 2
-
-Requirements: Ampere, Ada, or Hopper GPUs (Turing or lower not supported)
-
-```bash
-pip install flash-attn --no-build-isolation
-```
-
-::: {.callout-tip}
-
-If you get `undefined symbol` while training, ensure you installed PyTorch prior to Axolotl.
-Alternatively, try reinstall or downgrade a version.
-
-:::
-
-#### Flash Attention 3
-
-Requirements: Hopper only and CUDA 12.8 (recommended)
-
-```bash
-git clone https://github.com/Dao-AILab/flash-attention.git
-cd flash-attention/hopper
-python setup.py install
-```
-
-#### Flash Attention 4
-
-Requirements: Hopper or Blackwell GPUs. Auto-applied when `attn_uses_flash_lib`
-is true and FA4 is importable.
-
-FA4 is still a pre-release on PyPI, so `--pre` is required:
-
-```bash
-pip install --pre flash-attn-4
-```
-
-Or from source:
-
-```bash
-git clone https://github.com/Dao-AILab/flash-attention.git
-cd flash-attention/flash_attn/cute
-pip install -e .
-
-# FA2's flash_attn package includes a cute/ stub that shadows FA4.
-# Remove it so Python can find the real FA4 module:
-rm -r $(python -c "import flash_attn; print(flash_attn.__path__[0])")/cute
-```
-
-::: {.callout-note}
-
-**Hopper (SM90) users**: The backward kernel is not yet included in the pip package. To use FA4
-for training on Hopper, install from source using the instructions above.
-
-:::
-
-::: {.callout-warning}
-
-FA4 only supports head dimensions up to 128 (`d ≤ 128`). The DeepSeek shape `(192, 128)` is
-also supported but only on Blackwell. Axolotl automatically detects incompatible head dimensions
-and falls back to FA2/3.
-
-:::
-
-### AMD
-
-Requirements: ROCm 6.0 and above. See
-[Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
-
-### Flex Attention
-
-```yaml
-attn_implementation: flex_attention
-torch_compile: true  # recommended
-```
-
-Requires torch ≥ 2.6. See [PyTorch docs](https://pytorch.org/blog/flexattention/).
-
-### SageAttention
-
-Requirements: Ampere, Ada, or Hopper GPUs.
-
-```yaml
-attn_implementation: sage
-```
-
-```bash
-pip install sageattention==2.2.0 --no-build-isolation
-```
-
-::: {.callout-warning}
-
-Only LoRA/QLoRA recommended. Full finetuning has been observed to drop loss to 0. See
-[GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).
-
-:::
-
-For more details: [Sage Attention](https://github.com/thu-ml/SageAttention).
-
-### xFormers
-
-```yaml
-attn_implementation: xformers
-```
-
-::: {.callout-tip}
-
-Recommended for Turing GPUs or below (e.g. Colab T4).
-
-:::
-
-### Shifted Sparse Attention
-
-::: {.callout-warning}
-
-Planned for deprecation. Prefer one of the backends above.
-
-:::
-
-Requirements: LLaMA model architecture. Loaded as FA2 under the hood and
-patched to implement shifted-sparse attention. Does not support sample packing.
-
-```yaml
-attn_implementation: s2
-```
-
-### FP8
-
-torchao low-precision attention. Loaded as SDPA and patched post-load.
-
-Requirements: SM90+ (Hopper/Blackwell), PyTorch ≥ 2.11, torchao ≥ 0.17,
-flash-attn with FA3. KV caching must be disabled.
-
-```yaml
-attn_implementation: fp8
-```
-
-### Hub kernels
-
-```yaml
-attn_implementation: kernels-community/flash-attn3
-```
-
-Passed through to `transformers`; axolotl does not install the kernel itself.
-For recognized hub paths the capability flags are set automatically; for
-arbitrary paths axolotl uses conservative defaults (`attn_supports_packing=False`,
-`attn_uses_flash_lib=False`).
-
-## Migrating from legacy boolean flags
-
-The following legacy config fields are **deprecated** and will be removed in a
-future release. Each emits a `DeprecationWarning` when set and is stripped from
-the validated config.
-
-| Legacy | Canonical |
-|---|---|
-| `flash_attention: true` | `attn_implementation: flash_attention_2` |
-| `sdp_attention: true` | `attn_implementation: sdpa` |
-| `xformers_attention: true` | `attn_implementation: xformers` |
-| `flex_attention: true` | `attn_implementation: flex_attention` |
-| `sage_attention: true` | `attn_implementation: sage` |
-| `s2_attention: true` | `attn_implementation: s2` |
-| `eager_attention: true` | `attn_implementation: eager` |
-
-Combining `attn_implementation` with a legacy flag (e.g. `attn_implementation:
-flash_attention_2` **and** `flash_attention: true`) raises — pick one.
-
-::: {.callout-note}
-
-Existing example configs under `examples/` still use the legacy flags. They
-continue to work with a deprecation warning; they will be migrated in a
-follow-up pass.
-
-:::
--- a/docs/batch_vs_grad.qmd
+++ b/docs/batch_vs_grad.qmd
@@ -1,59 +0,0 @@
---
-title: Batch size vs Gradient accumulation
-description: Understanding of batch size and gradient accumulation steps
---
-
-Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn't significantly impact learning.
-
-This method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here's why:
-
-1. **Memory Consumption with Batch Size**: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.
-
-2. **Gradient Accumulation**: With gradient accumulation, you're effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you're only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.
-
-**Example 1:**
-Micro batch size: 3
-Gradient accumulation steps: 2
-Number of GPUs: 3
-Total batch size = 3 * 2 * 3 = 18
-
-```
-| GPU 1          | GPU 2          | GPU 3          |
-|----------------|----------------|----------------|
-| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |
-| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |
-|----------------|----------------|----------------|
-| → (accumulate) | → (accumulate) | → (accumulate) |
-|----------------|----------------|----------------|
-| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |
-| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |
-|----------------|----------------|----------------|
-| → (apply)      | → (apply)      | → (apply)      |
-
-Accumulated gradient for the weight w1 after the second iteration (considering all GPUs):
-Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18
-
-Weight update for w1:
-w1_new = w1_old - learning rate x (Total gradient for w1 / 18)
-```
-
-**Example 2:**
-Micro batch size: 2
-Gradient accumulation steps: 1
-Number of GPUs: 3
-Total batch size = 2 * 1 * 3 = 6
-
-```
-| GPU 1     | GPU 2     | GPU 3     |
-|-----------|-----------|-----------|
-| S1, S2    | S3, S4    | S5, S6    |
-| e1, e2    | e3, e4    | e5, e6    |
-|-----------|-----------|-----------|
-| → (apply) | → (apply) | → (apply) |
-
-Accumulated gradient for the weight w1 (considering all GPUs):
-Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6
-
-Weight update for w1:
-w1_new = w1_old - learning rate × (Total gradient for w1 / 6)
-```
--- a/docs/checkpoint_saving.qmd
+++ b/docs/checkpoint_saving.qmd
@@ -1,86 +0,0 @@
---
-title: "Checkpoint Saving"
-format:
-  html:
-    toc: true
-    toc-depth: 2
-    number-sections: true
-execute:
-  enabled: false
---
-
-## Overview
-
-Axolotl supports on-demand checkpoint saving during training. You can trigger checkpoints via file-based triggers (for programmatic control) or Control+C (for interactive use).
-
-## File-Based Checkpoint Trigger
-
-### Configuration
-
-Enable in your config:
-
-```yaml
-dynamic_checkpoint:
-  enabled: true
-  check_interval: 100  # Optional: check every N steps (default: 100)
-  trigger_file_path: "axolotl_checkpoint.save"  # Optional: custom filename
-```
-
-**Options:**
- `enabled`: `true` to enable (required)
- `check_interval`: Steps between file checks. Default: 100. Lower = faster response, higher I/O overhead.
- `trigger_file_path`: Custom trigger filename. Default: `axolotl_checkpoint.save`
-
-### How It Works
-
-1. Rank 0 checks for trigger file every `check_interval` steps in `output_dir`
-2. When detected, file is deleted and checkpoint is saved
-3. In distributed training, rank 0 broadcasts to synchronize all ranks
-
-### Usage
-
-**Command line:**
-```bash
-touch /path/to/output_dir/axolotl_checkpoint.save
-```
-
-**Programmatic:**
-```python
-from pathlib import Path
-Path("/path/to/output_dir/axolotl_checkpoint.save").touch()
-```
-
-Checkpoint saves within the next `check_interval` steps. The trigger file is auto-deleted after detection, so you can create it multiple times.
-
-**Custom filename:**
-```yaml
-dynamic_checkpoint:
-  enabled: true
-  trigger_file_path: "my_trigger.save"
-```
-```bash
-touch /path/to/output_dir/my_trigger.save
-```
-
-## Control+C (SIGINT) Checkpoint
-
-Pressing `Ctrl+C` during training saves the model state and exits gracefully. **Note:** This saves only the model weights, not optimizer state. For resumable checkpoints, use the file-based trigger.
-
-## Best Practices
-
- **Check interval**: Lower values (10-50) for fast training, default 100 for slower training
- **Distributed training**: Create trigger file once; rank 0 handles synchronization
- **Resume**: Dynamic checkpoints can be resumed like regular checkpoints via `resume_from_checkpoint`
-
-## Example
-
-```yaml
-output_dir: ./outputs/lora-out
-save_steps: 500  # Scheduled checkpoints
-
-dynamic_checkpoint:
-  enabled: true
-  check_interval: 50
-```
-
-This enables scheduled checkpoints every 500 steps plus on-demand saves via file trigger (checked every 50 steps).
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Mads Henrichsen	272bced137	cpu offloading	2023-12-31 22:17:43 +01:00
Mads Henrichsen	c371d6b546	cpu offloading	2023-12-31 12:02:29 +01:00
Mads Henrichsen	d6273188f0	fft	2023-12-31 07:42:46 +01:00
Mads Henrichsen	72797b04a5	fix modules	2023-12-31 07:40:33 +01:00
Mads Henrichsen	de47bb5eb0	better lr	2023-12-30 22:36:50 +01:00
Mads Henrichsen	c04df54b4b	new lr	2023-12-30 21:36:01 +01:00
Mads Henrichsen	e3716db386	small batch size	2023-12-30 13:20:45 +01:00
Mads Henrichsen	97943d8fc4	model revision	2023-12-30 12:55:17 +01:00
Mads Henrichsen	9d3f80cd40	disable packing	2023-12-30 12:51:03 +01:00
Mads Henrichsen	bfae79a634	trust	2023-12-30 12:47:50 +01:00
Mads Henrichsen	5a85ee16eb	yayi2	2023-12-30 12:43:46 +01:00
				`@@ -1 +0,0 @@`
				`See [docs/debugging.md](../docs/debugging.md) for guidance on how to modify these files to debug axolotl with VSCode.`
				`@@ -1 +0,0 @@`
				`This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.`