chore: update title

restore dockerfile
fix: trim allowed cuda versions
2025-04-26 16:21:31 -04:00 · 2025-04-26 16:21:30 -04:00 · 2025-04-26 16:21:30 -04:00 · 2025-04-26 16:21:30 -04:00 · 2025-04-26 16:21:30 -04:00 · 2025-04-26 16:21:30 -04:00
111 changed files with 553 additions and 5212 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -22,6 +22,12 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.4.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,8 +18,13 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras: vllm
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -30,7 +35,7 @@ jobs:
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
-            axolotl_extras:
+            axolotl_extras: vllm
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -62,7 +67,6 @@ jobs:
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
@@ -78,6 +82,11 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -9,7 +9,6 @@ on:
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
@@ -33,11 +32,18 @@ jobs:
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:  # no vllm support for 2.4.1
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
-            axolotl_extras:
+            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 126
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,6 +12,11 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -65,6 +70,11 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -1,61 +0,0 @@
 name: Preview
 on:
  workflow_dispatch:
  pull_request:
    types: [opened, synchronize, reopened]
    # Run the workflow only when one of these files changes
    paths:
      - '**/*.md'      # any Markdown file
      - '**/*.qmd'     # any Quarto file
      - '_quarto.yaml'
 permissions:
  checks: write
  contents: write
  deployments: write
  issues: write
  discussions: write
  pages: write
  pull-requests: write
  statuses: write
 jobs:
  preview:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
      - name: Set up Quarto
        uses: quarto-dev/quarto-actions/setup@v2
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Install dependencies
        run: |
          python3 -m pip install jupyter quartodoc
          python3 -m pip install -e . --no-deps
      - name: Build autodoc
        run: quartodoc build
      - name: Quarto render
        run: quarto render
      - name: Netlify Publish
        uses: nwtgck/actions-netlify@v3.0
        with:
          publish-dir: './_site'
          enable-pull-request-comment: true
          enable-github-deployment: true
          github-token: ${{ secrets.GITHUB_TOKEN }}
          deploy-message: "Deployed On Netlify"
          github-deployment-environment: 'preview'
          github-deployment-description: 'Preview Deployment'
        env:
          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -26,7 +26,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20
    steps:
@@ -106,6 +106,13 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -27,9 +27,6 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 env:
  TRANSFORMERS_IS_CI: "yes"
 jobs:
  pre-commit:
    name: pre-commit
@@ -52,7 +49,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20
    steps:
@@ -138,7 +135,7 @@ jobs:
      max-parallel: 1
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20
    steps:
@@ -261,12 +258,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: llmcompressor
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -278,7 +269,7 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            num_gpus: 1
-            axolotl_extras:
+            axolotl_extras: vllm
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
--- a/.runpod/Dockerfile
+++ b/.runpod/Dockerfile
@@ -1,10 +1,11 @@
-FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
+FROM runpod/pytorch:3.10-2.0.0-117
 COPY .runpod/requirements.txt /requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install --upgrade pip && \
    python3 -m pip install --upgrade -r /requirements.txt
 # Environment settings
 ARG BASE_VOLUME="/runpod-volume"
 ENV BASE_VOLUME=$BASE_VOLUME
@@ -14,5 +15,4 @@ ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
 COPY .runpod/src /src
 WORKDIR /src
 CMD ["python3", "/src/handler.py"]
--- a/.runpod/requirements.txt
+++ b/.runpod/requirements.txt
@@ -5,3 +5,11 @@
 # git+https://github.com/runpod/runpod-python.git
 # To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
 runpod~=1.7.0
 huggingface_hub
 typing-extensions
 pydantic
 pydantic-settings
 hf-transfer
 setuptools
 numpy==2.0.0
 axolotl[flash-attn,deepspeed]
--- a/.runpod/test-input.json
+++ b/.runpod/test-input.json
@@ -1,86 +0,0 @@
 {
  "input": {
    "name": "quick_smoke_test_sft",
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "llama-test",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "HuggingFaceTB/SmolLM2-135M",
      "model_type": "AutoModelForCausalLM",
      "tokenizer_type": "AutoTokenizer",
      "load_in_4bit": true,
      "strict": false,
      "datasets": [
        {
          "path": "mhenrichsen/alpaca_2k_test",
          "type": "alpaca",
          "split": "train[:10%]"
        }
      ],
      "val_set_size": 0.02,
      "output_dir": "./outputs/lora-out",
      "sequence_len": 4096,
      "sample_packing": true,
      "eval_sample_packing": false,
      "pad_to_sequence_len": true,
      "adapter": "qlora",
      "lora_r": 32,
      "lora_alpha": 64,
      "lora_dropout": 0.05,
      "lora_target_linear": true,
      "lora_modules_to_save": [
        "embed_tokens",
        "lm_head"
      ],
      "gradient_accumulation_steps": 2,
      "micro_batch_size": 1,
      "num_epochs": 1,
      "optimizer": "adamw_torch_fused",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": true,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "warmup_steps": 1,
      "evals_per_epoch": 1,
      "eval_max_new_tokens": 128,
      "saves_per_epoch": 1,
      "weight_decay": 0.0,
      "special_tokens": {
        "pad_token": "<|endoftext|>"
      },
      "max_steps": 20
    },
    "timeout": 100000
  },
  "config": {
    "gpuTypeId": "NVIDIA GeForce RTX 4090",
    "gpuCount": 1,
    "containerDiskInGb": 200,
    "env": [
      {
        "key": "TOKENIZER",
        "value": ""
      },
      {
        "key": "DISABLE_LOG_STATS",
        "value": "true"
      }
    ],
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ]
  }
 }
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
@@ -11,43 +11,43 @@
          "hf_token": ""
        },
        "args": {
-          "base_model": "HuggingFaceTB/SmolLM2-135M",
+          "base_model": "NousResearch/Meta-Llama-3-8B",
-          "model_type": "AutoModelForCausalLM",
+          "model_type": "LlamaForCausalLM",
          "tokenizer_type": "AutoTokenizer",
-          "load_in_4bit": true,
+          "load_in_8bit": true,
          "load_in_4bit": false,
          "strict": false,
          "datasets": [
            {
              "path": "mhenrichsen/alpaca_2k_test",
-              "type": "alpaca",
+              "type": "alpaca"
              "split": "train[:10%]"
            }
          ],
-          "val_set_size": 0.02,
+          "val_set_size": 0.05,
          "output_dir": "./outputs/lora-out",
          "sequence_len": 4096,
          "sample_packing": true,
          "eval_sample_packing": false,
          "pad_to_sequence_len": true,
-          "adapter": "qlora",
+          "adapter": "lora",
          "lora_r": 32,
-          "lora_alpha": 64,
+          "lora_alpha": 16,
          "lora_dropout": 0.05,
          "lora_target_linear": true,
          "lora_modules_to_save": [
            "embed_tokens",
            "lm_head"
          ],
-          "gradient_accumulation_steps": 2,
+          "gradient_accumulation_steps": 4,
-          "micro_batch_size": 1,
+          "micro_batch_size": 2,
          "num_epochs": 1,
-          "optimizer": "adamw_torch_fused",
+          "optimizer": "adamw_bnb_8bit",
          "lr_scheduler": "cosine",
          "learning_rate": 0.0002,
          "train_on_inputs": false,
          "group_by_length": false,
          "bf16": "auto",
-          "tf32": true,
+          "tf32": false,
          "gradient_checkpointing": true,
          "logging_steps": 1,
          "flash_attention": true,
@@ -57,9 +57,8 @@
          "saves_per_epoch": 1,
          "weight_decay": 0.0,
          "special_tokens": {
-            "pad_token": "<|endoftext|>"
+            "pad_token": "<|end_of_text|>"
-          },
+          }
          "max_steps": 20
        }
      },
      "timeout": 100000
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -20,4 +20,4 @@ pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
  --cov-report=xml:multigpu-coverage.xml
 # Upload coverage to Codecov
-codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
+codecov upload-process -t $CODECOV_TOKEN -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -154,10 +154,6 @@ datasets:
    # Key containing the messages (default: "messages")
    field_messages: messages
    # Key containing the system message (default: "system")
    # If the system message is not present in the dataset sample, it will be loaded from the field_system property.
    field_system: system
    # Mapping of properties from the input dataset to the chat template.
    # (default: message_property_mappings={'role':'role', 'content':'content'})
    # If a property exists in the template but not in this mapping, the system will attempt
@@ -184,14 +180,10 @@ datasets:
    # adding a system turn with empty content.
    drop_system_message:
    # Optional[bool]. Whether to split the assistant turn based on a reasoning trace inside delimited tags
    # defaults to False
    split_thinking:
    # IMPORTANT: The following fields determine which parts of the conversation to train on.
    # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
    # See examples at `docs/dataset-formats/conversation.qmd`
-    # Note: If the below 5 fields are empty, defaults to training only on the last message.
+    # Note: If the below 4 fields are set to empty, defaults to training only on the last message.
    # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
    roles_to_train: ["assistant"]  # default
@@ -200,13 +192,7 @@ datasets:
    # - turn (default): train on the EOS token at the end of each trainable turn
    # - last: train on the last EOS token in the conversation
    # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
-    train_on_eos: turn
+    train_on_eos: last
    # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are:
    # - all: train on all EOT tokens
    # - turn: train on the EOT token at the end of each trainable turn
    # - last: train on the last EOT token in the conversation
    # If not specified, defaults to the value of train_on_eos for backward compatibility.
    train_on_eot:
    # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
    message_field_training: training
    # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
@@ -289,17 +275,8 @@ process_reward_model:
 chat_template: tokenizer_default
 # custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
 chat_template_jinja: null
-# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training.
+# Changes the default system message. Currently only supports chatml.
-# These tokens mark the boundaries between conversation turns.
+default_system_message: You are a helpful assistant. Please give a long and detailed answer.
 # For example: ["/INST", "</s>", "[/SYSTEM_PROMPT]"]
 # If not specified, defaults to just the model's eos_token.
 # This is useful for templates that use multiple delimiter tokens.
 eot_tokens:
  # - "</s>"
  # - "[/INST]"
  # - "[/SYSTEM_PROMPT]"
 # Changes the default system message
 default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
 # Axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
@@ -684,10 +661,8 @@ special_tokens:
  # unk_token: "<unk>"
  # pad_token: "[PAD]"
-# Optional[list[str]]. Add extra tokens to the tokenizer.
+# Add extra tokens.
 tokens:
  # - "<|startoftext|>"
  # - "<|endoftext|>"
 # Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.
 # Only works for tokens that are not part of the base vocab (aka are added_tokens).
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -49,8 +49,7 @@ sections = [
    ("Knowledge Distillation (KD)", "kd"),
    ("Liger Kernels", "liger"),
    ("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
-    ("Spectrum", "spectrum"),
+    ("Spectrum", "spectrum")
    ("LLMCompressor", "llm_compressor")
 ]
 for section_name, folder_name in sections:
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -4,6 +4,18 @@ description: Conversation format for supervised fine-tuning.
 order: 3
 ---
 ## sharegpt
 ::: {.callout-important}
 ShareGPT is deprecated!. Please see [chat_template](#chat_template) section below.
 :::
 ## pygmalion
 ```{.json filename="data.jsonl"}
 {"conversations": [{"role": "...", "value": "..."}]}
 ```
 ## chat_template
 Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.
@@ -52,7 +64,7 @@ We recommend checking the below examples for other usecases.
 ### Examples
-1. (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
 ```yaml
 datasets:
@@ -97,55 +109,10 @@ datasets:
 ```
 ::: {.callout-important}
-Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
+Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
 :::
-5. If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
+5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
 ```yaml
 eot_tokens:
  - "[/INST]"
  # - "[/SYSTEM_PROMPT]"
 datasets:
  - path: ...
    type: chat_template
    # optional
    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)
 ```
 ::: {.callout-tip}
 See [config documentation](../config.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
 :::
 ::: {.callout-note}
 Using `eot_tokens` requires each token that exists in `chat_template` to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.
 You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
 :::
 6. Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
 ```yaml
 eot_tokens:
  - "[/INST]"
  # ...
 datasets:
  - path: ...
    type: chat_template
    train_on_eos: last
    train_on_eot: turn
 ```
 ::: {.callout-tip}
 If EOS token only appears at the end of a prompt, `train_on_eos: last` is equivalent to `train_on_eos: turn`. Therefore, generally, you can leave them to their defaults and omit them.
 :::
 7. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
 For a data sample that looks like:
@@ -195,15 +162,3 @@ datasets:
 ::: {.callout-tip}
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::
 ## sharegpt
 ::: {.callout-important}
 ShareGPT is deprecated!. Please see [chat_template](#chat_template) section.
 :::
 ## pygmalion
 ```{.json filename="data.jsonl"}
 {"conversations": [{"role": "...", "value": "..."}]}
 ```
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -73,40 +73,10 @@ description: Frequently asked questions
 > A: This is likely an empty turn.
-**Q: The EOS token is incorrectly being masked or not being masked / `EOS token __ not found in chat template`.**
+**Q: The EOS/EOT token is incorrectly being masked or not being masked.**
-> A: There can be two reasons:
+> A: This is because of the mismatch between `tokenizer.eos_token` and EOS/EOT token in template. Please make sure to set `eos_token` under `special_tokens` to the same EOS/EOT token as in template.
 > 1. This is because of the mismatch between `tokenizer.eos_token` and EOS token in template. Please make sure to set `eos_token: ` under `special_tokens: ` to the same EOS token as in template.
 > 2. The EOS token is not in the template. Please check if your template is correct. As an example, `phi_35` template does not use its dedicated EOS token `<|endoftext|>` at the end.
 **Q: "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null. Please add a `chat_template` in tokenizer config"**
 > A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See [chat_template](dataset-formats/conversation.qmd#chat-template) for more details.
 **Q: The EOT token(s) are incorrectly being masked or not being masked / `EOT token __ not found in chat template`.**
 > A: There can be two reasons:
 > 1. The EOT token is different from the EOS token and was not specified under `eot_tokens: `. Please set `eot_tokens: ` to the same EOT token(s) as in template.
 > 2. There is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.
 **Q: `EOT token encoding failed. Please check if the token is valid and can be encoded.`**
 > A: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.
 **Q: `EOT token __ is encoded as multiple tokens.`**
 > A: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `.
 **Q: `Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot`**
 > A: This is because the EOS token is in the `eot_tokens: ` while mismatch between `train_on_eos: ` and `train_on_eot: `. This will cause one to override the other. Please ensure that `train_on_eos: ` and `train_on_eot: ` are the same or remove the EOS token from `eot_tokens: `.
 **Q: If `eot_tokens: ` is not provided, what happens?**
 > A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.
 > Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -164,7 +164,7 @@ Here is an example of a multi-modal dataset:
        {
            "role": "user",
            "content": [
-                {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+                {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
                {"type": "text", "text": "Describe this image in detail."}
            ]
        },
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -502,7 +502,9 @@ The input format is a simple JSON input with customizable fields based on the ab
 Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
 :::
-In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
+If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
 First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
 using 4 GPUs - 2 for training, and 2 for vLLM:
 ::: {.callout-important}
 Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
@@ -537,10 +539,6 @@ Your `vLLM` instance will now attempt to spin up, and it's time to kick off trai
 CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
 ```
 ::: {.callout-note}
 Due to TRL's implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use `CUDA_VISIBLE_DEVICES=2,3` for the vLLM instance.
 :::
 #### Reward functions
 GRPO uses custom reward functions and transformations. Please have them ready locally.
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -1,77 +0,0 @@
 base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
 plugins:
  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 eval_sample_packing: false
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 8
 micro_batch_size: 1
 num_epochs: 1
 optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 2e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: <|end_of_text|>
 llmcompressor:
  recipe:
    finetuning_stage:
      finetuning_modifiers:
        ConstantPruningModifier:
          targets: [
            're:.*q_proj.weight',
            're:.*k_proj.weight',
            're:.*v_proj.weight',
            're:.*o_proj.weight',
            're:.*gate_proj.weight',
            're:.*up_proj.weight',
            're:.*down_proj.weight',
          ]
          start: 0
  save_compressed: true
--- a/examples/qwen3/32b-qlora.yaml
+++ b/examples/qwen3/32b-qlora.yaml
@@ -1,69 +0,0 @@
 base_model: Qwen/Qwen3-32B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 strict: false
 chat_template: qwen3
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 val_set_size: 0.0
 output_dir: ./outputs/out
 dataset_prepared_path: last_run_prepared
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 pad_to_sequence_len: true
 load_in_4bit: true
 adapter: qlora
 lora_r: 16
 lora_alpha: 32
 lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - down_proj
  - up_proj
 lora_mlp_kernel: true
 lora_qkv_kernel: true
 lora_o_kernel: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/qwen3/qlora-fsdp.yaml
+++ b/examples/qwen3/qlora-fsdp.yaml
@@ -1,68 +0,0 @@
 base_model: Qwen/Qwen3-8B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/out
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 pad_to_sequence_len: true
 adapter: qlora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 64
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - full_shard
  - auto_wrap
 fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,14 +11,14 @@ liger-kernel==0.5.8
 packaging==23.2
-peft==0.15.2
+peft==0.15.1
 transformers==4.51.3
 tokenizers>=0.21.1
 accelerate==1.6.0
 datasets==3.5.0
 deepspeed>=0.15.4
-trl==0.17.0
+trl==0.16.1
-hf_xet==1.1.0
+hf_xet==1.0.0
 hqq==0.2.5
 optimum==1.16.2
--- a/setup.py
+++ b/setup.py
@@ -67,13 +67,13 @@ def parse_requirements(extras_require_map):
            if (major, minor) >= (2, 7):
                _install_requires.pop(_install_requires.index(xformers_version))
                # _install_requires.append("xformers==0.0.29.post3")  # xformers seems to be hard pinned to 2.6.0
-                extras_require_map["vllm"] = ["vllm==0.8.5"]
+                extras_require_map["vllm"] = ["vllm==0.8.3"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append(
                    "xformers==0.0.29.post2"
                )  # vllm needs post2 w torch 2.6
-                extras_require_map["vllm"] = ["vllm==0.8.5"]
+                extras_require_map["vllm"] = ["vllm==0.8.3"]
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
@@ -149,9 +149,6 @@ extras_require = {
    "vllm": [
        "vllm==0.7.2",
    ],
    "llmcompressor": [
        "llmcompressor==0.5.1",
    ],
 }
 install_requires, dependency_links, extras_require_build = parse_requirements(
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -4,4 +4,4 @@ import pkgutil
 __path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package
-__version__ = "0.10.0.dev0"
+__version__ = "0.8.0"
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -2,7 +2,4 @@
 import os
 from axolotl.logging_config import configure_logging
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 configure_logging()
--- a/src/axolotl/cli/art.py
+++ b/src/axolotl/cli/art.py
@@ -16,15 +16,8 @@ AXOLOTL_LOGO = """
    @@@@  @@@@@@@@@@@@@@@@
 """
 HAS_PRINTED_LOGO = False
 def print_axolotl_text_art():
    """Prints axolotl ASCII art."""
    global HAS_PRINTED_LOGO  # pylint: disable=global-statement
    if HAS_PRINTED_LOGO:
        return
    if is_main_process():
        HAS_PRINTED_LOGO = True
        print(AXOLOTL_LOGO)
--- a/src/axolotl/cli/checks.py
+++ b/src/axolotl/cli/checks.py
@@ -8,6 +8,9 @@ from accelerate.commands.config import config_args
 from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError
 from axolotl.logging_config import configure_logging
 configure_logging()
 LOG = logging.getLogger(__name__)
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -5,7 +5,6 @@ import logging
 import os
 import tempfile
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from typing import Union
 from urllib.parse import urlparse
@@ -153,15 +152,7 @@ def prepare_plugins(cfg: DictDefault):
            plugin_manager.register(plugin_name)
-def plugin_set_cfg(cfg: DictDefault):
+def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs) -> DictDefault:
    if cfg.get("plugins"):
        plugin_manager = PluginManager.get_instance()
        plugin_manager.cfg = cfg
 def load_cfg(
    config: str | Path | DictDefault = Path("examples/"), **kwargs
 ) -> DictDefault:
    """
    Loads the `axolotl` configuration stored at `config`, validates it, and performs
    various setup.
@@ -173,24 +164,13 @@ def load_cfg(
    Returns:
        `DictDefault` mapping configuration keys to values.
    """
-    if isinstance(config, (str, Path)):
+    config = check_remote_config(config)
-        config = check_remote_config(config)
+    if Path(config).is_dir():
-        if Path(config).is_dir():
+        config = choose_config(Path(config))
            config = choose_config(Path(config))
-        # Load the config from the yaml file
+    # Load the config from the yaml file
-        with open(config, encoding="utf-8") as file:
+    with open(config, encoding="utf-8") as file:
-            cfg: DictDefault = DictDefault(yaml.safe_load(file))
+        cfg: DictDefault = DictDefault(yaml.safe_load(file))
        cfg.axolotl_config_path = config
    else:
        cfg = config
        with NamedTemporaryFile(
            mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
        ) as temp_file:
            temp_file.write(yaml.dump(config.to_dict()))
            temp_file.close()
        cfg.axolotl_config_path = temp_file.name
    # If there are any options passed in the cli, if it is something that seems valid
    # from the yaml, then overwrite the value
@@ -204,6 +184,8 @@ def load_cfg(
            else:
                cfg[k] = kwargs[k]
    cfg.axolotl_config_path = config
    try:
        device_props = torch.cuda.get_device_properties("cuda")
        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
@@ -231,6 +213,5 @@ def load_cfg(
    setup_wandb_env_vars(cfg)
    setup_mlflow_env_vars(cfg)
    setup_comet_env_vars(cfg)
    plugin_set_cfg(cfg)
    return cfg
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -1,7 +1,6 @@
 """CLI to run evaluation on a model."""
 import logging
 import os
 from pathlib import Path
 from typing import Union
@@ -15,7 +14,6 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
 from axolotl.utils import patch_optimized_env
 from axolotl.utils.dict import DictDefault
 LOG = logging.getLogger(__name__)
@@ -31,14 +29,10 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
    patch_optimized_env()
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
    check_accelerate_default_config()
-    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+    check_user_token()
        check_user_token()
    if cfg.rl:
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -28,8 +28,9 @@ from axolotl.cli.utils import (
    fetch_from_github,
    filter_none_kwargs,
 )
 from axolotl.cli.vllm_serve import do_vllm_serve
 from axolotl.integrations.lm_eval.cli import lm_eval
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.schemas.config import AxolotlInputConfig
@@ -55,8 +56,6 @@ def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
    patch_optimized_env()
    if cloud:
        from axolotl.cli.cloud import do_cli_preprocess
@@ -102,7 +101,7 @@ def train(
            config options.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()
    if "use_ray" in kwargs and kwargs["use_ray"]:
        accelerate = False
@@ -328,8 +327,6 @@ def fetch(directory: str, dest: Optional[str]) -> None:
@add_options_from_dataclass(VllmServeCliArgs)
@filter_none_kwargs
 def vllm_serve(config: str, **cli_args: VllmServeCliArgs):
    from axolotl.cli.vllm_serve import do_vllm_serve
    do_vllm_serve(config, cli_args)
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -1,6 +1,5 @@
 """CLI to run training on a model."""
 import gc
 import logging
 import os
 from pathlib import Path
@@ -18,7 +17,7 @@ from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.train import train
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.config import normalize_config, resolve_dtype
 from axolotl.utils.dict import DictDefault
@@ -36,7 +35,7 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
        cli_args: Training-specific CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()
    print_axolotl_text_art()
    check_accelerate_default_config()
@@ -49,11 +48,8 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
    model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
    del model, tokenizer, trainer
    gc.collect()
    plugin_manager = PluginManager.get_instance()
    plugin_manager.post_train_unload(cfg)
--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -20,9 +20,11 @@ from transformers import (
    ProcessorMixin,
 )
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 configure_logging()
 LOG = logging.getLogger(__name__)
--- a/src/axolotl/common/architectures.py
+++ b/src/axolotl/common/architectures.py
@@ -11,6 +11,5 @@ MOE_ARCH_BLOCK = {
    ],
    "mixtral": "MixtralSparseMoeBlock",
    "qwen2_moe": "Qwen2MoeSparseMoeBlock",
    "qwen3_moe": "Qwen3MoeSparseMoeBlock",
    "deepseek_v2": "DeepseekV2MoE",
 }
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -47,8 +47,7 @@ def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
 def load_datasets(
    *,
    cfg: DictDefault,
-    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
+    cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
    debug: bool = False,
 ) -> TrainDatasetMeta:
    """
    Loads one or more training or evaluation datasets, calling
@@ -57,7 +56,6 @@ def load_datasets(
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Command-specific CLI arguments.
        debug: Whether to print out tokenization of sample
    Returns:
        Dataclass with fields for training and evaluation datasets and the computed
@@ -66,8 +64,7 @@ def load_datasets(
    tokenizer = load_tokenizer(cfg)
    processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
    preprocess_iterable = (
-        cli_args
+        hasattr(cli_args, "iterable")
        and hasattr(cli_args, "iterable")
        and cli_args.iterable is not None
        and cli_args.iterable
    )
@@ -79,25 +76,20 @@ def load_datasets(
        preprocess_iterable=preprocess_iterable,
    )
-    if (  # pylint: disable=too-many-boolean-expressions
+    if (
-        cli_args
+        cli_args.debug
-        and (
+        or cfg.debug
-            cli_args.debug
+        or cli_args.debug_text_only
-            or cfg.debug
+        or int(cli_args.debug_num_examples) > 0
-            or cli_args.debug_text_only
+    ):
            or int(cli_args.debug_num_examples) > 0
        )
    ) or debug:
        LOG.info("check_dataset_labels...")
-        num_examples = cli_args.debug_num_examples if cli_args else 1
+        train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
        text_only = cli_args.debug_text_only if cli_args else False
        train_samples = sample_dataset(train_dataset, num_examples)
        check_dataset_labels(
            train_samples,
            tokenizer,
-            num_examples=num_examples,
+            num_examples=cli_args.debug_num_examples,
-            text_only=text_only,
+            text_only=cli_args.debug_text_only,
        )
        LOG.info("printing prompters...")
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -21,7 +21,6 @@ import importlib.util
 import inspect
 import logging
 import math
 import os
 import sys
 from abc import abstractmethod
 from pathlib import Path
@@ -61,7 +60,6 @@ from axolotl.core.training_args import (
 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback
 from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
 from axolotl.processing_strategies import get_processing_strategy
 from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
@@ -73,7 +71,6 @@ from axolotl.utils.callbacks import (
    SaveBetterTransformerModelCallback,
    bench_eval_callback_factory,
    causal_lm_bench_eval_callback_factory,
    colab_inference_post_train_callback,
    log_prediction_callback_factory,
 )
 from axolotl.utils.callbacks.lisa import lisa_callback_factory
@@ -117,8 +114,6 @@ class TrainerBuilderBase(abc.ABC):
        if hasattr(model, "add_model_tags"):
            model.add_model_tags(["axolotl"])
        patch_trainer_get_lr()
    @property
    def model_ref(self):
        return self._model_ref
@@ -295,10 +290,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
            callbacks.append(lisa_callback_factory(trainer))
        if any("COLAB_" in key for key in os.environ):
            ColabCallback = colab_inference_post_train_callback(trainer)
            callbacks.append(ColabCallback(self.cfg))
        callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer))
        return callbacks
@@ -494,7 +485,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        # these are all the "standard" kwargs that are def used
        training_arguments_kwargs["max_steps"] = (
-            self.cfg.max_steps if self.cfg.max_steps else -1
+            total_num_steps if self.cfg.max_steps else -1
        )
        training_arguments_kwargs["max_seq_length"] = self.cfg.sequence_len
        training_arguments_kwargs["per_device_train_batch_size"] = (
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -114,8 +114,6 @@ class AxolotlTrainer(
            packing_efficiency_estimate=self.args.sample_packing_efficiency,
            batch_max_len=batch_max_len,
            batch_size=batch_size,
            group_size=self.args.sample_packing_group_size,
            bin_size=self.args.sample_packing_bin_size,
            sequential=self.args.sample_packing_sequentially,
            drop_last=True,
        )
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -3,29 +3,15 @@ DPO trainer for axolotl
 """
 import gc
 import random
 from functools import wraps
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Union
 import pandas as pd
 import torch
 import wandb
 from accelerate import PartialState
 from datasets import Dataset, IterableDataset
 from peft.optimizers import create_loraplus_optimizer
 from torch import nn
-from torch.utils.data import DataLoader
+from transformers import Trainer
 from transformers import (
    BaseImageProcessor,
    FeatureExtractionMixin,
    PreTrainedTokenizerBase,
    ProcessorMixin,
    Trainer,
 )
 from transformers.trainer_utils import EvalLoopOutput
 from transformers.utils import is_sagemaker_mp_enabled
-from trl import DPOConfig, DPOTrainer, maybe_apply_chat_template, maybe_extract_prompt
+from trl import DPOTrainer
 from trl.trainer.utils import log_table_to_comet_experiment
 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
 from axolotl.core.trainers.utils import (
@@ -95,64 +81,6 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
        return super().push_to_hub(*args, **kwargs)
    # TODO: remove this once https://github.com/huggingface/trl/pull/3377 is in a release
    def _prepare_dataset(
        self,
        dataset: Union[Dataset, IterableDataset],
        processing_class: Union[
            PreTrainedTokenizerBase,
            BaseImageProcessor,
            FeatureExtractionMixin,
            ProcessorMixin,
        ],
        args: DPOConfig,
        dataset_name: str,
    ) -> Union[Dataset, IterableDataset]:
        # Build the kwargs for the `map` function
        map_kwargs: Dict[str, Any] = {"writer_batch_size": 10}
        if isinstance(dataset, Dataset):  # IterableDataset does not support num_proc
            map_kwargs["num_proc"] = args.dataset_num_proc
        with PartialState().main_process_first():
            # Extract prompt if needed
            if isinstance(
                dataset, Dataset
            ):  # `IterableDataset.map` does not support `desc`
                map_kwargs["desc"] = f"Extracting prompt in {dataset_name} dataset"
            dataset = dataset.map(maybe_extract_prompt, **map_kwargs)
            # Apply the chat template if needed
            if isinstance(
                dataset, Dataset
            ):  # `IterableDataset.map` does not support `desc`
                map_kwargs["desc"] = f"Applying chat template to {dataset_name} dataset"
            dataset = dataset.map(
                maybe_apply_chat_template,
                fn_kwargs={"tokenizer": processing_class, "tools": args.tools},
                **map_kwargs,
            )
            # Tokenize the dataset
            if isinstance(
                dataset, Dataset
            ):  # `IterableDataset.map` does not support `desc`
                map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
            dataset = dataset.map(
                self.tokenize_row if not self.is_vision_model else self.process_row,
                remove_columns=["chosen", "rejected"],
                fn_kwargs={
                    "processing_class": processing_class,
                    "max_prompt_length": args.max_prompt_length,
                    "max_completion_length": args.max_completion_length,
                    # for enc-dec, we add the special tokens ([bos_token] + prompt + [eos_token]; completion + [eos_token])
                    "add_special_tokens": False,
                },
                **map_kwargs,
            )
        return dataset
    @staticmethod
    def tokenize_row(
        features,
@@ -177,8 +105,12 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
            # dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs
            if res["chosen_input_ids"][0] == processing_class.bos_token_id:
                res["chosen_input_ids"] = res["chosen_input_ids"][1:]
                res["chosen_labels"] = res["chosen_labels"][1:]
                res["chosen_attention_mask"] = res["chosen_attention_mask"][1:]
            if res["rejected_input_ids"][0] == processing_class.bos_token_id:
                res["rejected_input_ids"] = res["rejected_input_ids"][1:]
                res["rejected_labels"] = res["rejected_labels"][1:]
                res["rejected_attention_mask"] = res["rejected_attention_mask"][1:]
        return res
@@ -192,67 +124,3 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
        gc.collect()
        torch.cuda.empty_cache()
        return loss
    # TODO: remove this once https://github.com/huggingface/trl/pull/3377 is in a release
    def evaluation_loop(
        self,
        dataloader: DataLoader,
        description: str,
        prediction_loss_only: Optional[bool] = None,
        ignore_keys: Optional[list[str]] = None,
        metric_key_prefix: str = "eval",
    ) -> EvalLoopOutput:
        """
        Overriding built-in evaluation loop to store metrics for each batch.
        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
        Works both with or without labels.
        """
        # Sample and save to game log if requested (for one batch to save time)
        if self.generate_during_eval:
            # Generate random indices within the range of the total number of samples
            num_samples = len(dataloader.dataset)
            random_indices = random.sample(
                range(num_samples), k=self.args.eval_batch_size
            )
            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
            random_batch_dataset = dataloader.dataset.select(random_indices)
            random_batch = self.data_collator(random_batch_dataset)
            random_batch = self._prepare_inputs(random_batch)
            policy_output_decoded, ref_output_decoded = (
                self.generate_from_model_and_ref(self.model, random_batch)
            )
            table = pd.DataFrame(
                columns=["Prompt", "Policy", "Ref Model"],
                data=[
                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
                    for prompt, pol, ref in zip(
                        random_batch_dataset["prompt"],
                        policy_output_decoded,
                        ref_output_decoded,
                    )
                ],
            )
            if "wandb" in self.args.report_to and self.accelerator.is_main_process:
                wandb.log({"game_log": wandb.Table(data=table)})
            if "comet_ml" in self.args.report_to:
                log_table_to_comet_experiment(
                    name="game_log.csv",
                    table=table,
                )
        # Base evaluation
        initial_output = super().evaluation_loop(
            dataloader,
            description,
            prediction_loss_only,
            ignore_keys,
            metric_key_prefix,
        )
        return initial_output
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -63,7 +63,6 @@ class GRPOStrategy:
        grpo_args_kwargs["max_completion_length"] = trl.max_completion_length
        grpo_args_kwargs["log_completions"] = trl.log_completions
        grpo_args_kwargs["num_completions_to_print"] = trl.num_completions_to_print
        if trl.reward_weights:
            grpo_args_kwargs["reward_weights"] = trl.reward_weights
@@ -71,13 +70,6 @@ class GRPOStrategy:
        if trl.scale_rewards is not None:
            grpo_args_kwargs["scale_rewards"] = trl.scale_rewards
        if trl.loss_type is not None:
            grpo_args_kwargs["loss_type"] = trl.loss_type
        if trl.mask_truncated_completions is not None:
            grpo_args_kwargs["mask_truncated_completions"] = (
                trl.mask_truncated_completions
            )
        if trl.temperature is not None:
            grpo_args_kwargs["temperature"] = trl.temperature
        if trl.top_p is not None:
@@ -93,11 +85,6 @@ class GRPOStrategy:
            grpo_args_kwargs["num_iterations"] = trl.num_iterations
        if trl.epsilon is not None:
            grpo_args_kwargs["epsilon"] = trl.epsilon
        if trl.epsilon_high is not None:
            grpo_args_kwargs["epsilon_high"] = trl.epsilon_high
        if trl.use_liger_loss is not None:
            grpo_args_kwargs["use_liger_loss"] = trl.use_liger_loss
        return grpo_args_kwargs
@@ -148,9 +135,7 @@ class GRPOStrategy:
        try:
            # use importlib to dynamically load the reward function from the module
            reward_func_module_name = reward_func_fqn.split(".")[-1]
-            reward_func_module = importlib.import_module(
+            reward_func_module = importlib.import_module(reward_func_fqn.split(".")[-2])
                ".".join(reward_func_fqn.split(".")[:-1])
            )
            reward_func = getattr(reward_func_module, reward_func_module_name)
            if not len(inspect.signature(reward_func).parameters) >= 2:
                raise ValueError(
--- a/src/axolotl/core/trainers/mixins/scheduler.py
+++ b/src/axolotl/core/trainers/mixins/scheduler.py
@@ -3,10 +3,9 @@
 import logging
 import torch
-from torch.optim.lr_scheduler import LRScheduler, OneCycleLR
+from torch.optim.lr_scheduler import OneCycleLR
 from transformers.trainer import Trainer
 from axolotl.integrations.base import PluginManager
 from axolotl.utils.schedulers import (
    RexLR,
    get_cosine_schedule_with_min_lr,
@@ -26,9 +25,9 @@ class SchedulerMixin(Trainer):
    def create_scheduler(
        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
-    ) -> LRScheduler:
+    ):
        """
-        Set up the scheduler. The optimizer of the trainer must have been set up either before this method is called or
+        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
        passed as an argument.
        Args:
@@ -48,16 +47,7 @@ class SchedulerMixin(Trainer):
        # fmt: off
        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
            # fmt: on
-            plugin_manager = PluginManager.get_instance()
+            if self.args.alternate_lr_scheduler_type == "one_cycle":
            lr_scheduler: LRScheduler | None = plugin_manager.create_lr_scheduler(
                trainer=self,
                optimizer=optimizer,
                num_training_steps=num_training_steps
            )
            if lr_scheduler is not None:
                LOG.info(f"Using plugin-created lr_scheduler: {lr_scheduler}")
                self.lr_scheduler = lr_scheduler
            elif self.args.alternate_lr_scheduler_type == "one_cycle":
                num_warmup_steps = self.args.get_warmup_steps(num_training_steps)
                pct_start = num_warmup_steps / num_training_steps
                extra_lr_kwargs = {}
@@ -120,4 +110,4 @@ class SchedulerMixin(Trainer):
            if use_cosine_min_lr:
                LOG.warning("axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")
-        return self.lr_scheduler  # type: ignore
+        return self.lr_scheduler
--- a/src/axolotl/core/trainers/relora.py
+++ b/src/axolotl/core/trainers/relora.py
@@ -1,7 +1,6 @@
 """Module for ReLoRA trainer"""
 import torch
 from torch.optim.lr_scheduler import LRScheduler
 from axolotl.core.trainers.base import AxolotlTrainer
 from axolotl.monkeypatch.relora import ReLoRAScheduler
@@ -20,11 +19,9 @@ class ReLoRATrainer(AxolotlTrainer):
        self,
        num_training_steps: int,
        optimizer: torch.optim.Optimizer | None = None,
-    ) -> LRScheduler:
+    ):
        optimizer = self.optimizer if optimizer is None else optimizer
-        lr_scheduler: LRScheduler = super().create_scheduler(
+        lr_scheduler = super().create_scheduler(num_training_steps, optimizer)
            num_training_steps, optimizer
        )
        if self.args.relora_steps:
            warmup_steps = (
@@ -33,7 +30,7 @@ class ReLoRATrainer(AxolotlTrainer):
            anneal_steps = (
                self.args.relora_anneal_steps if self.args.relora_anneal_steps else 1
            )
-            self.lr_scheduler = ReLoRAScheduler(  # type: ignore
+            self.lr_scheduler = ReLoRAScheduler(
                optimizer,
                lr_scheduler,
                self.args.relora_steps,
@@ -41,6 +38,6 @@ class ReLoRATrainer(AxolotlTrainer):
                warmup_steps,
            )
        else:
-            self.lr_scheduler = lr_scheduler  # type: ignore
+            self.lr_scheduler = lr_scheduler
-        return self.lr_scheduler  # type: ignore
+        return self.lr_scheduler
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -11,19 +11,20 @@ from accelerate.logging import get_logger
 from datasets import Dataset
 from transformers.trainer import Trainer
-from axolotl.train import (
+from axolotl.logging_config import configure_logging
-    TrainDatasetMeta,
+from axolotl.train import TrainDatasetMeta
-    setup_model_and_tokenizer,
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 )
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.trainer import setup_trainer
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
-LOG = get_logger(__name__)
+configure_logging()
 LOG = get_logger("axolotl.evaluate")
 def evaluate_dataset(
@@ -74,22 +75,37 @@ def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, f
    Returns:
        Dictionary mapping metric names to their values.
    """
-    # Load tokenizer, processor and model
+    # pylint: disable=duplicate-code
-    LOG.debug("loading model for evaluation...")
+    # Enable expandable segments for cuda allocation to improve VRAM usage
-    model, tokenizer, _, processor = setup_model_and_tokenizer(cfg)
+    set_pytorch_cuda_alloc_conf()
    # Load tokenizer
    LOG.debug(
        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
        main_process_only=True,
    )
    tokenizer = load_tokenizer(cfg)
    # Load processor for multimodal models if needed
    processor = None
    if cfg.is_multimodal:
        processor = load_processor(cfg, tokenizer)
    # Get datasets
    # pylint: disable=duplicate-code
    train_dataset = dataset_meta.train_dataset
    eval_dataset = dataset_meta.eval_dataset
    total_num_steps = dataset_meta.total_num_steps
    # Load model
    LOG.debug("loading model for evaluation...")
    model, _ = load_model(cfg, tokenizer, processor=processor)
    # Set up trainer
    trainer = setup_trainer(
-        cfg=cfg,
+        cfg,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
-        model=model,
+        model=(model, None, None),  # No need for model_ref or peft_config
        tokenizer=tokenizer,
        processor=processor,
        total_num_steps=total_num_steps,
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -24,7 +24,6 @@ import logging
 from typing import OrderedDict
 import torch
 from torch.optim.lr_scheduler import LRScheduler
 class BasePlugin:
@@ -37,12 +36,11 @@ class BasePlugin:
    Methods:
    register(cfg): Registers the plugin with the given configuration.
    pre_model_load(cfg): Performs actions before the model is loaded.
-    post_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied.
+    post_model_load(cfg, model): Performs actions after the model is loaded.
    pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
    post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
    post_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters.
    create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
-    create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler.
+    create_lr_scheduler(cfg, trainer, optimizer): Creates and returns a learning rate scheduler.
    add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.
    add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.
    """
@@ -79,14 +77,6 @@ class BasePlugin:
        None
        """
    def post_model_build(self, cfg, model):  # pylint: disable=unused-argument
        """
        Performs actions after the model is built/loaded, but before any adapters are applied.
        Args:
            cfg (dict): The configuration for the plugin.
        """
    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
        """
        Performs actions after the model is loaded.
@@ -147,8 +137,8 @@ class BasePlugin:
        """
    def create_lr_scheduler(
-        self, cfg, trainer, optimizer, num_training_steps
+        self, cfg, trainer, optimizer
-    ) -> LRScheduler | None:  # pylint: disable=unused-argument
+    ):  # pylint: disable=unused-argument
        """
        Creates and returns a learning rate scheduler.
@@ -156,10 +146,9 @@ class BasePlugin:
        cfg (dict): The configuration for the plugin.
        trainer (object): The trainer object for training.
        optimizer (object): The optimizer for training.
        num_training_steps (int): Total number of training steps
        Returns:
-        object (LRScheduler): The created learning rate scheduler.
+        object: The created learning rate scheduler.
        """
    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
@@ -272,7 +261,6 @@ class PluginManager:
    plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()
    _instance = None
    _cfg = None
    def __new__(cls):
        """
@@ -280,9 +268,7 @@ class PluginManager:
        """
        if cls._instance is None:
            cls._instance = super(PluginManager, cls).__new__(cls)
-            cls._instance.plugins: OrderedDict[str, BasePlugin] = (
+            cls._instance.plugins = collections.OrderedDict()
                collections.OrderedDict()
            )
        return cls._instance
    @staticmethod
@@ -295,14 +281,6 @@ class PluginManager:
            PluginManager()
        return PluginManager._instance  # type: ignore
    @property
    def cfg(self):
        return self._cfg
    @cfg.setter
    def cfg(self, cfg):
        self._cfg = cfg
    def register(self, plugin_name: str):
        """
        Registers a new plugin by its name.
@@ -351,22 +329,9 @@ class PluginManager:
        for plugin in self.plugins.values():
            plugin.pre_model_load(cfg)
    def post_model_build(self, cfg, model):
        """
        Calls the post_model_build method of all registered plugins after the model has been built/loaded,
        but before any adapters have been applied.
        Args:
            cfg (dict): The configuration for the plugins.
            model (object): The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_model_build(cfg, model)
    def post_model_load(self, cfg, model):
        """
-        Calls the post_model_load method of all registered plugins after the model has been loaded
+        Calls the post_model_load method of all registered plugins.
        inclusive of any adapters
        Parameters:
        cfg (dict): The configuration for the plugins.
@@ -422,29 +387,29 @@ class PluginManager:
                return trainer_cls
        return None
-    def create_optimizer(self, trainer):
+    def create_optimizer(self, cfg, trainer):
        """
        Calls the create_optimizer method of all registered plugins and returns the first non-None optimizer.
        Parameters:
        cfg (dict): The configuration for the plugins.
        trainer (object): The trainer object for training.
        Returns:
        object: The created optimizer, or None if none was found.
        """
        for plugin in self.plugins.values():
-            optimizer = plugin.create_optimizer(self.cfg, trainer)
+            optimizer = plugin.create_optimizer(cfg, trainer)
            if optimizer is not None:
                return optimizer
        return None
-    def create_lr_scheduler(
+    def create_lr_scheduler(self, cfg, trainer, optimizer):
        self, trainer, optimizer, num_training_steps
    ) -> LRScheduler | None:
        """
        Calls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.
        Parameters:
        cfg (dict): The configuration for the plugins.
        trainer (object): The trainer object for training.
        optimizer (object): The optimizer for training.
@@ -452,12 +417,7 @@ class PluginManager:
        object: The created learning rate scheduler, or None if none was found.
        """
        for plugin in self.plugins.values():
-            scheduler: LRScheduler | None = plugin.create_lr_scheduler(
+            scheduler = plugin.create_lr_scheduler(cfg, trainer, optimizer)
                self.cfg,
                trainer=trainer,
                optimizer=optimizer,
                num_training_steps=num_training_steps,
            )
            if scheduler is not None:
                return scheduler
        return None
@@ -498,20 +458,6 @@ class PluginManager:
                callbacks.extend(plugin_callbacks)
        return callbacks
    def post_train(self, cfg, model):
        """
        Calls the post_train method of all registered plugins.
        Parameters:
        cfg (dict): The configuration for the plugins.
        model (object): The loaded model.
        Returns:
        None
        """
        for plugin in self.plugins.values():
            plugin.post_train(cfg, model)
    def post_train_unload(self, cfg):
        """
        Calls the post_train_unload method of all registered plugins.
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -32,8 +32,8 @@ plugins:
 ## Supported Models
 - llama
 - llama4
 - llama4_text
 - llama4
 - mllama
 - phi3
 - gemma
@@ -43,11 +43,6 @@ plugins:
 - mistral
 - mistral3
 - qwen2
 - qwen2_moe
 - qwen2_vl
 - qwen2_5_vl
 - qwen3
 - qwen3_moe
 - cohere
 - cohere2
 - glm
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -25,7 +25,7 @@ import torch
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils import get_pytorch_version
-from axolotl.utils.distributed import is_main_process
+from axolotl.utils.distributed import zero_only
 from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401
@@ -72,11 +72,11 @@ class CutCrossEntropyPlugin(BasePlugin):
        if cfg.cut_cross_entropy:
            self._check_requirements()
-            from .monkeypatch.patch import (
+            from axolotl.integrations.cut_cross_entropy.monkeypatch.patch import (
                cce_patch,
            )
-            if is_main_process(use_environ=True):
+            with zero_only():
                LOG.info(
                    f"Applying Cut Cross Entropy to model type: {cfg.model_config_type}"
                )
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/init.py
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py
@@ -1,174 +0,0 @@
 """Llama CCE patch. Adapted from transformers v4.51.2"""
 # pylint: disable=duplicate-code
 from types import MethodType
 from typing import Optional, Union
 import torch
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
    apply_lce,
 )
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
 )
 from transformers.models.llama.modeling_llama import (
    _CONFIG_FOR_DOC,
    LLAMA_INPUTS_DOCSTRING,
    KwargsForCausalLM,
 )
 from transformers.processing_utils import Unpack
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple
 _PATCH_OPTS: PatchOptions | None = None
@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Cache] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs: Unpack[KwargsForCausalLM],
 ) -> CausalLMOutputWithPast:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    Example:
    ```python
    >>> from transformers import AutoTokenizer, LlamaForCausalLM
    >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
    >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
    >>> prompt = "Hey, are you conscious? Can you talk to me?"
    >>> inputs = tokenizer(prompt, return_tensors="pt")
    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs: BaseModelOutputWithPast = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs.last_hidden_state
    if hidden_states is None:
        raise ValueError("hidden_states is None")
    loss = None
    logits = None
    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
    slice_indices = (
        slice(-logits_to_keep, None)
        if isinstance(logits_to_keep, int)
        else logits_to_keep
    )
    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
        assert labels is not None
        loss = apply_lce(
            hidden_states[:, slice_indices, :],
            self.lm_head.weight,
            labels,
            _PATCH_OPTS,
            **kwargs,
        )
    else:
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )
    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def patch_llama(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    """Patch Llama for CCE."""
    global _PATCH_OPTS  # pylint: disable=global-statement
    from transformers.models.llama import modeling_llama
    _PATCH_OPTS = patch_options
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_llama.LlamaForCausalLM
        ), f"Expected a LlamaForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward, maybe_model)
        return maybe_model
    modeling_llama.LlamaForCausalLM.forward = cce_forward
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
@@ -5,7 +5,9 @@
 import transformers
 from cut_cross_entropy.cce_utils import LinearCrossEntropyImpl
 from cut_cross_entropy.linear_cross_entropy import LCE_IMPL_DEFAULT
 from cut_cross_entropy.transformers.llama import patch_llama
 from cut_cross_entropy.transformers.phi3 import patch_phi3
 from cut_cross_entropy.transformers.qwen2 import patch_qwen2
 from cut_cross_entropy.transformers.utils import PatchOptions, TransformersModelT
 from axolotl.integrations.cut_cross_entropy.monkeypatch.cohere import (
@@ -22,9 +24,6 @@ from axolotl.integrations.cut_cross_entropy.monkeypatch.glm4 import (
    patch_glm,
    patch_glm4,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.llama import (
    patch_llama,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.llama4 import (
    patch_llama4,
    patch_llama4_text,
@@ -34,22 +33,6 @@ from axolotl.integrations.cut_cross_entropy.monkeypatch.mistral3 import (
    patch_mistral3,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.mllama import patch_mllama
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2 import (
    patch_qwen2,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2_5_vl import (
    patch_qwen2_5_vl,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2_moe import (
    patch_qwen2_moe,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2_vl import (
    patch_qwen2_vl,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen3 import patch_qwen3
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen3_moe import (
    patch_qwen3_moe,
 )
 CUT_CROSS_ENTROPY_MODEL_MAPPING = {
    "llama": patch_llama,
@@ -64,11 +47,6 @@ CUT_CROSS_ENTROPY_MODEL_MAPPING = {
    "mistral": patch_mistral,
    "mistral3": patch_mistral3,
    "qwen2": patch_qwen2,
    "qwen2_moe": patch_qwen2_moe,
    "qwen2_vl": patch_qwen2_vl,
    "qwen2_5_vl": patch_qwen2_5_vl,
    "qwen3": patch_qwen3,
    "qwen3_moe": patch_qwen3_moe,
    "cohere": patch_cohere,
    "cohere2": patch_cohere2,
    "glm": patch_glm,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2.py
@@ -1,37 +0,0 @@
 """Qwen2 CCE patch. The model inherits Llama's modeling code and uses the same forward method."""
 # pylint: disable=duplicate-code
 from types import MethodType
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
 )
 def patch_qwen2(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    from transformers.models.qwen2 import modeling_qwen2
    # Set the _PATCH_OPTS in the llama patch file
    import axolotl.integrations.cut_cross_entropy.monkeypatch.llama as llama_patch
    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
    from axolotl.integrations.cut_cross_entropy.monkeypatch.llama import (
        cce_forward,
    )
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen2.Qwen2ForCausalLM
        ), f"Expected a Qwen2ForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward, maybe_model)
        return maybe_model
    modeling_qwen2.Qwen2ForCausalLM.forward = cce_forward
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_5_vl.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_5_vl.py
@@ -1,246 +0,0 @@
 """Qwen2.5 VL CCE patch. Adapted from transformers v4.51.2"""
 # pylint: disable=duplicate-code
 from types import MethodType
 from typing import Optional, Tuple, Union
 import torch
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
    apply_lce,
 )
 from torch.nn import CrossEntropyLoss
 from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
    Qwen2_5_VLCausalLMOutputWithPast,
 )
 _PATCH_OPTS: PatchOptions | None = None
 def cce_forward_multimodal(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[list[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    pixel_values: Optional[torch.Tensor] = None,
    pixel_values_videos: Optional[torch.FloatTensor] = None,
    image_grid_thw: Optional[torch.LongTensor] = None,
    video_grid_thw: Optional[torch.LongTensor] = None,
    rope_deltas: Optional[torch.LongTensor] = None,
    cache_position: Optional[torch.LongTensor] = None,
    second_per_grid_ts: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
    Returns:
    Example:
    ```python
    >>> from PIL import Image
    >>> import requests
    >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
    >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
    >>> messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What is shown in this image?"},
            ],
        },
    ]
    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)
    >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    return_dict = (
        return_dict if return_dict is not None else self.config.use_return_dict
    )
    if inputs_embeds is None:
        inputs_embeds = self.model.embed_tokens(input_ids)
        if pixel_values is not None:
            pixel_values = pixel_values.type(self.visual.dtype)
            image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
            n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
            n_image_features = image_embeds.shape[0]
            if n_image_tokens != n_image_features:
                raise ValueError(
                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                )
            mask = input_ids == self.config.image_token_id
            mask_unsqueezed = mask.unsqueeze(-1)
            mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
            image_mask = mask_expanded.to(inputs_embeds.device)
            image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)  # type: ignore
        if pixel_values_videos is not None:
            pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
            video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
            n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
            n_video_features = video_embeds.shape[0]
            if n_video_tokens != n_video_features:
                raise ValueError(
                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                )
            mask = input_ids == self.config.video_token_id
            mask_unsqueezed = mask.unsqueeze(-1)
            mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
            video_mask = mask_expanded.to(inputs_embeds.device)
            video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)  # type: ignore
        if attention_mask is not None:
            attention_mask = attention_mask.to(inputs_embeds.device)
    # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
    if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
        # calculate RoPE index once per generation in the pre-fill stage only
        if (
            (cache_position is not None and cache_position[0] == 0)
            or self.rope_deltas is None
            or (past_key_values is None or past_key_values.get_seq_length() == 0)  # type: ignore
        ):
            position_ids, rope_deltas = self.get_rope_index(
                input_ids,
                image_grid_thw,
                video_grid_thw,
                second_per_grid_ts,
                attention_mask,
            )
            self.rope_deltas = rope_deltas
        # then use the prev pre-calculated rope-deltas to get the correct position ids
        else:
            batch_size, seq_length, _ = inputs_embeds.shape
            delta = (
                (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
                if cache_position is not None
                else 0
            )
            position_ids = torch.arange(seq_length, device=inputs_embeds.device)  # type: ignore
            position_ids = position_ids.view(1, -1).expand(batch_size, -1)  # type: ignore
            if cache_position is not None:  # otherwise `deltas` is an int `0`
                delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)  # type: ignore
            position_ids = position_ids.add(delta)  # type: ignore
            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)  # type: ignore
    outputs = self.model(
        input_ids=None,
        position_ids=position_ids,
        attention_mask=attention_mask,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        cache_position=cache_position,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
        assert labels is not None
        loss = apply_lce(
            hidden_states,
            self.lm_head.weight,
            labels,
            _PATCH_OPTS,
        )
    else:
        logits = self.lm_head(hidden_states)
        if labels is not None:
            # Upcast to float if we need to compute the loss to avoid potential precision issues
            logits = logits.float()
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)
    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output
    return Qwen2_5_VLCausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        rope_deltas=self.rope_deltas,
    )
 def patch_qwen2_5_vl(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    global _PATCH_OPTS  # pylint: disable=global-statement
    from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl
    _PATCH_OPTS = patch_options
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration
        ), f"Expected a Qwen2_5_VLForConditionalGeneration model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
        return maybe_model
    modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.forward = (
        cce_forward_multimodal
    )
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py
@@ -1,188 +0,0 @@
 """Qwen2 MoE CCE patch. Adapted from transformers v4.51.2"""
 # pylint: disable=duplicate-code
 from types import MethodType
 from typing import Optional, Union
 import torch
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
    apply_lce,
 )
 from transformers.models.qwen2_moe.modeling_qwen2_moe import (
    _CONFIG_FOR_DOC,
    QWEN2MOE_INPUTS_DOCSTRING,
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    load_balancing_loss_func,
 )
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple
 _PATCH_OPTS: PatchOptions | None = None
@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[list[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_router_logits: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **loss_kwargs,
 ) -> MoeCausalLMOutputWithPast:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    Example:
    ```python
    >>> from transformers import AutoTokenizer, Qwen2MoeForCausalLM
    >>> model = Qwen2MoeForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
    >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
    >>> prompt = "Hey, are you conscious? Can you talk to me?"
    >>> inputs = tokenizer(prompt, return_tensors="pt")
    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_router_logits = (
        output_router_logits
        if output_router_logits is not None
        else self.config.output_router_logits
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs: MoeModelOutputWithPast = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        output_router_logits=output_router_logits,
        cache_position=cache_position,
    )
    hidden_states = outputs.last_hidden_state
    loss = None
    logits = None
    if hidden_states is None:
        raise ValueError("hidden_states is None")
    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
    slice_indices = (
        slice(-logits_to_keep, None)
        if isinstance(logits_to_keep, int)
        else logits_to_keep
    )
    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
        assert labels is not None
        loss = apply_lce(
            hidden_states[:, slice_indices, :],
            self.lm_head.weight,
            labels,
            _PATCH_OPTS,
            **loss_kwargs,
        )
    else:
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
    aux_loss = None
    if output_router_logits:
        aux_loss = load_balancing_loss_func(
            outputs.router_logits,
            self.num_experts,
            self.num_experts_per_tok,
            attention_mask,
        )
        if labels is not None:
            loss += self.router_aux_loss_coef * aux_loss.to(  # type: ignore
                loss.device  # type: ignore
            )  # make sure to reside in the same device
    return MoeCausalLMOutputWithPast(
        loss=loss,
        aux_loss=aux_loss,  # type: ignore
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        router_logits=outputs.router_logits,
    )
 def patch_qwen2_moe(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    global _PATCH_OPTS  # pylint: disable=global-statement
    from transformers.models.qwen2_moe import modeling_qwen2_moe
    _PATCH_OPTS = patch_options
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen2_moe.Qwen2MoeForCausalLM
        ), f"Expected a Qwen3MoeForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(forward, maybe_model)
        return maybe_model
    modeling_qwen2_moe.Qwen2MoeForCausalLM.forward = forward
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py
@@ -1,249 +0,0 @@
 """Qwen2 VL CCE patch. Adapted from transformers v4.51.2"""
 # pylint: disable=duplicate-code
 from types import MethodType
 from typing import Optional, Tuple, Union
 import torch
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
    apply_lce,
 )
 from torch.nn import CrossEntropyLoss
 from transformers.models.qwen2_vl.modeling_qwen2_vl import (
    _CONFIG_FOR_DOC,
    QWEN2_VL_INPUTS_DOCSTRING,
    Qwen2VLCausalLMOutputWithPast,
 )
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 _PATCH_OPTS: PatchOptions | None = None
@add_start_docstrings_to_model_forward(QWEN2_VL_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=Qwen2VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward_multimodal(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[list[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    pixel_values: Optional[torch.Tensor] = None,
    pixel_values_videos: Optional[torch.FloatTensor] = None,
    image_grid_thw: Optional[torch.LongTensor] = None,
    video_grid_thw: Optional[torch.LongTensor] = None,
    rope_deltas: Optional[torch.LongTensor] = None,
    cache_position: Optional[torch.LongTensor] = None,
 ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
    Returns:
    Example:
    ```python
    >>> from PIL import Image
    >>> import requests
    >>> from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
    >>> model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
    >>> messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What is shown in this image?"},
            ],
        },
    ]
    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)
    >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    return_dict = (
        return_dict if return_dict is not None else self.config.use_return_dict
    )
    if inputs_embeds is None:
        inputs_embeds = self.model.embed_tokens(input_ids)
        if pixel_values is not None:
            pixel_values = pixel_values.type(self.visual.get_dtype())
            image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
            n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
            n_image_features = image_embeds.shape[0]
            if n_image_tokens != n_image_features:
                raise ValueError(
                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                )
            image_mask = (
                (input_ids == self.config.image_token_id)
                .unsqueeze(-1)
                .expand_as(inputs_embeds)
                .to(inputs_embeds.device)
            )
            image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)  # type: ignore
        if pixel_values_videos is not None:
            pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
            video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
            n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
            n_video_features = video_embeds.shape[0]
            if n_video_tokens != n_video_features:
                raise ValueError(
                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                )
            video_mask = (
                (input_ids == self.config.video_token_id)
                .unsqueeze(-1)
                .expand_as(inputs_embeds)
                .to(inputs_embeds.device)
            )
            video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)  # type: ignore
        if attention_mask is not None:
            attention_mask = attention_mask.to(inputs_embeds.device)
    # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
    if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
        # calculate RoPE index once per generation in the pre-fill stage only
        if (
            (cache_position is not None and cache_position[0] == 0)
            or self.rope_deltas is None
            or (past_key_values is None or past_key_values.get_seq_length() == 0)  # type: ignore
        ):
            position_ids, rope_deltas = self.get_rope_index(
                input_ids, image_grid_thw, video_grid_thw, attention_mask
            )
            self.rope_deltas = rope_deltas
        # then use the prev pre-calculated rope-deltas to get the correct position ids
        else:
            batch_size, seq_length, _ = inputs_embeds.shape
            delta = (
                cache_position[0] + self.rope_deltas
                if cache_position is not None
                else 0
            )
            position_ids = torch.arange(seq_length, device=inputs_embeds.device)  # type: ignore
            position_ids = position_ids.view(1, -1).expand(batch_size, -1)  # type: ignore
            if cache_position is not None:  # otherwise `deltas` is an int `0`
                delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)  # type: ignore
                delta = delta.to(position_ids.device)  # type: ignore
            position_ids = position_ids.add(delta)  # type: ignore
            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)  # type: ignore
    outputs = self.model(
        input_ids=None,
        position_ids=position_ids,
        attention_mask=attention_mask,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        cache_position=cache_position,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
        assert labels is not None
        loss = apply_lce(
            hidden_states,
            self.lm_head.weight,
            labels,
            _PATCH_OPTS,
        )
    else:
        logits = self.lm_head(hidden_states)
        if labels is not None:
            # Upcast to float if we need to compute the loss to avoid potential precision issues
            logits = logits.float()
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)
    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output
    return Qwen2VLCausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        rope_deltas=self.rope_deltas,
    )
 def patch_qwen2_vl(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    global _PATCH_OPTS  # pylint: disable=global-statement
    from transformers.models.qwen2_vl import modeling_qwen2_vl
    _PATCH_OPTS = patch_options
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen2_vl.Qwen2VLForConditionalGeneration
        ), f"Expected a Qwen2VLForConditionalGeneration model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
        return maybe_model
    modeling_qwen2_vl.Qwen2VLForConditionalGeneration.forward = cce_forward_multimodal
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3.py
@@ -1,35 +0,0 @@
 """Qwen3 CCE patch. The model inherits Llama's modeling code and uses the same forward method."""
 # pylint: disable=duplicate-code
 from types import MethodType
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
 )
 def patch_qwen3(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    from transformers.models.qwen3 import modeling_qwen3
    # Set the _PATCH_OPTS in the llama patch file
    import axolotl.integrations.cut_cross_entropy.monkeypatch.llama as llama_patch
    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
    from axolotl.integrations.cut_cross_entropy.monkeypatch.llama import cce_forward
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen3.Qwen3ForCausalLM
        ), f"Expected a Qwen3ForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward, maybe_model)
        return maybe_model
    modeling_qwen3.Qwen3ForCausalLM.forward = cce_forward
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py
@@ -1,194 +0,0 @@
 """Qwen3 MoE CCE patch. Adapted from transformers v4.51.2"""
 # pylint: disable=duplicate-code
 from types import MethodType
 from typing import Optional, Union
 import torch
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
    apply_lce,
 )
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.qwen3_moe.modeling_qwen3_moe import (
    _CONFIG_FOR_DOC,
    QWEN3_MOE_INPUTS_DOCSTRING,
    KwargsForCausalLM,
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    load_balancing_loss_func,
 )
 from transformers.processing_utils import Unpack
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple
 _PATCH_OPTS: PatchOptions | None = None
@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[list[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_router_logits: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs: Unpack[KwargsForCausalLM],
 ) -> MoeCausalLMOutputWithPast:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    Example:
    ```python
    >>> from transformers import AutoTokenizer, Qwen3MoeForCausalLM
    >>> model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
    >>> prompt = "Hey, are you conscious? Can you talk to me?"
    >>> inputs = tokenizer(prompt, return_tensors="pt")
    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_router_logits = (
        output_router_logits
        if output_router_logits is not None
        else self.config.output_router_logits
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs: MoeModelOutputWithPast = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        output_router_logits=output_router_logits,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs.last_hidden_state
    if hidden_states is None:
        raise ValueError("hidden_states is None")
    loss = None
    logits = None
    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
    slice_indices = (
        slice(-logits_to_keep, None)
        if isinstance(logits_to_keep, int)
        else logits_to_keep
    )
    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
        assert labels is not None
        loss = apply_lce(
            hidden_states[:, slice_indices, :],
            self.lm_head.weight,
            labels,
            _PATCH_OPTS,
            **kwargs,
        )
    else:
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
    aux_loss = None
    if output_router_logits:
        aux_loss = load_balancing_loss_func(
            outputs.router_logits,
            self.num_experts,
            self.num_experts_per_tok,
            attention_mask,
        )
        if labels is not None:
            loss += self.router_aux_loss_coef * aux_loss.to(  # type: ignore
                loss.device  # type: ignore
            )  # make sure to reside in the same device
    return MoeCausalLMOutputWithPast(
        loss=loss,
        aux_loss=aux_loss,  # type: ignore
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        router_logits=outputs.router_logits,
    )
 def patch_qwen3_moe(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    global _PATCH_OPTS  # pylint: disable=global-statement
    from transformers.models.qwen3_moe import modeling_qwen3_moe
    _PATCH_OPTS = patch_options
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen3_moe.Qwen3MoeForCausalLM
        ), f"Expected a Qwen3MoeForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(forward, maybe_model)
        return maybe_model
    modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = forward
    return None
--- a/src/axolotl/integrations/kd/chat_template.py
+++ b/src/axolotl/integrations/kd/chat_template.py
@@ -35,9 +35,6 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
        sequence_len,
        roles_to_train=None,
        train_on_eos=None,
        train_on_eot=None,
        eot_tokens=None,
        split_thinking: bool | None = False,
        logprobs_field="logprobs",
        gen_temperature=1.0,
        kd_temperature=1.0,
@@ -53,9 +50,6 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
            sequence_len,
            roles_to_train=roles_to_train,
            train_on_eos=train_on_eos,
            train_on_eot=train_on_eot,
            eot_tokens=eot_tokens,
            split_thinking=split_thinking,
        )
    @property
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -23,8 +23,8 @@ import logging
 import sys
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils.distributed import is_main_process
 from ...utils.distributed import zero_only
 from .args import LigerArgs  # pylint: disable=unused-import. # noqa: F401
 from .utils import patch_with_compile_disable
@@ -85,7 +85,7 @@ class LigerPlugin(BasePlugin):
                kwargs["geglu"] = cfg.liger_glu_activation
            elif "swiglu" in liger_fn_sig.parameters:
                kwargs["swiglu"] = cfg.liger_glu_activation
-            if is_main_process(use_environ=True):
+            with zero_only():
                LOG.info(
                    f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}"
                )
@@ -151,30 +151,6 @@ class LigerPlugin(BasePlugin):
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3":
            from axolotl.integrations.liger.models.qwen3 import (
                apply_liger_kernel_to_qwen3,
            )
            apply_liger_kernel_to_qwen3(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3_moe":
            from axolotl.integrations.liger.models.qwen3_moe import (
                apply_liger_kernel_to_qwen3_moe,
            )
            apply_liger_kernel_to_qwen3_moe(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        else:
            logging.warning(
                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -1,160 +0,0 @@
 """
 Liger FLCE for Qwen3. Based on transformers v4.51.3.
 """
 import sys
 from typing import Optional, Tuple, Union
 import torch
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 def lce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Cache] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs,
 ) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **kwargs,
        )
    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )
    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def apply_liger_kernel_to_qwen3(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
    import transformers.models.qwen3.modeling_qwen3  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
    assert not (
        cross_entropy and fused_linear_cross_entropy
    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
    if rms_norm:
        modeling_qwen3.Qwen3RMSNorm = LigerRMSNorm
    if glu_activation:
        modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP
    if layer_norm:
        modeling_qwen3.nn.LayerNorm = LigerLayerNorm
    if cross_entropy:
        from transformers.loss.loss_utils import nn
        nn.functional.cross_entropy = liger_cross_entropy
    if fused_linear_cross_entropy:
        modeling_qwen3.Qwen3ForCausalLM.forward = lce_forward
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -1,191 +0,0 @@
 """
 Liger FLCE for Qwen3 MoE. Based on transformers v4.51.3.
 """
 import sys
 from copy import deepcopy
 from typing import List, Optional, Union
 import torch
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
 from transformers.modeling_outputs import MoeCausalLMOutputWithPast
 from transformers.models.qwen3_moe.modeling_qwen3_moe import load_balancing_loss_func
 def lce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[List[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_router_logits: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs,
 ) -> MoeCausalLMOutputWithPast:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_router_logits = (
        output_router_logits
        if output_router_logits is not None
        else self.config.output_router_logits
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        output_router_logits=output_router_logits,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **kwargs,
        )
    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )
    aux_loss = None
    if output_router_logits:
        aux_loss = load_balancing_loss_func(
            outputs.router_logits,
            self.num_experts,
            self.num_experts_per_tok,
            attention_mask,
        )
        if labels is not None:
            loss += self.router_aux_loss_coef * aux_loss.to(
                loss.device
            )  # make sure to reside in the same device
    return MoeCausalLMOutputWithPast(
        loss=loss,
        aux_loss=aux_loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def apply_liger_kernel_to_qwen3_moe(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
    assert not (
        cross_entropy and fused_linear_cross_entropy
    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
    if rms_norm:
        modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm
    if glu_activation:
        def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
            "Accepts intermediate_size to pass to LigerSwiGLUMLP"
            # clone config to avoid modifying the original
            config = deepcopy(config)
            if intermediate_size:
                setattr(config, "intermediate_size", intermediate_size)
            return LigerSwiGLUMLP(config, **kwargs)
        modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
    if layer_norm:
        modeling_qwen3_moe.nn.LayerNorm = LigerLayerNorm
    if cross_entropy:
        from transformers.loss.loss_utils import nn
        nn.functional.cross_entropy = liger_cross_entropy
    if fused_linear_cross_entropy:
        modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = lce_forward
--- a/src/axolotl/integrations/llm_compressor/README.md
+++ b/src/axolotl/integrations/llm_compressor/README.md
@@ -1,108 +0,0 @@
 # LLMCompressor Integration
 Fine-tune sparsified models in Axolotl using Neural Magic's [LLMCompressor](https://github.com/vllm-project/llm-compressor).
 This integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor's model compression capabilities with Axolotl's distributed training pipelines, users can efficiently fine-tune sparse models at scale.
 It uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.
 ---
 ## Requirements
 - Axolotl with `llmcompressor` extras:
  ```bash
  pip install "axolotl[llmcompressor]"
  ```
 - Requires `llmcompressor >= 0.5.1`
 This will install all necessary dependencies to fine-tune sparsified models using the integration.
 ---
 ## Usage
 To enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:
 ```yaml
 plugins:
  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
 llmcompressor:
  recipe:
    finetuning_stage:
      finetuning_modifiers:
        ConstantPruningModifier:
          targets: [
            're:.*q_proj.weight',
            're:.*k_proj.weight',
            're:.*v_proj.weight',
            're:.*o_proj.weight',
            're:.*gate_proj.weight',
            're:.*up_proj.weight',
            're:.*down_proj.weight',
          ]
          start: 0
  save_compressed: true
 # ... (other training arguments)
 ```
 This plugin **does not apply pruning or sparsification itself** — it is intended for **fine-tuning models that have already been sparsified**.
 Pre-sparsified checkpoints can be:
 - Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor)
 - Downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic)
 - Any custom LLM with compatible sparsity patterns that you've created yourself
 To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:
 [https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md)
 ### Storage Optimization with save_compressed
 Setting `save_compressed: true` in your configuration enables saving models in a compressed format, which:
 - Reduces disk space usage by approximately 40%
 - Maintains compatibility with vLLM for accelerated inference
 - Maintains compatibility with llmcompressor for further optimization (example: quantization)
 This option is highly recommended when working with sparse models to maximize the benefits of model compression.
 ### Example Config
 See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example.
 ---
 ## Inference with vLLM
 After fine-tuning your sparse model, you can leverage vLLM for efficient inference.
 You can also use LLMCompressor to apply additional quantization to your fine-tuned
 sparse model before inference for even greater performance benefits.:
 ```python
 from vllm import LLM, SamplingParams
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM("path/to/your/sparse/model")
 outputs = llm.generate(prompts, sampling_params)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 For more details on vLLM's capabilities and advanced configuration options, see the [official vLLM documentation](https://docs.vllm.ai/).
 ## Learn More
 For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:
 [https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)
--- a/src/axolotl/integrations/llm_compressor/init.py
+++ b/src/axolotl/integrations/llm_compressor/init.py
@@ -1,5 +0,0 @@
 """Integration entry point for the LLMCompressor plugin."""
 from .plugin import LLMCompressorPlugin
 __all__ = ["LLMCompressorPlugin"]
--- a/src/axolotl/integrations/llm_compressor/args.py
+++ b/src/axolotl/integrations/llm_compressor/args.py
@@ -1,40 +0,0 @@
 """
 LLMCompressor and Sparse Finetuning config models.
 """
 from typing import Any
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 class CompressionArgs(BaseModel):
    """Sparse Finetuning config for LLMCompressor."""
    # Typing for recipe is set to Any due to:
    # https://github.com/vllm-project/llm-compressor/issues/1319
    recipe: Annotated[
        Any,
        Field(
            description="The recipe containing the compression algorithms and hyperparameters to apply."
        ),
    ]
    save_compressed: Annotated[
        bool,
        Field(
            default=False,
            description="Whether to save the compressed model after training.",
        ),
    ]
 class LLMCompressorArgs(BaseModel):
    """LLMCompressor configuration BaseModel."""
    llmcompressor: Annotated[
        CompressionArgs,
        Field(
            description="Arguments enabling compression pathways through the LLM Compressor plugins"
        ),
    ]
--- a/src/axolotl/integrations/llm_compressor/plugin.py
+++ b/src/axolotl/integrations/llm_compressor/plugin.py
@@ -1,171 +0,0 @@
 """
 Sparse Finetuning plugin for Axolotl — enables handling of sparse neural networks
 by maintaining masks for zero weights during training.
 """
 import logging
 from functools import wraps
 from typing import Any, Callable, Concatenate, ParamSpec, TypeVar
 from llmcompressor import active_session, create_session
 from llmcompressor.core import callbacks as session_callbacks
 from llmcompressor.recipe import Recipe
 from torch.nn import Module
 from transformers.trainer import Trainer
 from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
 from transformers.training_args import TrainingArguments
 from axolotl.integrations.base import BasePlugin
 P = ParamSpec("P")  # Params for generic function signatures
 R = TypeVar("R")  # Return type for generic function signatures
 LOG = logging.getLogger("axolotl.integrations.llm_compressor")
 class LLMCompressorCallbackHandler(TrainerCallback):
    """
    Trainer callback for Sparse Finetuning.
    Maintains sparsity patterns during training by applying masks after optimization steps,
    ensuring zero-weight updates are canceled out.
    """
    def __init__(self, trainer: Trainer, recipe: Any):
        """
        Initialize the Sparse Finetuning callback handler.
        Args:
            trainer (Trainer): Huggingface Trainer instance.
            recipe (Recipe | dict): Sparse finetuning recipe to apply.
        """
        super().__init__()
        self.trainer = trainer
        self.recipe = (
            Recipe.model_validate(recipe) if not isinstance(recipe, Recipe) else recipe
        )
        self.original_compute_loss = trainer.compute_loss
        self.trainer.compute_loss = compute_loss_wrapper(self.trainer.compute_loss)
        create_session()
    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the beginning of training. Initializes the compression session.
        Args:
            args (TrainingArguments): Training arguments.
            state (TrainerState): Trainer state.
            control (TrainerControl): Trainer control.
        """
        super().on_train_begin(args, state, control, **kwargs)
        self.trainer.accelerator.wait_for_everyone()
        active_session().initialize(
            model=self.trainer.model,
            optimizer=self.trainer.optimizer,
            start=state.epoch,
            recipe=self.recipe,
        )
        self.trainer.accelerator.wait_for_everyone()
    def on_step_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the beginning of a training step. Triggers batch_start callback.
        """
        super().on_step_begin(args, state, control, **kwargs)
        session_callbacks.batch_start()
    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the end of a training step. Triggers optimizer and batch_end callbacks.
        """
        super().on_step_end(args, state, control, **kwargs)
        session_callbacks.optim_pre_step()
        session_callbacks.optim_post_step()
        session_callbacks.batch_end()
    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the end of training. Finalizes the compression session.
        """
        super().on_train_end(args, state, control, **kwargs)
        active_session().finalize()
        self.trainer.compute_loss_func = self.original_compute_loss
 class LLMCompressorPlugin(BasePlugin):
    """
    Sparse Finetuning plugin for Axolotl integration.
    """
    def get_input_args(self) -> str:
        """
        Returns the path to the plugin's argument definition.
        Returns:
            str: Dotted path to the LLMCompressorArgs class.
        """
        return "axolotl.integrations.llm_compressor.args.LLMCompressorArgs"
    def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
        """
        Adds Sparse Finetuning callback to the Trainer instance.
        Args:
            cfg (Any): Configuration object containing the sparse recipe.
            trainer (Trainer): Huggingface Trainer instance.
        Returns:
            list: List containing the configured callback instances.
        """
        LOG.info("Adding Sparse Finetuning callback to the trainer")
        callback = LLMCompressorCallbackHandler(
            trainer=trainer,
            recipe=cfg.llmcompressor.recipe,
        )
        return [callback]
 def compute_loss_wrapper(
    compute_loss_func: Callable[Concatenate[Module, P], R],
 ) -> Callable[Concatenate[Module, P], R]:
    """
    Wraps the loss computation function to trigger the loss_calculated callback.
    Args:
        compute_loss_func (Callable): Original loss computation function.
    Returns:
        Callable: Wrapped function that also invokes the loss_calculated callback.
    """
    @wraps(compute_loss_func)
    def compute_and_notify(model: Module, *args: P.args, **kwargs: P.kwargs) -> R:
        loss = compute_loss_func(model, *args, **kwargs)
        if active_session().lifecycle.initialized_ and model.training:
            session_callbacks.loss_calculated(loss=loss)
        return loss
    return compute_and_notify
--- a/src/axolotl/integrations/llm_compressor/utils.py
+++ b/src/axolotl/integrations/llm_compressor/utils.py
@@ -1,40 +0,0 @@
 """Utilities for llmcompressor integration with axolotl."""
 from typing import Union
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
    modify_save_pretrained,
 )
 from transformers import PreTrainedModel, Trainer
 def save_compressed_model(
    model: PreTrainedModel,
    output_dir: Union[str, bytes],
    trainer: Trainer,
    safe_serialization: bool = False,
    save_compressed: bool = False,
 ) -> None:
    """
    Synchronize processes, apply compression hooks, and save the model.
    Args:
        model (PreTrainedModel): The model to be saved.
        output_dir (str or bytes): Path where the model files will be written.
        trainer (Trainer): Hugging Face Trainer for process synchronization.
        safe_serialization (bool): Use safe serialization if True.
        save_compressed (bool): Write compressed tensors if True.
    """
    trainer.accelerator.wait_for_everyone()
    # Only the main process writes the files
    if not trainer.accelerator.is_main_process:
        return
    modify_save_pretrained(model)
    model.save_pretrained(
        output_dir,
        safe_serialization=safe_serialization,
        save_compressed=save_compressed,
        skip_sparsity_compression_stats=not save_compressed,
    )
--- a/src/axolotl/monkeypatch/attention/init.py
+++ b/src/axolotl/monkeypatch/attention/init.py
@@ -1,19 +0,0 @@
 """
 attention module for attention monkeypatches
 """
 from transformers.integrations.flash_attention import flash_attention_forward
 def patch_xformers_attn_over_fa2():
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
    from .xformers import xformers_attention_forward
    ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = xformers_attention_forward
 def unpatch_xformers_attn_over_fa2():
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
    ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward()
--- a/src/axolotl/monkeypatch/attention/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/patch.py
@@ -12,8 +12,10 @@ import torch
 import torch.distributed as dist
 from accelerate.logging import get_logger
 from axolotl.logging_config import configure_logging
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 configure_logging()
 LOG = get_logger(__name__)
--- a/src/axolotl/monkeypatch/attention/xformers.py
+++ b/src/axolotl/monkeypatch/attention/xformers.py
@@ -1,160 +0,0 @@
 """
 xformers attention implementation for packing
 """
 from typing import Optional
 import torch
 import xformers
 import xformers.ops.fmha
 from transformers.modeling_flash_attention_utils import (
    _upad_input,
 )
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 xformers_attention = xformers.ops.fmha.memory_efficient_attention
 def xformers_attention_forward(
    module: torch.nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    dropout: float = 0.0,  # pylint: disable=unused-argument
    scaling: Optional[float] = None,  # pylint: disable=unused-argument
    sliding_window: Optional[int] = None,  # pylint: disable=unused-argument
    softcap: Optional[float] = None,  # pylint: disable=unused-argument
    cu_seq_lens_q: Optional[torch.LongTensor] = None,
    cu_seq_lens_k: Optional[torch.LongTensor] = None,
    max_length_q: Optional[int] = None,
    max_length_k: Optional[int] = None,  # pylint: disable=unused-argument
    **kwargs,  # pylint: disable=unused-argument
 ):
    # Get dimensions
    # query: [batch, heads, seq_len, hidden_dim]
    batch_size = query.size(0)
    query_length = query.shape[2]
    key_length = key.shape[2]
    # Default causal mask
    attn_bias = xformers.ops.LowerTriangularMask()
    # Check if we have sliding window attention
    has_sliding_window = sliding_window is not None and sliding_window < query_length
    # Transpose dimensions for xformers (Q: [b, h, s, d] -> [b, s, h, d])
    query = query.transpose(1, 2)
    key = key.transpose(1, 2)
    value = value.transpose(1, 2)
    # Get GQA parameters
    num_attention_heads = module.config.num_attention_heads
    num_key_value_heads = module.config.num_key_value_heads
    head_dim = query.size(-1)
    is_gqa = num_attention_heads != num_key_value_heads
    n_groups = num_attention_heads // num_key_value_heads if is_gqa else 1
    # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
    # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
    # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
    if position_ids is not None and (
        max_length_q is not None
        or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all())
    ):
        if cu_seq_lens_q is None or cu_seq_lens_k is None:
            cu_seq_lens_q = get_cu_seqlens_from_pos_ids(position_ids)[0]
            cu_seq_lens_q = cu_seq_lens_q.squeeze()
            seq_lengths = cu_seq_lens_q[1:] - cu_seq_lens_q[:-1]
            attn_bias = (
                xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(
                    q_seqlen=seq_lengths.tolist(),
                )
            )
        else:
            query = query.reshape(-1, query.size(-2), query.size(-1))
            key = key.reshape(-1, key.size(-2), key.size(-1))
            value = value.reshape(-1, value.size(-2), value.size(-1))
        # Handle GQA
        if is_gqa:
            key = key.repeat_interleave(n_groups, dim=2)
            value = value.repeat_interleave(n_groups, dim=2)
    elif attention_mask is not None:
        query, key, value, _, cu_seq_lens, _ = _upad_input(
            query, key, value, attention_mask, query_length
        )
        cu_seq_lens_q, cu_seq_lens_k = cu_seq_lens
        seq_lengths = []
        for i in range(len(cu_seq_lens_q) - 1):
            seq_lengths.append(cu_seq_lens_q[i + 1] - cu_seq_lens_q[i])
        attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(
            q_seqlen=seq_lengths,
            kv_seqlen=seq_lengths,
        )
        # Handle GQA
        if is_gqa:
            key = key.repeat_interleave(n_groups, dim=2)
            value = value.repeat_interleave(n_groups, dim=2)
    else:
        # Handle Group Query Attention (GQA) using view/expand approach from reference
        key = key.view(batch_size, key_length, num_key_value_heads, 1, head_dim)
        value = value.view(batch_size, key_length, num_key_value_heads, 1, head_dim)
        key = key.expand(
            batch_size, key_length, num_key_value_heads, n_groups, head_dim
        )
        value = value.expand(
            batch_size, key_length, num_key_value_heads, n_groups, head_dim
        )
        if module.training:
            key = key.reshape(batch_size, key_length, num_attention_heads, head_dim)
            value = value.reshape(batch_size, key_length, num_attention_heads, head_dim)
            if has_sliding_window:
                query = query.view(
                    1, batch_size * query_length, num_attention_heads, head_dim
                )
                key = key.view(
                    1, batch_size * key_length, num_attention_heads, head_dim
                )
                value = value.view(
                    1, batch_size * key_length, num_attention_heads, head_dim
                )
        else:
            query = query.view(
                batch_size, query_length, num_key_value_heads, n_groups, head_dim
            )
            # If we need a sliding window attention
            if has_sliding_window:
                query = query.view(
                    1,
                    batch_size * query_length,
                    num_key_value_heads,
                    n_groups,
                    head_dim,
                )
                key = key.view(
                    1, batch_size * key_length, num_key_value_heads, n_groups, head_dim
                )
                value = value.view(
                    1, batch_size * key_length, num_key_value_heads, n_groups, head_dim
                )
    # Run the xformers attention
    attn_output = xformers_attention(
        query,
        key,
        value,
        attn_bias=attn_bias,
    )
    attn_output = attn_output.view(
        batch_size, -1, attn_output.size(-2), attn_output.size(-1)
    )
    return attn_output, None
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -23,42 +23,22 @@ from axolotl.utils.dict import DictDefault
 LOG = get_logger(__name__)
-QKV_PATCHES = [
+ORIGINAL_QKV_CODE = """
    (
        """
    query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
    key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 """.lstrip(
-            "\n"
+    "\n"
-        ),
+)
-        """
+
 PATCHED_QKV_CODE = """
    query_states, key_states, value_states = self.apply_qkv(hidden_states)
    query_states = query_states.view(hidden_shape).transpose(1, 2)
    key_states = key_states.view(hidden_shape).transpose(1, 2)
    value_states = value_states.view(hidden_shape).transpose(1, 2)
 """.lstrip(
-            "\n"
+    "\n"
-        ),
+)
    ),
    (
        """
    query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
    key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 """.lstrip(
            "\n"
        ),
        """
    query_states, key_states, value_states = self.apply_qkv(hidden_states)
    query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
    key_states = self.k_norm(key_states.view(hidden_shape)).transpose(1, 2)
    value_states = value_states.view(hidden_shape).transpose(1, 2)
 """.lstrip(
            "\n"
        ),
    ),
 ]
 ORIGINAL_O_CODE = """
    attn_output = self.o_proj(attn_output)
@@ -148,11 +128,10 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
    try:
        # Dynamically import the module and attention class
        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
-        model_cls_prefix = "".join(
+        module = __import__(
-            [part.capitalize() for part in model_type.split("_")]
+            module_path, fromlist=[f"{model_type.capitalize()}Attention"]
        )
-        module = __import__(module_path, fromlist=[f"{model_cls_prefix}Attention"])
+        attention_cls = getattr(module, f"{model_type.capitalize()}Attention")
        attention_cls = getattr(module, f"{model_cls_prefix}Attention")
        return attention_cls
    except (ImportError, AttributeError) as e:
@@ -189,18 +168,10 @@ def patch_self_attn_lora(cfg: DictDefault):
    attention_cls._original_forward = self_attn_forward
    self_attn_forward, _ = detab_code(self_attn_forward)
-    assert any(
+    assert ORIGINAL_QKV_CODE in self_attn_forward, "Original QKV code not found"
        qkv_options[0] in self_attn_forward for qkv_options in QKV_PATCHES
    ), "Original QKV code not found"
    assert ORIGINAL_O_CODE in self_attn_forward, "Original O code not found"
-    for qkv_orig, qkv_patched in QKV_PATCHES:
+    self_attn_forward = self_attn_forward.replace(ORIGINAL_QKV_CODE, PATCHED_QKV_CODE)
        if qkv_orig in self_attn_forward:
            self_attn_forward = self_attn_forward.replace(
                qkv_orig,
                qkv_patched,
            )
            break
    self_attn_forward = self_attn_forward.replace(ORIGINAL_O_CODE, PATCHED_O_CODE)
    self_attn_forward = self_attn_forward.replace(
        "def forward(",
--- a/src/axolotl/monkeypatch/loss/init.py
+++ b/src/axolotl/monkeypatch/loss/init.py
--- a/src/axolotl/monkeypatch/loss/chunked.py
+++ b/src/axolotl/monkeypatch/loss/chunked.py
@@ -1,134 +0,0 @@
 """
 chunked ce loss
 """
 from typing import List, Optional
 import torch
 import torch.nn.functional as F
 # copied and modified from torchtune.modules.loss.CEWithChunkedOutputLoss
 class CEWithChunkedOutputLoss(torch.nn.Module):
    """
    Cross-entropy with chunked outputs that saves memory by only upcasting one chunk at a time.
    For more details, please refer to: https://github.com/pytorch/torchtune/pull/1390
    """
    def __init__(self, num_output_chunks: int = 8, ignore_index: int = -100):
        super().__init__()
        self.num_output_chunks = num_output_chunks
        self.ignore_index = ignore_index
    def compute_cross_entropy(
        self,
        logits: torch.Tensor,
        labels: torch.Tensor,
        normalize: bool = True,  # pylint: disable=unused-argument
    ) -> torch.Tensor:
        """
        Upcast logits to fp32 and compute cross entropy loss.
        """
        return F.cross_entropy(
            logits.float(), labels, ignore_index=self.ignore_index, reduction="sum"
        )
    def forward(
        self, logits: List[torch.Tensor], labels: torch.Tensor, reduction="sum"
    ) -> torch.Tensor:
        """
        Args:
            logits (List[torch.Tensor]): List of chunked logits of length
                ``self.num_output_chunks``, where each chunk has shape
                ``(batch_size, num_tokens / num_output_chunks, vocab_size)``.
            labels (torch.Tensor): Ground truth labels of shape ``(batch_size, num_tokens)``.
            reduction (str): The reduction to apply to the output.
        Returns:
            torch.Tensor: Cross entropy loss of shape (1,).
        """
        total_elements = (labels != self.ignore_index).sum()
        # chunk and reshape labels (bsz, num_tokens, vocab) -> [(bsz*num_tokens/num_chunks, vocab)]
        labels = [
            target_chunk.reshape(-1)
            for target_chunk in labels.chunk(self.num_output_chunks, dim=1)
        ]
        # reshape logits [(bsz, num_tokens/num_chunks, vocab)] -> [(bsz*num_tokens/num_chunks, vocab)]
        logits = [
            logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits
        ]
        # compute one chunk at a time
        total_loss = 0.0
        for logits_chunk, labels_chunk in zip(logits, labels):
            total_loss += self.compute_cross_entropy(logits_chunk, labels_chunk)
        if reduction == "sum":
            return total_loss
        return total_loss / total_elements
 def _build_chunked_ce_loss_fn(num_output_chunks: int = 8, ignore_index: int = -100):
    loss_fn_ce = CEWithChunkedOutputLoss(num_output_chunks, ignore_index)
    loss_fn_ce.compute_cross_entropy = torch.compile(
        loss_fn_ce.compute_cross_entropy, backend="inductor"
    )
    return loss_fn_ce
 def get_causal_lm_loss(num_output_chunks: int = 8, ignore_index: int = -100):
    loss_fn_ce = _build_chunked_ce_loss_fn(num_output_chunks, ignore_index)
    def chunked_fix_cross_entropy(
        source,
        target,
        num_items_in_batch: int = None,
        ignore_index: int = -100,
        **kwargs,
    ):  # pylint: disable=unused-argument
        reduction = "sum" if num_items_in_batch is not None else "mean"
        logit_chunks = [  # pylint: disable=unnecessary-comprehension
            chunk for chunk in source.chunk(loss_fn_ce.num_output_chunks, dim=1)
        ]
        loss = loss_fn_ce(logit_chunks, target, reduction=reduction)
        if reduction == "sum":
            loss = loss / num_items_in_batch
        return loss
    def for_causal_lm_chunked_loss(
        logits,
        labels,
        vocab_size: int = None,  # pylint: disable=unused-argument
        num_items_in_batch: Optional[int] = None,
        ignore_index: int = -100,
        shift_labels: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        # skip the upcast to float since we handle that in the chunking loss
        if shift_labels is None:
            # Shift so that tokens < n predict n
            labels = F.pad(labels, (0, 1), value=ignore_index)
            shift_labels = labels[..., 1:].contiguous()
        # Skip Flattening the tokens
        # Enable model parallelism
        shift_labels = shift_labels.to(logits.device)
        loss = chunked_fix_cross_entropy(
            logits, shift_labels, num_items_in_batch, ignore_index, **kwargs
        )
        return loss
    return for_causal_lm_chunked_loss
 def patch_chunked_ce_loss_fn(num_output_chunks: int = 8, ignore_index: int = -100):
    import transformers.loss.loss_utils
    for_causal_lm_chunked_loss = get_causal_lm_loss(num_output_chunks, ignore_index)
    transformers.loss.loss_utils.ForCausalLMLoss = for_causal_lm_chunked_loss
    transformers.loss.loss_utils.LOSS_MAPPING["ForCausalLM"] = (
        for_causal_lm_chunked_loss
    )
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -18,8 +18,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "mixtral",
    "qwen2",
    "qwen2_moe",
    "qwen3",
    "qwen3_moe",
    "falcon",
    "phi",
    "phi3",
--- a/src/axolotl/monkeypatch/peft/init.py
+++ b/src/axolotl/monkeypatch/peft/init.py
--- a/src/axolotl/monkeypatch/peft/utils.py
+++ b/src/axolotl/monkeypatch/peft/utils.py
@@ -1,78 +0,0 @@
 """
 Patch prepare_model_for_kbit_training to not upcast everything
 """
 import inspect
 import logging
 import peft
 import axolotl
 from axolotl.monkeypatch.utils import detab_code
 LOG = logging.getLogger(__name__)
 ORIGINAL_PREPARE_CODE = """
        for param in model.parameters():
            if (
                (param.dtype == torch.float16) or (param.dtype == torch.bfloat16)
            ) and param.__class__.__name__ != "Params4bit":
                param.data = param.data.to(torch.float32)
 """
 PATCHED_PREPARE_CODE = """
        for name, param in model.named_parameters():
            if (
                (param.dtype == torch.float16) or (param.dtype == torch.bfloat16)
            ) and param.__class__.__name__ != "Params4bit" and "norm" in name:
                param.data = param.data.to(torch.float32)
 """
 def get_peft_prep_code() -> str:
    prepare = inspect.getsource(peft.utils.other.prepare_model_for_kbit_training)
    return prepare
 def check_peft_prep_code_is_patchable() -> bool:
    prep_code = get_peft_prep_code()
    prep_code, _ = detab_code(prep_code)
    return ORIGINAL_PREPARE_CODE in prep_code
 def patch_peft_prep_code():
    """
    monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs
    """
    try:
        prep_code = get_peft_prep_code()
    except OSError:
        return
    peft.utils.other._original_create_accelerator_and_postprocess = (  # pylint: disable=protected-access
        prep_code
    )
    prep_code, _ = detab_code(prep_code)
    if ORIGINAL_PREPARE_CODE not in prep_code:
        return
    prep_code = prep_code.replace(ORIGINAL_PREPARE_CODE, PATCHED_PREPARE_CODE)
    prep_code = prep_code.replace(
        "def prepare_model_for_kbit_training(",
        "def fixed_prepare_model_for_kbit_training(",
        1,
    )
    items_to_import = []
    for item in dir(peft.utils.other):
        if item in prep_code:
            items_to_import.append(item)
    exec(  # pylint: disable=exec-used  # nosec B102
        "from peft.utils.other import (" + ", ".join(x for x in items_to_import) + ")",
        globals(),
    )
    exec(prep_code, globals())  # pylint: disable=exec-used  # nosec B102
    LOG.info("patching prepare_model_for_kbit_training to allow for overrides")
    peft.utils.other.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
    axolotl.utils.models.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
--- a/src/axolotl/monkeypatch/trainer/init.py
+++ b/src/axolotl/monkeypatch/trainer/init.py
--- a/src/axolotl/monkeypatch/trainer/lr.py
+++ b/src/axolotl/monkeypatch/trainer/lr.py
@@ -1,42 +0,0 @@
 """
 monkeypatch for Trainer _get_learning_rate method
 """
 import logging
 import torch
 LOG = logging.getLogger(__name__)
 # TODO remove this patch once https://github.com/huggingface/transformers/pull/37881 is included in a release
 def _get_learning_rate(self):
    if self.is_deepspeed_enabled:
        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
        # not run for the first few dozen steps while loss scale is too large, and thus during
        # that time `get_last_lr` will fail if called during that warm up stage, so work around it:
        try:
            last_lr = self.lr_scheduler.get_last_lr()[0]
        except AssertionError as e:
            if "need to call step" in str(e):
                LOG.warning(
                    "tried to get lr value before scheduler/optimizer started stepping, returning lr=0"
                )
                last_lr = 0
            else:
                raise
    else:
        if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            last_lr = self.optimizer.param_groups[0]["lr"]
        else:
            last_lr = self.lr_scheduler.get_last_lr()[0]
    if torch.is_tensor(last_lr):
        last_lr = last_lr.item()
    return last_lr
 def patch_trainer_get_lr():
    from transformers.trainer import Trainer
    Trainer._get_learning_rate = _get_learning_rate  # pylint: disable=protected-access
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -4,7 +4,7 @@ HF Chat Templates prompt strategy
 import logging
 from collections import defaultdict
-from typing import Any, Dict, List, Set, Union
+from typing import Any, Dict, List, Optional, Set, Union
 from pydantic import BaseModel
 from transformers import ProcessorMixin
@@ -29,12 +29,11 @@ class ChatTemplatePrompter(Prompter):
        chat_template: str,
        processor=None,
        max_length=2048,
-        message_property_mappings: Dict[str, str] | None = None,
+        message_property_mappings: Optional[Dict[str, str]] = None,
-        message_field_training: str | None = None,
+        message_field_training: Optional[str] = None,
-        message_field_training_detail: str | None = None,
+        message_field_training_detail: Optional[str] = None,
        field_messages: str = "messages",
-        field_system: str = "system",
+        roles: Optional[Dict[str, List[str]]] = None,
        roles: Dict[str, List[str]] | None = None,
        drop_system_message: bool = False,
    ):
        # check if message_property_mappings is None or empty dict
@@ -42,7 +41,6 @@ class ChatTemplatePrompter(Prompter):
            message_property_mappings = {
                "role": "role",
                "content": "content",
                "reasoning_content": "reasoning_content",
            }
        if roles:
@@ -64,9 +62,8 @@ class ChatTemplatePrompter(Prompter):
        self.message_field_training = message_field_training
        self.message_field_training_detail = message_field_training_detail
        self.field_messages = field_messages
        self.field_system = field_system
        self.tokenizer = tokenizer
-        self.processor: ProcessorMixin | None = processor
+        self.processor: Optional[ProcessorMixin] = processor
        self.chat_template = chat_template
        self.max_length = max_length
        self.drop_system_message = drop_system_message
@@ -223,13 +220,10 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        self,
        prompter: "ChatTemplatePrompter",
        tokenizer,
-        train_on_inputs: bool,
+        train_on_inputs,
-        sequence_len: int,
+        sequence_len,
-        roles_to_train: list[str] | None = None,
+        roles_to_train=None,
-        train_on_eos: str | None = None,
+        train_on_eos=None,
        train_on_eot: str | None = None,
        eot_tokens: list[str] | None = None,
        split_thinking: bool | None = False,
    ):
        super().__init__(prompter, tokenizer, train_on_inputs, sequence_len)
        self.prompter: ChatTemplatePrompter = prompter
@@ -242,88 +236,12 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
            ]
        self.train_on_eos = train_on_eos
        # Backward compatibility, load from train_on_eos
        self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos
        # Default to eos_token if eot_tokens not provided
        self.eot_tokens = (
            eot_tokens if eot_tokens is not None else [self.tokenizer.eos_token]
        )
        self.split_thinking = split_thinking
        self.images = "images"
        LOG.debug(
            f"The chat template uses the following properites on the message: {self.prompter.chat_template_msg_variables}"
        )
        self._validate_eot_and_eos_tokens()
    def _validate_eot_and_eos_tokens(self):
        """
        - Validates that EOT tokens (or eos_token) are in the chat_template
        - Checks if EOT tokens are encoded as multiple tokens in the tokenizer.
        - Checks for potential conflicts between train_on_eos and train_on_eot.
        """
        if self.prompter.chat_template is None:
            # Usually this should not happen
            LOG.warning(
                "No chat template provided, skipping EOT and EOS token validation"
            )
            return
        # If the EOT token is the same as the EOS token, we need to check differently
        if len(self.eot_tokens) == 1 and self.eot_tokens[0] == self.tokenizer.eos_token:
            # Check if the eos_token is in the chat_template or as a variable `eos_token`
            # Note: we check for `eos_token` in the string, but it could possibly not be a variable
            if (
                self.tokenizer.eos_token not in self.prompter.chat_template
                and "eos_token" not in self.prompter.chat_template
            ):
                LOG.warning(
                    f"EOS token '{self.tokenizer.eos_token}' not found in chat_template. Please check if your template/EOS token is correct."
                )
            return
        # Create a new list to store tokens that should be kept
        valid_eot_tokens = []
        for token in self.eot_tokens:
            # Check if EOT token is in the chat_template
            if token not in self.prompter.chat_template:
                LOG.warning(f"EOT token '{token}' not found in chat_template.")
                # Don't add to the valid tokens list
                continue
            valid_eot_tokens.append(token)
        # Replace the original list with the filtered one
        self.eot_tokens = valid_eot_tokens
        for token in self.eot_tokens:
            # If token in template, check if EOT token is in tokenizer and not encoded as multiple tokens
            token_ids = self.tokenizer.encode(token, add_special_tokens=False)
            if not token_ids:
                raise ValueError(
                    "EOT token encoding failed. Please check if the token is valid and can be encoded."
                )
            if token_ids and len(token_ids) > 1:
                raise ValueError(
                    f"EOT token '{token}' is encoded as multiple tokens: {token_ids}. Please add it under `tokens: ` in the config "
                    "or (recommended) override unused added_tokens via `added_tokens_overrides: `."
                )
        # If eos_token is in eot_tokens and conflict between train_on_eos and train_on_eot, raise an error
        if (
            self.tokenizer.eos_token in self.eot_tokens
            and self.train_on_eos != self.train_on_eot
        ):
            raise ValueError(
                "Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot"
                f"train_on_eos: {self.train_on_eos}, train_on_eot: {self.train_on_eot}"
                f"eot_tokens: {self.eot_tokens}"
                f"eos_token: {self.tokenizer.eos_token}"
            )
    @property
    def supports_batched(self) -> bool:
        # Let calling code know we can handle lists of examples
@@ -367,7 +285,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        if (
            not self.roles_to_train
            and not self.train_on_eos
            and not self.train_on_eot
            and not self.prompter.message_field_training  # type: ignore
            and not self.prompter.message_field_training_detail  # type: ignore
        ):
@@ -403,7 +320,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        labels = [IGNORE_TOKEN_ID] * len(input_ids)
        last_eos_idx = -1
        last_eot_idx = -1
        for index, turn in enumerate(turns):
            role = turn.get("role")
            content = turn.get("content")
@@ -452,46 +368,25 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                LOG.debug(f"Labels after processing turn {index}: {labels}")
-            # Handle special tokens (EOT and EOS)
+            # Handle EOS token
-            for token_type, find_func, train_option in [
+            eos_idx = self.find_first_eos_token(input_ids, start_idx=turn_end_idx)
-                ("EOT", self.find_first_eot_token, self.train_on_eot),
+            if abs(eos_idx - turn_end_idx) <= 3:  # Allow for some template padding
-                ("EOS", self.find_first_eos_token, self.train_on_eos),
+                last_eos_idx = eos_idx
-            ]:
+                if self.train_on_eos == "all" or (
-                token_idx = find_func(input_ids, start_idx=turn_end_idx)
+                    self.train_on_eos == "turn" and should_train
-
+                ):
-                if (
+                    labels[eos_idx] = input_ids[eos_idx]
-                    token_idx != -1 and abs(token_idx - turn_end_idx) <= 3
+                    LOG.debug(f"EOS token set for training at index {eos_idx}")
-                ):  # Allow for some template padding
+            else:
                    # Update the last token index
                    if token_type == "EOT":  # nosec B105
                        last_eot_idx = token_idx
                    else:
                        last_eos_idx = token_idx
                    # Set labels if needed for this turn
                    if train_option == "all" or (
                        train_option == "turn" and should_train
                    ):
                        labels[token_idx] = input_ids[token_idx]
                        LOG.debug(
                            f"{token_type} token set for training at index {token_idx}"
                        )
                else:
                    LOG.debug(
                        f"{token_type} token missing after turn {turn}. {token_type.lower()}_idx: {token_idx}, turn_end_idx: {turn_end_idx}"
                    )
        # Handle 'last' option for special tokens
        for token_type, last_idx, train_option in [
            ("EOT", last_eot_idx, self.train_on_eot),
            ("EOS", last_eos_idx, self.train_on_eos),
        ]:
            if train_option == "last" and last_idx != -1:
                labels[last_idx] = input_ids[last_idx]
                LOG.debug(
-                    f"Last {token_type} token set for training at index {last_idx}"
+                    f"EOS token missing after turn {turn}. eos_idx: {eos_idx}, turn_end_idx: {turn_end_idx}"
                )
        # Handle 'last' option for train_on_eos
        if self.train_on_eos == "last" and last_eos_idx != -1:
            labels[last_eos_idx] = input_ids[last_eos_idx]
            LOG.debug(f"Last EOS token set for training at index {last_eos_idx}")
        LOG.debug(f"Final labels: {labels}")
        return {
@@ -507,25 +402,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                return i
        return -1
    def find_first_eot_token(self, input_ids, start_idx):
        """Find the first EOT token in the input_ids starting from start_idx."""
        # Get token IDs for all EOT tokens
        eot_token_ids = []
        for token in self.eot_tokens:
            token_ids = self.tokenizer.encode(token, add_special_tokens=False)
            if len(token_ids) != 1:
                raise ValueError(
                    f"EOT token '{token}' is encoded as multiple tokens: {token_ids}. Please add it under `tokens: ` in the config."
                )
            eot_token_ids.append(token_ids[0])  # Use the last token ID if multiple
        # Search for any of the EOT token IDs
        for i in range(start_idx, len(input_ids)):
            if input_ids[i] in eot_token_ids:
                return i
        return -1
    def find_turn(self, turns: list[dict], turn_idx: int):
        """
        Locate the starting and ending indices of the specified turn in a conversation.
@@ -612,17 +488,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
    def get_conversation_thread(self, prompt):
        turns = []
        possible_sys_turn = self.transform_message(
            prompt[self.prompter.field_messages][0]
        )
        if (
            possible_sys_turn["role"] != "system"
            and self.prompter.field_system in prompt
        ):
            turn = {"role": "system", "content": prompt[self.prompter.field_system]}
            turns.append(turn)
        for message in prompt[self.prompter.field_messages]:
            transformed_message = self.transform_message(message)
@@ -658,52 +523,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                transformed_message["role"], transformed_message["role"]
            )
        # TODO handle reasoning_content with split_thinking
        # if the role is assistant that we want to use reasoning_content
        if self.split_thinking and transformed_message["role"] == "assistant":
            content = transformed_message["content"]
            thinking_pairs = [
                ("<think>", "</think>"),
                ("<reasoning>", "</reasoning>"),
                ("<|begin_of_thought|>", "<|end_of_thought|>"),
            ]
            content_pairs = [("<|begin_of_solution|>", "<|end_of_solution|>")]
            for tpair in thinking_pairs:
                # check if the thinking pair is in the content
                if tpair[0] in content and tpair[1] in content:
                    # find the start and end index of the thinking pair
                    t_start_idx = content.find(tpair[0])
                    t_end_idx = content.find(tpair[1])
                    # get the thinking content
                    thinking_content = content[t_start_idx + len(tpair[0]) : t_end_idx]
                    transformed_message["reasoning_content"] = thinking_content.strip()
                    # take remainder of the content
                    # strip whitespace from beginning of the remainder (thinking tokens)
                    remainder = content[t_end_idx + len(tpair[1]) :].lstrip()
                    # check if the content pair is in the remainder
                    cpair_found = False
                    for cpair in content_pairs:
                        if cpair[0] in remainder and cpair[1] in remainder:
                            # find the start and end index of the content pair
                            c_start_idx = remainder.find(cpair[0])
                            c_end_idx = remainder.find(cpair[1])
                            # get the content content
                            content_content = remainder[
                                c_start_idx + len(cpair[0]) : c_end_idx
                            ]
                            transformed_message["content"] = content_content.strip()
                            cpair_found = True
                            break
                    # else, the content is the remainder
                    if not cpair_found:
                        transformed_message["content"] = remainder
                    break
        # Determine which keys in the original message were not mapped
        mapped_values = set(self.prompter.message_property_mappings.values())
        remaining_keys = set(message) - mapped_values
@@ -736,16 +555,13 @@ class StrategyLoader:
            "sequence_len": cfg.sequence_len,
            "roles_to_train": ds_cfg.get("roles_to_train", ["assistant"]),
            "train_on_eos": ds_cfg.get("train_on_eos", "turn"),
            "train_on_eot": ds_cfg.get("train_on_eot", None),
            "eot_tokens": cfg.get("eot_tokens", None),  # loads from cfg, not ds_cfg
            "split_thinking": ds_cfg.get("split_thinking", False),
        }
    def __call__(
        self,
        tokenizer,
        cfg,
-        ds_cfg: Union[Dict[str, Any], DatasetConfig] | None = None,
+        ds_cfg: Optional[Union[Dict[str, Any], DatasetConfig]] = None,
        processor=None,
    ):
        if ds_cfg is None:
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -21,7 +21,6 @@ from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer
 from axolotl.cli.art import print_axolotl_text_art
 from axolotl.common.datasets import TrainDatasetMeta
 from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
@@ -30,7 +29,7 @@ from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuil
 from axolotl.core.trainers.mixins.sequence_parallel import (
    SequenceParallelContextManager,
 )
-from axolotl.integrations.base import PluginManager
+from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
@@ -42,6 +41,7 @@ try:
 except ImportError:
    BetterTransformer = None
 configure_logging()
 LOG = get_logger(__name__)
@@ -295,23 +295,8 @@ def save_trained_model(
            trainer.model.save_pretrained(
                cfg.output_dir, safe_serialization=safe_serialization
            )
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
    if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
        # TODO: add integration support so this can be implemented completely within the plugin
        from axolotl.integrations.llm_compressor.utils import (
            save_compressed_model,
        )
        save_compressed_model(
            model=model,
            output_dir=cfg.output_dir,
            trainer=trainer,
            safe_serialization=safe_serialization,
            save_compressed=cfg.llmcompressor.save_compressed,
        )
 def create_model_card(cfg: DictDefault, trainer: Trainer):
    """
@@ -517,8 +502,6 @@ def train(
    Returns:
        Tuple of (model, tokenizer) after training
    """
    print_axolotl_text_art()
    # Setup model, tokenizer, (causal or RLHF) trainer, etc.
    (
        trainer,
@@ -550,7 +533,4 @@ def train(
    if not cfg.use_ray:
        cleanup_distributed()
    plugin_manager = PluginManager.get_instance()
    plugin_manager.post_train(cfg, model)
    return model, tokenizer, trainer
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -43,12 +43,3 @@ def set_pytorch_cuda_alloc_conf():
            os.environ["PYTORCH_CUDA_ALLOC_CONF"] = (
                "expandable_segments:True,roundup_power2_divisions:16"
            )
 def patch_optimized_env():
    """
    Patch environment variables to improve VRAM usage and increase download speed
    """
    if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
    set_pytorch_cuda_alloc_conf()
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 import gc
 import json
 import logging
 import os
 import traceback
@@ -809,44 +808,11 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
                    artifact.add_file(temp_file.name)
                    wandb.log_artifact(artifact)
                    wandb.save(temp_file.name)
-                    LOG.info(
+                LOG.info(
-                        "The Axolotl config has been saved to the WandB run under files."
+                    "The Axolotl config has been saved to the WandB run under files."
-                    )
+                )
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
            if args.deepspeed:
                try:
                    # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later.
                    with NamedTemporaryFile(
                        mode="w",
                        delete=False,
                        suffix=".json",
                        prefix="deepspeed_config_",
                    ) as temp_file:
                        skip_upload = False
                        if isinstance(args.deepspeed, dict):
                            json.dump(args.deepspeed, temp_file, indent=4)
                        elif isinstance(args.deepspeed, str) and os.path.exists(
                            args.deepspeed
                        ):
                            copyfile(args.deepspeed, temp_file.name)
                        else:
                            skip_upload = True
                        if not skip_upload:
                            artifact = wandb.Artifact(
                                f"deepspeed-config-{wandb.run.id}",
                                type="deepspeed-config",
                            )
                            artifact.add_file(temp_file.name)
                            wandb.log_artifact(artifact)
                            wandb.save(temp_file.name)
                            LOG.info(
                                "The DeepSpeed config has been saved to the WandB run under files."
                            )
                except (FileNotFoundError, ConnectionError) as err:
                    LOG.warning(f"Error while saving DeepSpeed config to WandB: {err}")
        return control
@@ -868,29 +834,3 @@ class GCCallback(TrainerCallback):
    ):
        torch.cuda.empty_cache()
        gc.collect()
 def colab_inference_post_train_callback(trainer: Trainer):
    class ColabCallback(TrainerCallback):
        """Callback to prep model for inference on Google Colab"""
        def __init__(self, cfg):
            self.gpu_name = torch.cuda.get_device_name(0)
            self.cfg = cfg
        def on_train_end(
            self, args, state, control, **kwargs
        ):  # pylint: disable=unused-argument
            """
            handle T4 gpu, we need to convert attention to eager for inference
            """
            if "Tesla T4" in self.gpu_name and self.cfg.xformers_attention:
                trainer.model.eval()
            trainer.model.config._attn_implementation = (  # pylint: disable=protected-access
                "eager"
            )
            trainer.model.gradient_checkpointing_disable()
            trainer.model.config.use_cache = True
            trainer.model.eval()
    return ColabCallback
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -59,7 +59,7 @@ def choose_device(cfg):
 def resolve_dtype(cfg):
    if (
-        not cfg.fp16 and cfg.bf16 == "auto" and not cfg.use_ray
+        cfg.bf16 == "auto" and not cfg.use_ray
    ):  # if we use ray we want to defer this check to the worker node
        if is_torch_bf16_gpu_available():
            LOG.debug("bf16 support detected, enabling for this configuration.")
@@ -67,12 +67,9 @@ def resolve_dtype(cfg):
        else:
            LOG.debug("bf16 support not detected, disabling for this configuration.")
            cfg.bf16 = False
-            if cfg.fp16 is None and not cfg.float16:
+            if cfg.fp16 is None:
                cfg.fp16 = True
    if cfg.fp16 and cfg.bf16 == "auto":
        cfg.bf16 = False
    if cfg.device == "mps":
        cfg.load_in_8bit = False
        cfg.tf32 = False
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -204,37 +204,7 @@ def load_prepare_preference_datasets(cfg):
            else:
                eval_dataset = load_split(cfg.test_datasets, cfg)
        if not eval_dataset:
-            if cfg.val_set_size:
+            eval_dataset = None
                # ensure we end up with the same fingerprint by doing rank0 first and being able to cache
                to_hash_train = (
                    train_dataset._fingerprint  # pylint: disable=protected-access
                    + "|"
                    + str(cfg.val_set_size)
                    + "|"
                    + "train"
                    + "|"
                    + str(cfg.seed or 42)
                )
                to_hash_test = (
                    train_dataset._fingerprint  # pylint: disable=protected-access
                    + "|"
                    + str(cfg.val_set_size)
                    + "|"
                    + "test"
                    + "|"
                    + str(cfg.seed or 42)
                )
                train_fingerprint = md5(to_hash_train)
                test_fingerprint = md5(to_hash_test)
                ds_w_test_split = train_dataset.train_test_split(
                    test_size=cfg.val_set_size,
                    seed=cfg.seed,
                    shuffle=False,
                    train_new_fingerprint=train_fingerprint,
                    test_new_fingerprint=test_fingerprint,
                )
                eval_dataset = ds_w_test_split["test"]
                train_dataset = ds_w_test_split["train"]
        if not train_is_preprocessed:
            _save_preprocessed_ds(cfg, cfg.datasets, train_dataset)
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -69,27 +69,17 @@ def barrier():
        dist.barrier()
-def is_main_process(use_environ=False):
+def is_main_process():
    """
    Check if the current process is the main process. If not in distributed mode,
    always return `True`.
    Args:
    - use_environ (bool, optional): Use environment variable to determine main process.
    Returns:
    - bool: `True` if the current process is the main process, `False` otherwise.
    """
    if use_environ:
        return os.environ.get("LOCAL_RANK", "0") == "0"
    if not is_distributed():
        return True
    return dist.get_rank() == 0
-def is_local_main_process(use_environ=False):
+def is_local_main_process():
    if use_environ:
        return os.environ.get("LOCAL_RANK", "0") == "0"
    return PartialState().is_local_main_process
@@ -109,6 +99,17 @@ def cleanup_distributed():
        torch.distributed.destroy_process_group()
@contextmanager
 def zero_only():
    """
    Context manager that only runs the enclosed block on the main rank.
    """
    if is_main_process():
        yield
    else:
        yield None
@contextmanager
 def zero_first(is_main):
    """
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -53,7 +53,6 @@ from transformers.integrations.deepspeed import (
 )
 from axolotl.common.architectures import MOE_ARCH_BLOCK
 from axolotl.integrations.base import PluginManager
 from axolotl.models.mamba import fix_mamba_attn_for_loss
 from axolotl.monkeypatch.multipack import (
    SUPPORTED_MULTIPACK_MODEL_TYPES,
@@ -68,14 +67,13 @@ from axolotl.utils.distributed import (
    get_device_count,
    get_device_type,
    is_local_main_process,
-    is_main_process,
+    zero_only,
 )
 from axolotl.utils.gradient_checkpointing import hf_grad_checkpoint_offload_wrapper
 from axolotl.utils.lora_embeddings import get_linear_embedding_layers
 from axolotl.utils.model_shard_quant import load_sharded_model, load_sharded_model_quant
 LOG = logging.getLogger(__name__)
 PLUGIN_MANAGER = PluginManager.get_instance()
 MULTIMODAL_AUTO_MODEL_MAPPING = {
    "mllama": MllamaForConditionalGeneration,
@@ -141,22 +139,6 @@ def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
        hasattr(model_config, "quantization_config")
        and model_config.quantization_config
    )
    # Detect compressed-tensors config
    is_compressed_tensors_config = (
        quant_config_exists
        and model_config.quantization_config.get("quant_method") == "compressed-tensors"
    )
    if is_compressed_tensors_config:
        if model_config.quantization_config.get("config_groups"):
            LOG.warning(
                "Found `config_groups` in a compressed-tensors config. "
                "QAT integration with llmcompressor is not tested."
            )
        # Skip further quant checks for compressed-tensors
        return
    quant_config_method_is_gptq = (
        quant_config_exists
        and "quant_method" in model_config.quantization_config
@@ -453,7 +435,7 @@ def load_tokenizer(cfg):
            {"additional_special_tokens": additional_special_tokens}
        )
-    if is_main_process(use_environ=True):
+    with zero_only():
        LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
        LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
        LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
@@ -556,30 +538,11 @@ class ModelLoader:
        self.auto_model_loader = AutoModelForCausalLM  # pylint: disable=invalid-name
    def apply_patches(self) -> None:
        if self.cfg.xformers_attention and self.cfg.sample_packing:
            from axolotl.monkeypatch.attention import patch_xformers_attn_over_fa2
            patch_xformers_attn_over_fa2()
            self.cfg.flash_attention = True
        if self.cfg.chunked_cross_entropy:
            from axolotl.monkeypatch.loss.chunked import patch_chunked_ce_loss_fn
            if self.cfg.chunked_cross_entropy_num_chunks:
                patch_chunked_ce_loss_fn(self.cfg.chunked_cross_entropy_num_chunks)
            else:
                patch_chunked_ce_loss_fn()
        if self.cfg.fsdp_config and str(self.cfg.fsdp_config.fsdp_version) == "2":
            from axolotl.monkeypatch.accelerate.fsdp2 import patch_accelerate_fsdp_utils
            patch_accelerate_fsdp_utils()
        if self.cfg.adapter:
            from axolotl.monkeypatch.peft.utils import patch_peft_prep_code
            patch_peft_prep_code()
        if self.cfg.flex_attention:
            from axolotl.monkeypatch.attention.flex_attn import (
                patch_flex_make_mask,
@@ -608,8 +571,10 @@ class ModelLoader:
            patch_gemma3conditionalgeneration_forward()
        # load any patches from plugins
        from axolotl.integrations.base import PluginManager
-        PLUGIN_MANAGER.pre_model_load(self.cfg)
+        plugin_manager = PluginManager.get_instance()
        plugin_manager.pre_model_load(self.cfg)
        # monkey patch to allow additional Accelerator init kwargs
        if self.cfg.fp8:
@@ -1199,7 +1164,7 @@ class ModelLoader:
                ],
            )
-    def prepare_model(self, qlora_fsdp: bool) -> None:
+    def prepare_model(self, qlora_fsdp) -> None:
        skip_prepare_model_for_kbit_training = False
        if self.cfg.model_config_type == "qwen" and self.cfg.adapter == "lora":
            # Qwen doesn't play nicely with LoRA if this is enabled
@@ -1287,7 +1252,6 @@ class ModelLoader:
        try:
            skip_move_to_device = self.build_model(qlora_fsdp)
            PLUGIN_MANAGER.post_model_build(self.cfg, self.model)
        except Exception as err:  # pylint: disable=broad-exception-caught
            LOG.exception(err)
            raise err
@@ -1328,7 +1292,7 @@ class ModelLoader:
        # make sure these are fp32 per Ramesh et al. (2021)
        embedding_modules = get_linear_embedding_layers(self.cfg.model_config_type)
-        if self.cfg.fsdp:
+        if not self.cfg.fsdp:
            # FSDP doesn't like mixed Float and BFloat16
            self.convert_embedding_modules_dtype(
                embedding_modules,
@@ -1367,8 +1331,6 @@ class ModelLoader:
                before_kbit_train_or_finetune=False,
            )
        PLUGIN_MANAGER.pre_lora_load(self.cfg, self.model)
        # ---------------------------------------------------------
        #  load lora or adapter
        # ---------------------------------------------------------
@@ -1430,7 +1392,7 @@ class ModelLoader:
            gc.collect()
            torch.cuda.empty_cache()
-        PLUGIN_MANAGER.post_model_load(self.cfg, self.model)
+        # TODO resume_from_checkpoint handling
        return self.model, lora_config
@@ -1465,13 +1427,9 @@ def load_adapter(model, cfg, adapter, inference=False):
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    if adapter in ["lora", "qlora"]:
-        model, lora_config = load_lora(model, cfg, inference=inference)
+        return load_lora(model, cfg, inference=inference)
        PLUGIN_MANAGER.post_lora_load(cfg, model)
        return model, lora_config
    if adapter == "llama-adapter":
-        model, lora_config = load_llama_adapter(model, cfg)
+        return load_llama_adapter(model, cfg)
        PLUGIN_MANAGER.post_lora_load(cfg, model)
        return model, lora_config
    raise NotImplementedError(f"{adapter} peft adapter not available")
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -1,13 +1,10 @@
 # pylint: skip-file
 """
-Multipack Batch Sampler - An efficient batch sampler for packing variable-length sequences
+Multipack Batch Sampler
 into fixed-capacity batches to optimize memory usage and training throughput.
 """
 import logging
 import math
-from concurrent.futures import ProcessPoolExecutor
+from typing import Any, Iterable, List, Union
 from multiprocessing import cpu_count
 from typing import Iterable, List, Union
 import numba
 import numpy as np
@@ -16,39 +13,26 @@ from torch.utils.data import BatchSampler, Sampler, SequentialSampler
 from axolotl.utils.distributed import reduce_and_broadcast
 LOG = logging.getLogger(__name__)
 LOG.setLevel(logging.INFO)
@numba.njit
-def ffd_check(sequence_lengths: np.ndarray, bin_capacity: int, num_bins: int):
+def ffd_check(a: np.ndarray, c: int, n: int):
-    """
+    # First-fit-decreasing bin packing
-    First-fit-decreasing bin packing algorithm check
+    # Check if a[] could fit in n bins with capacity c
    # https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing
-    Checks if sequences with the given lengths could fit in the specified number of bins
+    a = np.sort(a)[::-1]
-
+    bins = np.full((n,), c, dtype=a.dtype)
-    Args:
+    for size in a:
        sequence_lengths: Array of sequence lengths
        bin_capacity: Maximum capacity of each bin
        num_bins: Number of bins available
    Returns:
        True if all sequences can be packed, False otherwise
    """
    # Sort sequence lengths in descending order for optimal packing
    sequence_lengths = np.sort(sequence_lengths)[::-1]
    # Initialize all bins with full capacity
    bins = np.full((num_bins,), bin_capacity, dtype=sequence_lengths.dtype)
    # Try to place each sequence in the first bin it fits
    for size in sequence_lengths:
        not_found = True
-        for idx in range(num_bins):
+        for idx in range(n):
            if bins[idx] >= size:
                bins[idx] -= size
                not_found = False
                break
        # If no bin could fit this sequence, packing failed
        if not_found:
            return False
@@ -56,380 +40,240 @@ def ffd_check(sequence_lengths: np.ndarray, bin_capacity: int, num_bins: int):
@numba.njit
-def pack_group(
+def ffd_with_result(a: np.ndarray, c: int, start_index: int):
-    sequence_lengths: np.ndarray,
+    # First-fit-decreasing bin packing (with result return)
    group_offset: int,
    bin_capacity: int,
    max_bins: int,
    bin_size: int,
    safe_mode: bool = True,
 ):
    """
    Pack a group of sequences into bins using First-Fit Decreasing algorithm
-    Args:
+    indices = np.argsort(a)[::-1]
-        sequence_lengths: Array of sequence lengths
+    a = a[indices]
        group_offset: Offset to apply to indices when returning results
        bin_capacity: Maximum capacity of each bin
        max_bins: Maximum number of bins to use
        bin_size: Maximum number of sequences per bin
        safe_mode: If True, use a more conservative packing approach
-    Returns:
+    bins: List[Any] = []
-        List of bins, where each bin contains indices of sequences assigned to it
+    bins_result: List[Any] = []
-    """
+    for a_id, size in enumerate(a):
-    # Get sorting indices and sort lengths in descending order
+        add_new = True
-    indices = np.argsort(sequence_lengths)[::-1]
+        for idx in range(len(bins)):
-    sorted_lengths = sequence_lengths[indices]
+            if bins[idx] >= size:
-
+                bins[idx] -= size
-    bins_remaining_space: list = []  # Tracks remaining capacity in each bin
+                bins_result[idx].append(indices[a_id] + start_index)
-    bins_assigned_sequences: list = []  # Tracks sequence indices assigned to each bin
+                add_new = False
    for seq_id, size in enumerate(sorted_lengths):
        global_idx = indices[seq_id] + group_offset
        # Try to place sequence in existing bins
        add_new_bin = True
        for bin_idx, _ in enumerate(bins_remaining_space):
            if (
                bins_remaining_space[bin_idx] >= size
                and len(bins_assigned_sequences[bin_idx]) < bin_size
            ):
                bins_remaining_space[bin_idx] -= size
                bins_assigned_sequences[bin_idx].append(global_idx)
                add_new_bin = False
                break
-        # Create a new bin if needed and if we haven't reached the limit
+        if add_new:
-        if add_new_bin:
+            bins.append(c - size)
-            if len(bins_remaining_space) >= max_bins and safe_mode:
+            bins_result.append([indices[a_id] + start_index])
                # In safe mode, skip items that would exceed max_bins
                continue
            bins_remaining_space.append(bin_capacity - size)
            bins_assigned_sequences.append([global_idx])
-            # Safety check to avoid infinite bins
+    return bins_result
            if len(bins_remaining_space) > len(sequence_lengths):
                break
    return bins_assigned_sequences
 # Define a standalone function for multiprocessing
 def _process_group(args):
    group_lengths, start_idx, bin_capacity, max_bins, bin_size, safe_mode = args
    return pack_group(
        group_lengths, start_idx, bin_capacity, max_bins, bin_size, safe_mode
    )
 def pack_parallel(
    sequence_lengths: np.ndarray,
    bin_capacity: int,
    group_size: int,
    bin_size: int,
    num_processes: int | None = None,
    safe_mode: bool = True,
 ):
    """
    Pack sequences into bins using parallel processing
    Args:
        sequence_lengths: Array of sequence lengths
        bin_capacity: Maximum capacity of each bin as total number of tokens
        group_size: Number of sequences to process in each group
        bin_size: Maximum number of bins to use
        num_processes: Number of parallel processes to use
        safe_mode: If True, use a more conservative packing approach
    Returns:
        List of bins, where each bin contains indices of sequences assigned to it
    """
    num_items = len(sequence_lengths)
    if num_processes is None:
        num_processes = max(1, min(num_items // group_size, cpu_count()))
    # Create tasks for parallel processing
    tasks = []
    for i in range(0, num_items, group_size):
        group_lengths = sequence_lengths[i : i + group_size]
        max_bins = len(group_lengths)  # Allow as many bins as items in the group
        tasks.append((group_lengths, i, bin_capacity, max_bins, bin_size, safe_mode))
    # Process groups in parallel
    all_bins = []
    with ProcessPoolExecutor(max_workers=num_processes) as executor:
        for group_bins in executor.map(_process_group, tasks):
            all_bins.extend(group_bins)
    return all_bins
@numba.njit
-def allocate_sequentially(
+def allocate(
-    sequence_lengths: np.ndarray, rank: int, bin_capacity: int, num_ranks: int
+    lengths: np.ndarray, lengths_cumsum: np.ndarray, rank: int, c: int, n: int
 ):
    # Dynamic batch allocator, similar to Multifit
    # https://en.wikipedia.org/wiki/Multifit_algorithm
    # ~99.5% efficiency on OpenChat training set (12 * 2048 ctx len)
    s = 0
    start_index = 0
    result = []
    while True:
        # binary search [l, r)
        left = 1
        right = 1 + np.searchsorted(lengths_cumsum[start_index:], s + c * n, "right")
        while right - left > 1:
            mid = (left + right) // 2
            if ffd_check(lengths[start_index : start_index + mid], c, n):
                left = mid
            else:
                right = mid
        # use length l
        batch = ffd_with_result(
            lengths[start_index : start_index + left], c, start_index
        )
        assert len(batch) <= n
        if len(batch) < n:
            break
        start_index += left
        s = lengths_cumsum[start_index - 1]
        # add local rank
        result.append(batch[rank])
    return result, s, len(result) * c * n
@numba.njit
 def allocate_sequentially(lengths: np.ndarray, rank: int, c: int, n: int):
    """
    Sequential allocator that preserves example order
    Parameters:
-        sequence_lengths: The lengths of all examples
+    - lengths: The lengths of all examples
-        rank: The current rank (for distributed training)
+    - rank: The current rank (for distributed training)
-        bin_capacity: The capacity of each bin (maximum sequence length)
+    - c: The capacity of each bin (maximum sequence length)
-        num_ranks: Number of ranks (processes/GPUs)
+    - n: Number of ranks
    Returns:
-        rank_batches: List of batches for the current rank
+    - result: List of batches for the current rank
-        total_tokens_used: Number of actual example tokens
+    - total_used: Number of actual example tokens
-        total_token_slots: Maximum theoretical number of example tokens (number of bins * bin capacity)
+    - total_slots: Maximum theoretical number of example tokens (number of bins * bin capacity)
    """
-    rank_batches = []
+    result = []
-    total_tokens_used = 0
+    total_used = 0
    # First, do sequential packing into bins
    all_bins = []
-    current_bin = []
+    current_bin = [0 for i in range(0)]  # numba hint
-    remaining_capacity = bin_capacity
+    remaining_capacity = c
-    # Process each sequence in order
+    for idx, size in enumerate(lengths):
    for idx, size in enumerate(sequence_lengths):
        if size <= remaining_capacity:
            # Example fits in current bin
            current_bin.append(idx)
            remaining_capacity -= size
-            total_tokens_used += size
+            total_used += size
        else:
            # Example doesn't fit, start a new bin
            if current_bin:  # Add non-empty bin to all_bins
                all_bins.append(current_bin)
            current_bin = [idx]
-            remaining_capacity = bin_capacity - size
+            remaining_capacity = c - size
-            total_tokens_used += size
+            total_used += size
    # Add the last bin if not empty
    if current_bin:
        all_bins.append(current_bin)
-    # Assign bins to ranks - each rank gets every num_ranks-th bin
+    # Assign bins to ranks - each rank gets every n-th bin
-    for bin_idx in range(rank, len(all_bins), num_ranks):
+    for bin_idx in range(rank, len(all_bins), n):
-        rank_batches.append(all_bins[bin_idx])
+        result.append(all_bins[bin_idx])
-    return rank_batches, total_tokens_used, len(all_bins) * bin_capacity
+    return result, total_used, len(all_bins) * c
 class MultipackBatchSampler(BatchSampler):
-    """
+    """Batch sampler class for multipack"""
    Batch sampler class for efficient packing of variable-length sequences
    This sampler packs sequences into fixed-capacity bins (batches) to maximize
    GPU memory utilization and training throughput by reducing padding.
    It supports both parallel packing (using FFD algorithm) and
    sequential packing (preserving original sequence order).
    """
    def __init__(
        self,
        sampler: Union[Sampler[int], Iterable[int]],
-        batch_size: int,  # Number of bins per batch
+        batch_size: int,
-        batch_max_len: int,  # Maximum sequence length (bin capacity)
+        batch_max_len: int,
-        lengths: np.ndarray,  # Sequence lengths
+        lengths: np.ndarray,
-        packing_efficiency_estimate: float = 1.0,  # Initial efficiency estimate
+        packing_efficiency_estimate: float = 1.0,
-        drop_last: bool = False,  # Whether to drop incomplete batches
+        drop_last: bool = False,
-        num_count_samples: int = 16,  # Number of samples to estimate batch count
+        num_count_samples: int = 16,
-        sequential: bool = False,  # Whether to use sequential packing
+        sequential: bool = False,
-        group_size: int = 100_000,  # Size of groups for parallel packing
+        **kwargs,
        bin_size: int = 200,  # The max number of samples that can be packed in a single bin
        num_processes: int | None = None,  # Number of processes for parallel packing
        safe_mode: bool = True,  # Conservative packing to prevent training instability
        **kwargs,  # pylint: disable=unused-argument
    ):
        super().__init__(sampler, batch_size, drop_last)
        self.batch_size = batch_size
        self.batch_max_len = batch_max_len
-        self.lengths = np.array(lengths, dtype=np.int32)
+        self.lengths: np.ndarray = lengths
        self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
        self.sequential = sequential
        self.group_size = group_size
        self.bin_size = bin_size
        self.num_processes = num_processes
        self.safe_mode = safe_mode
        assert isinstance(self.lengths, np.ndarray)
        self.epoch = 0
-        # Efficiency statistics tracking
+        # statistics
-        self.total_tokens_used = 0
+        self.eff_total_used = 0
-        self.total_token_slots = 0
+        self.eff_total_slots = 0
-        # The number of times to calculate batches to determine minimum packed dataset length
+        # The number of times to calculate the batches to determine the minimum packed dataset length for the local rank
        self.num_count_samples = num_count_samples
-        # Minimum packed dataset length across all ranks (determined by gather/broadcast)
+        # the minimum packed dataset length across all ranks determined by a gather/broadcast
        self.len_across_ranks = None
        # Cache for batches
        self._batches = None
        if self.sequential and not isinstance(sampler, SequentialSampler):
-            LOG.warning(
+            LOG.warn(
                "using sequential sample packing with non-sequential sampler, did you want to also enable curriculum_sampling?"
            )
    def set_epoch(self, epoch: int):
        """Set the epoch number, used for reproducible shuffling across epochs"""
        self.epoch = epoch
        self._batches = None  # Invalidate batch cache
    def generate_batches(self, set_stats=False):
-        """
+        indices = [idx for idx in self.sampler]
        Generate packed batches for training
        Args:
            set_stats: Whether to update efficiency statistics
        Returns:
            List of batches, where each batch contains multiple bins,
            and each bin contains multiple sequence indices
        """
        if self._batches is not None:
            return self._batches
        # Get indices from the sampler
        indices = [  # pylint: disable=unnecessary-comprehension
            idx for idx in self.sampler
        ]
        # Get lengths of the selected sequences
        lengths = self.lengths[indices]
        lengths_cumsum = np.cumsum(lengths)
        # Pack sequences into bins using either sequential or parallel packing
        if self.sequential:
-            bins, total_used, total_slots = allocate_sequentially(
+            batches, total_used, total_slots = allocate_sequentially(
-                lengths,
+                lengths=lengths,
                rank=0,
-                bin_capacity=self.batch_max_len,
+                c=self.batch_max_len,
-                num_ranks=1,
+                n=1,
            )
        else:
-            # Use parallel packing
+            batches, total_used, total_slots = allocate(
-            all_bins = pack_parallel(
+                lengths=lengths,
-                lengths,
+                lengths_cumsum=lengths_cumsum,
-                bin_capacity=self.batch_max_len,
+                rank=0,
-                group_size=self.group_size,
+                c=self.batch_max_len,
-                bin_size=self.bin_size,
+                n=1,
                num_processes=self.num_processes,
                safe_mode=self.safe_mode,
            )
            # Map bin indices back to original indices
            bins = [
                [indices[b_idx] for b_idx in bin_indices] for bin_indices in all_bins
            ]
            # Calculate efficiency statistics
            total_used = lengths.sum()
            total_slots = len(all_bins) * self.batch_max_len
        # Group bins into batches (each batch contains batch_size bins)
        batches = [
-            bins[i : i + self.batch_size] for i in range(0, len(bins), self.batch_size)
+            [
                [indices[b_idx] for b_idx in batch]
                for batch in batches[i : i + self.batch_size]
            ]
            for i in range(0, len(batches), self.batch_size)
        ]
-        # Drop last batch if requested and it's incomplete
+        # statistics
        if self.drop_last and len(batches[-1]) < self.batch_size:
            batches = batches[:-1]
            # Adjust total_slots if we dropped a batch
            if not self.sequential:
                total_slots -= (self.batch_size - len(batches[-1])) * self.batch_max_len
        # Update statistics if requested
        if set_stats:
-            self.total_tokens_used += total_used
+            self.eff_total_used += total_used
-            self.total_token_slots += total_slots
+            self.eff_total_slots += total_slots
        self._batches = batches
        return batches
    def __iter__(self):
        """
        Return an iterator over batches
        The batches are truncated to match the minimum number of batches across all ranks
        to ensure distributed training balance
        """
        batches = self.generate_batches(set_stats=True)
        if self.len_across_ranks:
-            # Truncate batches to ensure all ranks have the same number of batches
+            # make sure the batches we iterate over is truncated to the same min length across all ranks
            batches = batches[: self.len_across_ranks]
        return iter(batches)
    def num_batches(self):
        batches = self.generate_batches(set_stats=True)
        return len(batches)
    def efficiency(self):
-        """
+        return self.eff_total_used / self.eff_total_slots
        Calculate the packing efficiency (ratio of tokens used to total token slots)
        Higher is better - 1.0 would mean perfect packing with no wasted space
        """
        if self.total_token_slots == 0:
            self.generate_batches(set_stats=True)
        if self.total_token_slots == 0:
            return 0.0
        # Return a Python float instead of potentially a numpy float
        return float(self.total_tokens_used / self.total_token_slots)
    def gather_efficiency(self):
        """
        Gather and synchronize packing efficiency estimates across all distributed ranks
        Returns a conservative efficiency estimate based on the measurements
        """
        def calc_sample_packing_eff_est(estimates: List[float]):
            LOG.debug(f"sample_packing_eff_est across ranks: {repr(estimates)}")
-            # Use 99.7% of max observed efficiency as a safe estimate
+            return math.floor(0.997 * max(estimates))
            max_eff = max(float(eff) for eff in estimates)
            return math.floor(0.997 * max_eff)
        # Gather efficiency from all ranks and apply the calculation function
        sample_packing_actual_eff_all = reduce_and_broadcast(
-            lambda: float(self.efficiency()),  # pylint: disable=unnecessary-lambda
+            lambda: self.efficiency(),  # pylint: disable=unnecessary-lambda
            calc_sample_packing_eff_est,
        )
        # Quantize to 0.5% intervals for stability
        sample_packing_eff_est = (
            math.ceil(sample_packing_actual_eff_all * 200.0) / 200.0
        )
        return sample_packing_eff_est
    def gather_len_batches(self, num):
        """
        Gather and synchronize batch counts across all distributed ranks
        Returns the minimum number of batches available on any rank
        """
        def calc_min_len(estimates: list[(int, float)]):
            LOG.info(f"gather_len_batches: {repr(estimates)}")
            return math.floor(min(estimates))
        # Find minimum batch count across ranks to ensure balance
        min_len_batches = reduce_and_broadcast(lambda: num, calc_min_len)
        return min_len_batches
    def __len__(self):
-        """
+        if not self.len_across_ranks:
-        Return the total number of batches that will be yielded by this sampler
+            len_batches = min(
-
+                [self.num_batches() for _ in range(self.num_count_samples)]
        This is calculated as the minimum number of batches available on any rank
        to ensure balanced distributed training
        """
        if self._batches is None:
            self._batches = self.generate_batches(set_stats=True)
        if self.len_across_ranks is None:
            # Sample multiple times to get stable estimate
            len_batches = min(  # pylint: disable=consider-using-generator
                [len(self._batches) for _ in range(self.num_count_samples)]
            )
            # Gather minimum across all ranks
            self.len_across_ranks = self.gather_len_batches(len_batches)
        return self.len_across_ranks
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -242,9 +242,6 @@ class AxolotlInputConfig(
    unsloth_rms_norm: bool | None = None
    unsloth_rope: bool | None = None
    chunked_cross_entropy: bool | None = None
    chunked_cross_entropy_num_chunks: int | None = None
    lora_mlp_kernel: bool | None = None
    lora_qkv_kernel: bool | None = None
    lora_o_kernel: bool | None = None
@@ -312,7 +309,6 @@ class AxolotlInputConfig(
        | Annotated[str, StringConstraints(pattern="^tokenizer_default_fallback_")]
    ) | None = None
    chat_template_jinja: str | None = None
    eot_tokens: list[str] | None = None
    default_system_message: str | None = None
    fix_untrained_tokens: int | list[int] | None = None
@@ -438,6 +434,16 @@ class AxolotlInputConfig(
            )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_sample_packing_w_xformers(cls, data):
        if data.get("sample_packing") and data.get("xformers_attention"):
            raise ValueError(
                "sample_packing not compatible with xformers_attention. Use flash_attention"
            )
        return data
    @model_validator(mode="before")
    @classmethod
    # pylint: disable=duplicate-code
@@ -505,17 +511,10 @@ class AxolotlInputConfig(
    @model_validator(mode="before")
    @classmethod
    def hint_sample_packing_padding(cls, data):
-        if data.get("sample_packing"):
+        if data.get("sample_packing") and not data.get("pad_to_sequence_len"):
-            pad_to_sequence_len = data.get("pad_to_sequence_len")
+            LOG.warning(
-            if pad_to_sequence_len is False:
+                "`pad_to_sequence_len: true` is recommended when using sample_packing"
-                LOG.warning(
+            )
                    "`pad_to_sequence_len: true` is recommended when using sample_packing"
                )
            elif pad_to_sequence_len is None:
                LOG.info(
                    "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
                )
                data["pad_to_sequence_len"] = True
        return data
    @model_validator(mode="before")
@@ -1150,18 +1149,6 @@ class AxolotlInputConfig(
        return data
    @model_validator(mode="before")
    @classmethod
    def check_grpo_peft_liger(cls, data):
        if (
            data.get("rl") == "grpo"
            and data.get("trl", {})
            and data.get("trl").get("use_liger_loss")
            and data.get("adapter")
        ):
            raise ValueError("PEFT + GRPO + Liger is not yet supported")
        return data
    @model_validator(mode="after")
    def check_sequence_parallel_degree(self):
        if not self.sequence_parallel_degree:
@@ -1327,57 +1314,6 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
                    )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_auto_enable_lora_kernels(cls, data):
        # Only proceed if using LoRA or QLoRA adapter
        if data.get("rl"):
            # RL trainers not tested so don't enable kernels by default
            return data
        if data.get("adapter") in ["lora", "qlora"]:
            # Skip if already set, using unsloth optimizations, or using 8-bit
            unsloth_fields = ["unsloth_lora_mlp", "unsloth_lora_qkv", "unsloth_lora_o"]
            kernel_fields = ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"]
            if (
                any(data.get(k) is not None for k in kernel_fields)
                or any(data.get(k) for k in unsloth_fields)
                or data.get("adapter") == "lora"
                and data.get("load_in_8bit")
            ):
                return data
            # Check multi-GPU compatibility
            capabilities = data.get("capabilities")
            is_multi_gpu = capabilities and capabilities.get("n_gpu", 0) > 1
            is_fsdp = data.get("fsdp") is not None
            is_fsdp2 = (
                data.get("fsdp_config") is not None
                and str(data.get("fsdp_config").get("fsdp_version")) == "2"
            )
            if (
                not is_multi_gpu
                or (is_multi_gpu and not is_fsdp)
                or (is_multi_gpu and is_fsdp2)
            ):
                # Auto-enable kernels if not explicitly set by user
                if data.get("lora_mlp_kernel") is None:
                    data["lora_mlp_kernel"] = True
                if data.get("lora_qkv_kernel") is None:
                    data["lora_qkv_kernel"] = True
                if data.get("lora_o_kernel") is None:
                    data["lora_o_kernel"] = True
                LOG.warning(
                    "Auto-enabling LoRA kernel optimizations for faster training. "
                    + "Please explicitly set `lora_*_kernel` config values to `false` to disable. "
                    + "See https://docs.axolotl.ai/docs/lora_optims.html for more info."
                )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_adopt_torch_version(cls, data):
--- a/src/axolotl/utils/schemas/datasets.py
+++ b/src/axolotl/utils/schemas/datasets.py
@@ -50,7 +50,6 @@ class SFTDataset(BaseModel):
    message_property_mappings: dict[str, str] | None = None
    message_field_training: str | None = None
    message_field_training_detail: str | None = None
    split_thinking: bool | None = None
    logprobs_field: str | None = None
    temperature: float | None = None
    roles_to_train: list[str] | None = None
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -35,7 +35,6 @@ class ChatTemplate(str, Enum):
    jamba = "jamba"  # pylint: disable=invalid-name
    jinja = "jinja"  # pylint: disable=invalid-name
    qwen_25 = "qwen_25"  # pylint: disable=invalid-name
    qwen3 = "qwen3"  # pylint: disable=invalid-name
    tokenizer_default = "tokenizer_default"  # pylint: disable=invalid-name
    exaone = "exaone"  # pylint: disable=invalid-name
    metharme = "metharme"  # pylint: disable=invalid-name
--- a/src/axolotl/utils/schemas/trl.py
+++ b/src/axolotl/utils/schemas/trl.py
@@ -67,12 +67,6 @@ class TRLConfig(BaseModel):
        default=False,
        json_schema_extra={"description": "Whether to log completions"},
    )
    num_completions_to_print: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of completions to print. If `log_completions` is `True`, this will be the number of completions logged."
        },
    )
    sync_ref_model: bool | None = Field(
        default=False,
        json_schema_extra={
@@ -139,25 +133,3 @@ class TRLConfig(BaseModel):
            "description": "Epsilon value for clipping in the GRPO algorithm."
        },
    )
    epsilon_high: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Upper-bound epsilon value for clipping in the GRPO algorithm."
        },
    )
    use_liger_loss: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Whether to use Liger loss for GRPO."},
    )
    loss_type: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Specifies the loss formulation to use. Supported values are `grpo`, `bnpo`, and `dr_grpo`."
        },
    )
    mask_truncated_completions: bool = Field(
        default=False,
        json_schema_extra={
            "description": "When enabled, truncated completions are excluded from the loss calculation."
        },
    )
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -597,8 +597,6 @@ def prepare_optim_env(cfg):
        os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
    elif cfg.fp16:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"
    else:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
 def prepare_opinionated_env(cfg):
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -79,9 +79,9 @@ def download_smollm2_135m_model():
@pytest.fixture(scope="session", autouse=True)
-def download_smollm2_135m_gptq_model():
+def download_llama_68m_random_model():
    # download the model
-    snapshot_download_w_retry("lilmeaty/SmolLM2-135M-Instruct-GPTQ", repo_type="model")
+    snapshot_download_w_retry("JackFram/llama-68m", repo_type="model")
@pytest.fixture(scope="session", autouse=True)
@@ -90,12 +90,6 @@ def download_qwen_2_5_half_billion_model():
    snapshot_download_w_retry("Qwen/Qwen2.5-0.5B", repo_type="model")
@pytest.fixture(scope="session", autouse=True)
 def download_qwen3_half_billion_model():
    # download the model
    snapshot_download_w_retry("Qwen/Qwen3-0.6B", repo_type="model")
@pytest.fixture(scope="session", autouse=True)
 def download_tatsu_lab_alpaca_dataset():
    # download the dataset
--- a/tests/e2e/integrations/test_hooks.py
+++ b/tests/e2e/integrations/test_hooks.py
@@ -1,184 +0,0 @@
 """
 e2e tests to make sure all the hooks are fired on the plugin
 """
 import os
 from pathlib import Path
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.integrations.base import BasePlugin
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists
 class LogHooksPlugin(BasePlugin):
    """
    fixture to capture in a log file each hook that was fired
    """
    base_dir = Path("/tmp/axolotl-log-hooks")
    def __init__(self):
        self.base_dir.mkdir(parents=True, exist_ok=True)
        try:
            os.remove(self.base_dir.joinpath("plugin_hooks.log"))
        except FileNotFoundError:
            pass
    def pre_model_load(self, cfg):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("pre_model_load\n")
    def post_model_build(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_model_build\n")
    def pre_lora_load(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("pre_lora_load\n")
    def post_lora_load(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_lora_load\n")
    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_model_load\n")
    def create_optimizer(self, cfg, trainer):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("create_optimizer\n")
    def get_trainer_cls(self, cfg):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("get_trainer_cls\n")
    def create_lr_scheduler(
        self, cfg, trainer, optimizer, num_training_steps
    ):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("create_lr_scheduler\n")
    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("add_callbacks_pre_trainer\n")
        return []
    def add_callbacks_post_trainer(
        self, cfg, trainer
    ):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("add_callbacks_post_trainer\n")
        return []
    def post_train(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_train\n")
    def post_train_unload(self, cfg):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_train_unload\n")
 class TestPluginHooks:
    """
    e2e tests to make sure all the hooks are fired during the training
    """
    def test_plugin_hooks(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "plugins": [
                    "tests.e2e.integrations.test_hooks.LogHooksPlugin",
                ],
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "flash_attention": True,
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
        with open(
            "/tmp/axolotl-log-hooks" + "/plugin_hooks.log", "r", encoding="utf-8"
        ) as f:
            file_contents = f.readlines()
            file_contents = "\n".join(file_contents)
            assert "pre_model_load" in file_contents
            assert "post_model_build" in file_contents
            assert "pre_lora_load" in file_contents
            assert "post_lora_load" in file_contents
            assert "post_model_load" in file_contents
            # assert "create_optimizer" in file_contents  # not implemented yet
            assert "get_trainer_cls" in file_contents
            assert "create_lr_scheduler" in file_contents
            assert "add_callbacks_pre_trainer" in file_contents
            assert "add_callbacks_post_trainer" in file_contents
            assert "post_train" in file_contents
            # assert "post_train_unload" in file_contents  # not called from test train call
        try:
            os.remove("/tmp/axolotl-log-hooks" + "/plugin_hooks.log")
        except FileNotFoundError:
            pass
--- a/tests/e2e/integrations/test_llm_compressor.py
+++ b/tests/e2e/integrations/test_llm_compressor.py
@@ -1,111 +0,0 @@
 """
 E2E smoke tests for LLMCompressorPlugin integration
 """
 from pathlib import Path
 import pytest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 from tests.e2e.utils import (
    check_model_output_exists,
    require_llmcompressor,
    require_torch_2_4_1,
 )
 MODELS = [
    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
    "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
 ]
@pytest.mark.parametrize(
    "base_model", MODELS, ids=["no-checkpoint-recipe", "with-checkpoint-recipe"]
 )
@pytest.mark.parametrize(
    "save_compressed", [True, False], ids=["save_compressed", "save_uncompressed"]
 )
 class TestLLMCompressorIntegration:
    """
    e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin
    """
    @require_llmcompressor
    @require_torch_2_4_1
    def test_llmcompressor_plugin(
        self, temp_dir, base_model: str, save_compressed: bool
    ):
        from llmcompressor import active_session
        # core cfg
        cfg = DictDefault(
            {
                "base_model": base_model,
                "plugins": ["axolotl.integrations.llm_compressor.LLMCompressorPlugin"],
                "sequence_len": 1024,
                "val_set_size": 0.05,
                "special_tokens": {"pad_token": "<|endoftext|>"},
                "datasets": [{"path": "mhenrichsen/alpaca_2k_test", "type": "alpaca"}],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 1e-5,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "bf16": "auto",
                "max_steps": 5,
                "llmcompressor": {
                    "recipe": {
                        "finetuning_stage": {
                            "finetuning_modifiers": {
                                "ConstantPruningModifier": {
                                    "targets": [
                                        "re:.*q_proj.weight",
                                        "re:.*k_proj.weight",
                                        "re:.*v_proj.weight",
                                        "re:.*o_proj.weight",
                                        "re:.*gate_proj.weight",
                                        "re:.*up_proj.weight",
                                        "re:.*down_proj.weight",
                                    ],
                                    "start": 0,
                                },
                            },
                        },
                    },
                    "save_compressed": save_compressed,
                },
            }
        )
        prepare_plugins(cfg)
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        try:
            train(cfg=cfg, dataset_meta=dataset_meta)
            check_model_output_exists(temp_dir, cfg)
            _check_llmcompressor_model_outputs(temp_dir, save_compressed)
        finally:
            active_session().reset()
 def _check_llmcompressor_model_outputs(temp_dir, save_compressed):
    if save_compressed:
        assert (Path(temp_dir) / "recipe.yaml").exists()
        from compressed_tensors import ModelCompressor
        from compressed_tensors.config import Sparse24BitMaskConfig
        compressor = ModelCompressor.from_pretrained(temp_dir)
        assert compressor is not None
        assert isinstance(compressor.sparsity_config, Sparse24BitMaskConfig)
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -4,14 +4,11 @@ GRPO test suite
 import os
 import random
 import shutil
 import subprocess  # nosec B404
 import sys
 import tempfile
 import time
 from pathlib import Path
 import psutil
 import pytest
 import requests
 import yaml
@@ -24,8 +21,8 @@ from tests.e2e.utils import require_vllm
 def start_vllm(
-    model: str, env: dict, wait: int | None = None, quiet=False, **kwargs
+    model: str, env: dict | None = None, wait: int | None = None, quiet=False, **kwargs
-) -> subprocess.Popen:
+) -> int:
    """
    helper function to start the VLLM server in the background, mostly for testing purposes
    """
@@ -49,41 +46,10 @@ def start_vllm(
    # print out the command to be executed
    print(" ".join(cmd))
    vllm_logging_json = Path(tempfile.mkdtemp()) / "vllm_logging.json"
    with open(vllm_logging_json, "w", encoding="utf-8") as temp_file:
        temp_file.write(
            """{
  "formatters": {
    "json": {
      "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
    }
  },
  "handlers": {
    "file": {
      "class": "logging.FileHandler",
      "formatter": "json",
      "level": "DEBUG",
      "filename": "/tmp/vllm.log",
      "mode": "a"
    }
  },
  "loggers": {
    "vllm": {
      "handlers": ["file"],
      "level": "DEBUG",
      "propagate": false
    }
  },
  "version": 1
 }"""
        )
    cmd_env = env.copy()
    cmd_env.update({"VLLM_LOGGING_CONFIG_PATH": vllm_logging_json})
    # start `trl vllm-serve` command in the background and capture the process id
    process = subprocess.Popen(  # pylint: disable=consider-using-with
        cmd,
-        env=cmd_env,
+        env=env,
        stdout=subprocess.DEVNULL if quiet else subprocess.PIPE,
        stderr=subprocess.DEVNULL if quiet else subprocess.PIPE,
    )  # nosec B603
@@ -92,51 +58,32 @@ def start_vllm(
    print(f"VLLM server process started (PID: {process.pid})")
    # wait until the http server is ready, even if it 404s, but timeout after 60 seconds
    period_seconds = 5
    started = False
    if wait and host and port:
-        for i in range(0, int(wait), period_seconds):
+        for _ in range(int(wait)):
            try:
                response = requests.get(f"http://{host}:{port}", timeout=1)
                print(f"{i}: VLLM server (status: {response.status_code})")
                if int(response.status_code) in [200, 404]:
                    started = True
                    break
-            except requests.exceptions.RequestException as exc:
+            except requests.exceptions.RequestException:
-                print(f"{i}: VLLM server failed to start: {str(exc)}")
+                pass
            # also check if the process.pid is still running
            if not process.poll() is None:
                break
-            time.sleep(period_seconds)
+            time.sleep(1)
    if wait and not started:
        print(
            f"VLLM server process did not start within {wait} seconds. Please check your server logs."
        )
-        recursive_kill(process)
+        process.kill()
        with open("/tmp/vllm.log", "r", encoding="utf-8") as log_file:
            print(log_file.read())
        shutil.rmtree("/tmp/vllm.log")
        raise RuntimeError(f"VLLM server process did not start within {wait} seconds.")
-    # return the process
+    # return the process id
-    return process
+    return process.pid
 def recursive_kill(process: subprocess.Popen):
    """
    Recursively kill a process and its children
    """
    process = psutil.Process(process.pid)
    for child in psutil.Process(process.pid).children(recursive=True):
        child.terminate()
        child.kill()
        os.kill(child.pid, 9)
    process.terminate()
    process.kill()
    os.kill(process.pid, 9)
 class TestGRPO:
@@ -227,17 +174,16 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
        current_env = os.environ.copy()
        env = {
-            "NCCL_P2P_LEVEL": "NVL",
+            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
-            "VLLM_DISABLE_COMPILE_CACHE": "1",
+            "VLLM_USE_V1": "0",
            # "VLLM_USE_V1": "0",
        }
-        vllm_process = start_vllm(
+        vllm_process_id = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
-            wait=300,
+            wait=120,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
@@ -256,14 +202,10 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
-                env={
+                env={"NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env},
                    "NCCL_P2P_LEVEL": "NVL",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
-            recursive_kill(vllm_process)
+            os.kill(vllm_process_id, 9)
    @pytest.mark.parametrize(
        "num_gpus",
@@ -320,17 +262,16 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
        current_env = os.environ.copy()
        env = {
-            "NCCL_P2P_LEVEL": "NVL",  # nccl can be brittle, assume P2P isn't reliable
+            "NCCL_P2P_LEVEL": "LOC",  # nccl can be brittle, assume P2P isn't reliable
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
-            "VLLM_DISABLE_COMPILE_CACHE": "1",
+            "VLLM_USE_V1": "0",
            # "VLLM_USE_V1": "0",
        }
-        vllm_process = start_vllm(
+        vllm_process_id = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
-            wait=300,
+            wait=120,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
@@ -349,11 +290,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
-                env={
+                env={"NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env},
                    "NCCL_P2P_LEVEL": "NVL",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
-            recursive_kill(vllm_process)
+            os.kill(vllm_process_id, 9)
--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -2,19 +2,14 @@
 # pylint: disable=redefined-outer-name
 from pathlib import Path
 import pytest
 import torch
 import yaml
 from accelerate.state import PartialState
 from peft import PeftModelForCausalLM, get_peft_config
 from transformers import AutoModelForCausalLM, LlamaForCausalLM
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import LlamaAttention
 from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeAttention
 from axolotl.cli.config import load_cfg
 from axolotl.kernels.lora import (
    apply_lora_mlp_geglu,
    apply_lora_mlp_swiglu,
@@ -71,36 +66,29 @@ def small_llama_model():
    return LlamaForCausalLM(LlamaConfig(**config))
-@pytest.mark.parametrize(
+def test_attention_patching_integration():
    "model_name,attention_cls",
    [
        ("HuggingFaceTB/SmolLM2-135M", LlamaAttention),
        ("Qwen/Qwen3-30B-A3B", Qwen3MoeAttention),
    ],
 )
 def test_attention_patching_integration(model_name, attention_cls):
    """Test attention patching in integration context."""
-    cfg = {"base_model": model_name}
+    cfg = {"base_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"}
    # Store the original implementation
-    original_forward = getattr(attention_cls, "forward")
+    original_forward = getattr(LlamaAttention, "forward")
    # Apply patch
    patch_self_attn_lora(cfg)
    # Get the new forward method
-    patched_forward = attention_cls.forward
+    patched_forward = LlamaAttention.forward
    # Check the forward method was replaced
    assert original_forward is not patched_forward
    assert patched_forward.__name__ == "axolotl_attn_forward"
    # Check original implementation was stored
-    assert hasattr(attention_cls, "_original_forward")
+    assert hasattr(LlamaAttention, "_original_forward")
    # Clean up
-    setattr(attention_cls, "forward", original_forward)
+    setattr(LlamaAttention, "forward", original_forward)
-    delattr(attention_cls, "_original_forward")
+    delattr(LlamaAttention, "_original_forward")
 def test_swiglu_mlp_integration(small_llama_model):
@@ -425,42 +413,3 @@ def test_kernel_training_integration():
    # Verify correct activation function
    layer = model.model.model.layers[0]
    assert layer.mlp.forward.__func__ is apply_lora_mlp_swiglu
 def test_kernel_training_integration_auto_enable(temp_dir):
    """Test model loading with auto-enabled kernel patches."""
    # Create minimal config without explicitly setting kernel options
    cfg = DictDefault(
        {
            "base_model": "HuggingFaceTB/SmolLM2-135M",
            "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
            "learning_rate": 0.000001,
            "datasets": [
                {
                    "path": "mhenrichsen/alpaca_2k_test",
                    "type": "alpaca",
                }
            ],
            "micro_batch_size": 1,
            "gradient_accumulation_steps": 1,
            "adapter": "lora",
            "lora_r": 8,
            "lora_alpha": 16,
            "lora_dropout": 0.0,
            "lora_target_linear": True,
            "sequence_len": 1024,
        }
    )
    # Write cfg to yaml file
    path = Path(temp_dir) / "config.yaml"
    with open(path, "w", encoding="utf-8") as fout:
        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
    # Load config
    cfg = load_cfg(str(path))
    # Verify kernel options were auto-enabled in the config
    assert cfg.lora_mlp_kernel is True
    assert cfg.lora_qkv_kernel is True
    assert cfg.lora_o_kernel is True
--- a/tests/e2e/patched/test_4d_multipack_llama.py
+++ b/tests/e2e/patched/test_4d_multipack_llama.py
@@ -28,7 +28,7 @@ class Test4dMultipackLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
                "flash_attention": False,
                "sdp_attention": True,
                "sample_packing": True,
@@ -41,9 +41,6 @@ class Test4dMultipackLlama(unittest.TestCase):
                "lora_target_linear": True,
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
@@ -76,7 +73,7 @@ class Test4dMultipackLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
                "flash_attention": False,
                "sdp_attention": False,
                "sample_packing": True,
@@ -89,9 +86,6 @@ class Test4dMultipackLlama(unittest.TestCase):
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
--- a/tests/e2e/patched/test_fused_llama.py
+++ b/tests/e2e/patched/test_fused_llama.py
@@ -32,7 +32,7 @@ class TestFusedLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
                "flash_attention": True,
                "pad_to_sequence_len": True,
                "flash_attn_fuse_qkv": True,
@@ -41,7 +41,9 @@ class TestFusedLlama(unittest.TestCase):
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
--- a/tests/e2e/patched/test_llama_s2_attention.py
+++ b/tests/e2e/patched/test_llama_s2_attention.py
@@ -31,8 +31,8 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 16384,
                "sample_packing": False,
                "flash_attention": True,
@@ -44,9 +44,7 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "Yukang/LongAlpaca-12k",
@@ -80,16 +78,14 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 16384,
                "sample_packing": False,
                "flash_attention": True,
                "s2_attention": True,
                "val_set_size": 0.02,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "Yukang/LongAlpaca-12k",
--- a/tests/e2e/patched/test_lora_llama_multipack.py
+++ b/tests/e2e/patched/test_lora_llama_multipack.py
@@ -31,8 +31,8 @@ class TestLoraLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
@@ -44,7 +44,9 @@ class TestLoraLlama(unittest.TestCase):
                "lora_target_linear": True,
                "val_set_size": 0.2,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
@@ -82,9 +84,9 @@ class TestLoraLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "lilmeaty/SmolLM2-135M-Instruct-GPTQ",
+                "base_model": "TheBlokeAI/jackfram_llama-68m-GPTQ",
                "model_type": "AutoModelForCausalLM",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
@@ -98,7 +100,9 @@ class TestLoraLlama(unittest.TestCase):
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
--- a/tests/e2e/test_dpo.py
+++ b/tests/e2e/test_dpo.py
@@ -31,8 +31,8 @@ class TestDPOLlamaLora(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
@@ -40,9 +40,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "rl": "dpo",
                "datasets": [
                    {
@@ -79,8 +77,8 @@ class TestDPOLlamaLora(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
@@ -88,9 +86,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "rl": "dpo",
                "rpo_alpha": 0.5,
                "datasets": [
@@ -128,8 +124,8 @@ class TestDPOLlamaLora(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
@@ -137,9 +133,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "rl": "dpo",
                "dpo_use_weighting": True,
                "datasets": [
@@ -178,8 +172,8 @@ class TestDPOLlamaLora(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
@@ -187,9 +181,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "rl": "kto_pair",
                "datasets": [
                    {
@@ -226,8 +218,8 @@ class TestDPOLlamaLora(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
@@ -235,9 +227,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "rl": "ipo",
                "datasets": [
                    {
@@ -274,8 +264,8 @@ class TestDPOLlamaLora(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
@@ -283,9 +273,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "rl": "orpo",
                "orpo_alpha": 0.1,
                "remove_unused_columns": False,
@@ -326,7 +314,7 @@ class TestDPOLlamaLora(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
@@ -335,9 +323,7 @@ class TestDPOLlamaLora(unittest.TestCase):
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "rl": "kto",
                "rl_beta": 0.5,
                "kto_desirable_weight": 1.0,
--- a/tests/e2e/test_evaluate.py
+++ b/tests/e2e/test_evaluate.py
@@ -1,65 +0,0 @@
 """E2E smoke test for evaluate CLI command"""
 import os
 from pathlib import Path
 import yaml
 from accelerate.test_utils import execute_subprocess_async
 from transformers.testing_utils import get_torch_dist_unique_port
 from axolotl.utils.dict import DictDefault
 os.environ["WANDB_DISABLED"] = "true"
 class TestE2eEvaluate:
    """Test cases for evaluate CLI"""
    def test_evaluate(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
                "base_model": "JackFram/llama-68m",
                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
            }
        )
        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
        execute_subprocess_async(
            [
                "accelerate",
                "launch",
                "--num-processes",
                "2",
                "--main_process_port",
                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.evaluate",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -26,13 +26,15 @@ class TestLlama:
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "trust_remote_code": True,
                "sequence_len": 512,
                "val_set_size": 0.02,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
NanoCode012	2b9a2dde4b	chore: update title	2025-04-26 16:21:31 -04:00
Wing Lian	388e950016	restore dockerfile	2025-04-26 16:21:30 -04:00
NanoCode012	fb4adbb311	fix: trim allowed cuda versions	2025-04-26 16:21:30 -04:00
Wing Lian	5e8abca54f	use axolotl cloud image as base and various fixes	2025-04-26 16:21:30 -04:00
Wing Lian	168ec339e5	chore: lint	2025-04-26 16:21:30 -04:00
zeke	cb7185998b	remove LICENSE and fix README	2025-04-26 16:21:30 -04:00
zeke	c2fc35f520	Add runpod sls handler	2025-04-26 16:21:30 -04:00
`@@ -4,4 +4,4 @@ import pkgutil`

	`__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package`	`__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package`

	`__version__ = "0.10.0.dev0"`	`__version__ = "0.8.0"`