validate config to set defaults

Add: line about further optimizations using llmcompressor
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-04-26 13:11:25 -04:00 · 2025-04-24 14:06:25 -04:00 · 2025-04-24 12:37:14 -05:00 · 2025-04-24 13:36:09 -04:00 · 2025-04-24 12:45:57 -04:00 · 2025-04-24 12:45:57 -04:00
114 changed files with 597 additions and 6048 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -22,6 +22,12 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.4.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
          - cuda: "124"
            cuda_version: 12.4.1
            cudnn_version: ""
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,8 +18,13 @@ jobs:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
-            pytorch: 2.5.1
+            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras: vllm
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -30,7 +35,7 @@ jobs:
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
-            axolotl_extras:
+            axolotl_extras: vllm
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
@@ -62,7 +67,6 @@ jobs:
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
@@ -78,6 +82,11 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -8,8 +8,6 @@ on:
      - 'setup.py'
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday
@@ -33,11 +31,18 @@ jobs:
            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:  # no vllm support for 2.4.1
            num_gpus: 2
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
-            axolotl_extras:
+            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 126
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,6 +12,11 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -65,6 +70,11 @@ jobs:
    strategy:
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            axolotl_extras:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -1,61 +0,0 @@
 name: Preview
 on:
  workflow_dispatch:
  pull_request:
    types: [opened, synchronize, reopened]
    # Run the workflow only when one of these files changes
    paths:
      - '**/*.md'      # any Markdown file
      - '**/*.qmd'     # any Quarto file
      - '_quarto.yaml'
 permissions:
  checks: write
  contents: write
  deployments: write
  issues: write
  discussions: write
  pages: write
  pull-requests: write
  statuses: write
 jobs:
  preview:
    runs-on: ubuntu-latest
    steps:
      - name: Check out repository
        uses: actions/checkout@v4
      - name: Set up Quarto
        uses: quarto-dev/quarto-actions/setup@v2
      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'
      - name: Install dependencies
        run: |
          python3 -m pip install jupyter quartodoc
          python3 -m pip install -e . --no-deps
      - name: Build autodoc
        run: quartodoc build
      - name: Quarto render
        run: quarto render
      - name: Netlify Publish
        uses: nwtgck/actions-netlify@v3.0
        with:
          publish-dir: './_site'
          enable-pull-request-comment: true
          enable-github-deployment: true
          github-token: ${{ secrets.GITHUB_TOKEN }}
          deploy-message: "Deployed On Netlify"
          github-deployment-environment: 'preview'
          github-deployment-description: 'Preview Deployment'
        env:
          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -26,7 +26,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20
    steps:
@@ -106,6 +106,13 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.4.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -27,9 +27,6 @@ concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}
 env:
  TRANSFORMERS_IS_CI: "yes"
 jobs:
  pre-commit:
    name: pre-commit
@@ -52,7 +49,7 @@ jobs:
      max-parallel: 2
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20
    steps:
@@ -138,7 +135,7 @@ jobs:
      max-parallel: 1
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
+        pytorch_version: ["2.4.1", "2.5.1", "2.6.0"]
    timeout-minutes: 20
    steps:
@@ -261,12 +258,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.6.0
            num_gpus: 1
            axolotl_extras: llmcompressor
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -278,7 +269,7 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            num_gpus: 1
-            axolotl_extras:
+            axolotl_extras: vllm
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
--- a/.runpod/.gitignore
+++ b/.runpod/.gitignore
@@ -1,161 +0,0 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/
 develop-eggs/
 dist/
 downloads/
 eggs/
 .eggs/
 lib/
 lib64/
 parts/
 sdist/
 var/
 wheels/
 share/python-wheels/
 *.egg-info/
 .installed.cfg
 *.egg
 MANIFEST
 # PyInstaller
 #  Usually these files are written by a python script from a template
 #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 *.manifest
 *.spec
 # Installer logs
 pip-log.txt
 pip-delete-this-directory.txt
 # Unit test / coverage reports
 htmlcov/
 .tox/
 .nox/
 .coverage
 .coverage.*
 .cache
 nosetests.xml
 coverage.xml
 *.cover
 *.py,cover
 .hypothesis/
 .pytest_cache/
 cover/
 # Translations
 *.mo
 *.pot
 # Django stuff:
 *.log
 local_settings.py
 db.sqlite3
 db.sqlite3-journal
 # Flask stuff:
 instance/
 .webassets-cache
 # Scrapy stuff:
 .scrapy
 # Sphinx documentation
 docs/_build/
 # PyBuilder
 .pybuilder/
 target/
 # Jupyter Notebook
 .ipynb_checkpoints
 # IPython
 profile_default/
 ipython_config.py
 # pyenv
 #   For a library or package, you might want to ignore these files since the code is
 #   intended to run in multiple environments; otherwise, check them in:
 # .python-version
 # pipenv
 #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 #   install all needed dependencies.
 #Pipfile.lock
 # poetry
 #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 #   This is especially recommended for binary packages to ensure reproducibility, and is more
 #   commonly ignored for libraries.
 #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
 #poetry.lock
 # pdm
 #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
 #pdm.lock
 #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
 #   in version control.
 #   https://pdm.fming.dev/#use-with-ide
 .pdm.toml
 # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
 __pypackages__/
 # Celery stuff
 celerybeat-schedule
 celerybeat.pid
 # SageMath parsed files
 *.sage.py
 # Environments
 .env
 .venv
 env/
 venv/
 ENV/
 env.bak/
 venv.bak/
 # Spyder project settings
 .spyderproject
 .spyproject
 # Rope project settings
 .ropeproject
 # mkdocs documentation
 /site
 # mypy
 .mypy_cache/
 .dmypy.json
 dmypy.json
 # Pyre type checker
 .pyre/
 # pytype static type analyzer
 .pytype/
 # Cython debug symbols
 cython_debug/
 # PyCharm
 #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
 #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 #.idea/
 pod/scripts/config.yaml
--- a/.runpod/Dockerfile
+++ b/.runpod/Dockerfile
@@ -1,18 +0,0 @@
 FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
 COPY .runpod/requirements.txt /requirements.txt
 RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install --upgrade pip && \
    python3 -m pip install --upgrade -r /requirements.txt
 # Environment settings
 ARG BASE_VOLUME="/runpod-volume"
 ENV BASE_VOLUME=$BASE_VOLUME
 ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
 ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
 ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
 COPY .runpod/src /src
 WORKDIR /src
 CMD ["python3", "/src/handler.py"]
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -1,335 +0,0 @@
 <h1>LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more</h1>
 # Configuration Options
 This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.
 ## Usage
 You can use these configuration Options:
 1. As a JSON request body:
 ```json
 {
  "input": {
    "user_id": "user",
    "model_id": "model-name",
    "run_id": "run-id",
    "credentials": {
      "wandb_api_key": "", # add your Weights & biases key. TODO:  you will be able to set this in Enviornment variables.
      "hf_token": "", # add your HF_token. TODO:  you will be able to set this in Enviornment variables.
    },
    "args": {
      "base_model": "NousResearch/Llama-3.2-1B",
      // ... other options
    }
  }
 }
 ```
 ## Configuration Options
 ### Model Configuration
 | Option              | Description                                                                                   | Default              |
 | ------------------- | --------------------------------------------------------------------------------------------- | -------------------- |
 | `base_model`        | Path to the base model (local or HuggingFace)                                                 | Required             |
 | `base_model_config` | Configuration path for the base model                                                         | Same as base_model   |
 | `revision_of_model` | Specific model revision from HuggingFace hub                                                  | Latest               |
 | `tokenizer_config`  | Custom tokenizer configuration path                                                           | Optional             |
 | `model_type`        | Type of model to load                                                                         | AutoModelForCausalLM |
 | `tokenizer_type`    | Type of tokenizer to use                                                                      | AutoTokenizer        |
 | `hub_model_id`      | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional             |
 ## Model Family Identification
 | Option                     | Default | Description                    |
 | -------------------------- | ------- | ------------------------------ |
 | `is_falcon_derived_model`  | `false` | Whether model is Falcon-based  |
 | `is_llama_derived_model`   | `false` | Whether model is LLaMA-based   |
 | `is_qwen_derived_model`    | `false` | Whether model is Qwen-based    |
 | `is_mistral_derived_model` | `false` | Whether model is Mistral-based |
 ## Model Configuration Overrides
 | Option                                          | Default    | Description                        |
 | ----------------------------------------------- | ---------- | ---------------------------------- |
 | `overrides_of_model_config.rope_scaling.type`   | `"linear"` | RoPE scaling type (linear/dynamic) |
 | `overrides_of_model_config.rope_scaling.factor` | `1.0`      | RoPE scaling factor                |
 ### Model Loading Options
 | Option         | Description                   | Default |
 | -------------- | ----------------------------- | ------- |
 | `load_in_8bit` | Load model in 8-bit precision | false   |
 | `load_in_4bit` | Load model in 4-bit precision | false   |
 | `bf16`         | Use bfloat16 precision        | false   |
 | `fp16`         | Use float16 precision         | false   |
 | `tf32`         | Use tensor float 32 precision | false   |
 ## Memory and Device Settings
 | Option             | Default   | Description             |
 | ------------------ | --------- | ----------------------- |
 | `gpu_memory_limit` | `"20GiB"` | GPU memory limit        |
 | `lora_on_cpu`      | `false`   | Load LoRA on CPU        |
 | `device_map`       | `"auto"`  | Device mapping strategy |
 | `max_memory`       | `null`    | Max memory per device   |
 ## Training Hyperparameters
 | Option                        | Default   | Description                 |
 | ----------------------------- | --------- | --------------------------- |
 | `gradient_accumulation_steps` | `1`       | Gradient accumulation steps |
 | `micro_batch_size`            | `2`       | Batch size per GPU          |
 | `eval_batch_size`             | `null`    | Evaluation batch size       |
 | `num_epochs`                  | `4`       | Number of training epochs   |
 | `warmup_steps`                | `100`     | Warmup steps                |
 | `warmup_ratio`                | `0.05`    | Warmup ratio                |
 | `learning_rate`               | `0.00003` | Learning rate               |
 | `lr_quadratic_warmup`         | `false`   | Quadratic warmup            |
 | `logging_steps`               | `null`    | Logging frequency           |
 | `eval_steps`                  | `null`    | Evaluation frequency        |
 | `evals_per_epoch`             | `null`    | Evaluations per epoch       |
 | `save_strategy`               | `"epoch"` | Checkpoint saving strategy  |
 | `save_steps`                  | `null`    | Saving frequency            |
 | `saves_per_epoch`             | `null`    | Saves per epoch             |
 | `save_total_limit`            | `null`    | Maximum checkpoints to keep |
 | `max_steps`                   | `null`    | Maximum training steps      |
 ### Dataset Configuration
 ```yaml
 datasets:
  - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
    type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
    ds_type: json # Dataset type
    data_files: path/to/data # Source data files
    train_on_split: train # Dataset split to use
 ```
 ## Chat Template Settings
 | Option                   | Default                          | Description            |
 | ------------------------ | -------------------------------- | ---------------------- |
 | `chat_template`          | `"tokenizer_default"`            | Chat template type     |
 | `chat_template_jinja`    | `null`                           | Custom Jinja template  |
 | `default_system_message` | `"You are a helpful assistant."` | Default system message |
 ## Dataset Processing
 | Option                        | Default                    | Description                       |
 | ----------------------------- | -------------------------- | --------------------------------- |
 | `dataset_prepared_path`       | `"data/last_run_prepared"` | Path for prepared dataset         |
 | `push_dataset_to_hub`         | `""`                       | Push dataset to HF hub            |
 | `dataset_processes`           | `4`                        | Number of preprocessing processes |
 | `dataset_keep_in_memory`      | `false`                    | Keep dataset in memory            |
 | `shuffle_merged_datasets`     | `true`                     | Shuffle merged datasets           |
 | `dataset_exact_deduplication` | `true`                     | Deduplicate datasets              |
 ## LoRA Configuration
 | Option                     | Default                | Description                    |
 | -------------------------- | ---------------------- | ------------------------------ |
 | `adapter`                  | `"lora"`               | Adapter type (lora/qlora)      |
 | `lora_model_dir`           | `""`                   | Directory with pretrained LoRA |
 | `lora_r`                   | `8`                    | LoRA attention dimension       |
 | `lora_alpha`               | `16`                   | LoRA alpha parameter           |
 | `lora_dropout`             | `0.05`                 | LoRA dropout                   |
 | `lora_target_modules`      | `["q_proj", "v_proj"]` | Modules to apply LoRA          |
 | `lora_target_linear`       | `false`                | Target all linear modules      |
 | `peft_layers_to_transform` | `[]`                   | Layers to transform            |
 | `lora_modules_to_save`     | `[]`                   | Modules to save                |
 | `lora_fan_in_fan_out`      | `false`                | Fan in/out structure           |
 ## Optimization Settings
 | Option                    | Default | Description                |
 | ------------------------- | ------- | -------------------------- |
 | `train_on_inputs`         | `false` | Train on input prompts     |
 | `group_by_length`         | `false` | Group by sequence length   |
 | `gradient_checkpointing`  | `false` | Use gradient checkpointing |
 | `early_stopping_patience` | `3`     | Early stopping patience    |
 ## Learning Rate Scheduling
 | Option                     | Default    | Description          |
 | -------------------------- | ---------- | -------------------- |
 | `lr_scheduler`             | `"cosine"` | Scheduler type       |
 | `lr_scheduler_kwargs`      | `{}`       | Scheduler parameters |
 | `cosine_min_lr_ratio`      | `null`     | Minimum LR ratio     |
 | `cosine_constant_lr_ratio` | `null`     | Constant LR ratio    |
 | `lr_div_factor`            | `null`     | LR division factor   |
 ## Optimizer Settings
 | Option                 | Default      | Description         |
 | ---------------------- | ------------ | ------------------- |
 | `optimizer`            | `"adamw_hf"` | Optimizer choice    |
 | `optim_args`           | `{}`         | Optimizer arguments |
 | `optim_target_modules` | `[]`         | Target modules      |
 | `weight_decay`         | `null`       | Weight decay        |
 | `adam_beta1`           | `null`       | Adam beta1          |
 | `adam_beta2`           | `null`       | Adam beta2          |
 | `adam_epsilon`         | `null`       | Adam epsilon        |
 | `max_grad_norm`        | `null`       | Gradient clipping   |
 ## Attention Implementations
 | Option                     | Default | Description                   |
 | -------------------------- | ------- | ----------------------------- |
 | `flash_optimum`            | `false` | Use better transformers       |
 | `xformers_attention`       | `false` | Use xformers                  |
 | `flash_attention`          | `false` | Use flash attention           |
 | `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
 | `flash_attn_rms_norm`      | `false` | Flash attention RMS norm      |
 | `flash_attn_fuse_qkv`      | `false` | Fuse QKV operations           |
 | `flash_attn_fuse_mlp`      | `false` | Fuse MLP operations           |
 | `sdp_attention`            | `false` | Use scaled dot product        |
 | `s2_attention`             | `false` | Use shifted sparse attention  |
 ## Tokenizer Modifications
 | Option           | Default | Description                  |
 | ---------------- | ------- | ---------------------------- |
 | `special_tokens` | -       | Special tokens to add/modify |
 | `tokens`         | `[]`    | Additional tokens            |
 ## Distributed Training
 | Option                  | Default | Description           |
 | ----------------------- | ------- | --------------------- |
 | `fsdp`                  | `null`  | FSDP configuration    |
 | `fsdp_config`           | `null`  | FSDP config options   |
 | `deepspeed`             | `null`  | Deepspeed config path |
 | `ddp_timeout`           | `null`  | DDP timeout           |
 | `ddp_bucket_cap_mb`     | `null`  | DDP bucket capacity   |
 | `ddp_broadcast_buffers` | `null`  | DDP broadcast buffers |
 <details>
 <summary><h3>Example Configuration Request:</h3></summary>
 Here's a complete example for fine-tuning a LLaMA model using LoRA:
 ```json
 {
  "input": {
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "test-run",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "NousResearch/Llama-3.2-1B",
      "load_in_8bit": false,
      "load_in_4bit": false,
      "strict": false,
      "datasets": [
        {
          "path": "teknium/GPT4-LLM-Cleaned",
          "type": "alpaca"
        }
      ],
      "dataset_prepared_path": "last_run_prepared",
      "val_set_size": 0.1,
      "output_dir": "./outputs/lora-out",
      "adapter": "lora",
      "sequence_len": 2048,
      "sample_packing": true,
      "eval_sample_packing": true,
      "pad_to_sequence_len": true,
      "lora_r": 16,
      "lora_alpha": 32,
      "lora_dropout": 0.05,
      "lora_target_modules": [
        "gate_proj",
        "down_proj",
        "up_proj",
        "q_proj",
        "v_proj",
        "k_proj",
        "o_proj"
      ],
      "gradient_accumulation_steps": 2,
      "micro_batch_size": 2,
      "num_epochs": 1,
      "optimizer": "adamw_8bit",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": false,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "loss_watchdog_threshold": 5,
      "loss_watchdog_patience": 3,
      "warmup_steps": 10,
      "evals_per_epoch": 4,
      "saves_per_epoch": 1,
      "weight_decay": 0,
      "hub_model_id": "runpod/llama-fr-lora",
      "wandb_name": "test-run-1",
      "wandb_project": "test-run-1",
      "wandb_entity": "axo-test",
      "special_tokens": {
        "pad_token": "<|end_of_text|>"
      }
    }
  }
 }
 ```
 </details>
 ### Advanced Features
 #### Wandb Integration
 - `wandb_project`: Project name for Weights & Biases
 - `wandb_entity`: Team name in W&B
 - `wandb_watch`: Monitor model with W&B
 - `wandb_name`: Name of the W&B run
 - `wandb_run_id`: ID for the W&B run
 #### Performance Optimization
 - `sample_packing`: Enable efficient sequence packing
 - `eval_sample_packing`: Use sequence packing during evaluation
 - `torch_compile`: Enable PyTorch 2.0 compilation
 - `flash_attention`: Use Flash Attention implementation
 - `xformers_attention`: Use xFormers attention implementation
 ### Available Optimizers
 The following optimizers are supported:
 - `adamw_hf`: HuggingFace's AdamW implementation
 - `adamw_torch`: PyTorch's AdamW
 - `adamw_torch_fused`: Fused AdamW implementation
 - `adamw_torch_xla`: XLA-optimized AdamW
 - `adamw_apex_fused`: NVIDIA Apex fused AdamW
 - `adafactor`: Adafactor optimizer
 - `adamw_anyprecision`: Anyprecision AdamW
 - `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
 - `lion_8bit`: 8-bit Lion optimizer
 - `lion_32bit`: 32-bit Lion optimizer
 - `sgd`: Stochastic Gradient Descent
 - `adagrad`: Adagrad optimizer
 ## Notes
 - Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
 - Enable `flash_attention: true` for faster training on modern GPUs
 - Use `gradient_checkpointing: true` to reduce memory usage
 - Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
 For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html).
 ### Errors:
 - if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.
--- a/.runpod/hub.json
+++ b/.runpod/hub.json
@@ -1,93 +0,0 @@
 {
  "title": "Axolotl Fine-Tuning",
  "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
  "type": "serverless",
  "category": "language",
  "iconUrl": "https://avatars.githubusercontent.com/u/167502477",
  "config": {
    "runsOn": "GPU",
    "containerDiskInGb": 200,
    "gpuCount": 1,
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ],
    "presets": [],
    "env": [
      {
        "key": "TOKENIZER",
        "input": {
          "name": "Tokenizer",
          "type": "string",
          "description": "Name or path of the Hugging Face tokenizer to use.",
          "default": "",
          "advanced": true
        }
      },
      {
        "key": "MAX_NUM_SEQS",
        "input": {
          "name": "Max Num Seqs",
          "type": "number",
          "description": "Maximum number of sequences per iteration.",
          "default": 256,
          "advanced": true
        }
      },
      {
        "key": "DISABLE_LOG_STATS",
        "input": {
          "name": "Disable Log Stats",
          "type": "boolean",
          "description": "Disable logging statistics.",
          "default": false,
          "trueValue": "true",
          "falseValue": "false"
        }
      },
      {
        "key": "LOAD_FORMAT",
        "input": {
          "name": "Load Format",
          "type": "string",
          "description": "The format of the model weights to load.",
          "default": "auto",
          "options": [
            {
              "label": "auto",
              "value": "auto"
            },
            {
              "label": "pt",
              "value": "pt"
            },
            {
              "label": "safetensors",
              "value": "safetensors"
            },
            {
              "label": "npcache",
              "value": "npcache"
            },
            {
              "label": "dummy",
              "value": "dummy"
            },
            {
              "label": "tensorizer",
              "value": "tensorizer"
            },
            {
              "label": "bitsandbytes",
              "value": "bitsandbytes"
            }
          ],
          "advanced": true
        }
      }
    ]
  }
 }
--- a/.runpod/requirements.txt
+++ b/.runpod/requirements.txt
@@ -1,7 +0,0 @@
 # Required Python packages get listed here, one per line.
 # Reccomended to lock the version number to avoid unexpected changes.
 # You can also install packages from a git repository, e.g.:
 # git+https://github.com/runpod/runpod-python.git
 # To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
 runpod~=1.7.0
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -1,577 +0,0 @@
 # # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
 # # This can also be a relative path to a model on disk
 # base_model: ./llama-7b-hf
 # # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
 # base_model_ignore_patterns:
 # # If the base_model repo on hf hub doesn't include configuration .json files,
 # # You can set that here, or leave this empty to default to base_model
 # base_model_config: ./llama-7b-hf
 # # You can specify to choose a specific model revision from huggingface hub
 # model_revision:
 # # Optional tokenizer configuration override in case you want to use a different tokenizer
 # # than the one defined in the base model
 # tokenizer_config:
 # # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
 # model_type: AutoModelForCausalLM
 # # Corresponding tokenizer for the model AutoTokenizer is a good choice
 # tokenizer_type: AutoTokenizer
 # # Trust remote code for untrusted source
 # trust_remote_code:
 # # use_fast option for tokenizer loading from_pretrained, default to True
 # tokenizer_use_fast:
 # # Whether to use the legacy tokenizer setting, defaults to True
 # tokenizer_legacy:
 # # Resize the model embeddings when new tokens are added to multiples of 32
 # # This is reported to improve training speed on some models
 # resize_token_embeddings_to_32x:
 # # Used to identify which the model is based on
 # is_falcon_derived_model:
 # is_llama_derived_model:
 # # Please note that if you set this to true, `padding_side` will be set to "left" by default
 # is_mistral_derived_model:
 # is_qwen_derived_model:
 # # optional overrides to the base model configuration
 # model_config:
 #   # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
 #   rope_scaling:
 #     type: # linear | dynamic
 #     factor: # float
 # # Whether you are training a 4-bit GPTQ quantized model
 # gptq: true
 # gptq_groupsize: 128 # group size
 # gptq_model_v1: false # v1 or v2
 # # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
 # load_in_8bit: true
 # # Use bitsandbytes 4 bit
 # load_in_4bit:
 # # Use CUDA bf16
 # bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
 # # Use CUDA fp16
 # fp16: true
 # # Use CUDA tf32
 # tf32: true # require >=ampere
 # # No AMP (automatic mixed precision)
 # bfloat16: true # require >=ampere
 # float16: true
 # # A list of one or more datasets to finetune the model with
 # datasets:
 #   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
 #   - path: vicgalle/alpaca-gpt4
 #   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
 #     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
 #     ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
 #     data_files: # Optional[str] path to source data files
 #     shards: # Optional[int] number of shards to split data into
 #     name: # Optional[str] name of dataset configuration to load
 #     train_on_split: train # Optional[str] name of dataset split to load from
 #     # Optional[str] fastchat conversation type, only used with type: sharegpt
 #     conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
 #     field_human: # Optional[str]. Human key to use for conversation.
 #     field_model: # Optional[str]. Assistant key to use for conversation.
 #   # Custom user prompt
 #   - path: repo
 #     type:
 #       # The below are defaults. only set what's needed.
 #       system_prompt: ""
 #       system_format: "{system}"
 #       field_system: system
 #       field_instruction: instruction
 #       field_input: input
 #       field_output: output
 #       # Customizable to be single line or multi-line
 #       # 'format' can include {input}
 #       format: |-
 #         User: {instruction} {input}
 #         Assistant:
 #       # 'no_input_format' cannot include {input}
 #       no_input_format: "{instruction} "
 #       # For `completion` datsets only, uses the provided field instead of `text` column
 #       field:
 # # Axolotl attempts to save the dataset as an arrow after packing the data together so
 # # subsequent training attempts load faster, relative path
 # dataset_prepared_path: data/last_run_prepared
 # # Push prepared dataset to hub
 # push_dataset_to_hub: # repo path
 # # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # # if not set.
 # dataset_processes: # defaults to os.cpu_count() if not set
 # # push checkpoints to hub
 # hub_model_id: # repo path to push finetuned model
 # # how to push checkpoints to hub
 # # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
 # hub_strategy:
 # # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
 # # Required to be true when used in combination with `push_dataset_to_hub`
 # hf_use_auth_token: # boolean
 # # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
 # val_set_size: 0.04
 # # Num shards for whole dataset
 # dataset_shard_num:
 # # Index of shard to use for whole dataset
 # dataset_shard_idx:
 # # The maximum length of an input to train with, this should typically be less than 2048
 # # as most models have a token/context limit of 2048
 # sequence_len: 2048
 # # Pad inputs so each step uses constant sized buffers
 # # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
 # pad_to_sequence_len:
 # # Max sequence length to concatenate training samples together up to
 # # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
 # # FutureWarning: This will soon be DEPRECATED
 # max_packed_sequence_len: 1024
 # # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
 # sample_packing:
 # # Set to 'false' if getting errors during eval with sample_packing on.
 # eval_sample_packing:
 # # You can set these packing optimizations AFTER starting a training at least once.
 # # The trainer will provide recommended values for these values.
 # sample_packing_eff_est:
 # total_num_tokens:
 # # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
 # adapter: lora
 # # If you already have a lora model trained that you want to load, put that here.
 # # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
 # lora_model_dir:
 # # LoRA hyperparameters
 # # For more details about the following options, see:
 # # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
 # lora_r: 8
 # lora_alpha: 16
 # lora_dropout: 0.05
 # lora_target_modules:
 #   - q_proj
 #   - v_proj
 # #  - k_proj
 # #  - o_proj
 # #  - gate_proj
 # #  - down_proj
 # #  - up_proj
 # lora_target_linear: # If true, will target all linear layers
 # # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
 # # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
 # # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
 # # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
 # lora_modules_to_save:
 # #  - embed_tokens
 # #  - lm_head
 # # Once you complete training, the model will be saved to the following directory.
 # # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
 # # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
 # lora_out_dir:
 # lora_fan_in_fan_out: false
 # # ReLoRA configuration
 # # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
 # relora_steps: # Number of steps per ReLoRA restart
 # relora_warmup_steps: # Number of per-restart warmup steps
 # relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
 # # wandb configuration if you're using it
 # wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
 # wandb_project: # Your wandb project name
 # wandb_entity: # A wandb Team name if using a Team
 # wandb_watch:
 # wandb_run_id: # Set the name of your wandb run
 # wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
 # # Where to save the full-finetuned model to
 # output_dir: ./completed-model
 # # Whether to use torch.compile and which backend to use
 # torch_compile:  # bool
 # torch_compile_backend:  # Optional[str]
 # # Training hyperparameters
 # # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
 # gradient_accumulation_steps: 1
 # # The number of samples to include in each batch. This is the number of samples sent to each GPU.
 # micro_batch_size: 2
 # eval_batch_size:
 # num_epochs: 4
 # warmup_steps: 100  # cannot use with warmup_ratio
 # warmup_ratio: 0.05  # cannot use with warmup_steps
 # learning_rate: 0.00003
 # lr_quadratic_warmup:
 # logging_steps:
 # save_strategy: # Set to `no` to skip checkpoint saves
 # save_steps: # Leave empty to save at each epoch
 # eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
 # save_total_limit: # Checkpoints saved at a time
 # # Maximum number of iterations to train for. It precedes num_epochs which means that
 # # if both are set, num_epochs will not be guaranteed.
 # # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
 # max_steps:
 # eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 # eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
 # # Save model as safetensors (require safetensors package)
 # save_safetensors:
 # # Whether to mask out or include the human's prompt from the training labels
 # train_on_inputs: false
 # # Group similarly sized data to minimize padding.
 # # May be slower to start, as it must download and sort the entire dataset.
 # # Note that training loss may have an oscillating pattern with this enabled.
 # group_by_length: false
 # # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
 # gradient_checkpointing: false
 # # Stop training after this many evaluation losses have increased in a row
 # # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
 # early_stopping_patience: 3
 # # Specify a scheduler and kwargs to use with the optimizer
 # lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
 # lr_scheduler_kwargs:
 # # For one_cycle optim
 # lr_div_factor: # Learning rate div factor
 # # For log_sweep optim
 # log_sweep_min_lr:
 # log_sweep_max_lr:
 # # Specify optimizer
 # # Valid values are driven by the Transformers OptimizerNames class, see:
 # # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
 # #
 # # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
 # # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
 # # in the examples/ for your model and fine-tuning use case.
 # #
 # # Valid values for 'optimizer' include:
 # # - adamw_hf
 # # - adamw_torch
 # # - adamw_torch_fused
 # # - adamw_torch_xla
 # # - adamw_apex_fused
 # # - adafactor
 # # - adamw_anyprecision
 # # - sgd
 # # - adagrad
 # # - adamw_bnb_8bit
 # # - lion_8bit
 # # - lion_32bit
 # # - paged_adamw_32bit
 # # - paged_adamw_8bit
 # # - paged_lion_32bit
 # # - paged_lion_8bit
 # optimizer:
 # # Specify weight decay
 # weight_decay:
 # # adamw hyperparams
 # adam_beta1:
 # adam_beta2:
 # adam_epsilon:
 # # Gradient clipping max norm
 # max_grad_norm:
 # # Augmentation techniques
 # # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
 # # currently only supported on Llama and Mistral
 # noisy_embedding_alpha:
 # # Whether to bettertransformers
 # flash_optimum:
 # # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
 # xformers_attention:
 # # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
 # flash_attention:
 # flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
 # flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
 # flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
 # flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
 # # Whether to use scaled-dot-product attention
 # # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
 # sdp_attention:
 # # Landmark attention (only llama)
 # landmark_attention:
 # # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
 # # LLaMA only
 # xpos_rope:
 # # Resume from a specific checkpoint dir
 # resume_from_checkpoint:
 # # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
 # # Be careful with this being turned on between different models.
 # auto_resume_from_checkpoints: false
 # # Don't mess with this, it's here for accelerate and torchrun
 # local_rank:
 # # Add or change special tokens.
 # # If you add tokens here, you don't need to add them to the `tokens` list.
 # special_tokens:
 #   # bos_token: "<s>"
 #   # eos_token: "</s>"
 #   # unk_token: "<unk>"
 # # Add extra tokens.
 # tokens:
 # # FSDP
 # fsdp:
 # fsdp_config:
 # # Deepspeed config path. e.g., deepspeed/zero3.json
 # deepspeed:
 # # Advanced DDP Arguments
 # ddp_timeout:
 # ddp_bucket_cap_mb:
 # ddp_broadcast_buffers:
 # # Path to torch distx for optim 'adamw_anyprecision'
 # torchdistx_path:
 # # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
 # pretraining_dataset:
 # # Debug mode
 # debug:
 # # Seed
 # seed:
 # # Allow overwrite yml config using from cli
 # strict:
 base_model: ${BASE_MODEL}
 base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
 base_model_config: ${BASE_MODEL_CONFIG}
 revision_of_model: ${REVISION_OF_MODEL}
 tokenizer_config: ${TOKENIZER_CONFIG}
 model_type: ${MODEL_TYPE}
 tokenizer_type: ${TOKENIZER_TYPE}
 trust_remote_code: ${TRUST_REMOTE_CODE}
 tokenizer_use_fast: ${TOKENIZER_USE_FAST}
 tokenizer_legacy: ${TOKENIZER_LEGACY}
 resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}
 is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
 is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
 is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
 is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}
 overrides_of_model_config:
  rope_scaling:
    type: ${ROPE_SCALING_TYPE}
    factor: ${ROPE_SCALING_FACTOR}
 bnb_config_kwargs:
  llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
  bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
  bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}
 gptq: ${GPTQ}
 load_in_8bit: ${LOAD_IN_8BIT}
 load_in_4bit: ${LOAD_IN_4BIT}
 bf16: ${BF16}
 fp16: ${FP16}
 tf32: ${TF32}
 bfloat16: ${BFLOAT16}
 float16: ${FLOAT16}
 gpu_memory_limit: ${GPU_MEMORY_LIMIT}
 lora_on_cpu: ${LORA_ON_CPU}
 datasets:
  - path: ${DATASET_PATH}
    type: ${DATASET_TYPE}
    ds_type: ${DATASET_DS_TYPE}
    data_files: ${DATASET_DATA_FILES}
    shards: ${DATASET_SHARDS}
    name: ${DATASET_NAME}
    train_on_split: ${DATASET_TRAIN_ON_SPLIT}
    revision: ${DATASET_REVISION}
    trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}
 rl: ${RL}
 dpo_use_weighting: ${DPO_USE_WEIGHTING}
 chat_template: ${CHAT_TEMPLATE}
 chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
 default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
 dataset_prepared_path: ${DATASET_PREPARED_PATH}
 push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
 dataset_processes: ${DATASET_PROCESSES}
 dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
 hub_model_id: ${HUB_MODEL_ID}
 hub_strategy: ${HUB_STRATEGY}
 hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
 val_set_size: ${VAL_SET_SIZE}
 dataset_shard_num: ${DATASET_SHARD_NUM}
 dataset_shard_idx: ${DATASET_SHARD_IDX}
 sequence_len: ${SEQUENCE_LEN}
 pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
 sample_packing: ${SAMPLE_PACKING}
 eval_sample_packing: ${EVAL_SAMPLE_PACKING}
 sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
 total_num_tokens: ${TOTAL_NUM_TOKENS}
 sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
 sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}
 batch_flattening: ${BATCH_FLATTENING}
 device_map: ${DEVICE_MAP}
 max_memory: ${MAX_MEMORY}
 adapter: ${ADAPTER}
 lora_model_dir: ${LORA_MODEL_DIR}
 lora_r: ${LORA_R}
 lora_alpha: ${LORA_ALPHA}
 lora_dropout: ${LORA_DROPOUT}
 lora_target_modules:
  - ${LORA_TARGET_MODULES}
 lora_target_linear: ${LORA_TARGET_LINEAR}
 peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
 lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
 lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}
 loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
 loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}
 peft:
  loftq_config:
    loftq_bits: ${LOFTQ_BITS}
 relora_steps: ${RELORA_STEPS}
 relora_warmup_steps: ${RELORA_WARMUP_STEPS}
 relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
 relora_prune_ratio: ${RELORA_PRUNE_RATIO}
 relora_cpu_offload: ${RELORA_CPU_OFFLOAD}
 wandb_mode: ${WANDB_MODE}
 wandb_project: ${WANDB_PROJECT}
 wandb_entity: ${WANDB_ENTITY}
 wandb_watch: ${WANDB_WATCH}
 wandb_name: ${WANDB_NAME}
 wandb_run_id: ${WANDB_RUN_ID}
 wandb_log_model: ${WANDB_LOG_MODEL}
 mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
 mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
 mlflow_run_name: ${MLFLOW_RUN_NAME}
 hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}
 use_comet: ${USE_COMET}
 comet_api_key: ${COMET_API_KEY}
 comet_workspace: ${COMET_WORKSPACE}
 comet_project_name: ${COMET_PROJECT_NAME}
 comet_experiment_key: ${COMET_EXPERIMENT_KEY}
 comet_mode: ${COMET_MODE}
 comet_online: ${COMET_ONLINE}
 comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}
 output_dir: ${OUTPUT_DIR}
 torch_compile: ${TORCH_COMPILE}
 torch_compile_backend: ${TORCH_COMPILE_BACKEND}
 gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
 micro_batch_size: ${MICRO_BATCH_SIZE}
 eval_batch_size: ${EVAL_BATCH_SIZE}
 num_epochs: ${NUM_EPOCHS}
 warmup_steps: ${WARMUP_STEPS}
 warmup_ratio: ${WARMUP_RATIO}
 learning_rate: ${LEARNING_RATE}
 lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
 logging_steps: ${LOGGING_STEPS}
 eval_steps: ${EVAL_STEPS}
 evals_per_epoch: ${EVALS_PER_EPOCH}
 save_strategy: ${SAVE_STRATEGY}
 save_steps: ${SAVE_STEPS}
 saves_per_epoch: ${SAVES_PER_EPOCH}
 save_total_limit: ${SAVE_TOTAL_LIMIT}
 max_steps: ${MAX_STEPS}
 eval_table_size: ${EVAL_TABLE_SIZE}
 eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
 eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}
 profiler_steps: ${PROFILER_STEPS}
 loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
 loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
 save_safetensors: ${SAVE_SAFETENSORS}
 train_on_inputs: ${TRAIN_ON_INPUTS}
 group_by_length: ${GROUP_BY_LENGTH}
 gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
 early_stopping_patience: ${EARLY_STOPPING_PATIENCE}
 lr_scheduler: ${LR_SCHEDULER}
 lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
 cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
 cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
 lr_div_factor: ${LR_DIV_FACTOR}
 optimizer: ${OPTIMIZER}
 optim_args: ${OPTIM_ARGS}
 optim_target_modules: ${OPTIM_TARGET_MODULES}
 weight_decay: ${WEIGHT_DECAY}
 adam_beta1: ${ADAM_BETA1}
 adam_beta2: ${ADAM_BETA2}
 adam_epsilon: ${ADAM_EPSILON}
 max_grad_norm: ${MAX_GRAD_NORM}
 neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}
 flash_optimum: ${FLASH_OPTIMUM}
 xformers_attention: ${XFORMERS_ATTENTION}
 flash_attention: ${FLASH_ATTENTION}
 flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
 flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
 flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
 flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
 sdp_attention: ${SDP_ATTENTION}
 s2_attention: ${S2_ATTENTION}
 resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
 auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}
 local_rank: ${LOCAL_RANK}
 special_tokens:
  bos_token: ${SPECIAL_TOKEN_BOS}
  eos_token: ${SPECIAL_TOKEN_EOS}
  unk_token: ${SPECIAL_TOKEN_UNK}
  pad_token: ${SPECIAL_TOKEN_PAD}
 tokens: ${TOKENS}
 fsdp: ${FSDP}
 fsdp_config: ${FSDP_CONFIG}
 deepspeed: ${DEEPSPEED}
 ddp_timeout: ${DDP_TIMEOUT}
 ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
 ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}
 torchdistx_path: ${TORCHDISTX_PATH}
 pretraining_dataset: ${PRETRAINING_DATASET}
 debug: ${DEBUG}
 seed: ${SEED}
 strict: ${STRICT}
--- a/.runpod/src/handler.py
+++ b/.runpod/src/handler.py
@@ -1,64 +0,0 @@
 """
 Runpod serverless entrypoint handler
 """
 import os
 import runpod
 import yaml
 from huggingface_hub._login import login
 from train import train
 from utils import get_output_dir
 BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
 if not os.path.exists(BASE_VOLUME):
    os.makedirs(BASE_VOLUME)
 logger = runpod.RunPodLogger()
 async def handler(job):
    runpod_job_id = job["id"]
    inputs = job["input"]
    run_id = inputs.get("run_id", "default_run_id")
    args = inputs.get("args", {})
    # Set output directory
    output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
    args["output_dir"] = output_dir
    # First save args to a temporary config file
    config_path = "/workspace/test_config.yaml"
    # Add run_name and job_id to args before saving
    args["run_name"] = run_id
    args["runpod_job_id"] = runpod_job_id
    yaml_data = yaml.dump(args, default_flow_style=False)
    with open(config_path, "w", encoding="utf-8") as file:
        file.write(yaml_data)
    # Handle credentials
    credentials = inputs.get("credentials", {})
    if "wandb_api_key" in credentials:
        os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
    if "hf_token" in credentials:
        os.environ["HF_TOKEN"] = credentials["hf_token"]
    if os.environ.get("HF_TOKEN"):
        login(token=os.environ["HF_TOKEN"])
    else:
        logger.info("No HF_TOKEN provided. Skipping login.")
    logger.info("Starting Training.")
    async for result in train(config_path):  # Pass the config path instead of args
        logger.info(result)
    logger.info("Training Complete.")
    # Cleanup
    del os.environ["WANDB_API_KEY"]
    del os.environ["HF_TOKEN"]
 runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
--- a/.runpod/src/test_input.json
+++ b/.runpod/src/test_input.json
@@ -1,61 +0,0 @@
 {
  "input": {
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "llama-test",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "NousResearch/Meta-Llama-3-8B",
      "model_type": "LlamaForCausalLM",
      "tokenizer_type": "AutoTokenizer",
      "load_in_8bit": true,
      "load_in_4bit": false,
      "strict": false,
      "datasets": [
        {
          "path": "mhenrichsen/alpaca_2k_test",
          "type": "alpaca"
        }
      ],
      "val_set_size": 0.05,
      "output_dir": "./outputs/lora-out",
      "sequence_len": 4096,
      "sample_packing": true,
      "eval_sample_packing": false,
      "pad_to_sequence_len": true,
      "adapter": "lora",
      "lora_r": 32,
      "lora_alpha": 16,
      "lora_dropout": 0.05,
      "lora_target_linear": true,
      "lora_modules_to_save": [
        "embed_tokens",
        "lm_head"
      ],
      "gradient_accumulation_steps": 4,
      "micro_batch_size": 2,
      "num_epochs": 1,
      "optimizer": "adamw_bnb_8bit",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": false,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "warmup_steps": 1,
      "evals_per_epoch": 1,
      "eval_max_new_tokens": 128,
      "saves_per_epoch": 1,
      "weight_decay": 0.0,
      "special_tokens": {
        "pad_token": "<|end_of_text|>"
      }
    }
  }
 }
--- a/.runpod/src/train.py
+++ b/.runpod/src/train.py
@@ -1,45 +0,0 @@
 """
 Runpod train entrypoint
 """
 import asyncio
 async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
    """
    Run preprocessing (if enabled) and training with the given config file
    :param config_path: Path to the YAML config file
    :param gpu_id: GPU ID to use (default: "0")
    :param preprocess: Whether to run preprocessing (default: True)
    """
    # First check if preprocessing is needed
    if preprocess:
        # Preprocess command
        preprocess_cmd = (
            f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
        )
        process = await asyncio.create_subprocess_shell(
            preprocess_cmd,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.STDOUT,
        )
        if process.stdout is not None:
            async for line in process.stdout:
                yield f"Preprocessing: {line.decode().strip()}"
        await process.wait()
        yield "Preprocessing completed."
    else:
        yield "Skipping preprocessing step."
    # Training command
    train_cmd = f"axolotl train {config_path}"
    process = await asyncio.create_subprocess_shell(
        train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
    )
    if process.stdout is not None:
        async for line in process.stdout:
            yield f"Training: {line.decode().strip()}"
    await process.wait()
--- a/.runpod/src/utils.py
+++ b/.runpod/src/utils.py
@@ -1,89 +0,0 @@
 """
 Runpod launcher utils
 """
 import os
 import yaml
 def get_output_dir(run_id):
    path = f"fine-tuning/{run_id}"
    return path
 def make_valid_config(input_args):
    """
    Creates and saves updated config file, returns the path to the new config
    :param input_args: dict of input args
    :return: str, path to the updated config file
    """
    # Load default config
    with open("config/config.yaml", "r", encoding="utf-8") as fin:
        all_args = yaml.safe_load(fin)
    if not input_args:
        print("No args provided, using defaults")
    else:
        all_args.update(input_args)
    # Create updated config path
    updated_config_path = "config/updated_config.yaml"
    # Save updated config to new file
    with open(updated_config_path, "w", encoding="utf-8") as f:
        yaml.dump(all_args, f)
    return updated_config_path
 def set_config_env_vars(args: dict):
    """
    Convert API arguments into environment variables.
    Handles nested dictionaries, lists, and special values.
    Args:
        args (dict): The arguments dictionary from the API request
    """
    def process_value(value):
        """Convert Python values to string format for environment variables"""
        if value is None:
            return ""
        if isinstance(value, bool):
            return str(value).lower()
        if isinstance(value, (list, dict)):
            return str(value)
        return str(value)
    def set_env_vars(data, prefix=""):
        """Recursively set environment variables from nested dictionary"""
        for key, value in data.items():
            env_key = prefix + key.upper()
            # Handle special cases
            if isinstance(value, dict):
                # For nested dictionaries (like special_tokens)
                set_env_vars(value, f"{env_key}_")
            elif isinstance(value, list):
                # Handle list of dictionaries (like datasets)
                if value and isinstance(value[0], dict):
                    for i, item in enumerate(value):
                        set_env_vars(item, f"{env_key}_{i}_")
                else:
                    # For simple lists (like lora_target_modules)
                    os.environ[env_key] = process_value(value)
            else:
                # Handle all other cases
                os.environ[env_key] = process_value(value)
    # Clear any existing related environment variables
    # This prevents old values from persisting
    for key in list(os.environ.keys()):
        if key.startswith(
            ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
        ):
            del os.environ[key]
    # Set new environment variables
    set_env_vars(args)
--- a/.runpod/test-input.json
+++ b/.runpod/test-input.json
@@ -1,86 +0,0 @@
 {
  "input": {
    "name": "quick_smoke_test_sft",
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "llama-test",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "HuggingFaceTB/SmolLM2-135M",
      "model_type": "AutoModelForCausalLM",
      "tokenizer_type": "AutoTokenizer",
      "load_in_4bit": true,
      "strict": false,
      "datasets": [
        {
          "path": "mhenrichsen/alpaca_2k_test",
          "type": "alpaca",
          "split": "train[:10%]"
        }
      ],
      "val_set_size": 0.02,
      "output_dir": "./outputs/lora-out",
      "sequence_len": 4096,
      "sample_packing": true,
      "eval_sample_packing": false,
      "pad_to_sequence_len": true,
      "adapter": "qlora",
      "lora_r": 32,
      "lora_alpha": 64,
      "lora_dropout": 0.05,
      "lora_target_linear": true,
      "lora_modules_to_save": [
        "embed_tokens",
        "lm_head"
      ],
      "gradient_accumulation_steps": 2,
      "micro_batch_size": 1,
      "num_epochs": 1,
      "optimizer": "adamw_torch_fused",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": true,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "warmup_steps": 1,
      "evals_per_epoch": 1,
      "eval_max_new_tokens": 128,
      "saves_per_epoch": 1,
      "weight_decay": 0.0,
      "special_tokens": {
        "pad_token": "<|endoftext|>"
      },
      "max_steps": 20
    },
    "timeout": 100000
  },
  "config": {
    "gpuTypeId": "NVIDIA GeForce RTX 4090",
    "gpuCount": 1,
    "containerDiskInGb": 200,
    "env": [
      {
        "key": "TOKENIZER",
        "value": ""
      },
      {
        "key": "DISABLE_LOG_STATS",
        "value": "true"
      }
    ],
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ]
  }
 }
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
@@ -1,90 +0,0 @@
 {
  "tests": [
    {
      "name": "quick_smoke_test_sft",
      "input": {
        "user_id": "user",
        "model_id": "llama-test",
        "run_id": "llama-test",
        "credentials": {
          "wandb_api_key": "",
          "hf_token": ""
        },
        "args": {
          "base_model": "HuggingFaceTB/SmolLM2-135M",
          "model_type": "AutoModelForCausalLM",
          "tokenizer_type": "AutoTokenizer",
          "load_in_4bit": true,
          "strict": false,
          "datasets": [
            {
              "path": "mhenrichsen/alpaca_2k_test",
              "type": "alpaca",
              "split": "train[:10%]"
            }
          ],
          "val_set_size": 0.02,
          "output_dir": "./outputs/lora-out",
          "sequence_len": 4096,
          "sample_packing": true,
          "eval_sample_packing": false,
          "pad_to_sequence_len": true,
          "adapter": "qlora",
          "lora_r": 32,
          "lora_alpha": 64,
          "lora_dropout": 0.05,
          "lora_target_linear": true,
          "lora_modules_to_save": [
            "embed_tokens",
            "lm_head"
          ],
          "gradient_accumulation_steps": 2,
          "micro_batch_size": 1,
          "num_epochs": 1,
          "optimizer": "adamw_torch_fused",
          "lr_scheduler": "cosine",
          "learning_rate": 0.0002,
          "train_on_inputs": false,
          "group_by_length": false,
          "bf16": "auto",
          "tf32": true,
          "gradient_checkpointing": true,
          "logging_steps": 1,
          "flash_attention": true,
          "warmup_steps": 1,
          "evals_per_epoch": 1,
          "eval_max_new_tokens": 128,
          "saves_per_epoch": 1,
          "weight_decay": 0.0,
          "special_tokens": {
            "pad_token": "<|endoftext|>"
          },
          "max_steps": 20
        }
      },
      "timeout": 100000
    }
  ],
  "config": {
    "gpuTypeId": "NVIDIA GeForce RTX 4090",
    "gpuCount": 1,
    "containerDiskInGb": 200,
    "env": [
      {
        "key": "TOKENIZER",
        "value": ""
      },
      {
        "key": "DISABLE_LOG_STATS",
        "value": "true"
      }
    ],
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ]
  }
 }
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ Features:
 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
- PyTorch ≥2.5.1
+- PyTorch ≥2.4.1
 ### Installation
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -52,4 +52,4 @@ pytest -v --durations=10 \
  --cov-append \
  --cov-report=xml:e2e-coverage.xml
-codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
+codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -20,4 +20,4 @@ pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
  --cov-report=xml:multigpu-coverage.xml
 # Upload coverage to Codecov
-codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
+codecov upload-process -t $CODECOV_TOKEN -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,7 +1,5 @@
 codecov:
  require_ci_to_pass: yes
  notify:
    wait_for_ci: true
 coverage:
  precision: 2
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -154,10 +154,6 @@ datasets:
    # Key containing the messages (default: "messages")
    field_messages: messages
    # Key containing the system message (default: "system")
    # If the system message is not present in the dataset sample, it will be loaded from the field_system property.
    field_system: system
    # Mapping of properties from the input dataset to the chat template.
    # (default: message_property_mappings={'role':'role', 'content':'content'})
    # If a property exists in the template but not in this mapping, the system will attempt
@@ -184,14 +180,10 @@ datasets:
    # adding a system turn with empty content.
    drop_system_message:
    # Optional[bool]. Whether to split the assistant turn based on a reasoning trace inside delimited tags
    # defaults to False
    split_thinking:
    # IMPORTANT: The following fields determine which parts of the conversation to train on.
    # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
    # See examples at `docs/dataset-formats/conversation.qmd`
-    # Note: If the below 5 fields are empty, defaults to training only on the last message.
+    # Note: If the below 4 fields are set to empty, defaults to training only on the last message.
    # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
    roles_to_train: ["assistant"]  # default
@@ -200,13 +192,7 @@ datasets:
    # - turn (default): train on the EOS token at the end of each trainable turn
    # - last: train on the last EOS token in the conversation
    # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
-    train_on_eos: turn
+    train_on_eos: last
    # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are:
    # - all: train on all EOT tokens
    # - turn: train on the EOT token at the end of each trainable turn
    # - last: train on the last EOT token in the conversation
    # If not specified, defaults to the value of train_on_eos for backward compatibility.
    train_on_eot:
    # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
    message_field_training: training
    # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
@@ -289,17 +275,8 @@ process_reward_model:
 chat_template: tokenizer_default
 # custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
 chat_template_jinja: null
-# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training.
+# Changes the default system message. Currently only supports chatml.
-# These tokens mark the boundaries between conversation turns.
+default_system_message: You are a helpful assistant. Please give a long and detailed answer.
 # For example: ["/INST", "</s>", "[/SYSTEM_PROMPT]"]
 # If not specified, defaults to just the model's eos_token.
 # This is useful for templates that use multiple delimiter tokens.
 eot_tokens:
  # - "</s>"
  # - "[/INST]"
  # - "[/SYSTEM_PROMPT]"
 # Changes the default system message
 default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
 # Axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
@@ -684,10 +661,8 @@ special_tokens:
  # unk_token: "<unk>"
  # pad_token: "[PAD]"
-# Optional[list[str]]. Add extra tokens to the tokenizer.
+# Add extra tokens.
 tokens:
  # - "<|startoftext|>"
  # - "<|endoftext|>"
 # Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.
 # Only works for tokens that are not part of the base vocab (aka are added_tokens).
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -4,6 +4,18 @@ description: Conversation format for supervised fine-tuning.
 order: 3
 ---
 ## sharegpt
 ::: {.callout-important}
 ShareGPT is deprecated!. Please see [chat_template](#chat_template) section below.
 :::
 ## pygmalion
 ```{.json filename="data.jsonl"}
 {"conversations": [{"role": "...", "value": "..."}]}
 ```
 ## chat_template
 Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.
@@ -52,7 +64,7 @@ We recommend checking the below examples for other usecases.
 ### Examples
-1. (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
 ```yaml
 datasets:
@@ -97,55 +109,10 @@ datasets:
 ```
 ::: {.callout-important}
-Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
+Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
 :::
-5. If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
+5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
 ```yaml
 eot_tokens:
  - "[/INST]"
  # - "[/SYSTEM_PROMPT]"
 datasets:
  - path: ...
    type: chat_template
    # optional
    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)
 ```
 ::: {.callout-tip}
 See [config documentation](../config.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
 :::
 ::: {.callout-note}
 Using `eot_tokens` requires each token that exists in `chat_template` to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.
 You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
 :::
 6. Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
 ```yaml
 eot_tokens:
  - "[/INST]"
  # ...
 datasets:
  - path: ...
    type: chat_template
    train_on_eos: last
    train_on_eot: turn
 ```
 ::: {.callout-tip}
 If EOS token only appears at the end of a prompt, `train_on_eos: last` is equivalent to `train_on_eos: turn`. Therefore, generally, you can leave them to their defaults and omit them.
 :::
 7. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
 For a data sample that looks like:
@@ -195,15 +162,3 @@ datasets:
 ::: {.callout-tip}
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::
 ## sharegpt
 ::: {.callout-important}
 ShareGPT is deprecated!. Please see [chat_template](#chat_template) section.
 :::
 ## pygmalion
 ```{.json filename="data.jsonl"}
 {"conversations": [{"role": "...", "value": "..."}]}
 ```
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -28,8 +28,6 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
 Tags examples:
 - `main-base-py3.11-cu128-2.7.0`
 - `main-base-py3.11-cu126-2.7.0`
 - `main-base-py3.11-cu124-2.6.0`
 - `main-base-py3.11-cu124-2.5.1`
 - `main-base-py3.11-cu124-2.4.1`
@@ -52,7 +50,7 @@ Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
 # on push to main
 main-py{python_version}-cu{cuda_version}-{pytorch_version}
-# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
+# latest main (currently torch 2.5.1, python 3.11, cuda 12.4)
 main-latest
 # nightly build
@@ -70,7 +68,6 @@ There may be some extra tags appended to the image, like `-vllm` which installs
 Tags examples:
 - `main-py3.11-cu126-2.7.0`
 - `main-py3.11-cu124-2.6.0`
 - `main-py3.11-cu124-2.5.1`
 - `main-py3.11-cu124-2.4.1`
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -73,40 +73,10 @@ description: Frequently asked questions
 > A: This is likely an empty turn.
-**Q: The EOS token is incorrectly being masked or not being masked / `EOS token __ not found in chat template`.**
+**Q: The EOS/EOT token is incorrectly being masked or not being masked.**
-> A: There can be two reasons:
+> A: This is because of the mismatch between `tokenizer.eos_token` and EOS/EOT token in template. Please make sure to set `eos_token` under `special_tokens` to the same EOS/EOT token as in template.
 > 1. This is because of the mismatch between `tokenizer.eos_token` and EOS token in template. Please make sure to set `eos_token: ` under `special_tokens: ` to the same EOS token as in template.
 > 2. The EOS token is not in the template. Please check if your template is correct. As an example, `phi_35` template does not use its dedicated EOS token `<|endoftext|>` at the end.
 **Q: "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null. Please add a `chat_template` in tokenizer config"**
 > A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See [chat_template](dataset-formats/conversation.qmd#chat-template) for more details.
 **Q: The EOT token(s) are incorrectly being masked or not being masked / `EOT token __ not found in chat template`.**
 > A: There can be two reasons:
 > 1. The EOT token is different from the EOS token and was not specified under `eot_tokens: `. Please set `eot_tokens: ` to the same EOT token(s) as in template.
 > 2. There is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.
 **Q: `EOT token encoding failed. Please check if the token is valid and can be encoded.`**
 > A: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.
 **Q: `EOT token __ is encoded as multiple tokens.`**
 > A: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `.
 **Q: `Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot`**
 > A: This is because the EOS token is in the `eot_tokens: ` while mismatch between `train_on_eos: ` and `train_on_eot: `. This will cause one to override the other. Please ensure that `train_on_eos: ` and `train_on_eot: ` are the same or remove the EOS token from `eot_tokens: `.
 **Q: If `eot_tokens: ` is not provided, what happens?**
 > A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.
 > Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -164,7 +164,7 @@ Here is an example of a multi-modal dataset:
        {
            "role": "user",
            "content": [
-                {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
+                {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
                {"type": "text", "text": "Describe this image in detail."}
            ]
        },
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -502,7 +502,9 @@ The input format is a simple JSON input with customizable fields based on the ab
 Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
 :::
-In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
+If you have multiple GPUs available, we reccomend using `vLLM` with the `GRPOTrainer` to significantly speedup trajectory generation during training.
 First, launch a `vLLM` server using `trl vllm-serve` - you may use a config file or CLI overrides to configure your vLLM server. In this example, we're
 using 4 GPUs - 2 for training, and 2 for vLLM:
 ::: {.callout-important}
 Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
@@ -537,10 +539,6 @@ Your `vLLM` instance will now attempt to spin up, and it's time to kick off trai
 CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
 ```
 ::: {.callout-note}
 Due to TRL's implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use `CUDA_VISIBLE_DEVICES=2,3` for the vLLM instance.
 :::
 #### Reward functions
 GRPO uses custom reward functions and transformations. Please have them ready locally.
--- a/examples/llama-4/scout-qlora-single-h100-flex.yaml
+++ b/examples/llama-4/scout-qlora-single-h100-flex.yaml
@@ -10,6 +10,7 @@ plugins:
 liger_glu_activation: true
 liger_rms_norm: true
 liger_layer_norm: true
 cut_cross_entropy: true
 llama4_linearized_experts: true  # needed with custom linearized experts model
 load_in_4bit: true
--- a/examples/qwen3/32b-qlora.yaml
+++ b/examples/qwen3/32b-qlora.yaml
@@ -1,69 +0,0 @@
 base_model: Qwen/Qwen3-32B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 strict: false
 chat_template: qwen3
 datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 val_set_size: 0.0
 output_dir: ./outputs/out
 dataset_prepared_path: last_run_prepared
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 pad_to_sequence_len: true
 load_in_4bit: true
 adapter: qlora
 lora_r: 16
 lora_alpha: 32
 lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - down_proj
  - up_proj
 lora_mlp_kernel: true
 lora_qkv_kernel: true
 lora_o_kernel: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 2
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_4bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: offload
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/qwen3/qlora-fsdp.yaml
+++ b/examples/qwen3/qlora-fsdp.yaml
@@ -1,68 +0,0 @@
 base_model: Qwen/Qwen3-8B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./outputs/out
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: true
 pad_to_sequence_len: true
 adapter: qlora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 64
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_steps: 10
 evals_per_epoch: 4
 saves_per_epoch: 1
 weight_decay: 0.0
 fsdp:
  - full_shard
  - auto_wrap
 fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
 special_tokens:
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,14 +11,14 @@ liger-kernel==0.5.8
 packaging==23.2
-peft==0.15.2
+peft==0.15.1
 transformers==4.51.3
 tokenizers>=0.21.1
 accelerate==1.6.0
 datasets==3.5.0
 deepspeed>=0.15.4
-trl==0.17.0
+trl==0.16.1
-hf_xet==1.1.0
+hf_xet==1.0.0
 hqq==0.2.5
 optimum==1.16.2
--- a/setup.py
+++ b/setup.py
@@ -67,11 +67,13 @@ def parse_requirements(extras_require_map):
            if (major, minor) >= (2, 7):
                _install_requires.pop(_install_requires.index(xformers_version))
                # _install_requires.append("xformers==0.0.29.post3")  # xformers seems to be hard pinned to 2.6.0
                extras_require_map["vllm"] = ["vllm==0.8.3"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append(
                    "xformers==0.0.29.post2"
                )  # vllm needs post2 w torch 2.6
                extras_require_map["vllm"] = ["vllm==0.8.3"]
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
@@ -145,7 +147,7 @@ extras_require = {
        "ray[train]",
    ],
    "vllm": [
-        "vllm==0.8.5",
+        "vllm==0.7.2",
    ],
    "llmcompressor": [
        "llmcompressor==0.5.1",
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -4,4 +4,4 @@ import pkgutil
 __path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package
-__version__ = "0.10.0.dev0"
+__version__ = "0.8.0"
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -2,7 +2,4 @@
 import os
 from axolotl.logging_config import configure_logging
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 configure_logging()
--- a/src/axolotl/cli/checks.py
+++ b/src/axolotl/cli/checks.py
@@ -8,6 +8,9 @@ from accelerate.commands.config import config_args
 from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError
 from axolotl.logging_config import configure_logging
 configure_logging()
 LOG = logging.getLogger(__name__)
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -5,7 +5,6 @@ import logging
 import os
 import tempfile
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from typing import Union
 from urllib.parse import urlparse
@@ -153,15 +152,7 @@ def prepare_plugins(cfg: DictDefault):
            plugin_manager.register(plugin_name)
-def plugin_set_cfg(cfg: DictDefault):
+def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs) -> DictDefault:
    if cfg.get("plugins"):
        plugin_manager = PluginManager.get_instance()
        plugin_manager.cfg = cfg
 def load_cfg(
    config: str | Path | DictDefault = Path("examples/"), **kwargs
 ) -> DictDefault:
    """
    Loads the `axolotl` configuration stored at `config`, validates it, and performs
    various setup.
@@ -173,24 +164,13 @@ def load_cfg(
    Returns:
        `DictDefault` mapping configuration keys to values.
    """
-    if isinstance(config, (str, Path)):
+    config = check_remote_config(config)
-        config = check_remote_config(config)
+    if Path(config).is_dir():
-        if Path(config).is_dir():
+        config = choose_config(Path(config))
            config = choose_config(Path(config))
-        # Load the config from the yaml file
+    # Load the config from the yaml file
-        with open(config, encoding="utf-8") as file:
+    with open(config, encoding="utf-8") as file:
-            cfg: DictDefault = DictDefault(yaml.safe_load(file))
+        cfg: DictDefault = DictDefault(yaml.safe_load(file))
        cfg.axolotl_config_path = config
    else:
        cfg = config
        with NamedTemporaryFile(
            mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
        ) as temp_file:
            temp_file.write(yaml.dump(config.to_dict()))
            temp_file.close()
        cfg.axolotl_config_path = temp_file.name
    # If there are any options passed in the cli, if it is something that seems valid
    # from the yaml, then overwrite the value
@@ -204,6 +184,8 @@ def load_cfg(
            else:
                cfg[k] = kwargs[k]
    cfg.axolotl_config_path = config
    try:
        device_props = torch.cuda.get_device_properties("cuda")
        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
@@ -231,6 +213,5 @@ def load_cfg(
    setup_wandb_env_vars(cfg)
    setup_mlflow_env_vars(cfg)
    setup_comet_env_vars(cfg)
    plugin_set_cfg(cfg)
    return cfg
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -1,7 +1,6 @@
 """CLI to run evaluation on a model."""
 import logging
 import os
 from pathlib import Path
 from typing import Union
@@ -15,7 +14,6 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
 from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.dict import DictDefault
 LOG = logging.getLogger(__name__)
@@ -31,14 +29,10 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
    set_pytorch_cuda_alloc_conf()
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
    check_accelerate_default_config()
-    if int(os.getenv("LOCAL_RANK", "0")) == 0:
+    check_user_token()
        check_user_token()
    if cfg.rl:
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -28,6 +28,7 @@ from axolotl.cli.utils import (
    fetch_from_github,
    filter_none_kwargs,
 )
 from axolotl.cli.vllm_serve import do_vllm_serve
 from axolotl.integrations.lm_eval.cli import lm_eval
 from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.schemas.config import AxolotlInputConfig
@@ -326,8 +327,6 @@ def fetch(directory: str, dest: Optional[str]) -> None:
@add_options_from_dataclass(VllmServeCliArgs)
@filter_none_kwargs
 def vllm_serve(config: str, **cli_args: VllmServeCliArgs):
    from axolotl.cli.vllm_serve import do_vllm_serve
    do_vllm_serve(config, cli_args)
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -1,6 +1,5 @@
 """CLI to run training on a model."""
 import gc
 import logging
 import os
 from pathlib import Path
@@ -49,11 +48,8 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
    model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
    del model, tokenizer, trainer
    gc.collect()
    plugin_manager = PluginManager.get_instance()
    plugin_manager.post_train_unload(cfg)
--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -20,9 +20,11 @@ from transformers import (
    ProcessorMixin,
 )
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 configure_logging()
 LOG = logging.getLogger(__name__)
--- a/src/axolotl/common/architectures.py
+++ b/src/axolotl/common/architectures.py
@@ -11,6 +11,5 @@ MOE_ARCH_BLOCK = {
    ],
    "mixtral": "MixtralSparseMoeBlock",
    "qwen2_moe": "Qwen2MoeSparseMoeBlock",
    "qwen3_moe": "Qwen3MoeSparseMoeBlock",
    "deepseek_v2": "DeepseekV2MoE",
 }
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -47,7 +47,7 @@ def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
 def load_datasets(
    *,
    cfg: DictDefault,
-    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
+    cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
 ) -> TrainDatasetMeta:
    """
    Loads one or more training or evaluation datasets, calling
@@ -64,8 +64,7 @@ def load_datasets(
    tokenizer = load_tokenizer(cfg)
    processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
    preprocess_iterable = (
-        cli_args
+        hasattr(cli_args, "iterable")
        and hasattr(cli_args, "iterable")
        and cli_args.iterable is not None
        and cli_args.iterable
    )
@@ -77,7 +76,7 @@ def load_datasets(
        preprocess_iterable=preprocess_iterable,
    )
-    if cli_args and (
+    if (
        cli_args.debug
        or cfg.debug
        or cli_args.debug_text_only
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -60,7 +60,6 @@ from axolotl.core.training_args import (
 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback
 from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
 from axolotl.processing_strategies import get_processing_strategy
 from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
@@ -115,8 +114,6 @@ class TrainerBuilderBase(abc.ABC):
        if hasattr(model, "add_model_tags"):
            model.add_model_tags(["axolotl"])
        patch_trainer_get_lr()
    @property
    def model_ref(self):
        return self._model_ref
@@ -488,7 +485,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        # these are all the "standard" kwargs that are def used
        training_arguments_kwargs["max_steps"] = (
-            self.cfg.max_steps if self.cfg.max_steps else -1
+            total_num_steps if self.cfg.max_steps else -1
        )
        training_arguments_kwargs["max_seq_length"] = self.cfg.sequence_len
        training_arguments_kwargs["per_device_train_batch_size"] = (
@@ -935,6 +932,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                collator = DataCollatorForSeq2Seq
        kwargs["return_tensors"] = "pt"
        if issubclass(collator, DataCollatorForSeq2Seq):
            kwargs["sequence_parallel_degree"] = training_args.sequence_parallel_degree
            kwargs["ring_attn_func"] = training_args.ring_attn_func
        return collator(
            *collator_args,
@@ -1051,9 +1051,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        if self.cfg.rpo_alpha is not None:
            training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha
        if self.cfg.use_wandb:
            training_args_kwargs["run_name"] = self.cfg.wandb_name
        training_args_cls = None
        blocklist_args_kwargs = []
        if self.cfg.rl == "simpo":
@@ -1124,12 +1121,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            **training_args_kwargs,
        )
        # unset run_name so wandb sets up experiment names
        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
                None
            )
        return training_args
    def build(self, total_num_steps):
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -371,15 +371,13 @@ class AxolotlTrainer(
                num_items_in_batch=num_items_in_batch,
            )
-        loss = super().compute_loss(
+        return super().compute_loss(
            model,
            inputs,
            return_outputs=return_outputs,
            num_items_in_batch=num_items_in_batch,
        )
        return loss
    @staticmethod
    def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
        concatenated_batch = {}
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -3,29 +3,15 @@ DPO trainer for axolotl
 """
 import gc
 import random
 from functools import wraps
-from typing import Any, Dict, Optional, Union
+from typing import Any, Dict, Union
 import pandas as pd
 import torch
 import wandb
 from accelerate import PartialState
 from datasets import Dataset, IterableDataset
 from peft.optimizers import create_loraplus_optimizer
 from torch import nn
-from torch.utils.data import DataLoader
+from transformers import Trainer
 from transformers import (
    BaseImageProcessor,
    FeatureExtractionMixin,
    PreTrainedTokenizerBase,
    ProcessorMixin,
    Trainer,
 )
 from transformers.trainer_utils import EvalLoopOutput
 from transformers.utils import is_sagemaker_mp_enabled
-from trl import DPOConfig, DPOTrainer, maybe_apply_chat_template, maybe_extract_prompt
+from trl import DPOTrainer
 from trl.trainer.utils import log_table_to_comet_experiment
 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
 from axolotl.core.trainers.utils import (
@@ -95,64 +81,6 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
        return super().push_to_hub(*args, **kwargs)
    # TODO: remove this once https://github.com/huggingface/trl/pull/3377 is in a release
    def _prepare_dataset(
        self,
        dataset: Union[Dataset, IterableDataset],
        processing_class: Union[
            PreTrainedTokenizerBase,
            BaseImageProcessor,
            FeatureExtractionMixin,
            ProcessorMixin,
        ],
        args: DPOConfig,
        dataset_name: str,
    ) -> Union[Dataset, IterableDataset]:
        # Build the kwargs for the `map` function
        map_kwargs: Dict[str, Any] = {"writer_batch_size": 10}
        if isinstance(dataset, Dataset):  # IterableDataset does not support num_proc
            map_kwargs["num_proc"] = args.dataset_num_proc
        with PartialState().main_process_first():
            # Extract prompt if needed
            if isinstance(
                dataset, Dataset
            ):  # `IterableDataset.map` does not support `desc`
                map_kwargs["desc"] = f"Extracting prompt in {dataset_name} dataset"
            dataset = dataset.map(maybe_extract_prompt, **map_kwargs)
            # Apply the chat template if needed
            if isinstance(
                dataset, Dataset
            ):  # `IterableDataset.map` does not support `desc`
                map_kwargs["desc"] = f"Applying chat template to {dataset_name} dataset"
            dataset = dataset.map(
                maybe_apply_chat_template,
                fn_kwargs={"tokenizer": processing_class, "tools": args.tools},
                **map_kwargs,
            )
            # Tokenize the dataset
            if isinstance(
                dataset, Dataset
            ):  # `IterableDataset.map` does not support `desc`
                map_kwargs["desc"] = f"Tokenizing {dataset_name} dataset"
            dataset = dataset.map(
                self.tokenize_row if not self.is_vision_model else self.process_row,
                remove_columns=["chosen", "rejected"],
                fn_kwargs={
                    "processing_class": processing_class,
                    "max_prompt_length": args.max_prompt_length,
                    "max_completion_length": args.max_completion_length,
                    # for enc-dec, we add the special tokens ([bos_token] + prompt + [eos_token]; completion + [eos_token])
                    "add_special_tokens": False,
                },
                **map_kwargs,
            )
        return dataset
    @staticmethod
    def tokenize_row(
        features,
@@ -177,8 +105,12 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
            # dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs
            if res["chosen_input_ids"][0] == processing_class.bos_token_id:
                res["chosen_input_ids"] = res["chosen_input_ids"][1:]
                res["chosen_labels"] = res["chosen_labels"][1:]
                res["chosen_attention_mask"] = res["chosen_attention_mask"][1:]
            if res["rejected_input_ids"][0] == processing_class.bos_token_id:
                res["rejected_input_ids"] = res["rejected_input_ids"][1:]
                res["rejected_labels"] = res["rejected_labels"][1:]
                res["rejected_attention_mask"] = res["rejected_attention_mask"][1:]
        return res
@@ -192,67 +124,3 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
        gc.collect()
        torch.cuda.empty_cache()
        return loss
    # TODO: remove this once https://github.com/huggingface/trl/pull/3377 is in a release
    def evaluation_loop(
        self,
        dataloader: DataLoader,
        description: str,
        prediction_loss_only: Optional[bool] = None,
        ignore_keys: Optional[list[str]] = None,
        metric_key_prefix: str = "eval",
    ) -> EvalLoopOutput:
        """
        Overriding built-in evaluation loop to store metrics for each batch.
        Prediction/evaluation loop, shared by `Trainer.evaluate()` and `Trainer.predict()`.
        Works both with or without labels.
        """
        # Sample and save to game log if requested (for one batch to save time)
        if self.generate_during_eval:
            # Generate random indices within the range of the total number of samples
            num_samples = len(dataloader.dataset)
            random_indices = random.sample(
                range(num_samples), k=self.args.eval_batch_size
            )
            # Use dataloader.dataset.select to get the random batch without iterating over the DataLoader
            random_batch_dataset = dataloader.dataset.select(random_indices)
            random_batch = self.data_collator(random_batch_dataset)
            random_batch = self._prepare_inputs(random_batch)
            policy_output_decoded, ref_output_decoded = (
                self.generate_from_model_and_ref(self.model, random_batch)
            )
            table = pd.DataFrame(
                columns=["Prompt", "Policy", "Ref Model"],
                data=[
                    [prompt, pol[len(prompt) :], ref[len(prompt) :]]
                    for prompt, pol, ref in zip(
                        random_batch_dataset["prompt"],
                        policy_output_decoded,
                        ref_output_decoded,
                    )
                ],
            )
            if "wandb" in self.args.report_to and self.accelerator.is_main_process:
                wandb.log({"game_log": wandb.Table(data=table)})
            if "comet_ml" in self.args.report_to:
                log_table_to_comet_experiment(
                    name="game_log.csv",
                    table=table,
                )
        # Base evaluation
        initial_output = super().evaluation_loop(
            dataloader,
            description,
            prediction_loss_only,
            ignore_keys,
            metric_key_prefix,
        )
        return initial_output
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -63,7 +63,6 @@ class GRPOStrategy:
        grpo_args_kwargs["max_completion_length"] = trl.max_completion_length
        grpo_args_kwargs["log_completions"] = trl.log_completions
        grpo_args_kwargs["num_completions_to_print"] = trl.num_completions_to_print
        if trl.reward_weights:
            grpo_args_kwargs["reward_weights"] = trl.reward_weights
@@ -71,13 +70,6 @@ class GRPOStrategy:
        if trl.scale_rewards is not None:
            grpo_args_kwargs["scale_rewards"] = trl.scale_rewards
        if trl.loss_type is not None:
            grpo_args_kwargs["loss_type"] = trl.loss_type
        if trl.mask_truncated_completions is not None:
            grpo_args_kwargs["mask_truncated_completions"] = (
                trl.mask_truncated_completions
            )
        if trl.temperature is not None:
            grpo_args_kwargs["temperature"] = trl.temperature
        if trl.top_p is not None:
@@ -93,11 +85,6 @@ class GRPOStrategy:
            grpo_args_kwargs["num_iterations"] = trl.num_iterations
        if trl.epsilon is not None:
            grpo_args_kwargs["epsilon"] = trl.epsilon
        if trl.epsilon_high is not None:
            grpo_args_kwargs["epsilon_high"] = trl.epsilon_high
        if trl.use_liger_loss is not None:
            grpo_args_kwargs["use_liger_loss"] = trl.use_liger_loss
        return grpo_args_kwargs
@@ -148,9 +135,7 @@ class GRPOStrategy:
        try:
            # use importlib to dynamically load the reward function from the module
            reward_func_module_name = reward_func_fqn.split(".")[-1]
-            reward_func_module = importlib.import_module(
+            reward_func_module = importlib.import_module(reward_func_fqn.split(".")[-2])
                ".".join(reward_func_fqn.split(".")[:-1])
            )
            reward_func = getattr(reward_func_module, reward_func_module_name)
            if not len(inspect.signature(reward_func).parameters) >= 2:
                raise ValueError(
--- a/src/axolotl/core/trainers/mixins/init.py
+++ b/src/axolotl/core/trainers/mixins/init.py
@@ -6,4 +6,4 @@
 from .optimizer import OptimizerMixin
 from .rng_state_loader import RngLoaderMixin
 from .scheduler import SchedulerMixin
-from .sequence_parallel import SequenceParallelContextManager, SequenceParallelMixin
+from .sequence_parallel import SequenceParallelMixin
--- a/src/axolotl/core/trainers/mixins/scheduler.py
+++ b/src/axolotl/core/trainers/mixins/scheduler.py
@@ -3,10 +3,9 @@
 import logging
 import torch
-from torch.optim.lr_scheduler import LRScheduler, OneCycleLR
+from torch.optim.lr_scheduler import OneCycleLR
 from transformers.trainer import Trainer
 from axolotl.integrations.base import PluginManager
 from axolotl.utils.schedulers import (
    RexLR,
    get_cosine_schedule_with_min_lr,
@@ -26,9 +25,9 @@ class SchedulerMixin(Trainer):
    def create_scheduler(
        self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
-    ) -> LRScheduler:
+    ):
        """
-        Set up the scheduler. The optimizer of the trainer must have been set up either before this method is called or
+        Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or
        passed as an argument.
        Args:
@@ -48,16 +47,7 @@ class SchedulerMixin(Trainer):
        # fmt: off
        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
            # fmt: on
-            plugin_manager = PluginManager.get_instance()
+            if self.args.alternate_lr_scheduler_type == "one_cycle":
            lr_scheduler: LRScheduler | None = plugin_manager.create_lr_scheduler(
                trainer=self,
                optimizer=optimizer,
                num_training_steps=num_training_steps
            )
            if lr_scheduler is not None:
                LOG.info(f"Using plugin-created lr_scheduler: {lr_scheduler}")
                self.lr_scheduler = lr_scheduler
            elif self.args.alternate_lr_scheduler_type == "one_cycle":
                num_warmup_steps = self.args.get_warmup_steps(num_training_steps)
                pct_start = num_warmup_steps / num_training_steps
                extra_lr_kwargs = {}
@@ -120,4 +110,4 @@ class SchedulerMixin(Trainer):
            if use_cosine_min_lr:
                LOG.warning("axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")
-        return self.lr_scheduler  # type: ignore
+        return self.lr_scheduler
--- a/src/axolotl/core/trainers/mixins/sequence_parallel.py
+++ b/src/axolotl/core/trainers/mixins/sequence_parallel.py
@@ -1,86 +1,16 @@
-"""
+"""Module for Axolotl trainer sequence parallelism mixin"""
 Module for Axolotl trainer sequence parallelism mixin and training context manager
 """
 import functools
 import logging
 import torch
 import torch.distributed as dist
 from datasets import Dataset
 from torch import nn
 from torch.utils.data import DistributedSampler, Sampler
 from torch.utils.hooks import RemovableHandle
-from axolotl.monkeypatch.attention.ring_attn import (
+from axolotl.monkeypatch.attention.ring_attn import get_ring_attn_group
    RingAttnFunc,
    get_ring_attn_group,
    update_ring_attn_params,
 )
 LOG = logging.getLogger(__name__)
 def apply_sequence_parallelism(
    batch: dict[str, torch.Tensor],
    local_rank: int,
    local_world_size: int,
    ring_attn_func: RingAttnFunc,
 ) -> dict[str, torch.Tensor]:
    """
    Apply sequence parallelism slicing to a batch.
    Args:
        batch: Batch dictionary (e.g., input_ids, attention_mask, etc.)
        local_rank: Local rank in the sequence parallel group
        local_world_size: World size of the sequence parallel group
        ring_attn_func: The ring attention function to use
    Returns:
        Sliced batch dictionary.
    """
    # Update ring attention params if needed
    if batch.get("position_ids") is not None:
        update_ring_attn_params(position_ids=batch["position_ids"])
    # Slice batch for sequence parallel processing
    total_seq_len = batch["input_ids"].size(1)
    for key in batch:
        if (
            key in batch
            and isinstance(batch[key], torch.Tensor)
            and batch[key].dim() > 1
            and batch[key].size(1) == total_seq_len
        ):
            if ring_attn_func in [
                RingAttnFunc.VARLEN_LLAMA3,
                RingAttnFunc.BATCH_RING,
            ]:
                # Split in sequential fashion and grab this rank's chunk
                batch[key] = (
                    batch[key].chunk(local_world_size, dim=1)[local_rank].contiguous()
                )
            elif ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
                chunks = batch[key].chunk(2 * local_world_size, dim=1)
                # Take rank's chunk and opposing chunk for zigzag pattern
                selected_chunks = [
                    chunks[local_rank],
                    chunks[2 * local_world_size - local_rank - 1],
                ]
                batch[key] = torch.cat(selected_chunks, dim=1).contiguous()
            elif ring_attn_func is RingAttnFunc.BATCH_STRIPE:
                # Split into striped data and stack
                tensor = torch.stack(
                    batch[key].split(local_world_size, dim=1),
                    dim=1,
                ).transpose(1, 2)
                batch[key] = tensor[:, local_rank].contiguous()
    return batch
 class SequenceParallelMixin:
    """
    Mixin class for sequence parallelism support in trainers.
@@ -157,157 +87,3 @@ class SequenceParallelMixin:
        return self._create_sequence_parallel_sampler(
            eval_dataset, shuffle=False, is_eval=True
        )
 class SequenceParallelContextManager:
    """
    Context manager for sequence parallelism operations.
    This class provides a context that will automatically apply sequence parallelism
    during model forward passes using a pre-forward hook, and gather outputs from
    across the sequence parallelism group using a post-forward hook.
    """
    def __init__(
        self,
        model: nn.Module,
        sequence_parallel_degree: int,
        ring_attn_func: RingAttnFunc,
    ):
        self.model = model
        self.sequence_parallel_degree = sequence_parallel_degree
        self.ring_attn_func = ring_attn_func
        self.process_group = get_ring_attn_group()
        # Initialize sequence parallel group details
        self.local_rank = dist.get_rank(self.process_group)
        self.local_world_size = dist.get_world_size(self.process_group)
        # Will store hook handles for removal
        self.hook_handles: list[RemovableHandle] = []
        # Create a partially applied version of the apply_sequence_parallelism function
        # with pre-configured params
        self.apply_sequence_parallelism = functools.partial(
            apply_sequence_parallelism,
            local_rank=self.local_rank,
            local_world_size=self.local_world_size,
            ring_attn_func=self.ring_attn_func,
        )
    def __enter__(self):
        # Forward pre-hook to apply sequence parallelism
        def sequence_parallel_pre_hook(_, args, kwargs):
            # Apply sequence parallelism to kwargs
            kwargs = self.apply_sequence_parallelism(batch=kwargs)
            return args, kwargs
        # Forward post-hook to gather outputs
        def sequence_parallel_post_hook(_, __, output):
            # Gather the sharded outputs
            return self.gather_outputs(output)
        # Register both hooks
        self.hook_handles.append(
            self.model.register_forward_pre_hook(
                sequence_parallel_pre_hook, with_kwargs=True
            )
        )
        self.hook_handles.append(
            self.model.register_forward_hook(sequence_parallel_post_hook)
        )
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        # Remove all hooks
        for handle in self.hook_handles:
            handle.remove()
        self.hook_handles = []
    def gather_outputs(self, output):
        """Gather sharded outputs from all ranks and reconstruct the full tensor."""
        # Handle different output formats (dict, tensor, etc.)
        if isinstance(output, dict):
            gathered_output = {}
            for key, value in output.items():
                if isinstance(value, torch.Tensor) and value.dim() > 1:
                    # Gather logits or other sequence-sharded tensors
                    gathered_value = self.gather_tensor(value)
                    gathered_output[key] = gathered_value
                else:
                    gathered_value = value.clone()
                    dist.all_reduce(
                        gathered_value, op=dist.ReduceOp.SUM, group=self.process_group
                    )
                    gathered_output[key] = gathered_value
            return gathered_output
        if isinstance(output, torch.Tensor):
            return self.gather_tensor(output)
        return output
    def gather_tensor(self, tensor):
        """Gather a sharded tensor from all ranks."""
        # Prepare tensors for all_gather
        world_size = self.local_world_size
        # Create list to store tensors from all ranks
        gathered_tensors = [torch.zeros_like(tensor) for _ in range(world_size)]
        # All-gather operation
        dist.all_gather(gathered_tensors, tensor, group=self.process_group)
        # Concatenate along sequence dimension (typically dim=1)
        if self.ring_attn_func in [RingAttnFunc.VARLEN_LLAMA3, RingAttnFunc.BATCH_RING]:
            # Simple concatenation for standard sharding
            return torch.cat(gathered_tensors, dim=1)
        if self.ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
            # Each rank has a pattern of (rank, world_size*2-rank-1)
            reconstituted_tensors = [None] * (world_size * 2)
            # First, split each gathered tensor into its two chunks
            for rank, gathered_tensor in enumerate(gathered_tensors):
                # Each tensor contains two chunks in the sequence dimension
                chunk_size = gathered_tensor.size(1) // 2
                chunk1, chunk2 = gathered_tensor.split(chunk_size, dim=1)
                # Place chunks in their original positions
                reconstituted_tensors[rank] = chunk1
                reconstituted_tensors[world_size * 2 - rank - 1] = chunk2
            # Concatenate the reconstituted tensors in the correct order
            return torch.cat(reconstituted_tensors, dim=1)
        # Otherwise, RingAttnFunc.BATCH_STRIPE
        # In striping, each rank has every world_size-th slice
        batch_size = tensor.size(0)
        hidden_dim = tensor.size(-1)
        # First, determine the full sequence length
        total_seq_len = 0
        for t in gathered_tensors:
            total_seq_len += t.size(1)
        # Create a tensor to hold the unstriped result
        result = torch.zeros(
            batch_size,
            total_seq_len,
            hidden_dim,
            dtype=tensor.dtype,
            device=tensor.device,
        )
        # For each rank's tensor, distribute its slices to the correct positions
        for rank, gathered_tensor in enumerate(gathered_tensors):
            # The rank's tensor contains every world_size-th slice
            # starting from its rank position
            seq_len = gathered_tensor.size(1)
            for i in range(seq_len):
                # Calculate the position in the full tensor
                pos = i * world_size + rank
                if pos < total_seq_len:
                    result[:, pos] = gathered_tensor[:, i]
        return result
--- a/src/axolotl/core/trainers/relora.py
+++ b/src/axolotl/core/trainers/relora.py
@@ -1,7 +1,6 @@
 """Module for ReLoRA trainer"""
 import torch
 from torch.optim.lr_scheduler import LRScheduler
 from axolotl.core.trainers.base import AxolotlTrainer
 from axolotl.monkeypatch.relora import ReLoRAScheduler
@@ -20,11 +19,9 @@ class ReLoRATrainer(AxolotlTrainer):
        self,
        num_training_steps: int,
        optimizer: torch.optim.Optimizer | None = None,
-    ) -> LRScheduler:
+    ):
        optimizer = self.optimizer if optimizer is None else optimizer
-        lr_scheduler: LRScheduler = super().create_scheduler(
+        lr_scheduler = super().create_scheduler(num_training_steps, optimizer)
            num_training_steps, optimizer
        )
        if self.args.relora_steps:
            warmup_steps = (
@@ -33,7 +30,7 @@ class ReLoRATrainer(AxolotlTrainer):
            anneal_steps = (
                self.args.relora_anneal_steps if self.args.relora_anneal_steps else 1
            )
-            self.lr_scheduler = ReLoRAScheduler(  # type: ignore
+            self.lr_scheduler = ReLoRAScheduler(
                optimizer,
                lr_scheduler,
                self.args.relora_steps,
@@ -41,6 +38,6 @@ class ReLoRATrainer(AxolotlTrainer):
                warmup_steps,
            )
        else:
-            self.lr_scheduler = lr_scheduler  # type: ignore
+            self.lr_scheduler = lr_scheduler
-        return self.lr_scheduler  # type: ignore
+        return self.lr_scheduler
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -11,19 +11,20 @@ from accelerate.logging import get_logger
 from datasets import Dataset
 from transformers.trainer import Trainer
-from axolotl.train import (
+from axolotl.logging_config import configure_logging
-    TrainDatasetMeta,
+from axolotl.train import TrainDatasetMeta
-    setup_model_and_tokenizer,
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 )
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.trainer import setup_trainer
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
-LOG = get_logger(__name__)
+configure_logging()
 LOG = get_logger("axolotl.evaluate")
 def evaluate_dataset(
@@ -74,22 +75,37 @@ def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, f
    Returns:
        Dictionary mapping metric names to their values.
    """
-    # Load tokenizer, processor and model
+    # pylint: disable=duplicate-code
-    LOG.debug("loading model for evaluation...")
+    # Enable expandable segments for cuda allocation to improve VRAM usage
-    model, tokenizer, _, processor = setup_model_and_tokenizer(cfg)
+    set_pytorch_cuda_alloc_conf()
    # Load tokenizer
    LOG.debug(
        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
        main_process_only=True,
    )
    tokenizer = load_tokenizer(cfg)
    # Load processor for multimodal models if needed
    processor = None
    if cfg.is_multimodal:
        processor = load_processor(cfg, tokenizer)
    # Get datasets
    # pylint: disable=duplicate-code
    train_dataset = dataset_meta.train_dataset
    eval_dataset = dataset_meta.eval_dataset
    total_num_steps = dataset_meta.total_num_steps
    # Load model
    LOG.debug("loading model for evaluation...")
    model, _ = load_model(cfg, tokenizer, processor=processor)
    # Set up trainer
    trainer = setup_trainer(
-        cfg=cfg,
+        cfg,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
-        model=model,
+        model=(model, None, None),  # No need for model_ref or peft_config
        tokenizer=tokenizer,
        processor=processor,
        total_num_steps=total_num_steps,
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -24,7 +24,6 @@ import logging
 from typing import OrderedDict
 import torch
 from torch.optim.lr_scheduler import LRScheduler
 class BasePlugin:
@@ -37,12 +36,11 @@ class BasePlugin:
    Methods:
    register(cfg): Registers the plugin with the given configuration.
    pre_model_load(cfg): Performs actions before the model is loaded.
-    post_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied.
+    post_model_load(cfg, model): Performs actions after the model is loaded.
    pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
    post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
    post_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters.
    create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
-    create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler.
+    create_lr_scheduler(cfg, trainer, optimizer): Creates and returns a learning rate scheduler.
    add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.
    add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.
    """
@@ -79,14 +77,6 @@ class BasePlugin:
        None
        """
    def post_model_build(self, cfg, model):  # pylint: disable=unused-argument
        """
        Performs actions after the model is built/loaded, but before any adapters are applied.
        Args:
            cfg (dict): The configuration for the plugin.
        """
    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
        """
        Performs actions after the model is loaded.
@@ -147,8 +137,8 @@ class BasePlugin:
        """
    def create_lr_scheduler(
-        self, cfg, trainer, optimizer, num_training_steps
+        self, cfg, trainer, optimizer
-    ) -> LRScheduler | None:  # pylint: disable=unused-argument
+    ):  # pylint: disable=unused-argument
        """
        Creates and returns a learning rate scheduler.
@@ -156,10 +146,9 @@ class BasePlugin:
        cfg (dict): The configuration for the plugin.
        trainer (object): The trainer object for training.
        optimizer (object): The optimizer for training.
        num_training_steps (int): Total number of training steps
        Returns:
-        object (LRScheduler): The created learning rate scheduler.
+        object: The created learning rate scheduler.
        """
    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
@@ -272,7 +261,6 @@ class PluginManager:
    plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()
    _instance = None
    _cfg = None
    def __new__(cls):
        """
@@ -280,9 +268,7 @@ class PluginManager:
        """
        if cls._instance is None:
            cls._instance = super(PluginManager, cls).__new__(cls)
-            cls._instance.plugins: OrderedDict[str, BasePlugin] = (
+            cls._instance.plugins = collections.OrderedDict()
                collections.OrderedDict()
            )
        return cls._instance
    @staticmethod
@@ -295,14 +281,6 @@ class PluginManager:
            PluginManager()
        return PluginManager._instance  # type: ignore
    @property
    def cfg(self):
        return self._cfg
    @cfg.setter
    def cfg(self, cfg):
        self._cfg = cfg
    def register(self, plugin_name: str):
        """
        Registers a new plugin by its name.
@@ -351,22 +329,9 @@ class PluginManager:
        for plugin in self.plugins.values():
            plugin.pre_model_load(cfg)
    def post_model_build(self, cfg, model):
        """
        Calls the post_model_build method of all registered plugins after the model has been built/loaded,
        but before any adapters have been applied.
        Args:
            cfg (dict): The configuration for the plugins.
            model (object): The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_model_build(cfg, model)
    def post_model_load(self, cfg, model):
        """
-        Calls the post_model_load method of all registered plugins after the model has been loaded
+        Calls the post_model_load method of all registered plugins.
        inclusive of any adapters
        Parameters:
        cfg (dict): The configuration for the plugins.
@@ -422,29 +387,29 @@ class PluginManager:
                return trainer_cls
        return None
-    def create_optimizer(self, trainer):
+    def create_optimizer(self, cfg, trainer):
        """
        Calls the create_optimizer method of all registered plugins and returns the first non-None optimizer.
        Parameters:
        cfg (dict): The configuration for the plugins.
        trainer (object): The trainer object for training.
        Returns:
        object: The created optimizer, or None if none was found.
        """
        for plugin in self.plugins.values():
-            optimizer = plugin.create_optimizer(self.cfg, trainer)
+            optimizer = plugin.create_optimizer(cfg, trainer)
            if optimizer is not None:
                return optimizer
        return None
-    def create_lr_scheduler(
+    def create_lr_scheduler(self, cfg, trainer, optimizer):
        self, trainer, optimizer, num_training_steps
    ) -> LRScheduler | None:
        """
        Calls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.
        Parameters:
        cfg (dict): The configuration for the plugins.
        trainer (object): The trainer object for training.
        optimizer (object): The optimizer for training.
@@ -452,12 +417,7 @@ class PluginManager:
        object: The created learning rate scheduler, or None if none was found.
        """
        for plugin in self.plugins.values():
-            scheduler: LRScheduler | None = plugin.create_lr_scheduler(
+            scheduler = plugin.create_lr_scheduler(cfg, trainer, optimizer)
                self.cfg,
                trainer=trainer,
                optimizer=optimizer,
                num_training_steps=num_training_steps,
            )
            if scheduler is not None:
                return scheduler
        return None
@@ -498,20 +458,6 @@ class PluginManager:
                callbacks.extend(plugin_callbacks)
        return callbacks
    def post_train(self, cfg, model):
        """
        Calls the post_train method of all registered plugins.
        Parameters:
        cfg (dict): The configuration for the plugins.
        model (object): The loaded model.
        Returns:
        None
        """
        for plugin in self.plugins.values():
            plugin.post_train(cfg, model)
    def post_train_unload(self, cfg):
        """
        Calls the post_train_unload method of all registered plugins.
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -27,13 +27,15 @@ pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transform
 ```yaml
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 cut_cross_entropy: true
 ```
 ## Supported Models
 - llama
 - llama4
 - llama4_text
 - llama4
 - mllama
 - phi3
 - gemma
@@ -43,11 +45,6 @@ plugins:
 - mistral
 - mistral3
 - qwen2
 - qwen2_moe
 - qwen2_vl
 - qwen2_5_vl
 - qwen3
 - qwen3_moe
 - cohere
 - cohere2
 - glm
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -25,7 +25,7 @@ import torch
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils import get_pytorch_version
-from axolotl.utils.distributed import is_main_process
+from axolotl.utils.distributed import zero_only
 from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401
@@ -76,7 +76,7 @@ class CutCrossEntropyPlugin(BasePlugin):
                cce_patch,
            )
-            if is_main_process(use_environ=True):
+            with zero_only():
                LOG.info(
                    f"Applying Cut Cross Entropy to model type: {cfg.model_config_type}"
                )
--- a/src/axolotl/integrations/cut_cross_entropy/args.py
+++ b/src/axolotl/integrations/cut_cross_entropy/args.py
@@ -28,7 +28,7 @@ class CutCrossEntropyArgs(BaseModel):
    Input args for Cut Cross Entropy.
    """
-    cut_cross_entropy: Optional[bool] = True
+    cut_cross_entropy: Optional[bool] = None
    @model_validator(mode="before")
    @classmethod
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/llama.py
@@ -1,174 +0,0 @@
 """Llama CCE patch. Adapted from transformers v4.51.2"""
 # pylint: disable=duplicate-code
 from types import MethodType
 from typing import Optional, Union
 import torch
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
    apply_lce,
 )
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
 )
 from transformers.models.llama.modeling_llama import (
    _CONFIG_FOR_DOC,
    LLAMA_INPUTS_DOCSTRING,
    KwargsForCausalLM,
 )
 from transformers.processing_utils import Unpack
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple
 _PATCH_OPTS: PatchOptions | None = None
@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Cache] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs: Unpack[KwargsForCausalLM],
 ) -> CausalLMOutputWithPast:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    Example:
    ```python
    >>> from transformers import AutoTokenizer, LlamaForCausalLM
    >>> model = LlamaForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf")
    >>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
    >>> prompt = "Hey, are you conscious? Can you talk to me?"
    >>> inputs = tokenizer(prompt, return_tensors="pt")
    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs: BaseModelOutputWithPast = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs.last_hidden_state
    if hidden_states is None:
        raise ValueError("hidden_states is None")
    loss = None
    logits = None
    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
    slice_indices = (
        slice(-logits_to_keep, None)
        if isinstance(logits_to_keep, int)
        else logits_to_keep
    )
    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
        assert labels is not None
        loss = apply_lce(
            hidden_states[:, slice_indices, :],
            self.lm_head.weight,
            labels,
            _PATCH_OPTS,
            **kwargs,
        )
    else:
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )
    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def patch_llama(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    """Patch Llama for CCE."""
    global _PATCH_OPTS  # pylint: disable=global-statement
    from transformers.models.llama import modeling_llama
    _PATCH_OPTS = patch_options
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_llama.LlamaForCausalLM
        ), f"Expected a LlamaForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward, maybe_model)
        return maybe_model
    modeling_llama.LlamaForCausalLM.forward = cce_forward
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
@@ -5,7 +5,9 @@
 import transformers
 from cut_cross_entropy.cce_utils import LinearCrossEntropyImpl
 from cut_cross_entropy.linear_cross_entropy import LCE_IMPL_DEFAULT
 from cut_cross_entropy.transformers.llama import patch_llama
 from cut_cross_entropy.transformers.phi3 import patch_phi3
 from cut_cross_entropy.transformers.qwen2 import patch_qwen2
 from cut_cross_entropy.transformers.utils import PatchOptions, TransformersModelT
 from axolotl.integrations.cut_cross_entropy.monkeypatch.cohere import (
@@ -22,9 +24,6 @@ from axolotl.integrations.cut_cross_entropy.monkeypatch.glm4 import (
    patch_glm,
    patch_glm4,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.llama import (
    patch_llama,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.llama4 import (
    patch_llama4,
    patch_llama4_text,
@@ -34,22 +33,6 @@ from axolotl.integrations.cut_cross_entropy.monkeypatch.mistral3 import (
    patch_mistral3,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.mllama import patch_mllama
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2 import (
    patch_qwen2,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2_5_vl import (
    patch_qwen2_5_vl,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2_moe import (
    patch_qwen2_moe,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen2_vl import (
    patch_qwen2_vl,
 )
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen3 import patch_qwen3
 from axolotl.integrations.cut_cross_entropy.monkeypatch.qwen3_moe import (
    patch_qwen3_moe,
 )
 CUT_CROSS_ENTROPY_MODEL_MAPPING = {
    "llama": patch_llama,
@@ -64,11 +47,6 @@ CUT_CROSS_ENTROPY_MODEL_MAPPING = {
    "mistral": patch_mistral,
    "mistral3": patch_mistral3,
    "qwen2": patch_qwen2,
    "qwen2_moe": patch_qwen2_moe,
    "qwen2_vl": patch_qwen2_vl,
    "qwen2_5_vl": patch_qwen2_5_vl,
    "qwen3": patch_qwen3,
    "qwen3_moe": patch_qwen3_moe,
    "cohere": patch_cohere,
    "cohere2": patch_cohere2,
    "glm": patch_glm,
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2.py
@@ -1,37 +0,0 @@
 """Qwen2 CCE patch. The model inherits Llama's modeling code and uses the same forward method."""
 # pylint: disable=duplicate-code
 from types import MethodType
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
 )
 def patch_qwen2(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    from transformers.models.qwen2 import modeling_qwen2
    # Set the _PATCH_OPTS in the llama patch file
    import axolotl.integrations.cut_cross_entropy.monkeypatch.llama as llama_patch
    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
    from axolotl.integrations.cut_cross_entropy.monkeypatch.llama import (
        cce_forward,
    )
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen2.Qwen2ForCausalLM
        ), f"Expected a Qwen2ForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward, maybe_model)
        return maybe_model
    modeling_qwen2.Qwen2ForCausalLM.forward = cce_forward
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_5_vl.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_5_vl.py
@@ -1,246 +0,0 @@
 """Qwen2.5 VL CCE patch. Adapted from transformers v4.51.2"""
 # pylint: disable=duplicate-code
 from types import MethodType
 from typing import Optional, Tuple, Union
 import torch
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
    apply_lce,
 )
 from torch.nn import CrossEntropyLoss
 from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
    Qwen2_5_VLCausalLMOutputWithPast,
 )
 _PATCH_OPTS: PatchOptions | None = None
 def cce_forward_multimodal(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[list[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    pixel_values: Optional[torch.Tensor] = None,
    pixel_values_videos: Optional[torch.FloatTensor] = None,
    image_grid_thw: Optional[torch.LongTensor] = None,
    video_grid_thw: Optional[torch.LongTensor] = None,
    rope_deltas: Optional[torch.LongTensor] = None,
    cache_position: Optional[torch.LongTensor] = None,
    second_per_grid_ts: Optional[torch.Tensor] = None,
 ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
    Returns:
    Example:
    ```python
    >>> from PIL import Image
    >>> import requests
    >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
    >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
    >>> messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What is shown in this image?"},
            ],
        },
    ]
    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)
    >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    return_dict = (
        return_dict if return_dict is not None else self.config.use_return_dict
    )
    if inputs_embeds is None:
        inputs_embeds = self.model.embed_tokens(input_ids)
        if pixel_values is not None:
            pixel_values = pixel_values.type(self.visual.dtype)
            image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
            n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
            n_image_features = image_embeds.shape[0]
            if n_image_tokens != n_image_features:
                raise ValueError(
                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                )
            mask = input_ids == self.config.image_token_id
            mask_unsqueezed = mask.unsqueeze(-1)
            mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
            image_mask = mask_expanded.to(inputs_embeds.device)
            image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)  # type: ignore
        if pixel_values_videos is not None:
            pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
            video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
            n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
            n_video_features = video_embeds.shape[0]
            if n_video_tokens != n_video_features:
                raise ValueError(
                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                )
            mask = input_ids == self.config.video_token_id
            mask_unsqueezed = mask.unsqueeze(-1)
            mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
            video_mask = mask_expanded.to(inputs_embeds.device)
            video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)  # type: ignore
        if attention_mask is not None:
            attention_mask = attention_mask.to(inputs_embeds.device)
    # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
    if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
        # calculate RoPE index once per generation in the pre-fill stage only
        if (
            (cache_position is not None and cache_position[0] == 0)
            or self.rope_deltas is None
            or (past_key_values is None or past_key_values.get_seq_length() == 0)  # type: ignore
        ):
            position_ids, rope_deltas = self.get_rope_index(
                input_ids,
                image_grid_thw,
                video_grid_thw,
                second_per_grid_ts,
                attention_mask,
            )
            self.rope_deltas = rope_deltas
        # then use the prev pre-calculated rope-deltas to get the correct position ids
        else:
            batch_size, seq_length, _ = inputs_embeds.shape
            delta = (
                (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
                if cache_position is not None
                else 0
            )
            position_ids = torch.arange(seq_length, device=inputs_embeds.device)  # type: ignore
            position_ids = position_ids.view(1, -1).expand(batch_size, -1)  # type: ignore
            if cache_position is not None:  # otherwise `deltas` is an int `0`
                delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)  # type: ignore
            position_ids = position_ids.add(delta)  # type: ignore
            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)  # type: ignore
    outputs = self.model(
        input_ids=None,
        position_ids=position_ids,
        attention_mask=attention_mask,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        cache_position=cache_position,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
        assert labels is not None
        loss = apply_lce(
            hidden_states,
            self.lm_head.weight,
            labels,
            _PATCH_OPTS,
        )
    else:
        logits = self.lm_head(hidden_states)
        if labels is not None:
            # Upcast to float if we need to compute the loss to avoid potential precision issues
            logits = logits.float()
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)
    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output
    return Qwen2_5_VLCausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        rope_deltas=self.rope_deltas,
    )
 def patch_qwen2_5_vl(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    global _PATCH_OPTS  # pylint: disable=global-statement
    from transformers.models.qwen2_5_vl import modeling_qwen2_5_vl
    _PATCH_OPTS = patch_options
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration
        ), f"Expected a Qwen2_5_VLForConditionalGeneration model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
        return maybe_model
    modeling_qwen2_5_vl.Qwen2_5_VLForConditionalGeneration.forward = (
        cce_forward_multimodal
    )
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_moe.py
@@ -1,188 +0,0 @@
 """Qwen2 MoE CCE patch. Adapted from transformers v4.51.2"""
 # pylint: disable=duplicate-code
 from types import MethodType
 from typing import Optional, Union
 import torch
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
    apply_lce,
 )
 from transformers.models.qwen2_moe.modeling_qwen2_moe import (
    _CONFIG_FOR_DOC,
    QWEN2MOE_INPUTS_DOCSTRING,
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    load_balancing_loss_func,
 )
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple
 _PATCH_OPTS: PatchOptions | None = None
@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[list[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_router_logits: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **loss_kwargs,
 ) -> MoeCausalLMOutputWithPast:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    Example:
    ```python
    >>> from transformers import AutoTokenizer, Qwen2MoeForCausalLM
    >>> model = Qwen2MoeForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
    >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)
    >>> prompt = "Hey, are you conscious? Can you talk to me?"
    >>> inputs = tokenizer(prompt, return_tensors="pt")
    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_router_logits = (
        output_router_logits
        if output_router_logits is not None
        else self.config.output_router_logits
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs: MoeModelOutputWithPast = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        output_router_logits=output_router_logits,
        cache_position=cache_position,
    )
    hidden_states = outputs.last_hidden_state
    loss = None
    logits = None
    if hidden_states is None:
        raise ValueError("hidden_states is None")
    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
    slice_indices = (
        slice(-logits_to_keep, None)
        if isinstance(logits_to_keep, int)
        else logits_to_keep
    )
    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
        assert labels is not None
        loss = apply_lce(
            hidden_states[:, slice_indices, :],
            self.lm_head.weight,
            labels,
            _PATCH_OPTS,
            **loss_kwargs,
        )
    else:
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
    aux_loss = None
    if output_router_logits:
        aux_loss = load_balancing_loss_func(
            outputs.router_logits,
            self.num_experts,
            self.num_experts_per_tok,
            attention_mask,
        )
        if labels is not None:
            loss += self.router_aux_loss_coef * aux_loss.to(  # type: ignore
                loss.device  # type: ignore
            )  # make sure to reside in the same device
    return MoeCausalLMOutputWithPast(
        loss=loss,
        aux_loss=aux_loss,  # type: ignore
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        router_logits=outputs.router_logits,
    )
 def patch_qwen2_moe(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    global _PATCH_OPTS  # pylint: disable=global-statement
    from transformers.models.qwen2_moe import modeling_qwen2_moe
    _PATCH_OPTS = patch_options
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen2_moe.Qwen2MoeForCausalLM
        ), f"Expected a Qwen3MoeForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(forward, maybe_model)
        return maybe_model
    modeling_qwen2_moe.Qwen2MoeForCausalLM.forward = forward
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen2_vl.py
@@ -1,249 +0,0 @@
 """Qwen2 VL CCE patch. Adapted from transformers v4.51.2"""
 # pylint: disable=duplicate-code
 from types import MethodType
 from typing import Optional, Tuple, Union
 import torch
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
    apply_lce,
 )
 from torch.nn import CrossEntropyLoss
 from transformers.models.qwen2_vl.modeling_qwen2_vl import (
    _CONFIG_FOR_DOC,
    QWEN2_VL_INPUTS_DOCSTRING,
    Qwen2VLCausalLMOutputWithPast,
 )
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 _PATCH_OPTS: PatchOptions | None = None
@add_start_docstrings_to_model_forward(QWEN2_VL_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=Qwen2VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def cce_forward_multimodal(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[list[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    pixel_values: Optional[torch.Tensor] = None,
    pixel_values_videos: Optional[torch.FloatTensor] = None,
    image_grid_thw: Optional[torch.LongTensor] = None,
    video_grid_thw: Optional[torch.LongTensor] = None,
    rope_deltas: Optional[torch.LongTensor] = None,
    cache_position: Optional[torch.LongTensor] = None,
 ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
    Returns:
    Example:
    ```python
    >>> from PIL import Image
    >>> import requests
    >>> from transformers import AutoProcessor, Qwen2VLForConditionalGeneration
    >>> model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
    >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
    >>> messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": "What is shown in this image?"},
            ],
        },
    ]
    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
    >>> image = Image.open(requests.get(url, stream=True).raw)
    >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    return_dict = (
        return_dict if return_dict is not None else self.config.use_return_dict
    )
    if inputs_embeds is None:
        inputs_embeds = self.model.embed_tokens(input_ids)
        if pixel_values is not None:
            pixel_values = pixel_values.type(self.visual.get_dtype())
            image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
            n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
            n_image_features = image_embeds.shape[0]
            if n_image_tokens != n_image_features:
                raise ValueError(
                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
                )
            image_mask = (
                (input_ids == self.config.image_token_id)
                .unsqueeze(-1)
                .expand_as(inputs_embeds)
                .to(inputs_embeds.device)
            )
            image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)  # type: ignore
        if pixel_values_videos is not None:
            pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
            video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
            n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
            n_video_features = video_embeds.shape[0]
            if n_video_tokens != n_video_features:
                raise ValueError(
                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
                )
            video_mask = (
                (input_ids == self.config.video_token_id)
                .unsqueeze(-1)
                .expand_as(inputs_embeds)
                .to(inputs_embeds.device)
            )
            video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)  # type: ignore
        if attention_mask is not None:
            attention_mask = attention_mask.to(inputs_embeds.device)
    # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
    if position_ids is None and (attention_mask is None or attention_mask.ndim == 2):
        # calculate RoPE index once per generation in the pre-fill stage only
        if (
            (cache_position is not None and cache_position[0] == 0)
            or self.rope_deltas is None
            or (past_key_values is None or past_key_values.get_seq_length() == 0)  # type: ignore
        ):
            position_ids, rope_deltas = self.get_rope_index(
                input_ids, image_grid_thw, video_grid_thw, attention_mask
            )
            self.rope_deltas = rope_deltas
        # then use the prev pre-calculated rope-deltas to get the correct position ids
        else:
            batch_size, seq_length, _ = inputs_embeds.shape
            delta = (
                cache_position[0] + self.rope_deltas
                if cache_position is not None
                else 0
            )
            position_ids = torch.arange(seq_length, device=inputs_embeds.device)  # type: ignore
            position_ids = position_ids.view(1, -1).expand(batch_size, -1)  # type: ignore
            if cache_position is not None:  # otherwise `deltas` is an int `0`
                delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)  # type: ignore
                delta = delta.to(position_ids.device)  # type: ignore
            position_ids = position_ids.add(delta)  # type: ignore
            position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)  # type: ignore
    outputs = self.model(
        input_ids=None,
        position_ids=position_ids,
        attention_mask=attention_mask,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        cache_position=cache_position,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
        assert labels is not None
        loss = apply_lce(
            hidden_states,
            self.lm_head.weight,
            labels,
            _PATCH_OPTS,
        )
    else:
        logits = self.lm_head(hidden_states)
        if labels is not None:
            # Upcast to float if we need to compute the loss to avoid potential precision issues
            logits = logits.float()
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)
    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output
    return Qwen2VLCausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        rope_deltas=self.rope_deltas,
    )
 def patch_qwen2_vl(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    global _PATCH_OPTS  # pylint: disable=global-statement
    from transformers.models.qwen2_vl import modeling_qwen2_vl
    _PATCH_OPTS = patch_options
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen2_vl.Qwen2VLForConditionalGeneration
        ), f"Expected a Qwen2VLForConditionalGeneration model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
        return maybe_model
    modeling_qwen2_vl.Qwen2VLForConditionalGeneration.forward = cce_forward_multimodal
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3.py
@@ -1,35 +0,0 @@
 """Qwen3 CCE patch. The model inherits Llama's modeling code and uses the same forward method."""
 # pylint: disable=duplicate-code
 from types import MethodType
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
 )
 def patch_qwen3(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    from transformers.models.qwen3 import modeling_qwen3
    # Set the _PATCH_OPTS in the llama patch file
    import axolotl.integrations.cut_cross_entropy.monkeypatch.llama as llama_patch
    llama_patch._PATCH_OPTS = patch_options  # pylint: disable=protected-access
    from axolotl.integrations.cut_cross_entropy.monkeypatch.llama import cce_forward
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen3.Qwen3ForCausalLM
        ), f"Expected a Qwen3ForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(cce_forward, maybe_model)
        return maybe_model
    modeling_qwen3.Qwen3ForCausalLM.forward = cce_forward
    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/qwen3_moe.py
@@ -1,194 +0,0 @@
 """Qwen3 MoE CCE patch. Adapted from transformers v4.51.2"""
 # pylint: disable=duplicate-code
 from types import MethodType
 from typing import Optional, Union
 import torch
 import transformers
 from cut_cross_entropy.transformers.utils import (
    PatchOptions,
    TransformersModelT,
    apply_lce,
 )
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.models.qwen3_moe.modeling_qwen3_moe import (
    _CONFIG_FOR_DOC,
    QWEN3_MOE_INPUTS_DOCSTRING,
    KwargsForCausalLM,
    MoeCausalLMOutputWithPast,
    MoeModelOutputWithPast,
    load_balancing_loss_func,
 )
 from transformers.processing_utils import Unpack
 from transformers.utils import (
    add_start_docstrings_to_model_forward,
    replace_return_docstrings,
 )
 from transformers.utils.deprecation import deprecate_kwarg
 from transformers.utils.generic import can_return_tuple
 _PATCH_OPTS: PatchOptions | None = None
@can_return_tuple
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
@add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
@replace_return_docstrings(
    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
 )
 def forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[list[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_router_logits: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs: Unpack[KwargsForCausalLM],
 ) -> MoeCausalLMOutputWithPast:
    r"""
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    Example:
    ```python
    >>> from transformers import AutoTokenizer, Qwen3MoeForCausalLM
    >>> model = Qwen3MoeForCausalLM.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
    >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-MoE-15B-A2B")
    >>> prompt = "Hey, are you conscious? Can you talk to me?"
    >>> inputs = tokenizer(prompt, return_tensors="pt")
    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_router_logits = (
        output_router_logits
        if output_router_logits is not None
        else self.config.output_router_logits
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs: MoeModelOutputWithPast = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        output_router_logits=output_router_logits,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs.last_hidden_state
    if hidden_states is None:
        raise ValueError("hidden_states is None")
    loss = None
    logits = None
    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
    slice_indices = (
        slice(-logits_to_keep, None)
        if isinstance(logits_to_keep, int)
        else logits_to_keep
    )
    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
        assert labels is not None
        loss = apply_lce(
            hidden_states[:, slice_indices, :],
            self.lm_head.weight,
            labels,
            _PATCH_OPTS,
            **kwargs,
        )
    else:
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
    aux_loss = None
    if output_router_logits:
        aux_loss = load_balancing_loss_func(
            outputs.router_logits,
            self.num_experts,
            self.num_experts_per_tok,
            attention_mask,
        )
        if labels is not None:
            loss += self.router_aux_loss_coef * aux_loss.to(  # type: ignore
                loss.device  # type: ignore
            )  # make sure to reside in the same device
    return MoeCausalLMOutputWithPast(
        loss=loss,
        aux_loss=aux_loss,  # type: ignore
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        router_logits=outputs.router_logits,
    )
 def patch_qwen3_moe(
    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
    patch_options: PatchOptions,
 ) -> TransformersModelT | None:
    global _PATCH_OPTS  # pylint: disable=global-statement
    from transformers.models.qwen3_moe import modeling_qwen3_moe
    _PATCH_OPTS = patch_options
    if isinstance(maybe_model, transformers.PreTrainedModel):
        assert isinstance(
            maybe_model, modeling_qwen3_moe.Qwen3MoeForCausalLM
        ), f"Expected a Qwen3MoeForCausalLM model. Got {type(maybe_model)}."
        maybe_model.forward = MethodType(forward, maybe_model)
        return maybe_model
    modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = forward
    return None
--- a/src/axolotl/integrations/kd/chat_template.py
+++ b/src/axolotl/integrations/kd/chat_template.py
@@ -35,9 +35,6 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
        sequence_len,
        roles_to_train=None,
        train_on_eos=None,
        train_on_eot=None,
        eot_tokens=None,
        split_thinking: bool | None = False,
        logprobs_field="logprobs",
        gen_temperature=1.0,
        kd_temperature=1.0,
@@ -53,9 +50,6 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
            sequence_len,
            roles_to_train=roles_to_train,
            train_on_eos=train_on_eos,
            train_on_eot=train_on_eot,
            eot_tokens=eot_tokens,
            split_thinking=split_thinking,
        )
    @property
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -23,8 +23,8 @@ import logging
 import sys
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils.distributed import is_main_process
 from ...utils.distributed import zero_only
 from .args import LigerArgs  # pylint: disable=unused-import. # noqa: F401
 from .utils import patch_with_compile_disable
@@ -85,7 +85,7 @@ class LigerPlugin(BasePlugin):
                kwargs["geglu"] = cfg.liger_glu_activation
            elif "swiglu" in liger_fn_sig.parameters:
                kwargs["swiglu"] = cfg.liger_glu_activation
-            if is_main_process(use_environ=True):
+            with zero_only():
                LOG.info(
                    f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}"
                )
@@ -151,30 +151,6 @@ class LigerPlugin(BasePlugin):
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3":
            from axolotl.integrations.liger.models.qwen3 import (
                apply_liger_kernel_to_qwen3,
            )
            apply_liger_kernel_to_qwen3(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3_moe":
            from axolotl.integrations.liger.models.qwen3_moe import (
                apply_liger_kernel_to_qwen3_moe,
            )
            apply_liger_kernel_to_qwen3_moe(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        else:
            logging.warning(
                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -1,160 +0,0 @@
 """
 Liger FLCE for Qwen3. Based on transformers v4.51.3.
 """
 import sys
 from typing import Optional, Tuple, Union
 import torch
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 def lce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Cache] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs,
 ) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **kwargs,
        )
    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )
    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def apply_liger_kernel_to_qwen3(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
    import transformers.models.qwen3.modeling_qwen3  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
    assert not (
        cross_entropy and fused_linear_cross_entropy
    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
    if rms_norm:
        modeling_qwen3.Qwen3RMSNorm = LigerRMSNorm
    if glu_activation:
        modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP
    if layer_norm:
        modeling_qwen3.nn.LayerNorm = LigerLayerNorm
    if cross_entropy:
        from transformers.loss.loss_utils import nn
        nn.functional.cross_entropy = liger_cross_entropy
    if fused_linear_cross_entropy:
        modeling_qwen3.Qwen3ForCausalLM.forward = lce_forward
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -1,191 +0,0 @@
 """
 Liger FLCE for Qwen3 MoE. Based on transformers v4.51.3.
 """
 import sys
 from copy import deepcopy
 from typing import List, Optional, Union
 import torch
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
 from transformers.modeling_outputs import MoeCausalLMOutputWithPast
 from transformers.models.qwen3_moe.modeling_qwen3_moe import load_balancing_loss_func
 def lce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[List[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_router_logits: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs,
 ) -> MoeCausalLMOutputWithPast:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_router_logits = (
        output_router_logits
        if output_router_logits is not None
        else self.config.output_router_logits
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        output_router_logits=output_router_logits,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **kwargs,
        )
    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )
    aux_loss = None
    if output_router_logits:
        aux_loss = load_balancing_loss_func(
            outputs.router_logits,
            self.num_experts,
            self.num_experts_per_tok,
            attention_mask,
        )
        if labels is not None:
            loss += self.router_aux_loss_coef * aux_loss.to(
                loss.device
            )  # make sure to reside in the same device
    return MoeCausalLMOutputWithPast(
        loss=loss,
        aux_loss=aux_loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def apply_liger_kernel_to_qwen3_moe(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
    assert not (
        cross_entropy and fused_linear_cross_entropy
    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
    if rms_norm:
        modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm
    if glu_activation:
        def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
            "Accepts intermediate_size to pass to LigerSwiGLUMLP"
            # clone config to avoid modifying the original
            config = deepcopy(config)
            if intermediate_size:
                setattr(config, "intermediate_size", intermediate_size)
            return LigerSwiGLUMLP(config, **kwargs)
        modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
    if layer_norm:
        modeling_qwen3_moe.nn.LayerNorm = LigerLayerNorm
    if cross_entropy:
        from transformers.loss.loss_utils import nn
        nn.functional.cross_entropy = liger_cross_entropy
    if fused_linear_cross_entropy:
        modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = lce_forward
--- a/src/axolotl/monkeypatch/attention/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/patch.py
@@ -12,8 +12,10 @@ import torch
 import torch.distributed as dist
 from accelerate.logging import get_logger
 from axolotl.logging_config import configure_logging
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 configure_logging()
 LOG = get_logger(__name__)
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -23,42 +23,22 @@ from axolotl.utils.dict import DictDefault
 LOG = get_logger(__name__)
-QKV_PATCHES = [
+ORIGINAL_QKV_CODE = """
    (
        """
    query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
    key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 """.lstrip(
-            "\n"
+    "\n"
-        ),
+)
-        """
+
 PATCHED_QKV_CODE = """
    query_states, key_states, value_states = self.apply_qkv(hidden_states)
    query_states = query_states.view(hidden_shape).transpose(1, 2)
    key_states = key_states.view(hidden_shape).transpose(1, 2)
    value_states = value_states.view(hidden_shape).transpose(1, 2)
 """.lstrip(
-            "\n"
+    "\n"
-        ),
+)
    ),
    (
        """
    query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
    key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
 """.lstrip(
            "\n"
        ),
        """
    query_states, key_states, value_states = self.apply_qkv(hidden_states)
    query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
    key_states = self.k_norm(key_states.view(hidden_shape)).transpose(1, 2)
    value_states = value_states.view(hidden_shape).transpose(1, 2)
 """.lstrip(
            "\n"
        ),
    ),
 ]
 ORIGINAL_O_CODE = """
    attn_output = self.o_proj(attn_output)
@@ -148,11 +128,10 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
    try:
        # Dynamically import the module and attention class
        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
-        model_cls_prefix = "".join(
+        module = __import__(
-            [part.capitalize() for part in model_type.split("_")]
+            module_path, fromlist=[f"{model_type.capitalize()}Attention"]
        )
-        module = __import__(module_path, fromlist=[f"{model_cls_prefix}Attention"])
+        attention_cls = getattr(module, f"{model_type.capitalize()}Attention")
        attention_cls = getattr(module, f"{model_cls_prefix}Attention")
        return attention_cls
    except (ImportError, AttributeError) as e:
@@ -189,18 +168,10 @@ def patch_self_attn_lora(cfg: DictDefault):
    attention_cls._original_forward = self_attn_forward
    self_attn_forward, _ = detab_code(self_attn_forward)
-    assert any(
+    assert ORIGINAL_QKV_CODE in self_attn_forward, "Original QKV code not found"
        qkv_options[0] in self_attn_forward for qkv_options in QKV_PATCHES
    ), "Original QKV code not found"
    assert ORIGINAL_O_CODE in self_attn_forward, "Original O code not found"
-    for qkv_orig, qkv_patched in QKV_PATCHES:
+    self_attn_forward = self_attn_forward.replace(ORIGINAL_QKV_CODE, PATCHED_QKV_CODE)
        if qkv_orig in self_attn_forward:
            self_attn_forward = self_attn_forward.replace(
                qkv_orig,
                qkv_patched,
            )
            break
    self_attn_forward = self_attn_forward.replace(ORIGINAL_O_CODE, PATCHED_O_CODE)
    self_attn_forward = self_attn_forward.replace(
        "def forward(",
--- a/src/axolotl/monkeypatch/trainer/init.py
+++ b/src/axolotl/monkeypatch/trainer/init.py
--- a/src/axolotl/monkeypatch/trainer/lr.py
+++ b/src/axolotl/monkeypatch/trainer/lr.py
@@ -1,42 +0,0 @@
 """
 monkeypatch for Trainer _get_learning_rate method
 """
 import logging
 import torch
 LOG = logging.getLogger(__name__)
 # TODO remove this patch once https://github.com/huggingface/transformers/pull/37881 is included in a release
 def _get_learning_rate(self):
    if self.is_deepspeed_enabled:
        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
        # not run for the first few dozen steps while loss scale is too large, and thus during
        # that time `get_last_lr` will fail if called during that warm up stage, so work around it:
        try:
            last_lr = self.lr_scheduler.get_last_lr()[0]
        except AssertionError as e:
            if "need to call step" in str(e):
                LOG.warning(
                    "tried to get lr value before scheduler/optimizer started stepping, returning lr=0"
                )
                last_lr = 0
            else:
                raise
    else:
        if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            last_lr = self.optimizer.param_groups[0]["lr"]
        else:
            last_lr = self.lr_scheduler.get_last_lr()[0]
    if torch.is_tensor(last_lr):
        last_lr = last_lr.item()
    return last_lr
 def patch_trainer_get_lr():
    from transformers.trainer import Trainer
    Trainer._get_learning_rate = _get_learning_rate  # pylint: disable=protected-access
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -4,7 +4,7 @@ HF Chat Templates prompt strategy
 import logging
 from collections import defaultdict
-from typing import Any, Dict, List, Set, Union
+from typing import Any, Dict, List, Optional, Set, Union
 from pydantic import BaseModel
 from transformers import ProcessorMixin
@@ -29,12 +29,11 @@ class ChatTemplatePrompter(Prompter):
        chat_template: str,
        processor=None,
        max_length=2048,
-        message_property_mappings: Dict[str, str] | None = None,
+        message_property_mappings: Optional[Dict[str, str]] = None,
-        message_field_training: str | None = None,
+        message_field_training: Optional[str] = None,
-        message_field_training_detail: str | None = None,
+        message_field_training_detail: Optional[str] = None,
        field_messages: str = "messages",
-        field_system: str = "system",
+        roles: Optional[Dict[str, List[str]]] = None,
        roles: Dict[str, List[str]] | None = None,
        drop_system_message: bool = False,
    ):
        # check if message_property_mappings is None or empty dict
@@ -42,7 +41,6 @@ class ChatTemplatePrompter(Prompter):
            message_property_mappings = {
                "role": "role",
                "content": "content",
                "reasoning_content": "reasoning_content",
            }
        if roles:
@@ -64,9 +62,8 @@ class ChatTemplatePrompter(Prompter):
        self.message_field_training = message_field_training
        self.message_field_training_detail = message_field_training_detail
        self.field_messages = field_messages
        self.field_system = field_system
        self.tokenizer = tokenizer
-        self.processor: ProcessorMixin | None = processor
+        self.processor: Optional[ProcessorMixin] = processor
        self.chat_template = chat_template
        self.max_length = max_length
        self.drop_system_message = drop_system_message
@@ -223,13 +220,10 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        self,
        prompter: "ChatTemplatePrompter",
        tokenizer,
-        train_on_inputs: bool,
+        train_on_inputs,
-        sequence_len: int,
+        sequence_len,
-        roles_to_train: list[str] | None = None,
+        roles_to_train=None,
-        train_on_eos: str | None = None,
+        train_on_eos=None,
        train_on_eot: str | None = None,
        eot_tokens: list[str] | None = None,
        split_thinking: bool | None = False,
    ):
        super().__init__(prompter, tokenizer, train_on_inputs, sequence_len)
        self.prompter: ChatTemplatePrompter = prompter
@@ -242,88 +236,12 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
            ]
        self.train_on_eos = train_on_eos
        # Backward compatibility, load from train_on_eos
        self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos
        # Default to eos_token if eot_tokens not provided
        self.eot_tokens = (
            eot_tokens if eot_tokens is not None else [self.tokenizer.eos_token]
        )
        self.split_thinking = split_thinking
        self.images = "images"
        LOG.debug(
            f"The chat template uses the following properites on the message: {self.prompter.chat_template_msg_variables}"
        )
        self._validate_eot_and_eos_tokens()
    def _validate_eot_and_eos_tokens(self):
        """
        - Validates that EOT tokens (or eos_token) are in the chat_template
        - Checks if EOT tokens are encoded as multiple tokens in the tokenizer.
        - Checks for potential conflicts between train_on_eos and train_on_eot.
        """
        if self.prompter.chat_template is None:
            # Usually this should not happen
            LOG.warning(
                "No chat template provided, skipping EOT and EOS token validation"
            )
            return
        # If the EOT token is the same as the EOS token, we need to check differently
        if len(self.eot_tokens) == 1 and self.eot_tokens[0] == self.tokenizer.eos_token:
            # Check if the eos_token is in the chat_template or as a variable `eos_token`
            # Note: we check for `eos_token` in the string, but it could possibly not be a variable
            if (
                self.tokenizer.eos_token not in self.prompter.chat_template
                and "eos_token" not in self.prompter.chat_template
            ):
                LOG.warning(
                    f"EOS token '{self.tokenizer.eos_token}' not found in chat_template. Please check if your template/EOS token is correct."
                )
            return
        # Create a new list to store tokens that should be kept
        valid_eot_tokens = []
        for token in self.eot_tokens:
            # Check if EOT token is in the chat_template
            if token not in self.prompter.chat_template:
                LOG.warning(f"EOT token '{token}' not found in chat_template.")
                # Don't add to the valid tokens list
                continue
            valid_eot_tokens.append(token)
        # Replace the original list with the filtered one
        self.eot_tokens = valid_eot_tokens
        for token in self.eot_tokens:
            # If token in template, check if EOT token is in tokenizer and not encoded as multiple tokens
            token_ids = self.tokenizer.encode(token, add_special_tokens=False)
            if not token_ids:
                raise ValueError(
                    "EOT token encoding failed. Please check if the token is valid and can be encoded."
                )
            if token_ids and len(token_ids) > 1:
                raise ValueError(
                    f"EOT token '{token}' is encoded as multiple tokens: {token_ids}. Please add it under `tokens: ` in the config "
                    "or (recommended) override unused added_tokens via `added_tokens_overrides: `."
                )
        # If eos_token is in eot_tokens and conflict between train_on_eos and train_on_eot, raise an error
        if (
            self.tokenizer.eos_token in self.eot_tokens
            and self.train_on_eos != self.train_on_eot
        ):
            raise ValueError(
                "Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot"
                f"train_on_eos: {self.train_on_eos}, train_on_eot: {self.train_on_eot}"
                f"eot_tokens: {self.eot_tokens}"
                f"eos_token: {self.tokenizer.eos_token}"
            )
    @property
    def supports_batched(self) -> bool:
        # Let calling code know we can handle lists of examples
@@ -367,7 +285,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        if (
            not self.roles_to_train
            and not self.train_on_eos
            and not self.train_on_eot
            and not self.prompter.message_field_training  # type: ignore
            and not self.prompter.message_field_training_detail  # type: ignore
        ):
@@ -403,7 +320,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        labels = [IGNORE_TOKEN_ID] * len(input_ids)
        last_eos_idx = -1
        last_eot_idx = -1
        for index, turn in enumerate(turns):
            role = turn.get("role")
            content = turn.get("content")
@@ -452,46 +368,25 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                LOG.debug(f"Labels after processing turn {index}: {labels}")
-            # Handle special tokens (EOT and EOS)
+            # Handle EOS token
-            for token_type, find_func, train_option in [
+            eos_idx = self.find_first_eos_token(input_ids, start_idx=turn_end_idx)
-                ("EOT", self.find_first_eot_token, self.train_on_eot),
+            if abs(eos_idx - turn_end_idx) <= 3:  # Allow for some template padding
-                ("EOS", self.find_first_eos_token, self.train_on_eos),
+                last_eos_idx = eos_idx
-            ]:
+                if self.train_on_eos == "all" or (
-                token_idx = find_func(input_ids, start_idx=turn_end_idx)
+                    self.train_on_eos == "turn" and should_train
-
+                ):
-                if (
+                    labels[eos_idx] = input_ids[eos_idx]
-                    token_idx != -1 and abs(token_idx - turn_end_idx) <= 3
+                    LOG.debug(f"EOS token set for training at index {eos_idx}")
-                ):  # Allow for some template padding
+            else:
                    # Update the last token index
                    if token_type == "EOT":  # nosec B105
                        last_eot_idx = token_idx
                    else:
                        last_eos_idx = token_idx
                    # Set labels if needed for this turn
                    if train_option == "all" or (
                        train_option == "turn" and should_train
                    ):
                        labels[token_idx] = input_ids[token_idx]
                        LOG.debug(
                            f"{token_type} token set for training at index {token_idx}"
                        )
                else:
                    LOG.debug(
                        f"{token_type} token missing after turn {turn}. {token_type.lower()}_idx: {token_idx}, turn_end_idx: {turn_end_idx}"
                    )
        # Handle 'last' option for special tokens
        for token_type, last_idx, train_option in [
            ("EOT", last_eot_idx, self.train_on_eot),
            ("EOS", last_eos_idx, self.train_on_eos),
        ]:
            if train_option == "last" and last_idx != -1:
                labels[last_idx] = input_ids[last_idx]
                LOG.debug(
-                    f"Last {token_type} token set for training at index {last_idx}"
+                    f"EOS token missing after turn {turn}. eos_idx: {eos_idx}, turn_end_idx: {turn_end_idx}"
                )
        # Handle 'last' option for train_on_eos
        if self.train_on_eos == "last" and last_eos_idx != -1:
            labels[last_eos_idx] = input_ids[last_eos_idx]
            LOG.debug(f"Last EOS token set for training at index {last_eos_idx}")
        LOG.debug(f"Final labels: {labels}")
        return {
@@ -507,25 +402,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                return i
        return -1
    def find_first_eot_token(self, input_ids, start_idx):
        """Find the first EOT token in the input_ids starting from start_idx."""
        # Get token IDs for all EOT tokens
        eot_token_ids = []
        for token in self.eot_tokens:
            token_ids = self.tokenizer.encode(token, add_special_tokens=False)
            if len(token_ids) != 1:
                raise ValueError(
                    f"EOT token '{token}' is encoded as multiple tokens: {token_ids}. Please add it under `tokens: ` in the config."
                )
            eot_token_ids.append(token_ids[0])  # Use the last token ID if multiple
        # Search for any of the EOT token IDs
        for i in range(start_idx, len(input_ids)):
            if input_ids[i] in eot_token_ids:
                return i
        return -1
    def find_turn(self, turns: list[dict], turn_idx: int):
        """
        Locate the starting and ending indices of the specified turn in a conversation.
@@ -612,17 +488,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
    def get_conversation_thread(self, prompt):
        turns = []
        possible_sys_turn = self.transform_message(
            prompt[self.prompter.field_messages][0]
        )
        if (
            possible_sys_turn["role"] != "system"
            and self.prompter.field_system in prompt
        ):
            turn = {"role": "system", "content": prompt[self.prompter.field_system]}
            turns.append(turn)
        for message in prompt[self.prompter.field_messages]:
            transformed_message = self.transform_message(message)
@@ -658,52 +523,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                transformed_message["role"], transformed_message["role"]
            )
        # TODO handle reasoning_content with split_thinking
        # if the role is assistant that we want to use reasoning_content
        if self.split_thinking and transformed_message["role"] == "assistant":
            content = transformed_message["content"]
            thinking_pairs = [
                ("<think>", "</think>"),
                ("<reasoning>", "</reasoning>"),
                ("<|begin_of_thought|>", "<|end_of_thought|>"),
            ]
            content_pairs = [("<|begin_of_solution|>", "<|end_of_solution|>")]
            for tpair in thinking_pairs:
                # check if the thinking pair is in the content
                if tpair[0] in content and tpair[1] in content:
                    # find the start and end index of the thinking pair
                    t_start_idx = content.find(tpair[0])
                    t_end_idx = content.find(tpair[1])
                    # get the thinking content
                    thinking_content = content[t_start_idx + len(tpair[0]) : t_end_idx]
                    transformed_message["reasoning_content"] = thinking_content.strip()
                    # take remainder of the content
                    # strip whitespace from beginning of the remainder (thinking tokens)
                    remainder = content[t_end_idx + len(tpair[1]) :].lstrip()
                    # check if the content pair is in the remainder
                    cpair_found = False
                    for cpair in content_pairs:
                        if cpair[0] in remainder and cpair[1] in remainder:
                            # find the start and end index of the content pair
                            c_start_idx = remainder.find(cpair[0])
                            c_end_idx = remainder.find(cpair[1])
                            # get the content content
                            content_content = remainder[
                                c_start_idx + len(cpair[0]) : c_end_idx
                            ]
                            transformed_message["content"] = content_content.strip()
                            cpair_found = True
                            break
                    # else, the content is the remainder
                    if not cpair_found:
                        transformed_message["content"] = remainder
                    break
        # Determine which keys in the original message were not mapped
        mapped_values = set(self.prompter.message_property_mappings.values())
        remaining_keys = set(message) - mapped_values
@@ -736,16 +555,13 @@ class StrategyLoader:
            "sequence_len": cfg.sequence_len,
            "roles_to_train": ds_cfg.get("roles_to_train", ["assistant"]),
            "train_on_eos": ds_cfg.get("train_on_eos", "turn"),
            "train_on_eot": ds_cfg.get("train_on_eot", None),
            "eot_tokens": cfg.get("eot_tokens", None),  # loads from cfg, not ds_cfg
            "split_thinking": ds_cfg.get("split_thinking", False),
        }
    def __call__(
        self,
        tokenizer,
        cfg,
-        ds_cfg: Union[Dict[str, Any], DatasetConfig] | None = None,
+        ds_cfg: Optional[Union[Dict[str, Any], DatasetConfig]] = None,
        processor=None,
    ):
        if ds_cfg is None:
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -6,7 +6,6 @@ import os
 import signal
 import sys
 import weakref
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Any, Dict
@@ -26,10 +25,7 @@ from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
 )
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
-from axolotl.core.trainers.mixins.sequence_parallel import (
+from axolotl.logging_config import configure_logging
    SequenceParallelContextManager,
 )
 from axolotl.integrations.base import PluginManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
@@ -41,6 +37,7 @@ try:
 except ImportError:
    BetterTransformer = None
 configure_logging()
 LOG = get_logger(__name__)
@@ -188,28 +185,16 @@ def execute_training(
        trainer: The configured trainer object.
        resume_from_checkpoint: Path to checkpoint to resume from, if applicable.
    """
-    # Define the context managers to use
+    LOG.info("Starting trainer...")
-    flash_context = (
+    if cfg.flash_optimum:
-        torch.backends.cuda.sdp_kernel(
+        with torch.backends.cuda.sdp_kernel(
            # TODO configure these from the YAML w/ sdp_kernel_kwargs: ...
            enable_flash=True,
            enable_math=True,
            enable_mem_efficient=True,
-        )
+        ):
-        if cfg.flash_optimum
+            trainer.train(resume_from_checkpoint=resume_from_checkpoint)
-        else nullcontext()
+    else:
    )
    sequence_parallel_context = (
        SequenceParallelContextManager(
            model=trainer.model,
            sequence_parallel_degree=cfg.sequence_parallel_degree,
            ring_attn_func=cfg.ring_attn_func,
        )
        if cfg.sequence_parallel_degree > 1
        else nullcontext()
    )
    LOG.info("Starting trainer...")
    with flash_context, sequence_parallel_context:
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
@@ -286,19 +271,7 @@ def save_trained_model(
                os.remove(os.path.join(cfg.output_dir, "model.safetensors"))
            except FileNotFoundError:
                pass
-    elif cfg.local_rank == 0:
+    elif hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
        if cfg.flash_optimum and BetterTransformer:
            model = BetterTransformer.reverse(model)
        if cfg.rl and cfg.adapter and not cfg.rl_adapter_ref_model:
            trainer.model.save_pretrained(
                cfg.output_dir, safe_serialization=safe_serialization
            )
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
    if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
        # TODO: add integration support so this can be implemented completely within the plugin
        from axolotl.integrations.llm_compressor.utils import (
            save_compressed_model,
        )
@@ -311,6 +284,17 @@ def save_trained_model(
            save_compressed=cfg.llmcompressor.save_compressed,
        )
    elif cfg.local_rank == 0:
        if cfg.flash_optimum and BetterTransformer:
            model = BetterTransformer.reverse(model)
        if cfg.rl and cfg.adapter and not cfg.rl_adapter_ref_model:
            trainer.model.save_pretrained(
                cfg.output_dir, safe_serialization=safe_serialization
            )
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
 def create_model_card(cfg: DictDefault, trainer: Trainer):
    """
@@ -547,7 +531,4 @@ def train(
    if not cfg.use_ray:
        cleanup_distributed()
    plugin_manager = PluginManager.get_instance()
    plugin_manager.post_train(cfg, model)
    return model, tokenizer, trainer
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 import gc
 import json
 import logging
 import os
 import traceback
@@ -809,44 +808,11 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
                    artifact.add_file(temp_file.name)
                    wandb.log_artifact(artifact)
                    wandb.save(temp_file.name)
-                    LOG.info(
+                LOG.info(
-                        "The Axolotl config has been saved to the WandB run under files."
+                    "The Axolotl config has been saved to the WandB run under files."
-                    )
+                )
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
            if args.deepspeed:
                try:
                    # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later.
                    with NamedTemporaryFile(
                        mode="w",
                        delete=False,
                        suffix=".json",
                        prefix="deepspeed_config_",
                    ) as temp_file:
                        skip_upload = False
                        if isinstance(args.deepspeed, dict):
                            json.dump(args.deepspeed, temp_file, indent=4)
                        elif isinstance(args.deepspeed, str) and os.path.exists(
                            args.deepspeed
                        ):
                            copyfile(args.deepspeed, temp_file.name)
                        else:
                            skip_upload = True
                        if not skip_upload:
                            artifact = wandb.Artifact(
                                f"deepspeed-config-{wandb.run.id}",
                                type="deepspeed-config",
                            )
                            artifact.add_file(temp_file.name)
                            wandb.log_artifact(artifact)
                            wandb.save(temp_file.name)
                            LOG.info(
                                "The DeepSpeed config has been saved to the WandB run under files."
                            )
                except (FileNotFoundError, ConnectionError) as err:
                    LOG.warning(f"Error while saving DeepSpeed config to WandB: {err}")
        return control
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
--- a/src/axolotl/utils/collators/batching.py
+++ b/src/axolotl/utils/collators/batching.py
@@ -1,12 +1,20 @@
-"""Data collators for axolotl to pad labels and position_ids for packed sequences"""
+"""
 Data collators for axolotl to pad labels and position_ids for packed sequences. Also
 includes logic for handling sequence parallelism collation.
 """
 from dataclasses import dataclass
 from typing import Any
 import numpy as np
 import torch
 import torch.distributed as dist
 from transformers import PreTrainedTokenizerBase
 from transformers.utils import PaddingStrategy
 from axolotl.monkeypatch.attention.ring_attn import update_ring_attn_params
 from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
@dataclass
 class DataCollatorForSeq2Seq:
@@ -41,6 +49,8 @@ class DataCollatorForSeq2Seq:
            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
        sequence_parallel_degree (`int`):
            The degree of sequence parallelism. Default to 1 for no sequence parallelism.
    """
    tokenizer: PreTrainedTokenizerBase
@@ -51,6 +61,17 @@ class DataCollatorForSeq2Seq:
    label_pad_token_id: int = -100
    position_pad_token_id: int = 0
    return_tensors: str = "pt"
    sequence_parallel_degree: int = 1
    ring_attn_func: RingAttnFunc | None = None
    def __post_init__(self):
        if self.sequence_parallel_degree > 1:
            from axolotl.monkeypatch.attention.ring_attn import get_ring_attn_group
            # Get information about our position in the SP group
            sp_group = get_ring_attn_group()
            self.local_rank = dist.get_rank(group=sp_group)
            self.local_world_size = dist.get_world_size(group=sp_group)
    def __call__(self, features, return_tensors=None):
        has_attn_mask = "attention_mask" in features[0].keys()
@@ -120,8 +141,62 @@ class DataCollatorForSeq2Seq:
            )
            features["decoder_input_ids"] = decoder_input_ids
        if self.sequence_parallel_degree > 1:
            features = self.apply_sequence_parallelism(features)
        return features
    def apply_sequence_parallelism(
        self, batch: dict[str, torch.Tensor]
    ) -> torch.Tensor:
        """
        Apply sequence parallelism slicing to a batch.
        Args:
            batch: Batch dictionary from parent collator.
        Returns:
            Sliced batch dictionary.
        """
        # Get local (start, end) for sequence parallelism slicing
        total_seq_len = batch["input_ids"].size(1)
        # Update params for varlen ring attention calculation
        if batch.get("position_ids") is not None:
            update_ring_attn_params(position_ids=batch["position_ids"])
        # Slice batch for sequence parallel processing
        for key in batch:
            if batch[key].size(1) == total_seq_len:
                if self.ring_attn_func in [
                    RingAttnFunc.VARLEN_LLAMA3,
                    RingAttnFunc.BATCH_RING,
                ]:
                    batch[key] = (
                        batch[key]
                        .chunk(self.local_world_size, dim=1)[self.local_rank]
                        .contiguous()
                    )
                elif self.ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
                    chunks = batch[key].chunk(2 * self.local_world_size, dim=1)
                    # Take rank's chunk and opposing chunk for zigzag pattern
                    selected_chunks = [
                        chunks[self.local_rank],
                        chunks[2 * self.local_world_size - self.local_rank - 1],
                    ]
                    batch[key] = torch.cat(selected_chunks, dim=1).contiguous()
                elif self.ring_attn_func is RingAttnFunc.BATCH_STRIPE:
                    # TODO(djsaunde): This doesn't seem to work as expected
                    # Split into striped data and stack
                    tensor = torch.stack(
                        batch[key].split(self.local_world_size, dim=1),
                        dim=1,
                    ).transpose(1, 2)
                    batch[key] = tensor[:, self.local_rank].contiguous()
        return batch
@dataclass
 class BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -67,7 +67,7 @@ def resolve_dtype(cfg):
        else:
            LOG.debug("bf16 support not detected, disabling for this configuration.")
            cfg.bf16 = False
-            if cfg.fp16 is None and not cfg.float16:
+            if cfg.fp16 is None:
                cfg.fp16 = True
    if cfg.device == "mps":
@@ -126,6 +126,9 @@ def normalize_config(cfg):
            with open(ds_config_path, encoding="utf-8") as f:
                cfg.deepspeed = json.load(f)
    if cfg.sequence_parallel_degree is None:
        cfg.sequence_parallel_degree = 1
    if cfg.saves_per_epoch:
        save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
        if save_steps < 1.0:  # prevent saves on every step
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -204,37 +204,7 @@ def load_prepare_preference_datasets(cfg):
            else:
                eval_dataset = load_split(cfg.test_datasets, cfg)
        if not eval_dataset:
-            if cfg.val_set_size:
+            eval_dataset = None
                # ensure we end up with the same fingerprint by doing rank0 first and being able to cache
                to_hash_train = (
                    train_dataset._fingerprint  # pylint: disable=protected-access
                    + "|"
                    + str(cfg.val_set_size)
                    + "|"
                    + "train"
                    + "|"
                    + str(cfg.seed or 42)
                )
                to_hash_test = (
                    train_dataset._fingerprint  # pylint: disable=protected-access
                    + "|"
                    + str(cfg.val_set_size)
                    + "|"
                    + "test"
                    + "|"
                    + str(cfg.seed or 42)
                )
                train_fingerprint = md5(to_hash_train)
                test_fingerprint = md5(to_hash_test)
                ds_w_test_split = train_dataset.train_test_split(
                    test_size=cfg.val_set_size,
                    seed=cfg.seed,
                    shuffle=False,
                    train_new_fingerprint=train_fingerprint,
                    test_new_fingerprint=test_fingerprint,
                )
                eval_dataset = ds_w_test_split["test"]
                train_dataset = ds_w_test_split["train"]
        if not train_is_preprocessed:
            _save_preprocessed_ds(cfg, cfg.datasets, train_dataset)
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -134,9 +134,10 @@ def prepare_dataset(cfg, tokenizer, processor=None, preprocess_iterable=None):
                    "csv", data_files=f.name, split="train", streaming=True
                )
        else:
-            iter_ds = load_dataset(
+            if is_local_main_process():
-                path, streaming=True, split=split, name=name, data_files=data_files
+                iter_ds = load_dataset(
-            )
+                    path, streaming=True, split=split, name=name, data_files=data_files
                )
        if skip:
            LOG.info(f"Skipping {skip} samples from the dataset")
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -69,27 +69,17 @@ def barrier():
        dist.barrier()
-def is_main_process(use_environ=False):
+def is_main_process():
    """
    Check if the current process is the main process. If not in distributed mode,
    always return `True`.
    Args:
    - use_environ (bool, optional): Use environment variable to determine main process.
    Returns:
    - bool: `True` if the current process is the main process, `False` otherwise.
    """
    if use_environ:
        return os.environ.get("LOCAL_RANK", "0") == "0"
    if not is_distributed():
        return True
    return dist.get_rank() == 0
-def is_local_main_process(use_environ=False):
+def is_local_main_process():
    if use_environ:
        return os.environ.get("LOCAL_RANK", "0") == "0"
    return PartialState().is_local_main_process
@@ -109,6 +99,17 @@ def cleanup_distributed():
        torch.distributed.destroy_process_group()
@contextmanager
 def zero_only():
    """
    Context manager that only runs the enclosed block on the main rank.
    """
    if is_main_process():
        yield
    else:
        yield None
@contextmanager
 def zero_first(is_main):
    """
--- a/src/axolotl/utils/gradient_checkpointing/init.py
+++ b/src/axolotl/utils/gradient_checkpointing/init.py
@@ -1,7 +1,5 @@
 """custom checkpointing utils"""
 from functools import partial
 from axolotl.utils.gradient_checkpointing.unsloth import (
    Unsloth_Offloaded_Gradient_Checkpointer,
 )
@@ -11,10 +9,6 @@ def hf_grad_checkpoint_offload_wrapper(
    decoder_layer, *args, use_reentrant=None
 ):  # pylint: disable=unused-argument
    return Unsloth_Offloaded_Gradient_Checkpointer.apply(
-        (
+        decoder_layer.__self__,
            decoder_layer.func.__self__
            if isinstance(decoder_layer, partial)
            else decoder_layer.__self__
        ),
        *args,
    )
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -53,7 +53,6 @@ from transformers.integrations.deepspeed import (
 )
 from axolotl.common.architectures import MOE_ARCH_BLOCK
 from axolotl.integrations.base import PluginManager
 from axolotl.models.mamba import fix_mamba_attn_for_loss
 from axolotl.monkeypatch.multipack import (
    SUPPORTED_MULTIPACK_MODEL_TYPES,
@@ -68,14 +67,13 @@ from axolotl.utils.distributed import (
    get_device_count,
    get_device_type,
    is_local_main_process,
-    is_main_process,
+    zero_only,
 )
 from axolotl.utils.gradient_checkpointing import hf_grad_checkpoint_offload_wrapper
 from axolotl.utils.lora_embeddings import get_linear_embedding_layers
 from axolotl.utils.model_shard_quant import load_sharded_model, load_sharded_model_quant
 LOG = logging.getLogger(__name__)
 PLUGIN_MANAGER = PluginManager.get_instance()
 MULTIMODAL_AUTO_MODEL_MAPPING = {
    "mllama": MllamaForConditionalGeneration,
@@ -453,7 +451,7 @@ def load_tokenizer(cfg):
            {"additional_special_tokens": additional_special_tokens}
        )
-    if is_main_process(use_environ=True):
+    with zero_only():
        LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
        LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
        LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
@@ -589,8 +587,10 @@ class ModelLoader:
            patch_gemma3conditionalgeneration_forward()
        # load any patches from plugins
        from axolotl.integrations.base import PluginManager
-        PLUGIN_MANAGER.pre_model_load(self.cfg)
+        plugin_manager = PluginManager.get_instance()
        plugin_manager.pre_model_load(self.cfg)
        # monkey patch to allow additional Accelerator init kwargs
        if self.cfg.fp8:
@@ -1268,7 +1268,6 @@ class ModelLoader:
        try:
            skip_move_to_device = self.build_model(qlora_fsdp)
            PLUGIN_MANAGER.post_model_build(self.cfg, self.model)
        except Exception as err:  # pylint: disable=broad-exception-caught
            LOG.exception(err)
            raise err
@@ -1348,8 +1347,6 @@ class ModelLoader:
                before_kbit_train_or_finetune=False,
            )
        PLUGIN_MANAGER.pre_lora_load(self.cfg, self.model)
        # ---------------------------------------------------------
        #  load lora or adapter
        # ---------------------------------------------------------
@@ -1411,7 +1408,7 @@ class ModelLoader:
            gc.collect()
            torch.cuda.empty_cache()
-        PLUGIN_MANAGER.post_model_load(self.cfg, self.model)
+        # TODO resume_from_checkpoint handling
        return self.model, lora_config
@@ -1446,13 +1443,9 @@ def load_adapter(model, cfg, adapter, inference=False):
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    if adapter in ["lora", "qlora"]:
-        model, lora_config = load_lora(model, cfg, inference=inference)
+        return load_lora(model, cfg, inference=inference)
        PLUGIN_MANAGER.post_lora_load(cfg, model)
        return model, lora_config
    if adapter == "llama-adapter":
-        model, lora_config = load_llama_adapter(model, cfg)
+        return load_llama_adapter(model, cfg)
        PLUGIN_MANAGER.post_lora_load(cfg, model)
        return model, lora_config
    raise NotImplementedError(f"{adapter} peft adapter not available")
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -309,7 +309,6 @@ class AxolotlInputConfig(
        | Annotated[str, StringConstraints(pattern="^tokenizer_default_fallback_")]
    ) | None = None
    chat_template_jinja: str | None = None
    eot_tokens: list[str] | None = None
    default_system_message: str | None = None
    fix_untrained_tokens: int | list[int] | None = None
@@ -512,17 +511,10 @@ class AxolotlInputConfig(
    @model_validator(mode="before")
    @classmethod
    def hint_sample_packing_padding(cls, data):
-        if data.get("sample_packing"):
+        if data.get("sample_packing") and not data.get("pad_to_sequence_len"):
-            pad_to_sequence_len = data.get("pad_to_sequence_len")
+            LOG.warning(
-            if pad_to_sequence_len is False:
+                "`pad_to_sequence_len: true` is recommended when using sample_packing"
-                LOG.warning(
+            )
                    "`pad_to_sequence_len: true` is recommended when using sample_packing"
                )
            elif pad_to_sequence_len is None:
                LOG.info(
                    "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
                )
                data["pad_to_sequence_len"] = True
        return data
    @model_validator(mode="before")
@@ -1157,29 +1149,22 @@ class AxolotlInputConfig(
        return data
-    @model_validator(mode="before")
+    @field_validator("sequence_parallel_degree", mode="after")
    @classmethod
-    def check_grpo_peft_liger(cls, data):
+    def check_sequence_parallel_degree(cls, value, info):
-        if (
+        if not value:
-            data.get("rl") == "grpo"
+            value = 1
            and data.get("trl", {})
            and data.get("trl").get("use_liger_loss")
            and data.get("adapter")
        ):
            raise ValueError("PEFT + GRPO + Liger is not yet supported")
        return data
-    @model_validator(mode="after")
+        if value > 1:
-    def check_sequence_parallel_degree(self):
+            if not info.data.get("flash_attention"):
        if not self.sequence_parallel_degree:
            self.sequence_parallel_degree = 1
        elif self.sequence_parallel_degree > 1:
            if not self.flash_attention:
                raise ValueError(
                    "flash_attention: true must be set with sequence_parallel_degree > 1"
                )
-            if self.sample_packing and self.micro_batch_size > 1:
+            if (
                info.data.get("sample_packing")
                and not info.data["micro_batch_size"] == 1
            ):
                raise ValueError(
                    "micro_batch_size must be set to 1 when sample_packing is enabled"
                    "due to a `ring-flash-attn` requirement"
@@ -1199,40 +1184,42 @@ class AxolotlInputConfig(
            # according to the proportion of non-padding tokens per rank.
            LOG.warning(
                "Sequence parallelism (SP) is enabled with "
-                f"sequence_parallel_degree={self.sequence_parallel_degree}. "
+                f"sequence_parallel_degree={value}. Please note that logged losses may "
-                "Please note that logged losses may differ slightly to the non-SP "
+                "differ slightly to the non-SP losses due to transformers Trainer "
-                "losses due to transformers Trainer implementation details. "
+                "implementation details. Please see "
-                "Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
+                "https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
                "for more details."
            )
-        return self
+        return value
-    @model_validator(mode="after")
+    @field_validator("ring_attn_func", mode="after")
-    def validate_ring_attn_func(self):
+    @classmethod
-        if getattr(self, "sequence_parallel_degree", 1) == 1:
+    def check_ring_attn_func(cls, value, info):
-            return self
+        if not info.data.get("sequence_parallel_degree", 1) > 1:
            return value
        from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
-        if self.ring_attn_func is not None:
+        if value is not None:
            # Set the ring attention function if passed in config
            valid_funcs = list(RingAttnFunc)
-            if self.ring_attn_func in valid_funcs:
+            if value in valid_funcs:
-                self.ring_attn_func = RingAttnFunc(self.ring_attn_func)
+                value = RingAttnFunc(value)
            else:
                raise ValueError(
-                    f"ring_attn_func: {self.ring_attn_func} must be in {valid_funcs}"
+                    f"ring_attn_func: {value} must be one of {valid_funcs}"
                )
        else:
            # Default ring attention function selection
-            sample_packing = getattr(self, "sample_packing", False)
+            sample_packing = info.data.get("sample_packing")
-            self.ring_attn_func = (
+            value = (
                RingAttnFunc.VARLEN_LLAMA3
                if sample_packing
                else RingAttnFunc.BATCH_RING
            )
-        return self
+        return value
    @model_validator(mode="before")
    @classmethod
@@ -1334,57 +1321,6 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
                    )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_auto_enable_lora_kernels(cls, data):
        # Only proceed if using LoRA or QLoRA adapter
        if data.get("rl"):
            # RL trainers not tested so don't enable kernels by default
            return data
        if data.get("adapter") in ["lora", "qlora"]:
            # Skip if already set, using unsloth optimizations, or using 8-bit
            unsloth_fields = ["unsloth_lora_mlp", "unsloth_lora_qkv", "unsloth_lora_o"]
            kernel_fields = ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"]
            if (
                any(data.get(k) is not None for k in kernel_fields)
                or any(data.get(k) for k in unsloth_fields)
                or data.get("adapter") == "lora"
                and data.get("load_in_8bit")
            ):
                return data
            # Check multi-GPU compatibility
            capabilities = data.get("capabilities")
            is_multi_gpu = capabilities and capabilities.get("n_gpu", 0) > 1
            is_fsdp = data.get("fsdp") is not None
            is_fsdp2 = (
                data.get("fsdp_config") is not None
                and str(data.get("fsdp_config").get("fsdp_version")) == "2"
            )
            if (
                not is_multi_gpu
                or (is_multi_gpu and not is_fsdp)
                or (is_multi_gpu and is_fsdp2)
            ):
                # Auto-enable kernels if not explicitly set by user
                if data.get("lora_mlp_kernel") is None:
                    data["lora_mlp_kernel"] = True
                if data.get("lora_qkv_kernel") is None:
                    data["lora_qkv_kernel"] = True
                if data.get("lora_o_kernel") is None:
                    data["lora_o_kernel"] = True
                LOG.warning(
                    "Auto-enabling LoRA kernel optimizations for faster training. "
                    + "Please explicitly set `lora_*_kernel` config values to `false` to disable. "
                    + "See https://docs.axolotl.ai/docs/lora_optims.html for more info."
                )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_adopt_torch_version(cls, data):
--- a/src/axolotl/utils/schemas/datasets.py
+++ b/src/axolotl/utils/schemas/datasets.py
@@ -50,7 +50,6 @@ class SFTDataset(BaseModel):
    message_property_mappings: dict[str, str] | None = None
    message_field_training: str | None = None
    message_field_training_detail: str | None = None
    split_thinking: bool | None = None
    logprobs_field: str | None = None
    temperature: float | None = None
    roles_to_train: list[str] | None = None
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -35,7 +35,6 @@ class ChatTemplate(str, Enum):
    jamba = "jamba"  # pylint: disable=invalid-name
    jinja = "jinja"  # pylint: disable=invalid-name
    qwen_25 = "qwen_25"  # pylint: disable=invalid-name
    qwen3 = "qwen3"  # pylint: disable=invalid-name
    tokenizer_default = "tokenizer_default"  # pylint: disable=invalid-name
    exaone = "exaone"  # pylint: disable=invalid-name
    metharme = "metharme"  # pylint: disable=invalid-name
--- a/src/axolotl/utils/schemas/trl.py
+++ b/src/axolotl/utils/schemas/trl.py
@@ -67,12 +67,6 @@ class TRLConfig(BaseModel):
        default=False,
        json_schema_extra={"description": "Whether to log completions"},
    )
    num_completions_to_print: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of completions to print. If `log_completions` is `True`, this will be the number of completions logged."
        },
    )
    sync_ref_model: bool | None = Field(
        default=False,
        json_schema_extra={
@@ -139,25 +133,3 @@ class TRLConfig(BaseModel):
            "description": "Epsilon value for clipping in the GRPO algorithm."
        },
    )
    epsilon_high: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Upper-bound epsilon value for clipping in the GRPO algorithm."
        },
    )
    use_liger_loss: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Whether to use Liger loss for GRPO."},
    )
    loss_type: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Specifies the loss formulation to use. Supported values are `grpo`, `bnpo`, and `dr_grpo`."
        },
    )
    mask_truncated_completions: bool = Field(
        default=False,
        json_schema_extra={
            "description": "When enabled, truncated completions are excluded from the loss calculation."
        },
    )
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -348,7 +348,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
                    load_from_cache_file=not cfg.is_preprocess,
                    desc="Add position_id column (PoSE)",
                )
-    elif cfg.sample_packing:
+    elif cfg.sample_packing or cfg.sequence_parallel_degree > 1:
        drop_long_kwargs = {}
        if filter_map_kwargs:
            drop_long_kwargs["desc"] = "Add position_id column (Sample Packing)"
@@ -358,7 +358,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
            **filter_map_kwargs,
            **drop_long_kwargs,
        )
-        if cfg.eval_sample_packing:
+        if cfg.eval_sample_packing or cfg.sequence_parallel_degree > 1:
            if eval_dataset:
                eval_dataset = eval_dataset.map(
                    add_position_ids,
@@ -528,13 +528,6 @@ def setup_torch_compile_env(cfg):
 def setup_deepspeed_env(cfg, stage=None):
    from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
    from axolotl.utils.distributed import distributed_state
    if distributed_state and distributed_state.initialized:
        raise RuntimeError(
            "Distributed State already initialized before Deepspeed setup"
        )
    os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
    os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
    if stage:
@@ -597,8 +590,6 @@ def prepare_optim_env(cfg):
        os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
    elif cfg.fp16:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"
    else:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
 def prepare_opinionated_env(cfg):
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -79,9 +79,9 @@ def download_smollm2_135m_model():
@pytest.fixture(scope="session", autouse=True)
-def download_smollm2_135m_gptq_model():
+def download_llama_68m_random_model():
    # download the model
-    snapshot_download_w_retry("lilmeaty/SmolLM2-135M-Instruct-GPTQ", repo_type="model")
+    snapshot_download_w_retry("JackFram/llama-68m", repo_type="model")
@pytest.fixture(scope="session", autouse=True)
@@ -90,12 +90,6 @@ def download_qwen_2_5_half_billion_model():
    snapshot_download_w_retry("Qwen/Qwen2.5-0.5B", repo_type="model")
@pytest.fixture(scope="session", autouse=True)
 def download_qwen3_half_billion_model():
    # download the model
    snapshot_download_w_retry("Qwen/Qwen3-0.6B", repo_type="model")
@pytest.fixture(scope="session", autouse=True)
 def download_tatsu_lab_alpaca_dataset():
    # download the dataset
--- a/tests/e2e/integrations/test_hooks.py
+++ b/tests/e2e/integrations/test_hooks.py
@@ -1,184 +0,0 @@
 """
 e2e tests to make sure all the hooks are fired on the plugin
 """
 import os
 from pathlib import Path
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.integrations.base import BasePlugin
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists
 class LogHooksPlugin(BasePlugin):
    """
    fixture to capture in a log file each hook that was fired
    """
    base_dir = Path("/tmp/axolotl-log-hooks")
    def __init__(self):
        self.base_dir.mkdir(parents=True, exist_ok=True)
        try:
            os.remove(self.base_dir.joinpath("plugin_hooks.log"))
        except FileNotFoundError:
            pass
    def pre_model_load(self, cfg):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("pre_model_load\n")
    def post_model_build(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_model_build\n")
    def pre_lora_load(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("pre_lora_load\n")
    def post_lora_load(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_lora_load\n")
    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_model_load\n")
    def create_optimizer(self, cfg, trainer):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("create_optimizer\n")
    def get_trainer_cls(self, cfg):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("get_trainer_cls\n")
    def create_lr_scheduler(
        self, cfg, trainer, optimizer, num_training_steps
    ):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("create_lr_scheduler\n")
    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("add_callbacks_pre_trainer\n")
        return []
    def add_callbacks_post_trainer(
        self, cfg, trainer
    ):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("add_callbacks_post_trainer\n")
        return []
    def post_train(self, cfg, model):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_train\n")
    def post_train_unload(self, cfg):  # pylint: disable=unused-argument
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_train_unload\n")
 class TestPluginHooks:
    """
    e2e tests to make sure all the hooks are fired during the training
    """
    def test_plugin_hooks(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "plugins": [
                    "tests.e2e.integrations.test_hooks.LogHooksPlugin",
                ],
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "flash_attention": True,
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
        with open(
            "/tmp/axolotl-log-hooks" + "/plugin_hooks.log", "r", encoding="utf-8"
        ) as f:
            file_contents = f.readlines()
            file_contents = "\n".join(file_contents)
            assert "pre_model_load" in file_contents
            assert "post_model_build" in file_contents
            assert "pre_lora_load" in file_contents
            assert "post_lora_load" in file_contents
            assert "post_model_load" in file_contents
            # assert "create_optimizer" in file_contents  # not implemented yet
            assert "get_trainer_cls" in file_contents
            assert "create_lr_scheduler" in file_contents
            assert "add_callbacks_pre_trainer" in file_contents
            assert "add_callbacks_post_trainer" in file_contents
            assert "post_train" in file_contents
            # assert "post_train_unload" in file_contents  # not called from test train call
        try:
            os.remove("/tmp/axolotl-log-hooks" + "/plugin_hooks.log")
        except FileNotFoundError:
            pass
--- a/tests/e2e/integrations/test_llm_compressor.py
+++ b/tests/e2e/integrations/test_llm_compressor.py
@@ -12,11 +12,7 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
-from tests.e2e.utils import (
+from tests.e2e.utils import check_model_output_exists, require_torch_2_4_1
    check_model_output_exists,
    require_llmcompressor,
    require_torch_2_4_1,
 )
 MODELS = [
    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
@@ -35,13 +31,10 @@ class TestLLMCompressorIntegration:
    e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin
    """
    @require_llmcompressor
    @require_torch_2_4_1
    def test_llmcompressor_plugin(
        self, temp_dir, base_model: str, save_compressed: bool
    ):
        from llmcompressor import active_session
        # core cfg
        cfg = DictDefault(
            {
@@ -91,18 +84,18 @@ class TestLLMCompressorIntegration:
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-        try:
+        train(cfg=cfg, dataset_meta=dataset_meta)
-            train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
-            check_model_output_exists(temp_dir, cfg)
+        _check_llmcompressor_model_outputs(temp_dir, save_compressed)
            _check_llmcompressor_model_outputs(temp_dir, save_compressed)
        finally:
            active_session().reset()
 def _check_llmcompressor_model_outputs(temp_dir, save_compressed):
    if save_compressed:
        assert (Path(temp_dir) / "recipe.yaml").exists()
    # recipe.yaml should exist
    assert (Path(temp_dir) / "recipe.yaml").exists()
    # sparsity config exists if save_compressed
    if save_compressed:
        from compressed_tensors import ModelCompressor
        from compressed_tensors.config import Sparse24BitMaskConfig
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -4,14 +4,11 @@ GRPO test suite
 import os
 import random
 import shutil
 import subprocess  # nosec B404
 import sys
 import tempfile
 import time
 from pathlib import Path
 import psutil
 import pytest
 import requests
 import yaml
@@ -24,8 +21,8 @@ from tests.e2e.utils import require_vllm
 def start_vllm(
-    model: str, env: dict, wait: int | None = None, quiet=False, **kwargs
+    model: str, env: dict | None = None, wait: int | None = None, quiet=False, **kwargs
-) -> subprocess.Popen:
+) -> int:
    """
    helper function to start the VLLM server in the background, mostly for testing purposes
    """
@@ -49,41 +46,10 @@ def start_vllm(
    # print out the command to be executed
    print(" ".join(cmd))
    vllm_logging_json = Path(tempfile.mkdtemp()) / "vllm_logging.json"
    with open(vllm_logging_json, "w", encoding="utf-8") as temp_file:
        temp_file.write(
            """{
  "formatters": {
    "json": {
      "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
    }
  },
  "handlers": {
    "file": {
      "class": "logging.FileHandler",
      "formatter": "json",
      "level": "DEBUG",
      "filename": "/tmp/vllm.log",
      "mode": "a"
    }
  },
  "loggers": {
    "vllm": {
      "handlers": ["file"],
      "level": "DEBUG",
      "propagate": false
    }
  },
  "version": 1
 }"""
        )
    cmd_env = env.copy()
    cmd_env.update({"VLLM_LOGGING_CONFIG_PATH": vllm_logging_json})
    # start `trl vllm-serve` command in the background and capture the process id
    process = subprocess.Popen(  # pylint: disable=consider-using-with
        cmd,
-        env=cmd_env,
+        env=env,
        stdout=subprocess.DEVNULL if quiet else subprocess.PIPE,
        stderr=subprocess.DEVNULL if quiet else subprocess.PIPE,
    )  # nosec B603
@@ -92,51 +58,32 @@ def start_vllm(
    print(f"VLLM server process started (PID: {process.pid})")
    # wait until the http server is ready, even if it 404s, but timeout after 60 seconds
    period_seconds = 5
    started = False
    if wait and host and port:
-        for i in range(0, int(wait), period_seconds):
+        for _ in range(int(wait)):
            try:
                response = requests.get(f"http://{host}:{port}", timeout=1)
                print(f"{i}: VLLM server (status: {response.status_code})")
                if int(response.status_code) in [200, 404]:
                    started = True
                    break
-            except requests.exceptions.RequestException as exc:
+            except requests.exceptions.RequestException:
-                print(f"{i}: VLLM server failed to start: {str(exc)}")
+                pass
            # also check if the process.pid is still running
            if not process.poll() is None:
                break
-            time.sleep(period_seconds)
+            time.sleep(1)
    if wait and not started:
        print(
            f"VLLM server process did not start within {wait} seconds. Please check your server logs."
        )
-        recursive_kill(process)
+        process.kill()
        with open("/tmp/vllm.log", "r", encoding="utf-8") as log_file:
            print(log_file.read())
        shutil.rmtree("/tmp/vllm.log")
        raise RuntimeError(f"VLLM server process did not start within {wait} seconds.")
-    # return the process
+    # return the process id
-    return process
+    return process.pid
 def recursive_kill(process: subprocess.Popen):
    """
    Recursively kill a process and its children
    """
    process = psutil.Process(process.pid)
    for child in psutil.Process(process.pid).children(recursive=True):
        child.terminate()
        child.kill()
        os.kill(child.pid, 9)
    process.terminate()
    process.kill()
    os.kill(process.pid, 9)
 class TestGRPO:
@@ -227,17 +174,16 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
        current_env = os.environ.copy()
        env = {
-            "NCCL_P2P_LEVEL": "NVL",
+            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
-            "VLLM_DISABLE_COMPILE_CACHE": "1",
+            "VLLM_USE_V1": "0",
            # "VLLM_USE_V1": "0",
        }
-        vllm_process = start_vllm(
+        vllm_process_id = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
-            wait=300,
+            wait=120,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
@@ -256,14 +202,10 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
-                env={
+                env={"NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env},
                    "NCCL_P2P_LEVEL": "NVL",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
-            recursive_kill(vllm_process)
+            os.kill(vllm_process_id, 9)
    @pytest.mark.parametrize(
        "num_gpus",
@@ -320,17 +262,16 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
        current_env = os.environ.copy()
        env = {
-            "NCCL_P2P_LEVEL": "NVL",  # nccl can be brittle, assume P2P isn't reliable
+            "NCCL_P2P_LEVEL": "LOC",  # nccl can be brittle, assume P2P isn't reliable
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
-            "VLLM_DISABLE_COMPILE_CACHE": "1",
+            "VLLM_USE_V1": "0",
            # "VLLM_USE_V1": "0",
        }
-        vllm_process = start_vllm(
+        vllm_process_id = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
-            wait=300,
+            wait=120,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
@@ -349,11 +290,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
-                env={
+                env={"NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env},
                    "NCCL_P2P_LEVEL": "NVL",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
-            recursive_kill(vllm_process)
+            os.kill(vllm_process_id, 9)
--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -2,19 +2,14 @@
 # pylint: disable=redefined-outer-name
 from pathlib import Path
 import pytest
 import torch
 import yaml
 from accelerate.state import PartialState
 from peft import PeftModelForCausalLM, get_peft_config
 from transformers import AutoModelForCausalLM, LlamaForCausalLM
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import LlamaAttention
 from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeAttention
 from axolotl.cli.config import load_cfg
 from axolotl.kernels.lora import (
    apply_lora_mlp_geglu,
    apply_lora_mlp_swiglu,
@@ -71,36 +66,29 @@ def small_llama_model():
    return LlamaForCausalLM(LlamaConfig(**config))
-@pytest.mark.parametrize(
+def test_attention_patching_integration():
    "model_name,attention_cls",
    [
        ("HuggingFaceTB/SmolLM2-135M", LlamaAttention),
        ("Qwen/Qwen3-30B-A3B", Qwen3MoeAttention),
    ],
 )
 def test_attention_patching_integration(model_name, attention_cls):
    """Test attention patching in integration context."""
-    cfg = {"base_model": model_name}
+    cfg = {"base_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"}
    # Store the original implementation
-    original_forward = getattr(attention_cls, "forward")
+    original_forward = getattr(LlamaAttention, "forward")
    # Apply patch
    patch_self_attn_lora(cfg)
    # Get the new forward method
-    patched_forward = attention_cls.forward
+    patched_forward = LlamaAttention.forward
    # Check the forward method was replaced
    assert original_forward is not patched_forward
    assert patched_forward.__name__ == "axolotl_attn_forward"
    # Check original implementation was stored
-    assert hasattr(attention_cls, "_original_forward")
+    assert hasattr(LlamaAttention, "_original_forward")
    # Clean up
-    setattr(attention_cls, "forward", original_forward)
+    setattr(LlamaAttention, "forward", original_forward)
-    delattr(attention_cls, "_original_forward")
+    delattr(LlamaAttention, "_original_forward")
 def test_swiglu_mlp_integration(small_llama_model):
@@ -425,42 +413,3 @@ def test_kernel_training_integration():
    # Verify correct activation function
    layer = model.model.model.layers[0]
    assert layer.mlp.forward.__func__ is apply_lora_mlp_swiglu
 def test_kernel_training_integration_auto_enable(temp_dir):
    """Test model loading with auto-enabled kernel patches."""
    # Create minimal config without explicitly setting kernel options
    cfg = DictDefault(
        {
            "base_model": "HuggingFaceTB/SmolLM2-135M",
            "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
            "learning_rate": 0.000001,
            "datasets": [
                {
                    "path": "mhenrichsen/alpaca_2k_test",
                    "type": "alpaca",
                }
            ],
            "micro_batch_size": 1,
            "gradient_accumulation_steps": 1,
            "adapter": "lora",
            "lora_r": 8,
            "lora_alpha": 16,
            "lora_dropout": 0.0,
            "lora_target_linear": True,
            "sequence_len": 1024,
        }
    )
    # Write cfg to yaml file
    path = Path(temp_dir) / "config.yaml"
    with open(path, "w", encoding="utf-8") as fout:
        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
    # Load config
    cfg = load_cfg(str(path))
    # Verify kernel options were auto-enabled in the config
    assert cfg.lora_mlp_kernel is True
    assert cfg.lora_qkv_kernel is True
    assert cfg.lora_o_kernel is True
--- a/tests/e2e/patched/test_4d_multipack_llama.py
+++ b/tests/e2e/patched/test_4d_multipack_llama.py
@@ -28,7 +28,7 @@ class Test4dMultipackLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
                "flash_attention": False,
                "sdp_attention": True,
                "sample_packing": True,
@@ -41,9 +41,6 @@ class Test4dMultipackLlama(unittest.TestCase):
                "lora_target_linear": True,
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
@@ -76,7 +73,7 @@ class Test4dMultipackLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
                "flash_attention": False,
                "sdp_attention": False,
                "sample_packing": True,
@@ -89,9 +86,6 @@ class Test4dMultipackLlama(unittest.TestCase):
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
--- a/tests/e2e/patched/test_activation_checkpointing.py
+++ b/tests/e2e/patched/test_activation_checkpointing.py
@@ -1,77 +0,0 @@
 """
 E2E tests for activation checkpointing
 """
 import pytest
 import transformers
 from torch.utils.checkpoint import checkpoint
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists
@pytest.fixture()
 def fix_checkpoint_after_test():
    yield
    transformers.modeling_utils.checkpoint = checkpoint
 class TestActivationCheckpointing:
    """
    E2E tests for activation checkpointing
    """
    def test_activation_checkpointing_offload(
        self,
        temp_dir,
        fix_checkpoint_after_test,  # pylint: disable=unused-argument,redefined-outer-name
    ):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                    "eos_token": "<|im_end|>",
                },
                "datasets": [
                    {
                        "chat_template": "chatml",
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "split": "train[:10%]",
                        "field_messages": "conversations",
                        "message_field_role": "from",
                        "message_field_content": "value",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": True,
                "bf16": True,
                "save_safetensors": True,
                "gradient_checkpointing": "offload",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
--- a/tests/e2e/patched/test_fused_llama.py
+++ b/tests/e2e/patched/test_fused_llama.py
@@ -32,7 +32,7 @@ class TestFusedLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
                "flash_attention": True,
                "pad_to_sequence_len": True,
                "flash_attn_fuse_qkv": True,
@@ -41,7 +41,9 @@ class TestFusedLlama(unittest.TestCase):
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
--- a/tests/e2e/patched/test_llama_s2_attention.py
+++ b/tests/e2e/patched/test_llama_s2_attention.py
@@ -31,8 +31,8 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 16384,
                "sample_packing": False,
                "flash_attention": True,
@@ -44,9 +44,7 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "Yukang/LongAlpaca-12k",
@@ -80,16 +78,14 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 16384,
                "sample_packing": False,
                "flash_attention": True,
                "s2_attention": True,
                "val_set_size": 0.02,
-                "special_tokens": {
+                "special_tokens": {},
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "Yukang/LongAlpaca-12k",
--- a/tests/e2e/patched/test_lora_llama_multipack.py
+++ b/tests/e2e/patched/test_lora_llama_multipack.py
@@ -31,8 +31,8 @@ class TestLoraLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
@@ -44,7 +44,9 @@ class TestLoraLlama(unittest.TestCase):
                "lora_target_linear": True,
                "val_set_size": 0.2,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
@@ -82,9 +84,9 @@ class TestLoraLlama(unittest.TestCase):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
-                "base_model": "lilmeaty/SmolLM2-135M-Instruct-GPTQ",
+                "base_model": "TheBlokeAI/jackfram_llama-68m-GPTQ",
                "model_type": "AutoModelForCausalLM",
-                "tokenizer_type": "AutoTokenizer",
+                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
@@ -98,7 +100,9 @@ class TestLoraLlama(unittest.TestCase):
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
+                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -99,7 +99,6 @@ class TestMixtral(unittest.TestCase):
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Wing Lian	b708a1cc45	validate config to set defaults	2025-04-26 13:11:25 -04:00
Rahul Tuli	daa9a58f83	Add: line about further optimizations using llmcompressor Signed-off-by: Rahul Tuli <rtuli@redhat.com>	2025-04-24 14:06:25 -04:00
Rahul Tuli	ae7069e15b	Merge branch 'main' into llmcompressor-sft	2025-04-24 12:37:14 -05:00
Rahul Tuli	20d48cd617	Address Review Comments: * deleted redundant docs/llm_compressor.qmd * incorporated feedback in integration README.md * added llmcompressor integration to docs/custom_integrations.qmd Signed-off-by: Rahul Tuli <rtuli@redhat.com>	2025-04-24 13:36:09 -04:00
Rahul Tuli	e766a730ba	Add: .qmd file	2025-04-24 12:45:57 -04:00
Rahul Tuli	7dc797860e	Tests, Style, Updates	2025-04-24 12:45:57 -04:00
Rahul Tuli	ff4904c8c4	Rebase and updates!	2025-04-24 12:45:57 -04:00
Rahul Tuli	45b7293793	Add: `llm_compressor` integration documentation	2025-04-24 12:45:57 -04:00
Rahul Tuli	279c7178bc	Move: LLMCompressorPlugin into it's own submodule	2025-04-24 12:45:57 -04:00
Rahul Tuli	e73c3709f9	Update model config	2025-04-24 12:45:57 -04:00
Rahul Tuli	33562189f8	Use: absolute import	2025-04-24 12:45:57 -04:00
Rahul Tuli	c057a2268f	Rename: sft.yaml to sparse-finetuning.yaml	2025-04-24 12:45:57 -04:00
Rahul Tuli	9d7a3809b5	Add: llcompressor installable	2025-04-24 12:45:57 -04:00
Rahul Tuli	b7b24d6a64	Address review comments from @markurtz	2025-04-24 12:45:57 -04:00
Rahul Tuli	8b82b8f7a1	Apply suggestions from @markurtz Co-authored-by: Mark Kurtz <mark.j.kurtz@gmail.com>	2025-04-24 12:45:57 -04:00
Rahul Tuli	81da58c0a1	Update llmcompressor version to latest	2025-04-24 12:45:57 -04:00
Rahul Tuli	2cd5a234a7	Revert: TODO's	2025-04-24 12:45:57 -04:00
Rahul Tuli	8c1af0747d	Use: warning over warn	2025-04-24 12:45:57 -04:00
Rahul Tuli	a06b360d99	pre commit hooks	2025-04-24 12:45:57 -04:00
Rahul Tuli	0f6456a14f	Add:llmcompressor instalable	2025-04-24 12:45:57 -04:00
Rahul Tuli	47a333ce49	Update: review comments!	2025-04-24 12:45:57 -04:00
Rahul Tuli	f9d6776c28	Add: SFTPlugin with llmcompressor	2025-04-24 12:45:57 -04:00
`@@ -4,4 +4,4 @@ import pkgutil`

	`__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package`	`__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package`

	`__version__ = "0.10.0.dev0"`	`__version__ = "0.8.0"`