commenting out unused

temp: trying another approach
progress
2025-06-16 01:53:13 +00:00 · 2025-06-15 21:32:10 +00:00 · 2025-06-14 17:40:21 +00:00 · 2025-06-13 09:51:06 -04:00 · 2025-06-12 18:54:41 +00:00 · 2025-06-07 06:58:59 +00:00
184 changed files with 4960 additions and 9795 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -16,7 +16,6 @@ on:
 jobs:
  build-base:
    if: github.repository_owner == 'axolotl-ai-cloud'
-    timeout-minutes: 480
    # this job needs to be run on self-hosted GPU runners...
    runs-on: ubuntu-latest-m
    strategy:
@@ -48,14 +47,14 @@ jobs:
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
          - cuda: "128"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
          - cuda: "128"
@@ -107,7 +106,6 @@ jobs:
            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
  build-base-uv:
    if: github.repository_owner == 'axolotl-ai-cloud'
-    timeout-minutes: 480
    runs-on: ubuntu-latest-m
    strategy:
      fail-fast: false
@@ -124,7 +122,7 @@ jobs:
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
    steps:
--- a/.github/workflows/docs.yml
+++ b/.github/workflows/docs.yml
@@ -23,7 +23,7 @@ jobs:
        - name: Install dependencies
          run: |
            python3 -m pip install jupyter quartodoc
-            python3 -m pip install -e .
+            python3 -m pip install -e . --no-deps
        - name: Build autodoc
          run: quartodoc build
        - name: Publish to GitHub Pages (and render)
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,12 +29,12 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -97,12 +97,12 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -8,7 +8,7 @@ on:
      - 'setup.py'
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
-      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
+      - 'src/axolotl/core/trainers/mixins/context_parallel.py'
      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
@@ -43,7 +43,7 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -8,9 +8,7 @@ on:
    paths:
      - '**/*.md'      # any Markdown file
      - '**/*.qmd'     # any Quarto file
-      - '_quarto.yml'
-      - docs/scripts/generate_config_docs.py
-      - src/axolotl/utils/schemas/**.py
+      - '_quarto.yaml'

 permissions:
  checks: write
@@ -40,7 +38,7 @@ jobs:
      - name: Install dependencies
        run: |
          python3 -m pip install jupyter quartodoc
-          python3 -m pip install -e .
+          python3 -m pip install -e . --no-deps

      - name: Build autodoc
        run: quartodoc build
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -52,7 +52,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20

    steps:
@@ -125,7 +125,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20

    steps:
@@ -188,7 +188,7 @@ jobs:
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
+    timeout-minutes: 90
    needs: [pre-commit, pytest, pytest-sdist]

    strategy:
@@ -238,7 +238,7 @@ jobs:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
+    timeout-minutes: 90
    # Only run the remainder of the matrix if the first e2e check passed;
    # this is to save on wasted compute costs for known failures that get caught in the first run
    needs: [pre-commit, pytest, docker-e2e-tests-1st]
@@ -262,13 +262,13 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            num_gpus: 1
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            num_gpus: 1
            axolotl_extras:
    steps:
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -328,7 +328,7 @@ The following optimizers are supported:
 - Use `gradient_checkpointing: true` to reduce memory usage
 - Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory

-For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config-reference.html).
+For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html).

 ### Errors:

--- a/README.md
+++ b/README.md
@@ -22,32 +22,28 @@
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
 </p>

-
-## 🎉 Latest Updates
-
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
- 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).
-
-## ✨ Overview
-
 Axolotl is a tool designed to streamline post-training for various AI models.
+Post-training refers to any modifications or additional training performed on
+pre-trained models - including full model fine-tuning, parameter-efficient tuning (like
+LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment
+techniques. With support for multiple model architectures and training configurations,
+Axolotl makes it easy to get started with these techniques.
+
+Axolotl is designed to work with YAML config files that contain everything you need to
+preprocess a dataset, train or fine-tune a model, run model inference or evaluation,
+and much more.

 Features:

- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
- **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), Sequence Parallelism (SP), LoRA optimizations, Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!
- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
-
-
+- Train various Huggingface models such as llama, pythia, falcon, mpt
+- Supports fullfinetune, lora, qlora, relora, and gptq
+- Customize configurations using a simple yaml file or CLI overwrite
+- Load different dataset formats, use custom formats, or bring your own tokenized datasets
+- Integrated with [xformers](https://github.com/facebookresearch/xformers), flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
+- Works with single GPU or multiple GPUs via FSDP or Deepspeed
+- Easily run with Docker locally or on the cloud
+- Log results and optionally checkpoints to wandb, mlflow or Comet
+- And more!

 ## 🚀 Quick Start

@@ -85,12 +81,19 @@ axolotl train examples/llama-3/lora-1b.yml

 That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.

+## ✨ Key Features
+
+- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, and more
+- **Easy Configuration**: Simple YAML files to control your training setup
+- **Performance Optimizations**: Flash Attention, xformers, multi-GPU training
+- **Flexible Dataset Handling**: Use various formats and custom datasets
+- **Cloud Ready**: Run on cloud platforms or local hardware

 ## 📚 Documentation

 - [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
- [Configuration Guide](https://docs.axolotl.ai/docs/config-reference.html) - Full configuration options and examples
- [Dataset Loading](https://docs.axolotl.ai/docs/dataset_loading.html) - Loading datasets from various sources
+- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
 - [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
 - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
 - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
@@ -109,6 +112,31 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge

 Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.

+## Supported Models
+
+|             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
+|-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
+| llama       | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
+| Mistral     | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
+| Mixtral-MoE | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Mixtral8X22 | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Pythia      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| cerebras    | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| btlm        | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| mpt         | ✅         | ❌    | ❓     | ❌             | ❌                 | ❌          | ❓            |
+| falcon      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
+| gpt-j       | ✅         | ✅    | ✅     | ❌             | ❌                 | ❓          | ❓            |
+| XGen        | ✅         | ❓    | ✅     | ❓             | ❓                 | ❓          | ✅            |
+| phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
+| Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
+| Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
+| Jamba       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
+
+✅: supported
+❌: not supported
+❓: untested
+
 ## ❤️ Sponsors

 Thank you to our sponsors who help make Axolotl possible:
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -1,6 +1,5 @@
 project:
  type: website
-  pre-render: docs/scripts/generate_config_docs.py

 quartodoc:
  dir: docs/api
@@ -76,7 +75,7 @@ quartodoc:
    - title: Context Managers
      desc: Context managers for altering trainer behaviors
      contents:
-        - utils.ctx_managers.sequence_parallel
+        - utils.ctx_managers.context_parallel
    - title: Prompt Strategies
      desc: Prompt formatting strategies
      contents:
@@ -236,7 +235,7 @@ website:
            - docs/installation.qmd
            - docs/inference.qmd
            - docs/cli.qmd
-            - docs/config-reference.qmd
+            - docs/config.qmd
            - text: "API Reference"
              href: docs/api

@@ -275,7 +274,7 @@ website:
            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
-            - docs/sequence_parallelism.qmd
+            - docs/context_parallelism.qmd

        - section: "Troubleshooting"
          contents:
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -6,7 +6,7 @@ from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=120 * 60,  # 90 min
+    timeout=90 * 60,  # 90 min
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -69,7 +69,7 @@ def run_cmd(cmd: str, run_folder: str):
@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
-    timeout=120 * 60,
+    timeout=90 * 60,
    cpu=16.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
--- a/deepspeed_configs/zero2_torch_compile.json
+++ b/deepspeed_configs/zero2_torch_compile.json
@@ -1,31 +0,0 @@
-{
-  "compile": {
-    "disable": false,
-    "backend": "inductor"
-  },
-  "zero_optimization": {
-    "stage": 2,
-    "offload_optimizer": {
-      "device": "cpu"
-    },
-    "contiguous_gradients": true,
-    "overlap_comm": true
-  },
-  "bf16": {
-    "enabled": "auto"
-  },
-  "fp16": {
-    "enabled": "auto",
-    "auto_cast": false,
-    "loss_scale": 0,
-    "initial_scale_power": 32,
-    "loss_scale_window": 1000,
-    "hysteresis": 2,
-    "min_loss_scale": 1
-  },
-  "gradient_accumulation_steps": "auto",
-  "gradient_clipping": "auto",
-  "train_batch_size": "auto",
-  "train_micro_batch_size_per_gpu": "auto",
-  "wall_clock_breakdown": false
-}
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -38,6 +38,6 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10

-RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
+RUN if [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
        pip3 install flash-attn==2.7.4.post1; \
    fi
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace

 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -29,12 +29,8 @@ RUN uv venv --no-project --relocatable axolotl-venv

 ENV PATH="/workspace/axolotl-venv/bin:${PATH}"

-RUN uv pip install packaging setuptools wheel psutil \
+RUN uv pip install packaging setuptools wheel \
    && uv pip install torch==${PYTORCH_VERSION} \
    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
    && uv pip install awscli pydantic
-
-RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
-        uv pip install --no-build-isolation flash-attn==2.7.4.post1; \
-    fi
--- a/docs/.gitignore
+++ b/docs/.gitignore
@@ -2,4 +2,3 @@
 _site/
 /api/*.qmd
 /api/*.html
-config-reference.qmd
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -0,0 +1,795 @@
+---
+title: Config Reference
+description: A complete list of all configuration options.
+---
+
+```yaml
+# This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
+# This can also be a relative path to a model on disk
+base_model: ./llama-7b-hf
+# You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
+base_model_ignore_patterns:
+# If the base_model repo on hf hub doesn't include configuration .json files,
+# You can set that here, or leave this empty to default to base_model
+base_model_config: ./llama-7b-hf
+# You can specify to choose a specific model revision from huggingface hub
+revision_of_model:
+# Optional tokenizer configuration path in case you want to use a different tokenizer
+# than the one defined in the base model
+tokenizer_config:
+# If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
+model_type: AutoModelForCausalLM
+# Corresponding tokenizer for the model AutoTokenizer is a good choice
+tokenizer_type: AutoTokenizer
+# Trust remote code for untrusted source
+trust_remote_code:
+# use_fast option for tokenizer loading from_pretrained, default to True
+tokenizer_use_fast:
+# Whether to use the legacy tokenizer setting, defaults to True
+tokenizer_legacy:
+# Resize the model embeddings when new tokens are added to multiples of 32
+# This is reported to improve training speed on some models
+resize_token_embeddings_to_32x:
+# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
+shrink_embeddings:
+# Optional[bool] Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs
+embeddings_skip_upcast:
+# Whether to load the model with randomly initialized weights. Useful for
+# pre-training a model from scratch or debugging purposes.
+random_init_weights:
+
+# (Internal use only)
+# Used to identify which the model is based on
+is_falcon_derived_model:
+is_llama_derived_model:
+is_qwen_derived_model:
+# Please note that if you set this to true, `padding_side` will be set to "left" by default
+is_mistral_derived_model:
+
+# optional overrides to the base model configuration
+overrides_of_model_config:
+  # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
+  rope_scaling:
+    type: # linear | dynamic
+    factor: # float
+
+# optional overrides the base model loading from_pretrained
+overrides_of_model_kwargs:
+  # use_cache: False
+
+# optional overrides to the bnb 4bit quantization configuration
+# https://huggingface.co/docs/transformers/main/main_classes/quantization#transformers.BitsAndBytesConfig
+bnb_config_kwargs:
+  # These are default values
+  llm_int8_has_fp16_weight: false
+  bnb_4bit_quant_type: nf4
+  bnb_4bit_use_double_quant: true
+
+# quantization aware training
+qat:
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
+
+# post-training quantization
+quantization:
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
+  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
+
+
+# Whether you are training a 4-bit GPTQ quantized model
+gptq: true
+
+# This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+load_in_8bit: true
+# Use bitsandbytes 4 bit
+load_in_4bit:
+
+# Use CUDA bf16
+bf16: true # bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere
+# Use CUDA fp16
+fp16: true
+# Use CUDA tf32
+tf32: true # require >=ampere
+# Note: if bf16 is set to 'auto', and fp16 is set to true, we will prefer the explict fp16 setting
+
+# No AMP (automatic mixed precision)
+bfloat16: true # require >=ampere
+float16: true
+
+# Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
+gpu_memory_limit: 20GiB
+# Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
+lora_on_cpu: true
+
+# List[str]. Add plugins to extend the pipeline.
+# See `src/axolotl/integrations` for the available plugins or doc below for more details.
+# https://docs.axolotl.ai/docs/custom_integrations.html
+plugins:
+  # - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+# A list of one or more datasets to finetune the model with
+# See https://docs.axolotl.ai/docs/dataset_loading.html for guide on loading datasets
+# See https://docs.axolotl.ai/docs/dataset-formats/ for guide on dataset formats
+datasets:
+  # HuggingFace dataset repo | s3:// | gs:// | path to local file or directory
+  - path: vicgalle/alpaca-gpt4
+    # The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]
+    type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
+    ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
+    data_files: # Optional[str] path to source data files
+
+    shards: # Optional[int] split dataset into N pieces (use with shards_idx)
+    shards_idx: # Optional[int] = 0 the index of sharded dataset to use
+
+    preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)
+
+    name: # Optional[str] name of dataset configuration to load
+    split: train # Optional[str] name of dataset split to load from
+    revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
+    trust_remote_code: # Optional[bool] Trust remote code for untrusted source
+
+  # Custom user instruction prompt
+  - path: repo
+    type:
+      # The below are defaults. only set what's needed if you use a different column name.
+      system_prompt: ""
+      system_format: "{system}"
+      field_system: system
+      field_instruction: instruction
+      field_input: input
+      field_output: output
+
+      # Customizable to be single line or multi-line
+      # Use {instruction}/{input} as key to be replaced
+      # 'format' can include {input}
+      format: |-
+        User: {instruction} {input}
+        Assistant:
+      # 'no_input_format' cannot include {input}
+      no_input_format: "{instruction} "
+
+      # For `completion` datsets only, uses the provided field instead of `text` column
+      field:
+
+  # Using chat template
+  - path: ...
+    # Set type to `chat_template` to use this strategy
+    type: chat_template
+    # Specify the name of the chat template to use
+    # The name of the chat template to use for training, following values are supported:
+    # - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.
+    # - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
+    # - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.
+    # - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
+    chat_template: tokenizer_default
+
+    # Custom jinja chat template. Used only if `chat_template: jinja` or empty.
+    chat_template_jinja:
+
+    # Key containing the messages (default: "messages")
+    field_messages: messages
+
+    # Key containing the system message (default: "system")
+    # If the system message is not present in the dataset sample, it will be loaded from the field_system property.
+    field_system: system
+
+    # Mapping of properties from the input dataset to the chat template.
+    # (default: message_property_mappings={'role':'role', 'content':'content'})
+    # If a property exists in the template but not in this mapping, the system will attempt
+    # to load it directly from the message using the property name as the key.
+    # Example: In the mapping below, 'from' is loaded from input dataset and used as 'role',
+    # while 'value' is loaded and used as 'content' in the chat template.
+    message_property_mappings:
+      role: from
+      content: value
+      # ...
+
+    # Optional[Dict[str, List]]. Roles mapping in the messages.
+    # The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.
+    # The default is:
+    roles:
+      user: ["human", "user"]
+      assistant: ["gpt", "assistant"]
+      system: ["system"]
+      tool: ["tool"]
+
+    # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.
+    # This does not drop the default system message from chat_template if it exists. If you wish to,
+    # we recommend using a custom jinja template with the default system message removed or
+    # adding a system turn with empty content.
+    drop_system_message:
+
+    # Optional[bool]. (for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags
+    # See example at `docs/dataset-formats/conversation.qmd`
+    split_thinking:
+
+    # IMPORTANT: The following fields determine which parts of the conversation to train on.
+    # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
+    # See examples at `docs/dataset-formats/conversation.qmd`
+    # Note: If the below 5 fields are empty, defaults to training only on the last message.
+
+    # Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
+    roles_to_train: ["assistant"]  # default
+    # Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
+    # - all: train on all EOS tokens
+    # - turn (default): train on the EOS token at the end of each trainable turn
+    # - last: train on the last EOS token in the conversation
+    # TIP: Please make sure that your `tokenizer.eos_token` is same as EOS/EOT token in template. Otherwise, set `eos_token` under `special_tokens`.
+    train_on_eos: turn
+    # Optional[str]. Which EOT (End-of-Turn) tokens to train on in the conversation. Possible values are:
+    # - all: train on all EOT tokens
+    # - turn: train on the EOT token at the end of each trainable turn
+    # - last: train on the last EOT token in the conversation
+    # If not specified, defaults to the value of train_on_eos for backward compatibility.
+    train_on_eot:
+    # The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
+    message_field_training: training
+    # The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
+    # The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).
+    message_field_training_detail: train_detail
+
+
+# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
+# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
+shuffle_merged_datasets: true
+
+# Deduplicates datasets and test_datasets with identical entries.
+dataset_exact_deduplication: true
+
+# A list of one or more datasets to eval the model with.
+# You can use either test_datasets, or val_set_size, but not both.
+test_datasets:
+  - path: /workspace/data/eval.jsonl
+    ds_type: json
+    # You need to specify a split. For "json" datasets the default split is called "train".
+    split: train
+    type: completion
+    data_files:
+      - /workspace/data/eval.jsonl
+
+# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'
+rl:
+rl_beta:  # Optional[float]. The beta parameter for the RL training.
+
+# dpo
+dpo_use_weighting:  # Optional[bool]. Whether to perform weighting.
+rpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.
+
+# orpo
+orpo_alpha: 0.1  # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.
+
+# kto
+kto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.
+kto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.
+
+# simpo
+cpo_alpha: 1.0  # Weight of the BC regularizer
+simpo_gamma: 0.5  # Target reward margin for the SimPO loss
+
+# grpo
+trl:
+  use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
+  vllm_server_host: # Optional[str]. Host of the vLLM server to connect to.
+  vllm_server_port: # Optional[int]. Port of the vLLM server to connect to.
+  vllm_server_timeout: # Optional[int]. Total timeout (in seconds) to wait for the vLLM server to respond.
+  vllm_guided_decoding_regex: # Optional[str]. Regex for vLLM guided decoding.
+
+  beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
+  max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
+
+  reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.
+  reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.
+
+  num_generations: # Optional[int]. Number of generations to sample.
+  log_completions: # Optional[bool]. Whether to log completions.
+  num_completions_to_print: # Optional[int]. Number of completions to print when log_completions is True.
+
+  sync_ref_model: # Optional[bool]. Whether to sync the reference model.
+  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
+  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
+  scale_rewards: # Optional[bool]. Whether to scale rewards by their standard deviation.
+
+  temperature: # Optional[float]. Sampling temperature for the GRPO policy.
+  top_p: # Optional[float]. Top-p sampling probability for the generation policy.
+  top_k: # Optional[int]. Top-k sampling for the generation policy.
+  min_p: # Optional[float]. Minimum probability for the generation policy.
+  repetition_penalty: # Optional[float]. Penalty for tokens that appear in prompt and generated text.
+
+  num_iterations: # Optional[int]. Number of iterations per batch (μ) for GRPO.
+  epsilon: # Optional[float]. Epsilon value for clipping in the GRPO algorithm.
+  epsilon_high: # Optional[float]. Upper-bound epsilon value for clipping in the GRPO algorithm.
+  use_liger_loss: # Optional[bool]. Whether to use Liger loss for GRPO.
+  loss_type: # Optional[str]. Loss formulation to use. Supported values: grpo, bnpo, dr_grpo.
+  mask_truncated_completions: # Optional[bool]. Whether to exclude truncated completions from loss calculation.
+
+
+# reward modelling: `True` or `False`
+reward_model:
+
+# process reward modelling: `True` or `False`
+process_reward_model:
+
+# The name of the chat template to use for training, following values are supported:
+# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.
+# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
+# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.
+# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
+# The selected chat template will be saved to the tokenizer_config.json for easier inferencing
+# Note: It is recommended to set train_on_inputs to true when using a chat template that is different from the model's default chat template.
+chat_template: tokenizer_default
+# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
+chat_template_jinja: null
+# Optional[List[str]]. Custom EOT (End-of-Turn) tokens to mask/unmask during training.
+# These tokens mark the boundaries between conversation turns.
+# For example: ["/INST", "</s>", "[/SYSTEM_PROMPT]"]
+# If not specified, defaults to just the model's eos_token.
+# This is useful for templates that use multiple delimiter tokens.
+eot_tokens:
+  # - "</s>"
+  # - "[/INST]"
+  # - "[/SYSTEM_PROMPT]"
+# Changes the default system message
+default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
+# Axolotl attempts to save the dataset as an arrow after packing the data together so
+# subsequent training attempts load faster, relative path
+dataset_prepared_path: data/last_run_prepared
+# Push prepared dataset to hub
+push_dataset_to_hub: # Optional[str] repo_org/repo_name
+# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
+# if not set.
+dataset_processes: # defaults to os.cpu_count() if not set
+# Keep dataset in memory while preprocessing
+# Only needed if cached dataset is taking too much storage
+dataset_keep_in_memory:
+# push checkpoints to hub
+hub_model_id: # private repo path to push finetuned model
+# how to push checkpoints to hub
+# https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
+hub_strategy:
+# Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
+# Required to be true when used in combination with `push_dataset_to_hub`
+hf_use_auth_token: # boolean
+# How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
+val_set_size: 0.04
+# Num shards for whole dataset
+dataset_shard_num:
+# Index of shard to use for whole dataset
+dataset_shard_idx:
+
+# The maximum length of an input to train with, this should typically be less than 2048
+# as most models have a token/context limit of 2048
+sequence_len: 2048
+# Pad inputs so each step uses constant sized buffers
+# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
+pad_to_sequence_len:
+# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
+sample_packing:
+# Set to 'false' if getting errors during eval with sample_packing on.
+eval_sample_packing:
+# You can set these packing optimizations AFTER starting a training at least once.
+# The trainer will provide recommended values for these values.
+sample_packing_eff_est:
+total_num_tokens:
+# Increasing the following values helps with packing, but usually only slightly (<%1.)
+# The number of samples packed at a time.
+sample_packing_group_size: 100000
+# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
+sample_packing_bin_size: 200
+sample_pack_sequentially: # Optional[bool]. Whether to pack samples sequentially.
+
+# whether to concatenate samples during pretraining
+pretraining_sample_concatenation:
+
+curriculum_sampling: # Optional[bool]. Whether to use sequential sampling for curriculum learning
+
+# Use batch flattening for speedups when not using sample_packing
+batch_flattening:
+
+# Passed through to transformers when loading the model when launched without accelerate
+# Use `sequential` when training w/ model parallelism to limit memory
+device_map:
+# Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model.
+max_memory:
+
+# If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
+adapter: lora
+# If you already have a lora model trained that you want to load, put that here.
+# This means after training, if you want to test the model, you should set this to the value of `output_dir`.
+# Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`.
+lora_model_dir:
+
+# LoRA hyperparameters
+# For more details about the following options, see:
+# https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - v_proj
+#  - k_proj
+#  - o_proj
+#  - gate_proj
+#  - down_proj
+#  - up_proj
+lora_target_linear: # If true, will target all linear modules
+
+# List[int] | int. # The layer indices to transform, otherwise, apply to all layers
+# https://huggingface.co/docs/peft/v0.15.0/en/package_reference/lora#peft.LoraConfig.layers_to_transform
+peft_layers_to_transform:
+
+# Optional[bool]. Whether to use DoRA.
+# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#weight-decomposed-low-rank-adaptation-dora
+peft_use_dora:
+
+# Optional[bool]. Whether to use RSLoRA.
+# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#rank-stabilized-lora
+peft_use_rslora:
+
+# Optional[list[tuple[int, int]]]. List of layer indices to replicate.
+# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#memory-efficient-layer-replication-with-lora
+peft_layer_replication:
+
+# bool | Literal["gaussian", "eva", "olora", "pissa", "pissa_niter_[number of iters]", "corda", "loftq"]
+# How to initialize LoRA weights. Default to True which is MS original implementation.
+# https://huggingface.co/docs/peft/v0.15.0/en/developer_guides/lora#initialization
+peft_init_lora_weights:
+
+# If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
+# For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
+# `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
+# https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
+lora_modules_to_save:
+#  - embed_tokens
+#  - lm_head
+
+lora_fan_in_fan_out: false
+
+# Apply custom LoRA autograd functions and activation function Triton kernels for
+# speed and memory savings
+# See: https://docs.axolotl.ai/docs/lora_optims.html
+lora_mlp_kernel: true
+lora_qkv_kernel: true
+lora_o_kernel: true
+
+# LoRA+ hyperparameters
+# For more details about the following options, see:
+# https://arxiv.org/abs/2402.12354  and `src/axolotl/core/train_builder.py`
+loraplus_lr_ratio: # loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4.
+loraplus_lr_embedding: #  loraplus learning rate for lora embedding layers. Default value is 1e-6.
+
+peft:
+  # Configuration options for loftq initialization for LoRA
+  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
+  loftq_config:
+    loftq_bits:  # typically 4 bits
+
+# ReLoRA configuration
+# Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
+relora_steps: # Number of steps per ReLoRA restart
+relora_warmup_steps: # Number of per-restart warmup steps
+relora_anneal_steps: # Number of anneal steps for each relora cycle
+relora_prune_ratio: # threshold for optimizer magnitude when pruning
+relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
+
+# wandb configuration if you're using it
+# Make sure your `WANDB_API_KEY` environment variable is set (recommended) or you login to wandb with `wandb login`.
+wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
+wandb_project: # Your wandb project name
+wandb_entity: # A wandb Team name if using a Team
+wandb_watch:
+wandb_name: # Set the name of your wandb run
+wandb_run_id: # Set the ID of your wandb run
+wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
+
+# mlflow configuration if you're using it
+mlflow_tracking_uri: # URI to mlflow
+mlflow_experiment_name: # Your experiment name
+mlflow_run_name: # Your run name
+hf_mlflow_log_artifacts:  # set to true to copy each saved checkpoint on each save to mlflow artifact registry
+
+# Comet configuration if you're using it
+# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`.
+# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start
+use_comet: # Enable or disable Comet integration.
+comet_api_key: # API key for Comet. Recommended to set via `comet login`.
+comet_workspace: # Workspace name in Comet. Defaults to the user's default workspace.
+comet_project_name: # Project name in Comet. Defaults to Uncategorized.
+comet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key.
+comet_mode: # Create a new experiment ("create") or log to an existing one ("get"). Default ("get_or_create") auto-selects based on configuration.
+comet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True.
+comet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details.
+
+# Tensorboard
+use_tensorboard: # Optional[bool]
+
+# Where to save the full-finetuned model to
+output_dir: ./completed-model
+
+# Whether to use torch.compile and which backend to use
+# setting to `auto` will enable torch compile when torch>=2.5.1
+torch_compile:  # Optional[Union[Literal["auto"], bool]]
+torch_compile_backend:  # Optional[str]
+torch_compile_mode:  # 'default' | 'reduce-overhead' | 'max-autotune'
+
+# Training hyperparameters
+
+# If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
+gradient_accumulation_steps: 1
+# The number of samples to include in each batch. This is the number of samples sent to each GPU.
+# Batch size per gpu = micro_batch_size * gradient_accumulation_steps
+micro_batch_size: 2
+eval_batch_size:
+num_epochs: 4
+warmup_steps: 100  # cannot use with warmup_ratio
+warmup_ratio: 0.05  # cannot use with warmup_steps
+learning_rate: 0.00003
+lr_quadratic_warmup:
+logging_steps:
+eval_steps: # Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps
+evals_per_epoch: # number of times per epoch to run evals, mutually exclusive with eval_steps
+eval_strategy: # Set to `"no"` to skip evaluation, `"epoch"` at end of each epoch, leave empty to infer from `eval_steps`.
+save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of each epoch, `"best"` when better result is achieved, leave empty to infer from `save_steps`.
+save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
+saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
+save_total_limit: # Checkpoints saved at a time
+save_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.
+# Maximum number of iterations to train for. It precedes num_epochs which means that
+# if both are set, num_epochs will not be guaranteed.
+# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
+max_steps:
+
+# bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time.
+include_tokens_per_second: # Optional[bool]
+
+# whether to find batch size that fits in memory. Passed to underlying transformers Trainer
+auto_find_batch_size: # Optional[bool]
+
+eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
+eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
+do_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`.
+eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]
+
+profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.
+                # see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information
+                # snapshots can be visualized @ https://pytorch.org/memory_viz
+
+loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
+loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
+
+# Save model as safetensors (require safetensors package). Default True
+save_safetensors:
+
+# Whether to mask out or include the human's prompt from the training labels
+train_on_inputs: false
+# Group similarly sized data to minimize padding.
+# May be slower to start, as it must download and sort the entire dataset.
+# Note that training loss may have an oscillating pattern with this enabled.
+group_by_length: false
+
+# Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk".
+# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+gradient_checkpointing: false
+# additional kwargs to pass to the trainer for gradient checkpointing
+# gradient_checkpointing_kwargs:
+#   use_reentrant: true
+
+# Stop training after this many evaluation losses have increased in a row
+# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
+early_stopping_patience: 3
+
+# Specify a scheduler and kwargs to use with the optimizer
+# Valid values are driven by the Transformers SchedulerType class, see:
+# https://github.com/huggingface/transformers/blob/5f4ecf2d9f867a1255131d2461d75793c0cf1db2/src/transformers/trainer_utils.py#L420
+# Valid values include
+# - 'linear'
+# - 'cosine' (default)
+# - 'cosine_with_restarts'
+# - 'polynomial'
+# - 'constant'
+# - 'constant_with_warmup'
+# - 'inverse_sqrt'
+# - 'reduce_lr_on_plateau'
+# - 'cosine_with_min_lr'
+# - 'warmup_stable_decay'
+
+# Additional schedulers include:
+# - 'one_cycle'
+# - 'rex'
+lr_scheduler:
+lr_scheduler_kwargs:
+cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
+cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
+
+# For one_cycle optim
+lr_div_factor: # Learning rate div factor
+
+# Specify optimizer
+# Valid values are driven by the Transformers OptimizerNames class, see:
+# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189
+#
+# Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
+# torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
+# in the examples/ for your model and fine-tuning use case.
+#
+# Valid values for 'optimizer' include:
+# - adamw_torch
+# - adamw_torch_fused (default)
+# - adamw_torch_xla
+# - adamw_torch_npu_fused
+# - adamw_apex_fused
+# - adopt_adamw  (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
+# - adafactor
+# - adamw_anyprecision
+# - adamw_torch_4bit
+# - ademamix
+# - sgd
+# - adagrad
+# - adamw_bnb_8bit
+# - adamw_8bit   # alias for adamw_bnb_8bit
+# - ademamix_8bit
+# - lion_8bit
+# - lion_32bit
+# - paged_adamw_32bit
+# - paged_adamw_8bit
+# - paged_ademamix_32bit
+# - paged_ademamix_8bit
+# - paged_lion_32bit
+# - paged_lion_8bit
+# - rmsprop
+# - rmsprop_bnb
+# - rmsprop_bnb_8bit
+# - rmsprop_bnb_32bit
+# - galore_adamw
+# - galore_adamw_8bit
+# - galore_adafactor
+# - galore_adamw_layerwise
+# - galore_adamw_8bit_layerwise
+# - galore_adafactor_layerwise
+# - lomo
+# - adalomo
+# - grokadamw
+# - schedule_free_adamw
+# - schedule_free_sgd
+# - apollo_adamw
+# - apollo_adamw_layerwise
+#
+# Additional custom optimizers include:
+# - optimi_adamw
+# - ao_adamw_8bit
+# - ao_adamw_fp8
+# - came_pytorch
+optimizer:
+# Dictionary of arguments to pass to the optimizer
+optim_args:
+# For Galore Optimizers the following optim_args are available
+# rank:  # type: int
+# update_proj_gap  # type: int
+# scale  # type: float
+# proj_type:  # type: str, default = std
+
+# The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm
+optim_target_modules:
+# - self_attn  # for llama
+# - mlp
+
+# Specify weight decay
+weight_decay:
+# adamw hyperparams
+adam_beta1:
+adam_beta2:
+adam_beta3:  # only used for CAME Optimizer
+adam_epsilon:
+adam_epsilon2:  # only used for CAME Optimizer
+# Gradient clipping max norm
+max_grad_norm:
+
+# Augmentation techniques
+# NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
+# currently only supported on Llama and Mistral
+neftune_noise_alpha:
+
+# Optional[bool]. Whether to bettertransformers
+flash_optimum:
+
+# Note: Only one of the following attention patches can be used at a time.
+# For example, if you set `xformers_attention` to `true`, do not set `flash_attention` to `true`.
+
+# Optional[bool]. Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
+xformers_attention:
+# Optional[bool]. Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+flash_attention:
+flash_attn_cross_entropy:  # Optional[bool]. Whether to use flash-attention cross entropy implementation - advanced use only
+flash_attn_rms_norm:  # Optional[bool]. Whether to use flash-attention rms norm implementation - advanced use only
+flash_attn_fuse_qkv: # Optional[bool]. Whether to fuse QKV into a single operation
+flash_attn_fuse_mlp: # Optional[bool]. Whether to fuse part of the MLP into a single operation
+# Optional[bool]. Whether to use scaled-dot-product attention
+# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+sdp_attention:
+# Optional[bool]. Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
+s2_attention:
+
+# Optional[bool]. Whether to use low_cpu_mem_usage
+low_cpu_mem_usage:
+# Optional[str]. Resume from a specific checkpoint dir
+resume_from_checkpoint:
+# Optional[bool]. If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# Be careful with this being turned on between different models.
+auto_resume_from_checkpoints: false
+
+## Multimodal section
+# int | tuple[int, int] | None . Size to resize images to, width x height.
+# Will read from model/processor config if not set.
+image_size:
+# str. Algorithm to use for image resizing. "bilinear", "bicubic", "lanczos". Default is "bilinear".
+image_resize_algorithm: 'bilinear'
+## End of multimodal section
+
+# Don't mess with this, it's here for accelerate and torchrun
+local_rank:
+
+# Add or change special tokens.
+# If you add tokens here, you don't need to add them to the `tokens` list.
+special_tokens:
+  # bos_token: "<s>"
+  # eos_token: "</s>"
+  # unk_token: "<unk>"
+  # pad_token: "[PAD]"
+
+# Optional[list[str]]. Add extra tokens to the tokenizer.
+tokens:
+  # - "<|startoftext|>"
+  # - "<|endoftext|>"
+
+# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.
+# Only works for tokens that are not part of the base vocab (aka are added_tokens).
+# Can be checked if they exist in tokenizer.json added_tokens.
+added_tokens_overrides:  # Dict[int, str]
+#  128041: "<|im_start|>"
+#  128042: "<|im_end|>"
+
+# FSDP
+fsdp:
+fsdp_config:
+
+# Deepspeed config path. e.g., deepspeed_configs/zero3.json
+deepspeed:
+
+# Advanced DDP Arguments
+ddp_timeout:
+ddp_bucket_cap_mb:
+ddp_broadcast_buffers:
+
+# Context parallelism
+# Set to a divisor of the number of GPUs available to split sequences into chunks of equal size.
+# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
+# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
+# subsequences, or set to 4 to split into four equal-sized subsequences.
+# See https://docs.axolotl.ai/docs/context_parallelism.html for more details.
+context_parallel_degree:
+# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
+# Must evenly divide the number of KV heads in your model.
+heads_k_stride: 1
+# One of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to "varlen_llama3"
+# in the sample packing case, and "batch_ring" in the non-sample packing case.
+ring_attn_func:
+
+# Path to torch distx for optim 'adamw_anyprecision'
+torchdistx_path:
+
+# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
+pretraining_dataset:
+
+# Debug mode
+debug:
+
+# Seed
+seed:
+
+# Allow overwrite yml config using from cli
+strict:
+```
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -12,7 +12,7 @@ Chat Template strategy uses a jinja2 template that converts a list of messages i
 {"conversations": [{"role": "...", "content": "..."}]}
 ```

-See [configs](../config-reference.qmd) for full configs and supported templates.
+See [configs](../config.qmd) for full configs and supported templates.

 ### Migrating from sharegpt

@@ -52,9 +52,7 @@ We recommend checking the below examples for other usecases.

 ### Examples

-#### Training on last message
-
-(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
+1. (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.

 ```yaml
 datasets:
@@ -68,9 +66,7 @@ datasets:
 If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
 :::

-#### Overriding default chat template
-
-Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
+2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.

 ```yaml
 chat_template: gemma # this overwrites the tokenizer's chat_template
@@ -80,13 +76,7 @@ datasets:
    roles_to_train: ["assistant"]  # default value
 ```

-::: {.callout-note}
-If you want to use built-in chat_template, use `chat_template: tokenizer_default` (this is set by default).
-:::
-
-#### Using default chat template with fallback
-
-Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
+3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.

 ```yaml
 chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
@@ -95,9 +85,7 @@ datasets:
    type: chat_template
 ```

-#### Custom Jinja template
-
-Using a custom jinja template on OpenAI messages format, training on all assistant messages.
+4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.

 ```yaml
 # chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
@@ -112,9 +100,7 @@ datasets:
 Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
 :::

-#### Using template with different token for EOT and EOS
-
- If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
+5. If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.

 ```yaml
 eot_tokens:
@@ -130,16 +116,16 @@ datasets:
 ```

 ::: {.callout-tip}
-See [config documentation](../config-reference.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
+See [config documentation](../config.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
 :::

 ::: {.callout-note}
 Using `eot_tokens` requires each token that exists in `chat_template` to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.

-You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config-reference.qmd) for more details.
+You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
 :::

- Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
+6. Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.

 ```yaml
 eot_tokens:
@@ -159,73 +145,7 @@ If EOS token only appears at the end of a prompt, `train_on_eos: last` is equiva
 :::


-#### Using tool use
-
-Instead of passing `tools` via the system prompt, an alternative method would be to have the `tools` in a separate column and loaded via `chat_template` to let the template dynamically build it.
-
-```json
-{
-    "tools": [
-        {
-            "type": "...",
-            "function": {
-                "name": "...",
-                "description": "...",
-                "parameters": {
-                    "type": "...",
-                    "properties": {
-                        // ...
-                    },
-                    "required": ["..."],
-                },
-            },
-        },
-    ],
-    "messages": [
-        // ...
-        {
-            "role": "assistant", // call the function via assistant
-            "tool_calls": [
-                {
-                    "type": "function",
-                    "function": {
-                        "name": "...",
-                        "arguments": {
-                            "...": "...",
-                        }
-                    }
-                }
-            ]
-        },
-        {
-            "role": "tool",
-            "name": "...",
-            "content": "..."
-        },
-    ],
-}
-```
-
-::: {.callout-note}
-Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
-:::
-
-```yaml
-chat_template: llama4
-datasets:
-  - path: ...
-    type: chat_template
-    # field_tools: tools # default is `tools`
-```
-
-::: {.callout-tip}
-Look into the `chat_template` you are using to see if it supports `tools` and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the `tool` or `ipython` role for `llama4` template.
-:::
-
-
-#### Using fine-grained control over token masking
-
-(Advanced) Using fine-grained control over tokens and turns to train in a conversation
+7. (Advanced) Using fine-grained control over tokens and turns to train in a conversation

 For a data sample that looks like:

@@ -276,9 +196,7 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::

-#### Reasoning split
-
-(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
+8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.

 ```yaml
 datasets:
--- a/docs/dataset-formats/inst_tune.qmd
+++ b/docs/dataset-formats/inst_tune.qmd
@@ -186,4 +186,4 @@ datasets:
      no_input_format: "[INST] {instruction} [/INST]"
 ```

-See full config options under [here](../config-reference.qmd).
+See full config options under [here](../config.qmd).
--- a/docs/dataset_loading.qmd
+++ b/docs/dataset_loading.qmd
@@ -36,7 +36,7 @@ This matches the API of [`datasets.load_dataset`](https://github.com/huggingface

 For HuggingFace's guide to load different dataset types, see [here](https://huggingface.co/docs/datasets/loading).

-For full details on the config, see [config-reference.qmd](config-reference.qmd).
+For full details on the config, see [config.qmd](config.qmd).

 ::: {.callout-note}

--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -9,7 +9,7 @@ format:
 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with Pytorch 2.7.1 and CUDA 12.8.
+For Blackwell GPUs, please use the tags with Pytorch 2.7.0 and CUDA 12.8.
 :::

 ## Base
@@ -32,8 +32,8 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}

 Tags examples:

- `main-base-py3.11-cu128-2.7.1`
- `main-base-py3.11-cu126-2.7.1`
+- `main-base-py3.11-cu128-2.7.0`
+- `main-base-py3.11-cu126-2.7.0`
 - `main-base-py3.11-cu124-2.6.0`
 - `main-base-py3.11-cu124-2.5.1`

--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -9,11 +9,11 @@ description: Frequently asked questions

 > A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd)

-**Q: exitcode: -9**
+**Q: Exitcode -9**

 > A: This usually happens when you run out of system RAM.

-**Q: exitcode: -7 while using deepspeed**
+**Q: Exitcode -7 while using deepspeed**

 > A: Try upgrading deepspeed w: `pip install -U deepspeed`

--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -55,7 +55,7 @@ output_dir: ./outputs/lora-out
 - To perform QLoRA finetuning, replace with `load_in_4bit: true` and `adapter: qlora`.
 :::

-See our [config options](config-reference.qmd) for more details.
+See our [Config options](config.qmd) for more details.

 ### Training {#sec-training}

@@ -179,7 +179,7 @@ Now that you have the basics, you might want to:

 Check our other guides for details on these topics:

- [Configuration Guide](config-reference.qmd) - Full configuration options
+- [Configuration Guide](config.qmd) - Full configuration options
 - [Dataset Loading](dataset_loading.qmd) - Loading datasets from various sources
 - [Dataset Formats](dataset-formats) - Working with different data formats
 - [Multi-GPU Training](multi-gpu.qmd)
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -14,7 +14,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir
 ## Requirements {#sec-requirements}

 - NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
- Python ≥3.11
+- Python ≥3.10
 - PyTorch ≥2.5.1

 ## Installation Methods {#sec-installation-methods}
@@ -153,7 +153,7 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.

 ### Conda/Pip venv {#sec-conda}

-1. Install Python ≥3.11
+1. Install Python ≥3.10
 2. Install PyTorch: https://pytorch.org/get-started/locally/
 3. Install Axolotl:
   ```{.bash}
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -18,7 +18,7 @@ Axolotl supports several methods for multi-GPU training:

 - DeepSpeed (recommended)
 - FSDP (Fully Sharded Data Parallel)
- Sequence parallelism
+- Context parallelism
 - FSDP + QLoRA

 ## DeepSpeed {#sec-deepspeed}
@@ -80,14 +80,14 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

-## Sequence parallelism {#sec-sequence-parallelism}
+## Context parallelism {#sec-sequence-parallelism}

-We support sequence parallelism (SP) via the
+We support context parallelism (SP) via the
 [ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
 allows one to split up sequences across GPUs, which is useful in the event that a
 single sequence causes OOM errors during model training.

-See our [dedicated guide](sequence_parallelism.qmd) for more information.
+See our [dedicated guide](context_parallelism.qmd) for more information.

 ### FSDP + QLoRA {#sec-fsdp-qlora}

--- a/docs/qat.qmd
+++ b/docs/qat.qmd
@@ -29,4 +29,4 @@ qat:
  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
 ```

-Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.
+Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize` command](./quantize.md) to do this.
--- a/docs/quantize.qmd
+++ b/docs/quantize.qmd
@@ -32,7 +32,7 @@ output_dir:  # The path to the output directory.

 Once quantization is complete, your quantized model will be saved in the `{output_dir}/quantized` directory.

-You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.qmd) - you can do this by using the existing QAT configuration file which
+You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.md) - you can do this by using the existing QAT configuration file which
 you used to train the model:

 ```yaml
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -500,7 +500,7 @@ The input format is a simple JSON input with customizable fields based on the ab
 ### GRPO

 ::: {.callout-tip}
-Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/grpo_code).
+Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
 :::

 In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
--- a/docs/scripts/generate_config_docs.py
+++ b/docs/scripts/generate_config_docs.py
@@ -1,752 +0,0 @@
-# type: ignore
-
-"""
-Quarto documentation generation from Pydantic models. Uses Pydantic model source code
-to automatically group fields, including inherited fields from parent classes.
-"""
-
-import ast
-import inspect
-import textwrap
-import types
-import typing
-from typing import Any, FrozenSet, Type, Union
-
-from pydantic import BaseModel
-
-from axolotl.utils.schemas.config import AxolotlInputConfig
-
-
-class QuartoGenerator:
-    """Generate Quarto documentation from Pydantic models."""
-
-    def __init__(self):
-        self._class_fields_cache = {}
-        self._inheritance_map_cache = {}
-        self._nested_models_cache = {}
-
-    def _get_direct_fields(self, cls: Type[BaseModel]) -> FrozenSet[str]:
-        """Get fields defined directly in a single class (not inherited)."""
-        if cls in self._class_fields_cache:
-            return self._class_fields_cache[cls]
-
-        fields = set()
-
-        # Get annotated fields
-        if hasattr(cls, "__annotations__"):
-            fields.update(cls.__annotations__.keys())
-
-        # Filter out private/special methods
-        fields = {f for f in fields if not f.startswith("_")}
-
-        result = frozenset(fields)
-        self._class_fields_cache[cls] = result
-        return result
-
-    def _is_pydantic_model(self, type_obj) -> bool:
-        """Check if a type is a Pydantic BaseModel."""
-        return inspect.isclass(type_obj) and issubclass(type_obj, BaseModel)
-
-    # pylint: disable=too-many-return-statements
-    def _extract_nested_type(self, field_type) -> Any:
-        """Extract the actual type from complex type annotations."""
-        # Handle Annotated types (Python 3.9+)
-        if hasattr(typing, "get_origin") and hasattr(typing, "get_args"):
-            origin = typing.get_origin(field_type)
-            args = typing.get_args(field_type)
-
-            if origin is not None:
-                # Handle Annotated[SomeType, ...] - extract the first argument
-                if hasattr(typing, "Annotated") and origin is typing.Annotated:
-                    if args:
-                        return self._extract_nested_type(
-                            args[0]
-                        )  # Recursively process the actual type
-
-                # Handle list[SomeType], List[SomeType], etc.
-                elif origin in (list, typing.List):
-                    if args:
-                        return self._extract_nested_type(
-                            args[0]
-                        )  # Extract element type
-
-                # Handle Union types (including | syntax)
-                elif origin is typing.Union:
-                    # Get non-None types from the Union
-                    non_none_types = [arg for arg in args if arg is not type(None)]
-                    if len(non_none_types) >= 1:
-                        # Prioritize Pydantic models over primitive types
-                        pydantic_models = [
-                            arg
-                            for arg in non_none_types
-                            if self._is_pydantic_model(arg)
-                        ]
-                        if pydantic_models:
-                            # Return the first Pydantic model found
-                            return self._extract_nested_type(pydantic_models[0])
-
-                        # No Pydantic models, return the first non-None type
-                        return self._extract_nested_type(non_none_types[0])
-
-        # Handle new Python 3.10+ union syntax (PeftConfig | None)
-        if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType:
-            # Get non-None types from the Union
-            non_none_types = [
-                arg for arg in field_type.__args__ if arg is not type(None)
-            ]
-            if len(non_none_types) >= 1:
-                # Prioritize Pydantic models over primitive types
-                pydantic_models = [
-                    arg for arg in non_none_types if self._is_pydantic_model(arg)
-                ]
-                if pydantic_models:
-                    return self._extract_nested_type(pydantic_models[0])
-                return self._extract_nested_type(non_none_types[0])
-
-        # Handle old typing.Union syntax (fallback)
-        if hasattr(field_type, "__origin__"):
-            if field_type.__origin__ is Union:
-                # Get non-None types from the Union
-                non_none_types = [
-                    arg for arg in field_type.__args__ if arg is not type(None)
-                ]
-                if len(non_none_types) >= 1:
-                    # Prioritize Pydantic models over primitive types
-                    pydantic_models = [
-                        arg for arg in non_none_types if self._is_pydantic_model(arg)
-                    ]
-                    if pydantic_models:
-                        return self._extract_nested_type(pydantic_models[0])
-                    return self._extract_nested_type(non_none_types[0])
-            # Handle other generic types like dict[str, Any], etc.
-            elif hasattr(field_type, "__args__"):
-                return field_type
-
-        return field_type
-
-    # pylint: disable=too-many-return-statements
-    def _extract_all_pydantic_models_from_type(
-        self, field_type
-    ) -> list[type[BaseModel]]:
-        """Extract all Pydantic models from a type annotation, including from Unions."""
-        models = []
-
-        if field_type is None:
-            return models
-
-        # Handle Annotated types
-        if hasattr(typing, "get_origin") and hasattr(typing, "get_args"):
-            origin = typing.get_origin(field_type)
-            args = typing.get_args(field_type)
-
-            if origin is not None:
-                # Handle Annotated[SomeType, ...] - extract from the first argument
-                if hasattr(typing, "Annotated") and origin is typing.Annotated:
-                    if args:
-                        models.extend(
-                            self._extract_all_pydantic_models_from_type(args[0])
-                        )
-                    return models
-
-                # Handle list[SomeType], List[SomeType], etc.
-                if origin in (list, typing.List):
-                    if args:
-                        models.extend(
-                            self._extract_all_pydantic_models_from_type(args[0])
-                        )
-                    return models
-
-                # Handle Union types
-                if origin is typing.Union:
-                    for arg in args:
-                        if arg is not type(None):  # Skip None type
-                            models.extend(
-                                self._extract_all_pydantic_models_from_type(arg)
-                            )
-                    return models
-
-        # Handle new Python 3.10+ union syntax
-        if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType:
-            for arg in field_type.__args__:
-                if arg is not type(None):  # Skip None type
-                    models.extend(self._extract_all_pydantic_models_from_type(arg))
-            return models
-
-        # Handle old typing.Union syntax (fallback)
-        if hasattr(field_type, "__origin__") and field_type.__origin__ is Union:
-            for arg in field_type.__args__:
-                if arg is not type(None):  # Skip None type
-                    models.extend(self._extract_all_pydantic_models_from_type(arg))
-            return models
-
-        # Check if this type itself is a Pydantic model
-        if self._is_pydantic_model(field_type):
-            models.append(field_type)
-
-        return models
-
-    def _get_nested_models(
-        self, model_class: type[BaseModel], visited=None
-    ) -> dict[str, type[BaseModel]]:
-        """Get all nested Pydantic models from a model class."""
-        if visited is None:
-            visited = set()
-
-        # Avoid infinite recursion
-        if model_class in visited:
-            return {}
-
-        if model_class in self._nested_models_cache:
-            return self._nested_models_cache[model_class]
-
-        visited.add(model_class)
-        nested_models = {}
-
-        # Check all fields in the model
-        for field_info in model_class.model_fields.values():
-            field_type = self._extract_nested_type(field_info.annotation)
-
-            if self._is_pydantic_model(field_type):
-                nested_models[field_type.__name__] = field_type
-                # Recursively get nested models from this nested model
-                deeper_nested = self._get_nested_models(field_type, visited.copy())
-                nested_models.update(deeper_nested)
-
-        self._nested_models_cache[model_class] = nested_models
-        return nested_models
-
-    def _build_inheritance_map(self, child_class: Type[BaseModel]):
-        """Build inheritance map for a class and all its parents."""
-        if child_class in self._inheritance_map_cache:
-            return self._inheritance_map_cache[child_class]
-
-        inheritance_map = {}
-
-        # Get MRO and filter out BaseModel and object
-        mro_classes = [
-            cls
-            for cls in child_class.__mro__
-            if cls not in (BaseModel, object) and hasattr(cls, "__annotations__")
-        ]
-
-        # Process each class in the MRO
-        for cls in mro_classes:
-            inheritance_map[cls] = self._get_direct_fields(cls)
-
-        self._inheritance_map_cache[child_class] = inheritance_map
-        return inheritance_map
-
-    def _wrap_comment(self, text: str, width: int = 88) -> list[str]:
-        """Wrap a comment to specified width, accounting for '# ' prefix."""
-        if not text.strip():
-            return ["#"]
-
-        # Account for "# " prefix (2 characters)
-        content_width = width - 2
-        wrapped_lines = textwrap.wrap(text, width=content_width)
-        return [f"# {line}" for line in wrapped_lines]
-
-    def _extract_type_from_source(
-        self, model_class: type[BaseModel], field_name: str
-    ) -> str:
-        """Extract the actual type annotation text from source code, checking inheritance chain."""
-        # Use inheritance map to check classes efficiently
-        inheritance_map = self._build_inheritance_map(model_class)
-
-        # Check classes in MRO order
-        for cls in model_class.__mro__:
-            if cls in inheritance_map and field_name in inheritance_map[cls]:
-                type_annotation = self._get_type_from_class_source(cls, field_name)
-                if type_annotation != "unknown":
-                    return type_annotation
-
-        return "unknown"
-
-    def _get_type_from_class_source(self, class_obj: type, field_name: str) -> str:
-        """Extract type annotation from a specific class's source code."""
-        try:
-            source = inspect.getsource(class_obj)
-            tree = ast.parse(source)
-        except (OSError, TypeError):
-            return "unknown"
-
-        # Find the class definition
-        for node in tree.body:
-            if isinstance(node, ast.ClassDef) and node.name == class_obj.__name__:
-                # Find the field assignment
-                for body_node in node.body:
-                    if isinstance(body_node, ast.AnnAssign) and isinstance(
-                        body_node.target, ast.Name
-                    ):
-                        if body_node.target.id == field_name and body_node.annotation:
-                            return ast.unparse(body_node.annotation)
-                break
-
-        return "unknown"
-
-    def _extract_field_groups_from_all_classes(
-        self, model_class: type[BaseModel]
-    ) -> list[dict]:
-        """Extract field groups from all classes in the inheritance hierarchy."""
-        all_groups = []
-        inheritance_map = self._build_inheritance_map(model_class)
-
-        # Get all Pydantic base classes in MRO order (most specific first)
-        # This puts AxolotlInputConfig fields first, then parent class fields
-        pydantic_classes = [
-            cls
-            for cls in model_class.__mro__
-            if cls in inheritance_map and inheritance_map[cls]
-        ]
-
-        # Extract groups from each class
-        for cls in pydantic_classes:
-            class_groups = self._extract_field_groups_from_source(cls)
-            for group in class_groups:
-                all_groups.append(group)
-
-        # If no groups found, create a default grouping by class
-        if not all_groups:
-            for cls in pydantic_classes:
-                fields_in_class = inheritance_map[cls]
-                if fields_in_class:
-                    all_groups.append(
-                        {
-                            "fields": list(fields_in_class),
-                        }
-                    )
-
-        return all_groups
-
-    # pylint: disable=too-many-return-statements
-    def _extract_field_groups_from_source(
-        self, model_class: type[BaseModel]
-    ) -> list[dict]:
-        """Extract field groups from source code based on blank lines and comments."""
-        try:
-            source = inspect.getsource(model_class)
-            tree = ast.parse(source)
-        except (OSError, TypeError):
-            # Fallback if we can't get source code
-            fields_in_class = self._get_direct_fields(model_class)
-            if fields_in_class:
-                return [
-                    {
-                        "fields": list(fields_in_class),
-                    }
-                ]
-            return []
-
-        groups = []
-        current_group_fields = []
-        current_group_comment = None
-
-        # Find the class definition
-        class_node = None
-        for node in ast.walk(tree):
-            if isinstance(node, ast.ClassDef) and node.name == model_class.__name__:
-                class_node = node
-                break
-
-        if not class_node:
-            fields_in_class = self._get_direct_fields(model_class)
-            if fields_in_class:
-                return [
-                    {
-                        "fields": list(fields_in_class),
-                    }
-                ]
-            return []
-
-        # Parse the source lines to detect groupings
-        source_lines = source.split("\n")
-
-        # Get fields that are actually defined in this specific class
-        fields_in_class = self._get_direct_fields(model_class)
-
-        # Find assignments that correspond to model fields for THIS class only
-        field_assignments = []
-        for node in class_node.body:
-            if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
-                field_name = node.target.id
-                if field_name in fields_in_class:
-                    field_assignments.append(
-                        {
-                            "name": field_name,
-                            "lineno": node.lineno,
-                            "end_lineno": getattr(node, "end_lineno", node.lineno),
-                        }
-                    )
-
-        if not field_assignments:
-            if fields_in_class:
-                return [
-                    {
-                        "fields": list(fields_in_class),
-                    }
-                ]
-            return []
-
-        # Sort by line number
-        field_assignments.sort(key=lambda x: x["lineno"])
-
-        # Group fields based on blank lines and comments
-        for i, field_info in enumerate(field_assignments):
-            field_name = field_info["name"]
-            current_line = field_info["lineno"]
-
-            # Check if this starts a new group (blank line before or significant gap)
-            is_new_group = False
-
-            if i == 0:
-                is_new_group = True
-            else:
-                prev_end_line = field_assignments[i - 1]["end_lineno"]
-
-                # Check for blank lines or comments between fields
-                lines_between = source_lines[prev_end_line : current_line - 1]
-                has_blank_line = any(line.strip() == "" for line in lines_between)
-                has_comment = any(
-                    line.strip().startswith("#") for line in lines_between
-                )
-
-                # Start new group if there's a blank line or comment, or significant gap
-                if has_blank_line or has_comment or (current_line - prev_end_line > 3):
-                    is_new_group = True
-
-            if is_new_group and current_group_fields:
-                # Save the previous group
-                groups.append(
-                    {
-                        "fields": current_group_fields.copy(),
-                        "description": current_group_comment,
-                    }
-                )
-                current_group_fields = []
-                current_group_comment = None
-
-            current_group_fields.append(field_name)
-
-        # Add the final group
-        if current_group_fields:
-            groups.append(
-                {
-                    "fields": current_group_fields,
-                    "description": current_group_comment,
-                }
-            )
-
-        return groups
-
-    def _generate_field_documentation(
-        self,
-        model_class: type[BaseModel],
-        field_name: str,
-        field_info: dict,
-        field_type_str: str,
-        is_required: bool,
-        indent_level: int = 0,
-        visited_models: set = None,
-    ) -> list[str]:
-        """Generate documentation for a single field, expanding nested models inline."""
-        if visited_models is None:
-            visited_models = set()
-
-        lines = []
-        indent = "  " * indent_level
-
-        # Get the actual field type for nested model detection
-        if field_name in model_class.model_fields:
-            pydantic_field_info = model_class.model_fields[field_name]
-            actual_field_type = pydantic_field_info.annotation
-        else:
-            actual_field_type = None
-
-        # Add description comment if available
-        description = field_info.get("description", "")
-        if description:
-            wrapped_lines = self._wrap_comment(description, width=88 - len(indent))
-            for line in wrapped_lines:
-                lines.append(f"{indent}{line}")
-
-        # Extract nested Pydantic models from the type annotation
-        nested_models = self._extract_all_pydantic_models_from_type(actual_field_type)
-
-        # Filter out already visited models to prevent infinite recursion
-        expandable_models = [
-            model for model in nested_models if model not in visited_models
-        ]
-
-        if expandable_models:
-            # This field contains Pydantic models that can be expanded
-
-            # Show the field with its full type annotation
-            field_line = f"{indent}{field_name}: {field_type_str}"
-            if field_info.get("default") is not None:
-                field_line += f" = {field_info['default']}"
-            if is_required:
-                field_line += " (required)"
-            lines.append(field_line)
-
-            # Add to visited to prevent infinite recursion
-            new_visited = visited_models.copy()
-            new_visited.update(expandable_models)
-
-            # Expand each nested Pydantic model
-            for i, nested_model in enumerate(expandable_models):
-                if i > 0:
-                    lines.append("\n")
-                lines.append(f"{indent}  # For {nested_model.__name__}:")
-
-                # Get nested model schema
-                try:
-                    nested_schema = nested_model.model_json_schema()
-                    nested_properties = nested_schema.get("properties", {})
-                    nested_required = nested_schema.get("required", [])
-                except Exception:  # pylint: disable=broad-exception-caught
-                    # Fallback: use model fields directly
-                    nested_properties = {}
-                    nested_required = []
-                    for (
-                        nested_field_name,
-                        nested_field_info,
-                    ) in nested_model.model_fields.items():
-                        nested_description = ""
-                        if (
-                            hasattr(nested_field_info, "json_schema_extra")
-                            and nested_field_info.json_schema_extra
-                        ):
-                            nested_description = (
-                                nested_field_info.json_schema_extra.get(
-                                    "description", ""
-                                )
-                            )
-                        elif (
-                            hasattr(nested_field_info, "description")
-                            and nested_field_info.description
-                        ):
-                            nested_description = nested_field_info.description
-
-                        nested_default_val = None
-                        if (
-                            hasattr(nested_field_info, "default")
-                            and nested_field_info.default is not None
-                        ):
-                            if str(nested_field_info.default) != "PydanticUndefined":
-                                nested_default_val = nested_field_info.default
-
-                        nested_properties[nested_field_name] = {
-                            "type": "unknown",
-                            "description": nested_description,
-                            "default": nested_default_val,
-                        }
-
-                        if nested_field_info.is_required():
-                            nested_required.append(nested_field_name)
-
-                # Get field groups for the nested model
-                nested_field_groups = self._extract_field_groups_from_all_classes(
-                    nested_model
-                )
-
-                # Generate nested fields with increased indentation
-                for i, group in enumerate(nested_field_groups):
-                    if not group["fields"]:
-                        continue
-
-                    # Add blank line between groups (except before first group)
-                    if i > 0:
-                        lines.append("")
-
-                    # Process nested fields
-                    for nested_field_name in group["fields"]:
-                        if nested_field_name not in nested_properties:
-                            continue
-
-                        nested_field_info = nested_properties[nested_field_name]
-                        nested_field_type = self._extract_type_from_source(
-                            nested_model, nested_field_name
-                        )
-                        nested_is_required = nested_field_name in nested_required
-
-                        # Recursively generate documentation for nested field
-                        nested_lines = self._generate_field_documentation(
-                            nested_model,
-                            nested_field_name,
-                            nested_field_info,
-                            nested_field_type,
-                            nested_is_required,
-                            indent_level + 1,
-                            new_visited,
-                        )
-                        lines.extend(nested_lines)
-        else:
-            # Regular field (no expandable nested models)
-            field_line = f"{indent}{field_name}: {field_type_str}"
-            if field_info.get("default") is not None:
-                field_line += f" = {field_info['default']}"
-            if is_required:
-                field_line += " (required)"
-            lines.append(field_line)
-
-        return lines
-
-    def generate_qmd(
-        self,
-        model_class: type[BaseModel],
-        title: str | None = None,
-        expand_nested: bool = True,
-    ) -> str:
-        """Auto-generate config reference documentation including inherited fields."""
-
-        if title is None:
-            title = f"{model_class.__name__} Reference"
-
-        # Try to get JSON schema, with fallback for serialization issues
-        try:
-            schema = model_class.model_json_schema()
-            properties = schema.get("properties", {})
-            required = schema.get("required", [])
-        except Exception as e:  # pylint: disable=broad-exception-caught
-            print(
-                f"Warning: Could not generate JSON schema ({e}). Using model fields instead."
-            )
-            # Fallback: use model fields directly
-            properties = {}
-            required = []
-            for field_name, field_info in model_class.model_fields.items():
-                # Extract description from json_schema_extra or field info
-                description = ""
-                if (
-                    hasattr(field_info, "json_schema_extra")
-                    and field_info.json_schema_extra
-                ):
-                    description = field_info.json_schema_extra.get("description", "")
-                elif hasattr(field_info, "description") and field_info.description:
-                    description = field_info.description
-
-                # Get default value
-                default_val = None
-                if hasattr(field_info, "default") and field_info.default is not None:
-                    # Handle special Pydantic default markers
-                    if str(field_info.default) != "PydanticUndefined":
-                        default_val = field_info.default
-
-                properties[field_name] = {
-                    "type": "unknown",
-                    "description": description,
-                    "default": default_val,
-                }
-
-                if field_info.is_required():
-                    required.append(field_name)
-
-        # Extract field groups from all classes in inheritance hierarchy
-        field_groups = self._extract_field_groups_from_all_classes(model_class)
-
-        # Start building QMD content
-        qmd_lines = [
-            "---",
-            f"title: {title}",
-            "description: A complete list of all configuration options.",
-            "---",
-            "",
-        ]
-
-        # Generate one big code block with all fields (inline nested expansion)
-        qmd_lines.append("```yaml")
-
-        for i, group in enumerate(field_groups):
-            if not group["fields"]:
-                continue
-
-            # Add blank line between groups (except before first group)
-            if i > 0:
-                qmd_lines.append("")
-
-            # Process fields in the order they appear in source
-            for field_name in group["fields"]:
-                if field_name not in properties:
-                    continue
-
-                field_info = properties[field_name]
-                field_type = self._extract_type_from_source(model_class, field_name)
-                is_required = field_name in required
-
-                if expand_nested:
-                    # Check if this field has nested models
-                    if field_name in model_class.model_fields:
-                        pydantic_field_info = model_class.model_fields[field_name]
-                        nested_models = self._extract_all_pydantic_models_from_type(
-                            pydantic_field_info.annotation
-                        )
-                        has_nested = bool(nested_models)
-                    else:
-                        has_nested = False
-
-                    # Add blank line before nested config
-                    if has_nested:
-                        qmd_lines.append("")
-
-                    # Use the new inline generation method
-                    field_lines = self._generate_field_documentation(
-                        model_class,
-                        field_name,
-                        field_info,
-                        field_type,
-                        is_required,
-                        indent_level=0,
-                        visited_models=set(),
-                    )
-                    qmd_lines.extend(field_lines)
-
-                    # Add blank line after nested config
-                    if has_nested:
-                        qmd_lines.append("")
-                else:
-                    # Original simple approach
-                    description = field_info.get("description", "")
-                    default = field_info.get("default")
-
-                    # Add wrapped comment for description
-                    if description:
-                        wrapped_lines = self._wrap_comment(description)
-                        qmd_lines.extend(wrapped_lines)
-
-                    line = f"{field_name}: {field_type}"
-                    if default is not None:
-                        line += f" = {default}"
-                    if is_required:
-                        line += " (required)"
-                    qmd_lines.append(line)
-
-        qmd_lines.append("```")
-
-        # Join all lines and clean up any double newlines
-        content = "\n".join(qmd_lines)
-
-        # Replace multiple consecutive newlines with just two newlines (one blank line)
-        import re
-
-        content = re.sub(r"\n{3,}", "\n\n", content)
-
-        # Ensure single newline at the very end
-        content = content.rstrip("\n") + "\n"
-
-        return content
-
-
-def main():
-    generator = QuartoGenerator()
-
-    print("Generating config reference content...")
-    qmd_content = generator.generate_qmd(AxolotlInputConfig, "Config Reference", True)
-
-    print("Writing to file...")
-    with open("docs/config-reference.qmd", "w", encoding="utf-8") as f:
-        f.write(qmd_content)
-    print("Done!")
-
-
-if __name__ == "__main__":
-    main()
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -1,16 +1,16 @@
 ---
-title: Sequence Parallelism
+title: Context Parallelism
 description: Train with long sequences split across multiple GPUs.
 ---

-Sequence parallelism is a technique that splits sequences across multiple GPUs,
+Context parallelism is a technique that splits sequences across multiple GPUs,
 allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
 GPU processes a different portion of the sequence, and the results are aggregated
 through a ring communication pattern.

-## When to Use Sequence Parallelism
+## When to Use Context Parallelism

-Use sequence parallelism when:
+Use context parallelism when:

 - You need to train with sequence lengths that don't fit into a single GPU's memory
 - You have multiple GPUs available
@@ -18,11 +18,11 @@ Use sequence parallelism when:

 ## Configuration

-To enable sequence parallelism, add the following to your configuration file:
+To enable context parallelism, add the following to your configuration file:

 ```yaml
 # Set to a divisor (> 1) of the number of GPUs available
-sequence_parallel_degree: 4  # Split sequences across 4 GPUs
+context_parallel_degree: 4  # Split sequences across 4 GPUs
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 # Optional; one of "varlen_llama3" or "batch_ring". Defaults to
@@ -30,23 +30,23 @@ heads_k_stride: 1
 ring_attn_func:
 ```

-The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
+The `context_parallel_degree` should be a divisor of the total number of GPUs. For example:

 - With 8 GPUs, valid values would be 2, 4, or 8
 - With 4 GPUs, valid values would be 2 or 4

 ## Implementation Details

-When sequence parallelism is enabled:
+When context parallelism is enabled:

-1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
+1. Each sequence is divided into equal chunks across the GPUs in a context parallel group
 2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
 3. Position IDs are adjusted to maintain proper relative positions
 4. The trainer uses special ring communication patterns for attention operations

 ## Requirements

-To use sequence parallelism, you need:
+To use context parallelism, you need:

 - Multiple GPUs (at least 2)
 - The `ring-flash-attn` package. Install with:
@@ -66,7 +66,7 @@ sequence_len: 8192

 ...

-sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+context_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
 # Optional; strides across the key dimension. Larger values use more memory but should make training faster.
 heads_k_stride: 1
 # Optional; one of "varlen_llama3" or "batch_ring". Defaults to
@@ -79,22 +79,22 @@ ring_attn_func:
 This will train the Llama 3 8B model with 8K context length, with each sequence split
 into 2 subsequences of length 4096 across 2 GPUs.

-## Sample Packing with Sequence Parallelism
+## Sample Packing with Context Parallelism

-Sequence parallelism is compatible with Axolotl's sample packing functionality. When using both features together:
+Context parallelism is compatible with Axolotl's sample packing functionality. When using both features together:

 1. Samples are first packed together
-2. The packed sequences are then divided across GPUs in the sequence parallel group
+2. The packed sequences are then divided across GPUs in the context parallel group
 3. Position IDs are automatically adjusted to maintain proper relative positions

 ## Effect on Batch Size

-When using sequence parallelism, your effective global batch size is **divided** by the `sequence_parallel_degree`. This happens because:
+When using context parallelism, your effective global batch size is **divided** by the `context_parallel_degree`. This happens because:

- Each group of `sequence_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
+- Each group of `context_parallel_degree` GPUs works on the same batch (just different parts of each sequence)
 - The number of batches processed per step decreases

 For example:
- With 8 GPUs and no sequence parallelism: 8 different batches processed per step
- With 8 GPUs and `sequence_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
+- With 8 GPUs and no context parallelism: 8 different batches processed per step
+- With 8 GPUs and `context_parallel_degree=4`: Only 2 different batches processed per step (each split across 4 GPUs)
 - If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -5,10 +5,6 @@ tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name

-special_tokens:
-  pad_token: <|finetune_right_pad_id|>
-  eos_token: <|eot_id|>
-
 load_in_8bit: true
 load_in_4bit: false

--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -1,71 +0,0 @@
-# Finetune Magistral Small with Axolotl
-
-Magistral Small is a 24B parameter opensource model from MistralAI found on [HuggingFace](https://huggingface.co/mistralai/Magistral-Small-2506). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
-
-MistralAI has also released a proprietary medium-sized version called Magistral Medium.
-
-Thanks to the team at MistralAI for giving us early access to prepare for this release.
-
-## Getting started
-
-1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Magistral is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
-
-    Here is an example of how to install from main for pip:
-
-```bash
-# Ensure you have Pytorch installed (Pytorch 2.6.0 recommended)
-git clone https://github.com/axolotl-ai-cloud/axolotl.git
-cd axolotl
-
-pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
-pip3 install --no-build-isolation -e '.[flash-attn,mistral]'
-```
-
-2. Download the example config:
-
-```bash
-axolotl fetch examples
-```
-
-3. Run the finetuning example:
-
-```bash
-axolotl train examples/magistral/magistral-small-qlora.yaml
-```
-
-This config uses about 24GB VRAM.
-
-Let us know how it goes. Happy finetuning! 🚀
-
-### TIPS
-
- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format is the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
-
-## Optimization Guides
-
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
-
-## Limitations
-
-We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
-
-The tokenizer does not work with `dataset.map` with multiprocessing, so we had to disable it. In addition, we do not support overriding tokens yet.
-
-## Related Resources
-
- [MistralAI Magistral Blog](https://mistral.ai/news/magistral/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
-
-
-## Future Work
-
- Add parity to Preference Tuning, RL, Multi-modal, etc.
- Add parity to other tokenizer configs like overriding tokens.
--- a/examples/magistral/magistral-small-fsdp-qlora.yaml
+++ b/examples/magistral/magistral-small-fsdp-qlora.yaml
@@ -1,72 +0,0 @@
-base_model: mistralai/Magistral-Small-2506
-
-# Enable to use mistral-common tokenizer
-tokenizer_use_mistral_common: true
-
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: true
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/lora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_torch_fused
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing:
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-
-fsdp:
-  - full_shard
-  - auto_wrap
-fsdp_config:
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
-  fsdp_activation_checkpointing: true
--- a/examples/magistral/magistral-small-qlora.yaml
+++ b/examples/magistral/magistral-small-qlora.yaml
@@ -1,63 +0,0 @@
-base_model: mistralai/Magistral-Small-2506
-
-# Enable to use mistral-common tokenizer
-tokenizer_use_mistral_common: true
-
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: true
-
-datasets:
-  - path: fozziethebeat/alpaca_messages_2k_test
-    type: chat_template
-
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.1
-output_dir: ./outputs/lora-out
-
-adapter: qlora
-lora_model_dir:
-
-sequence_len: 2048
-sample_packing: true
-pad_to_sequence_len: true
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-resume_from_checkpoint:
-logging_steps: 1
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

 wandb_project:
 wandb_entity:
--- a/favicon.jpg
+++ b/favicon.jpg
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,12 +13,12 @@ packaging==23.2

 huggingface_hub==0.32.2
 peft==0.15.2
-transformers==4.52.4
+transformers==4.52.3
 tokenizers>=0.21.1
 accelerate==1.7.0
 datasets==3.6.0
 deepspeed>=0.17.0
-trl==0.18.2
+trl==0.18.1
 hf_xet==1.1.2

 optimum==1.16.2
@@ -67,5 +67,3 @@ schedulefree==1.4.1

 axolotl-contribs-lgpl==0.0.6
 axolotl-contribs-mit==0.0.3
-
-mistral-common==1.6.0
--- a/setup.py
+++ b/setup.py
@@ -118,7 +118,7 @@ extras_require = {
        "yunchang==0.6.0",
    ],
    "deepspeed": [
-        "deepspeed==0.17.1",
+        "deepspeed==0.17.0",
        "deepspeed-kernels",
    ],
    "mamba-ssm": [
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -4,4 +4,4 @@ import pkgutil

 __path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package

-__version__ = "0.11.0.dev"
+__version__ = "0.10.0.dev0"
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -26,7 +26,7 @@ from axolotl.utils.mlflow_ import setup_mlflow_env_vars
 from axolotl.utils.trainer import prepare_opinionated_env, prepare_optim_env
 from axolotl.utils.wandb_ import setup_wandb_env_vars

-LOG = get_logger(__name__)
+LOG = get_logger(__name__, use_environ=True)


 def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -73,7 +73,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
        load_in_8bit=False,
        load_in_4bit=False,
        flash_attention=False,
-        sequence_parallel_degree=None,
+        context_parallel_degree=None,
        deepspeed=None,
        fsdp=None,
        fsdp_config=None,
--- a/src/axolotl/common/const.py
+++ b/src/axolotl/common/const.py
@@ -1,3 +1,5 @@
-"""Various shared constants"""
+"""
+Various shared constants
+"""

 DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -3,13 +3,15 @@
 import math
 import random
 from dataclasses import dataclass
+from typing import Optional, Union

 from datasets import Dataset

 import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.loaders import load_processor, load_tokenizer
-from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
+from axolotl.utils.data import prepare_dataset
+from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RLType
@@ -28,7 +30,16 @@ class TrainDatasetMeta:


 def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
-    """Randomly sample `num_samples` samples with replacement from `dataset`."""
+    """
+    Randomly sample `num_samples` samples from `dataset`.
+
+    Args:
+        dataset: Dataset.
+        num_samples: Number of samples to return.
+
+    Returns:
+        Random sample (with replacement) of examples in `dataset`.
+    """
    return dataset.select(
        [random.randrange(0, len(dataset) - 1) for _ in range(num_samples)]  # nosec
    )
@@ -40,37 +51,44 @@ def load_datasets(
    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
    debug: bool = False,
 ) -> TrainDatasetMeta:
-    """Loads one or more training or evaluation datasets, calling
-    `axolotl.utils.data.prepare_datasets`. Optionally, logs out debug information.
+    """
+    Loads one or more training or evaluation datasets, calling
+    `axolotl.utils.data.prepare_dataset`. Optionally, logs out debug information.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Command-specific CLI arguments.
-        debug: Whether to print out tokenization of sample. This is duplicated in
-            `cfg` and `cli_args`, but is kept due to use in our Colab notebooks.
+        debug: Whether to print out tokenization of sample

    Returns:
        Dataclass with fields for training and evaluation datasets and the computed
-            `total_num_steps`.
+        `total_num_steps`.
    """
    tokenizer = load_tokenizer(cfg)
    processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
-    preprocess_iterable = getattr(cli_args, "iterable", False)
+    preprocess_iterable = (
+        cli_args
+        and hasattr(cli_args, "iterable")
+        and cli_args.iterable is not None
+        and cli_args.iterable
+    )

-    train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
+    train_dataset, eval_dataset, total_num_steps, prompters = prepare_dataset(
        cfg,
        tokenizer,
        processor=processor,
        preprocess_iterable=preprocess_iterable,
    )

-    if (
-        cfg.debug
-        or getattr(cli_args, "debug", False)
-        or getattr(cli_args, "debug_text_only", False)
-        or getattr(cli_args, "debug_num_examples", 0) > 0
-        or debug
-    ):
+    if (  # pylint: disable=too-many-boolean-expressions
+        cli_args
+        and (
+            cli_args.debug
+            or cfg.debug
+            or cli_args.debug_text_only
+            or int(cli_args.debug_num_examples) > 0
+        )
+    ) or debug:
        LOG.info("check_dataset_labels...")

        num_examples = cli_args.debug_num_examples if cli_args else 1
@@ -95,10 +113,13 @@ def load_datasets(


 def load_preference_datasets(
-    *, cfg: DictDefault, cli_args: PreprocessCliArgs | TrainerCliArgs | None = None
+    *,
+    cfg: DictDefault,
+    cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
 ) -> TrainDatasetMeta:
-    """Loads one or more training or evaluation datasets for RL training using paired
-    preference data, calling `axolotl.utils.data.rl.prepare_preference_datasets`.
+    """
+    Loads one or more training or evaluation datasets for RL training using paired
+    preference data, calling `axolotl.utils.data.rl.load_prepare_preference_datasets`.
    Optionally, logs out debug information.

    Args:
@@ -109,28 +130,23 @@ def load_preference_datasets(
        Dataclass with fields for training and evaluation datasets and the computed
        `total_num_steps`.
    """
-    tokenizer = load_tokenizer(cfg)
-    train_dataset, eval_dataset = prepare_preference_datasets(cfg, tokenizer)
+    train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
+    total_num_steps: Optional[int] = int(
+        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
+    )
+    if cfg.rl is RLType.GRPO:
+        total_num_steps = None

-    total_num_steps: int | None = None
-    if cfg.rl is not RLType.GRPO:
-        total_num_steps = int(
-            math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
-        )
-
-    if (cli_args and cli_args.debug) or cfg.debug:
+    if cli_args.debug or cfg.debug:
        LOG.info("check_dataset_labels...")

-        num_examples = cli_args.debug_num_examples if cli_args else 1
-        text_only = cli_args.debug_text_only if cli_args else False
-
        tokenizer = load_tokenizer(cfg)
-        train_samples = sample_dataset(train_dataset, num_examples)
+        train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
        check_dataset_labels(
-            dataset=train_samples,
-            tokenizer=tokenizer,
-            num_examples=num_examples,
-            text_only=text_only,
+            train_samples,
+            tokenizer,
+            num_examples=cli_args.debug_num_examples,
+            text_only=cli_args.debug_text_only,
            rl_mode=True,
        )

--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -380,16 +380,14 @@ class TrainerBuilderBase(abc.ABC):
        )

        # eval_strategy and eval_steps
-        if not self.eval_dataset and self.cfg.val_set_size == 0:
-            # do not eval if no eval_dataset and val_set_size=0
+        if not self.eval_dataset or self.cfg.val_set_size == 0:
+            # do not eval if no eval_dataset or val_set_size=0
            training_args_kwargs["eval_strategy"] = "no"
        elif self.cfg.eval_steps:
            training_args_kwargs["eval_strategy"] = "steps"
            training_args_kwargs["eval_steps"] = self.cfg.eval_steps
-            training_args_kwargs["eval_on_start"] = True
        elif self.cfg.eval_strategy:
            training_args_kwargs["eval_strategy"] = self.cfg.eval_strategy
-            training_args_kwargs["eval_on_start"] = True

    def _configure_reporting(self, training_args_kwargs: dict):
        report_to = []
@@ -492,9 +490,6 @@ class TrainerBuilderBase(abc.ABC):
        training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
        training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs

-        if self.cfg.dataset_processes:
-            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
-
        # max_length is not used in CausalTrainer
        if self.cfg.reward_model or self.cfg.rl:
            training_args_kwargs["max_length"] = self.cfg.sequence_len
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -21,12 +21,18 @@ from axolotl.core.trainers import (
    AxolotlTrainer,
    ReLoRATrainer,
 )
+from axolotl.core.training_args import (
+    AxolotlPRMConfig,
+    AxolotlRewardConfig,
+    AxolotlTrainingArguments,
+)
 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback
 from axolotl.processing_strategies import get_processing_strategy
 from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
+    EvalFirstStepCallback,
    LossWatchDogCallback,
    SaveBetterTransformerModelCallback,
    bench_eval_callback_factory,
@@ -57,6 +63,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

    def get_callbacks(self):
        callbacks = super().get_callbacks()
+        callbacks.append(EvalFirstStepCallback())

        if self.cfg.relora_steps:
            callbacks.append(ReLoRACallback(self.cfg))
@@ -123,9 +130,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        return callbacks

    def _get_trainer_cls(self):
-        """
-        Gets the trainer class for the given configuration.
-        """
        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
@@ -142,12 +146,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        return AxolotlTrainer

    def build(self, total_num_steps):
-        from axolotl.core.training_args import (
-            AxolotlPRMConfig,
-            AxolotlRewardConfig,
-            AxolotlTrainingArguments,
-        )
-
        training_arguments_kwargs, trainer_kwargs = self._set_base_training_args(
            total_num_steps
        )
@@ -316,12 +314,20 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            training_arguments_kwargs["image_resize_algorithm"] = (
                self.cfg.image_resize_algorithm
            )
-
-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            plugin_training_args = plugin_manager.get_training_args(self.cfg)
-            if plugin_training_args:
-                training_arguments_kwargs.update(plugin_training_args)
+        if self.cfg.kd_ce_alpha is not None:
+            training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
+        if self.cfg.kd_alpha is not None:
+            training_arguments_kwargs["kd_alpha"] = self.cfg.kd_alpha
+        if self.cfg.kd_temperature is not None:
+            training_arguments_kwargs["kd_temperature"] = self.cfg.kd_temperature
+        if self.cfg.kd_zscore_base_temp is not None:
+            training_arguments_kwargs["kd_zscore_base_temp"] = (
+                self.cfg.kd_zscore_base_temp
+            )
+        if self.cfg.kd_top_k_before_softmax is not None:
+            training_arguments_kwargs["kd_top_k_before_softmax"] = (
+                self.cfg.kd_top_k_before_softmax
+            )

        if self.cfg.reward_model:
            training_args_cls = AxolotlRewardConfig
@@ -375,7 +381,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        elif "tokenizer" in sig.parameters:
            trainer_kwargs["tokenizer"] = self.tokenizer
        if (
-            trainer_cls not in [AxolotlRewardTrainer, AxolotlPRMTrainer]
+            not (trainer_cls in [AxolotlRewardTrainer, AxolotlPRMTrainer])
            and self.cfg.datasets is not None
        ):
            trainer_kwargs["dataset_tags"] = [
@@ -402,10 +408,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        return trainer

    def build_collator(
-        self,
-        training_args,  # type: "AxolotlTrainingArguments"  # type: ignore
-        is_eval=False,
-        **kwargs,
+        self, training_args: AxolotlTrainingArguments, is_eval=False, **kwargs
    ):
        if training_args.pretraining:
            if (
@@ -434,19 +437,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            ]
        ]
        collator_args = [self.tokenizer]
-
-        collator_cls_and_kwargs = None
-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            collator_cls_and_kwargs = plugin_manager.get_collator_cls_and_kwargs(
-                self.cfg, is_eval=is_eval
-            )
-
-        if collator_cls_and_kwargs:
-            collator = collator_cls_and_kwargs[0]
-            if kwargs and isinstance(kwargs, dict):
-                kwargs.update(collator_cls_and_kwargs[1])
-        elif self.cfg.reward_model:
+        if self.cfg.reward_model:
            collator = RewardDataCollatorWithPadding
        elif use_batch_sampler_collator:
            # Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
@@ -477,6 +468,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                collator_args.pop(0)
                kwargs.pop("pad_to_multiple_of", None)
                kwargs.pop("padding", None)
+            elif self.cfg.kd_trainer:
+                from axolotl.integrations.kd.collator import (
+                    DataCollatorForKD,
+                    KDBatchSamplerDataCollatorForSeq2Seq,
+                )
+
+                if self.cfg.sample_packing:
+                    collator = KDBatchSamplerDataCollatorForSeq2Seq
+                else:
+                    collator = DataCollatorForKD
            else:
                collator = DataCollatorForSeq2Seq

--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -12,9 +12,13 @@ from axolotl.core.trainers import (
 from axolotl.core.trainers.dpo import DPOStrategy
 from axolotl.core.trainers.dpo.args import AxolotlDPOConfig
 from axolotl.core.trainers.grpo import GRPOStrategy
+from axolotl.core.training_args import (
+    AxolotlCPOConfig,
+    AxolotlKTOConfig,
+    AxolotlORPOConfig,
+)
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders.utils import ensure_dtype
-from axolotl.utils.callbacks.qat import QATCallback
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RLType

@@ -27,9 +31,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
    def get_callbacks(self):
        callbacks = super().get_callbacks()

-        if self.cfg.qat:
-            callbacks.append(QATCallback(self.cfg.qat))
-
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
@@ -53,7 +54,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):

        if self.cfg.rl is RLType.GRPO:
            trainer_cls = GRPOStrategy.get_trainer_class(
-                sequence_parallel=self.cfg.sequence_parallel_degree > 1
+                context_parallel=self.cfg.context_parallel_degree > 1
            )
            trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))

@@ -78,12 +79,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        """
        Returns training_args and trainer_kwargs
        """
-        from axolotl.core.training_args import (
-            AxolotlCPOConfig,
-            AxolotlKTOConfig,
-            AxolotlORPOConfig,
-        )
-
        training_args_kwargs, trainer_kwargs = self._set_base_training_args(
            total_num_steps=total_num_steps
        )
@@ -95,6 +90,10 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        else:
            training_args_kwargs["remove_unused_columns"] = False

+        # only rlhf
+        if self.cfg.dataset_processes:
+            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
+
        if self.cfg.trl and self.cfg.trl.beta is not None:
            training_args_kwargs["beta"] = self.cfg.trl.beta
        elif self.cfg.rl_beta is not None:
@@ -143,7 +142,22 @@ class HFRLTrainerBuilder(TrainerBuilderBase):

        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
            training_args_cls = AxolotlDPOConfig
-            training_args_kwargs.update(DPOStrategy.set_training_args_kwargs(self.cfg))
+            if self.cfg.rl is RLType.IPO:
+                training_args_kwargs["loss_type"] = "ipo"
+
+            # Not compatible with IPO
+            if self.cfg.rl is RLType.DPO and self.cfg.dpo_label_smoothing:
+                training_args_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
+
+            training_args_kwargs["max_completion_length"] = None
+            training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
+            training_args_kwargs["generate_during_eval"] = self.cfg.use_wandb
+            if self.cfg.dpo_use_weighting is not None:
+                training_args_kwargs["use_weighting"] = self.cfg.dpo_use_weighting
+            if self.cfg.dpo_use_logits_to_keep is not None:
+                training_args_kwargs["use_logits_to_keep"] = (
+                    self.cfg.dpo_use_logits_to_keep
+                )
        else:
            raise ValueError(f"Unsupported RL: {self.cfg.rl}")

@@ -151,12 +165,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if blocklist_key in training_args_kwargs:
                del training_args_kwargs[blocklist_key]

-        if self.cfg.plugins:
-            plugin_manager = PluginManager.get_instance()
-            plugin_training_args = plugin_manager.get_training_args(self.cfg)
-            if plugin_training_args:
-                training_args_kwargs.update(plugin_training_args)
-
        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
            logging_first_step=True,
            **training_args_kwargs,
--- a/src/axolotl/core/trainers/init.py
+++ b/src/axolotl/core/trainers/init.py
@@ -5,7 +5,7 @@

 from .base import AxolotlTrainer
 from .dpo.trainer import AxolotlDPOTrainer
-from .grpo.trainer import AxolotlGRPOSequenceParallelTrainer, AxolotlGRPOTrainer
+from .grpo.trainer import AxolotlGRPOContextParallelTrainer, AxolotlGRPOTrainer
 from .mamba import AxolotlMambaTrainer
 from .relora import ReLoRATrainer
 from .trl import (
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -7,11 +7,13 @@ from __future__ import annotations
 import os
 from collections import defaultdict
 from functools import partial, wraps
-from typing import Callable, Literal, Optional
+from typing import Any, Callable, Literal, Optional

+from axolotl.utils.ctx_managers.context_parallel.distributed import get_context_parallel_manager
 import datasets
 import torch
 from datasets import Dataset
+from torch import nn
 from torch.utils.data import (
    BatchSampler,
    DataLoader,
@@ -25,7 +27,6 @@ from trl.trainer.utils import pad_to_length
 from typing_extensions import override

 from axolotl.core.trainers.mixins import (
-    CheckpointSaveMixin,
    OptimizerMixin,
    RngLoaderMixin,
    SchedulerMixin,
@@ -34,16 +35,13 @@ from axolotl.core.trainers.utils import (
    sanitize_kwargs_for_ds_tagging,
    sanitize_kwargs_for_tagging,
 )
-from axolotl.utils import get_not_null
 from axolotl.utils.logging import get_logger
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths

 LOG = get_logger(__name__)


-class AxolotlTrainer(
-    SchedulerMixin, OptimizerMixin, RngLoaderMixin, CheckpointSaveMixin, Trainer
-):
+class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
    """Extend the base Trainer for axolotl helpers"""

    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
@@ -69,6 +67,32 @@ class AxolotlTrainer(
        if self.args.orpo_alpha:
            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")

+        # SPDA device mesh init
+        import torch.distributed as dist
+
+        world_size = dist.get_world_size()
+        mesh_shape = (
+            world_size // 2,
+            2,
+        )
+        self.world_mesh = dist.DeviceMesh(
+            "cuda",
+            torch.tensor(list(range(world_size))).reshape(mesh_shape),
+            mesh_dim_names=("dp", "cp"),
+        )
+
+    def training_step(
+        self, model: nn.Module, inputs: dict[str, torch.Tensor | Any], num_items_in_batch=None
+    ) -> torch.Tensor:
+        ctx_manager = get_context_parallel_manager(
+            world_mesh=self.world_mesh,
+            model=model,
+        )
+        to_shard = {k: v for k, v in inputs.items() if v.ndim > 1}
+        with ctx_manager(list(to_shard.values())):
+            super().training_step(model, inputs, num_items_in_batch)
+        
+
    def _wrap_model(self, model, training=True, dataloader=None):
        if self.args.torch_compile:
            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
@@ -105,7 +129,7 @@ class AxolotlTrainer(
            )
            batch_max_len = train_batch_size * self.args.max_seq_length

-        sampler = MultipackBatchSampler(
+        return MultipackBatchSampler(
            base_sampler,
            lengths=get_dataset_lengths(dataset),
            packing_efficiency_estimate=self.args.sample_packing_efficiency,
@@ -115,12 +139,8 @@ class AxolotlTrainer(
            bin_size=self.args.sample_packing_bin_size,
            sequential=self.args.sample_packing_sequentially,
            drop_last=True,
-            num_processes=self.args.dataset_num_proc,
        )

-        len(sampler)
-        return sampler
-
    def _get_train_sampler(
        self, train_dataset: Optional[Dataset] = None
    ) -> Optional[Sampler]:
@@ -228,9 +248,7 @@ class AxolotlTrainer(
        }

        if not isinstance(dataset, torch.utils.data.IterableDataset):
-            dataloader_params["drop_last"] = get_not_null(
-                self.args.dataloader_drop_last, True
-            )
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
            if sampler_fn is not None:
                sampler = sampler_fn(dataset)
                if isinstance(sampler, BatchSampler):
--- a/src/axolotl/core/trainers/dpo/init.py
+++ b/src/axolotl/core/trainers/dpo/init.py
@@ -22,19 +22,10 @@ class DPOStrategy:
        training_args_kwargs = {}
        if cfg.rl is RLType.IPO:
            training_args_kwargs["loss_type"] = "ipo"
-        # Label smoothing is not compatible with IPO
-        if cfg.rl is RLType.DPO and cfg.dpo_label_smoothing:
-            training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing
-        training_args_kwargs["max_completion_length"] = None
        training_args_kwargs["max_length"] = cfg.sequence_len
+        training_args_kwargs["max_completion_length"] = None
        training_args_kwargs["max_prompt_length"] = cfg.sequence_len
        training_args_kwargs["generate_during_eval"] = cfg.use_wandb
        if cfg.dpo_use_weighting is not None:
            training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting
-        if cfg.dpo_padding_free is not None:
-            training_args_kwargs["padding_free"] = cfg.dpo_padding_free
-        if cfg.dpo_norm_loss is not None:
-            training_args_kwargs["dpo_norm_loss"] = cfg.dpo_norm_loss
-        if cfg.dpo_use_logits_to_keep is not None:
-            training_args_kwargs["use_logits_to_keep"] = cfg.dpo_use_logits_to_keep
        return training_args_kwargs
--- a/src/axolotl/core/trainers/dpo/args.py
+++ b/src/axolotl/core/trainers/dpo/args.py
@@ -14,5 +14,3 @@ class AxolotlDPOConfig(AxolotlTrainingMixins, DPOConfig):
    """
    DPO config for DPO training
    """
-
-    dpo_norm_loss: bool | None = False
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -83,20 +83,3 @@ class AxolotlDPOTrainer(
        gc.collect()
        torch.cuda.empty_cache()
        return loss
-
-    def concatenated_forward(
-        self,
-        model: nn.Module,
-        batch: dict[str, Union[list, torch.LongTensor]],
-        is_ref_model: bool = False,
-    ) -> dict[str, torch.Tensor]:
-        if self.args.dpo_norm_loss:
-            # fmt: off
-            loss_type: str = self.loss_type  # type: ignore[has-type]  # pylint: disable=access-member-before-definition
-            # fmt: on
-            # concatenated_forward handles avg token logprob for ipo case already
-            self.loss_type = "ipo"  # pylint: disable=attribute-defined-outside-init
-            res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
-            self.loss_type = loss_type  # pylint: disable=attribute-defined-outside-init
-            return res
-        return super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -8,7 +8,7 @@ from trl.trainer.grpo_trainer import RewardFunc

 from axolotl.core.trainers.grpo.args import AxolotlGRPOConfig
 from axolotl.core.trainers.grpo.trainer import (
-    AxolotlGRPOSequenceParallelTrainer,
+    AxolotlGRPOContextParallelTrainer,
    AxolotlGRPOTrainer,
 )
 from axolotl.utils.dict import DictDefault
@@ -23,10 +23,10 @@ class GRPOStrategy:

    @classmethod
    def get_trainer_class(
-        cls, sequence_parallel: bool
-    ) -> type[AxolotlGRPOTrainer] | type[AxolotlGRPOSequenceParallelTrainer]:
-        if sequence_parallel:
-            return AxolotlGRPOSequenceParallelTrainer
+        cls, context_parallel: bool
+    ) -> type[AxolotlGRPOTrainer] | type[AxolotlGRPOContextParallelTrainer]:
+        if context_parallel:
+            return AxolotlGRPOContextParallelTrainer
        return AxolotlGRPOTrainer

    @classmethod
@@ -69,8 +69,8 @@ class GRPOStrategy:
        grpo_args_kwargs["log_completions"] = trl.log_completions
        grpo_args_kwargs["num_completions_to_print"] = trl.num_completions_to_print

-        if cfg.sequence_parallel_degree > 1:
-            grpo_args_kwargs["sequence_parallel_degree"] = cfg.sequence_parallel_degree
+        if cfg.context_parallel_degree > 1:
+            grpo_args_kwargs["context_parallel_degree"] = cfg.context_parallel_degree

        if trl.reward_weights:
            grpo_args_kwargs["reward_weights"] = trl.reward_weights
--- a/src/axolotl/core/trainers/grpo/args.py
+++ b/src/axolotl/core/trainers/grpo/args.py
@@ -13,4 +13,4 @@ from axolotl.core.training_args import AxolotlTrainingMixins
 class AxolotlGRPOConfig(AxolotlTrainingMixins, GRPOConfig):
    """Axolotl GRPO Config for GRPO training"""

-    sequence_parallel_degree: int | None = None
+    context_parallel_degree: int | None = None
--- a/src/axolotl/core/trainers/grpo/sampler.py
+++ b/src/axolotl/core/trainers/grpo/sampler.py
@@ -1,7 +1,7 @@
 """Repeat random sampler (similar to the one implemented in
 https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds
-sequence parallelism functionality; i.e., duplicating data across ranks in the same
-sequence parallel group.
+context parallelism functionality; i.e., duplicating data across ranks in the same
+context parallel group.
 """

 from typing import Iterator, Sized
@@ -10,26 +10,26 @@ import torch
 from torch.utils.data import Sampler


-class SequenceParallelRepeatRandomSampler(Sampler):
-    """Sampler for GRPO training with sequence parallelism.
+class ContextParallelRepeatRandomSampler(Sampler):
+    """Sampler for GRPO training with context parallelism.

    This sampler ensures:
-    - Ranks in the same sequence parallel (SP) group receive identical data.
+    - Ranks in the same context parallel (SP) group receive identical data.
    - Each index is repeated multiple times for sampling different completions.
    - Entire batches are repeated for reuse in multiple updates.
-    - Data is properly distributed across SP groups.
+    - Data is properly distributed across CP groups.

-    In the table below, the values represent dataset indices. Each SP group has
-    `sequence_parallel_degree = 2` GPUs working together on the same data. There are 2
-    SP groups (SP0 and SP1), with `world_size = 4` total GPUs.
+    In the table below, the values represent dataset indices. Each CP group has
+    `context_parallel_degree = 2` GPUs working together on the same data. There are 2
+    CP groups (SP0 and SP1), with `world_size = 4` total GPUs.

-                                               Sequence Parallel Groups
+                                               Context Parallel Groups
                                        |       SP0        |       SP1        |
                                        |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |
                    global_step  step    <---> mini_repeat_count=3
-                                            <----------> batch_size=2 per SP group
-    grad_accum=2   ▲  ▲  0       0         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- SP groups get different data
-                   ▼  |  0       1         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Same data for each SP group GPU
+                                            <----------> batch_size=2 per CP group
+    grad_accum=2   ▲  ▲  0       0         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- CP groups get different data
+                   ▼  |  0       1         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Same data for each CP group GPU
                      |
                      |  1       2         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Repeat same indices for iterations
    num_iterations=2  ▼  1       3         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- When using gradient accumulation
@@ -45,7 +45,7 @@ class SequenceParallelRepeatRandomSampler(Sampler):
        rank: Rank of current process.
        batch_size: Number of samples per batch.
        repeat_count: How many times to repeat the full sampling process.
-        sequence_parallel_degree: Number of ranks in a sequence parallel group.
+        context_parallel_degree: Number of ranks in a context parallel group.
        shuffle: Whether to shuffle the dataset.
        seed: Random seed for shuffling.
        drop_last: Whether to drop the last incomplete batch.
@@ -59,7 +59,7 @@ class SequenceParallelRepeatRandomSampler(Sampler):
        rank: int,
        batch_size: int = 1,
        repeat_count: int = 1,
-        sequence_parallel_degree: int = 1,
+        context_parallel_degree: int = 1,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
@@ -76,16 +76,16 @@ class SequenceParallelRepeatRandomSampler(Sampler):
        self.world_size = world_size
        self.rank = rank

-        # Sequence parallelism parameters
-        self.sequence_parallel_degree = sequence_parallel_degree
-        self.num_sp_groups = world_size // sequence_parallel_degree
-        self.sp_group_id = rank // sequence_parallel_degree
+        # Context parallelism parameters
+        self.context_parallel_degree = context_parallel_degree
+        self.num_sp_groups = world_size // context_parallel_degree
+        self.sp_group_id = rank // context_parallel_degree

        # Adjust dataset size for distributed sampling
        self.num_samples = len(self.dataset)
        self.total_size = self.num_samples

-        # Calculate effective number of samples per SP group
+        # Calculate effective number of samples per CP group
        if (
            self.drop_last
            and self.total_size % (self.num_sp_groups * self.batch_size) != 0
@@ -125,8 +125,8 @@ class SequenceParallelRepeatRandomSampler(Sampler):
            padding = indices[: self.batch_size - len(indices) % self.batch_size]
            indices += padding

-        # Subsample based on SP group ID
-        # Each SP group gets distinct batches of data
+        # Subsample based on CP group ID
+        # Each CP group gets distinct batches of data
        batch_indices = []
        for i in range(0, len(indices), self.batch_size * self.num_sp_groups):
            start_idx = i + self.sp_group_id * self.batch_size
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -1,9 +1,8 @@
-"""Axolotl GRPO trainers (with and without sequence parallelism handling)"""
+"""Axolotl GRPO trainers (with and without context parallelism handling)"""

 # pylint: disable=too-many-lines,duplicate-code,protected-access,no-member

 import warnings
-from functools import partial
 from typing import Any

 import datasets
@@ -42,7 +41,7 @@ from trl.trainer.grpo_config import GRPOConfig
 from trl.trainer.grpo_trainer import RewardFunc, nanstd
 from trl.trainer.utils import pad

-from axolotl.core.trainers.grpo.sampler import SequenceParallelRepeatRandomSampler
+from axolotl.core.trainers.grpo.sampler import ContextParallelRepeatRandomSampler
 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
 from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
 from axolotl.monkeypatch.ring_attn import get_ring_attn_group
@@ -59,45 +58,9 @@ class AxolotlGRPOTrainer(

    _tag_names = ["trl", "grpo", "axolotl"]

-    def get_train_dataloader(self):
-        if self.train_dataset is None:
-            raise ValueError("Trainer: training requires a train_dataset.")

-        train_dataset = self.train_dataset
-        data_collator = self.data_collator
-        if isinstance(train_dataset, datasets.Dataset):
-            train_dataset = self._remove_unused_columns(
-                train_dataset, description="training"
-            )
-        else:
-            data_collator = self._get_collator_with_removed_columns(
-                data_collator, description="training"
-            )
-
-        dataloader_params = {
-            "batch_size": self._train_batch_size
-            * self.args.steps_per_generation,  # < this is the change
-            "collate_fn": data_collator,
-            "num_workers": self.args.dataloader_num_workers,
-            "pin_memory": self.args.dataloader_pin_memory,
-            "persistent_workers": self.args.dataloader_persistent_workers,
-        }
-
-        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
-            dataloader_params["sampler"] = self._get_train_sampler()
-            dataloader_params["drop_last"] = self.args.dataloader_drop_last
-            dataloader_params["worker_init_fn"] = partial(
-                seed_worker,
-                num_workers=self.args.dataloader_num_workers,
-                rank=self.args.process_index,
-            )
-            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
-
-        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
-
-
-class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
-    """Extend the base GRPOTrainer for sequence parallelism handling"""
+class AxolotlGRPOContextParallelTrainer(AxolotlGRPOTrainer):
+    """Extend the base GRPOTrainer for context parallelism handling"""

    def __init__(
        self,
@@ -134,11 +97,11 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
        )

-        # Get number of SP groups (number of processes divided by SP degree)
+        # Get number of CP groups (number of processes divided by CP degree)
        num_processes = self.accelerator.num_processes
-        num_sp_groups = num_processes // self.args.sequence_parallel_degree
+        num_sp_groups = num_processes // self.args.context_parallel_degree

-        # Calculate batch size per SP group (not per process)
+        # Calculate batch size per CP group (not per process)
        sp_group_batch_size = self.args.per_device_train_batch_size * num_sp_groups
        possible_values = [
            n_gen
@@ -148,7 +111,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):

        if self.num_generations not in possible_values:
            raise ValueError(
-                f"The batch size per SP group ({num_sp_groups} x "
+                f"The batch size per CP group ({num_sp_groups} x "
                f"{self.args.per_device_train_batch_size}) must be evenly divisible by "
                f"the number of generations per prompt ({self.num_generations}). Given "
                "the current configuration, the valid values for the number of "
@@ -156,7 +119,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
            )

        if self.args.eval_strategy != "no":
-            # If sequence parallelism is enabled, calculate batch size per SP group
+            # If context parallelism is enabled, calculate batch size per CP group
            sp_group_eval_batch_size = args.per_device_eval_batch_size * num_sp_groups  # type: ignore[union-attr]
            possible_values = [
                n_gen
@@ -166,8 +129,8 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):

            if self.num_generations not in possible_values:
                raise ValueError(
-                    f"With sequence parallelism (degree {self.args.sequence_parallel_degree}), "
-                    f"the eval batch size per SP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) "
+                    f"With context parallelism (degree {self.args.context_parallel_degree}), "
+                    f"the eval batch size per CP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) "
                    f"must be evenly divisible by the number of generations per prompt "
                    f"({self.num_generations}). Given the current eval batch size, "
                    f"the valid values for the number of generations are: {possible_values}."
@@ -180,7 +143,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
        self.local_world_size = 1

    def train(self, *args, **kwargs):
-        # Initialize the SP group
+        # Initialize the CP group
        self.sp_group = get_ring_attn_group()
        self.rank = dist.get_rank()
        self.world_size = dist.get_world_size()
@@ -196,16 +159,16 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
            * self.args.gradient_accumulation_steps
        )

-        return SequenceParallelRepeatRandomSampler(
+        return ContextParallelRepeatRandomSampler(
            dataset=self.train_dataset,
            mini_repeat_count=self.num_generations,
            world_size=self.world_size,
            rank=self.rank,
            batch_size=effective_batch_size
            // self.num_generations
-            // self.args.sequence_parallel_degree,
+            // self.args.context_parallel_degree,
            repeat_count=self.num_iterations * self.args.gradient_accumulation_steps,
-            sequence_parallel_degree=self.args.sequence_parallel_degree,
+            context_parallel_degree=self.args.context_parallel_degree,
            shuffle=True,
            seed=self.args.seed,
            drop_last=True,
@@ -263,11 +226,11 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
        ):
            self.accelerator.even_batches = False

-        # Return unprepared dataloader if using sequence parallelism
+        # Return unprepared dataloader if using context parallelism
        # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
        # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
        # slice each batch along the sequence dimension).
-        if self.args.sequence_parallel_degree > 1:
+        if self.args.context_parallel_degree > 1:
            return dataloader

        # Otherwise prepare with accelerator
@@ -340,21 +303,21 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
            all_prompts_text = gather_object(prompts_text)
            if self.accelerator.is_main_process:
-                if self.args.sequence_parallel_degree > 1:
-                    # Calculate sequence parallel group information
+                if self.args.context_parallel_degree > 1:
+                    # Calculate context parallel group information
                    world_size = self.accelerator.num_processes
-                    sequence_parallel_degree = self.args.sequence_parallel_degree
-                    num_sp_groups = world_size // sequence_parallel_degree
+                    context_parallel_degree = self.args.context_parallel_degree
+                    num_sp_groups = world_size // context_parallel_degree

-                    # Since processes in the same SP group have the same prompts, we need to ensure
-                    # we only take one copy of each prompt from each SP group
+                    # Since processes in the same CP group have the same prompts, we need to ensure
+                    # we only take one copy of each prompt from each CP group
                    ordered_set_of_prompts = []
                    for sp_group_id in range(num_sp_groups):
-                        # Get the first process from each SP group (typically the group leader)
-                        group_leader_rank = sp_group_id * sequence_parallel_degree
+                        # Get the first process from each CP group (typically the group leader)
+                        group_leader_rank = sp_group_id * context_parallel_degree

-                        # Extract prompts from this SP group, accounting for num_generations duplicates
-                        # We only need prompts from one rank in each SP group
+                        # Extract prompts from this CP group, accounting for num_generations duplicates
+                        # We only need prompts from one rank in each CP group
                        group_prompts = all_prompts_text[
                            group_leader_rank
                            * len(prompts_text) : (group_leader_rank + 1)
@@ -367,7 +330,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                    # num_generations outputs for each one. This is faster than generating outputs for each duplicate
                    # prompt individually.
                    ordered_set_of_prompts = all_prompts_text[
-                        :: self.num_generations * self.args.sequence_parallel_degree
+                        :: self.num_generations * self.args.context_parallel_degree
                    ]

                with profiling_context(self, "vLLM.generate"):
@@ -384,28 +347,28 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                    )
            else:
                completion_ids = [None] * (
-                    len(all_prompts_text) // self.args.sequence_parallel_degree
+                    len(all_prompts_text) // self.args.context_parallel_degree
                )

            # Broadcast the completions from the main process to all processes
            completion_ids = broadcast_object_list(completion_ids, from_process=0)

-            # Determine the appropriate slice based on sequence parallelism
-            if self.args.sequence_parallel_degree > 1:
-                # Calculate SP group ID (which group of ranks this rank belongs to)
+            # Determine the appropriate slice based on context parallelism
+            if self.args.context_parallel_degree > 1:
+                # Calculate CP group ID (which group of ranks this rank belongs to)
                sp_group_id = self.accelerator.process_index // self.local_world_size

-                # Calculate the start index for this SP group
+                # Calculate the start index for this CP group
                sp_group_start = sp_group_id * len(prompts) * self.local_world_size

-                # All ranks in the same SP group get the same data slice
+                # All ranks in the same CP group get the same data slice
                process_slice = slice(
                    sp_group_start,
                    sp_group_start + len(prompts),
                )
                completion_ids = completion_ids[process_slice]
            else:
-                # Original behavior for non-sequence parallel case
+                # Original behavior for non-context parallel case
                process_slice = slice(
                    self.accelerator.process_index * len(prompts),
                    (self.accelerator.process_index + 1) * len(prompts),
@@ -615,20 +578,20 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
            advantages = advantages / (std_grouped_rewards + 1e-4)

        # Slice to keep only the local part of the data
-        if self.args.sequence_parallel_degree > 1:
-            # Calculate SP group ID (which group of ranks this rank belongs to)
+        if self.args.context_parallel_degree > 1:
+            # Calculate CP group ID (which group of ranks this rank belongs to)
            sp_group_id = self.accelerator.process_index // self.local_world_size

-            # Calculate the start index for this SP group
+            # Calculate the start index for this CP group
            sp_group_start = sp_group_id * len(prompts) * self.local_world_size

-            # All ranks in the same SP group get the same data slice
+            # All ranks in the same CP group get the same data slice
            process_slice = slice(
                sp_group_start,
                sp_group_start + len(prompts),
            )
        else:
-            # Original behavior for non-sequence parallel case
+            # Original behavior for non-context parallel case
            process_slice = slice(
                self.accelerator.process_index * len(prompts),
                (self.accelerator.process_index + 1) * len(prompts),
--- a/src/axolotl/core/trainers/mixins/init.py
+++ b/src/axolotl/core/trainers/mixins/init.py
@@ -3,7 +3,6 @@
 # pylint: disable=unused-import
 # flake8: noqa

-from .checkpoints import CheckpointSaveMixin
 from .optimizer import OptimizerMixin
 from .rng_state_loader import RngLoaderMixin
 from .scheduler import SchedulerMixin
--- a/src/axolotl/core/trainers/mixins/checkpoints.py
+++ b/src/axolotl/core/trainers/mixins/checkpoints.py
@@ -1,21 +0,0 @@
-"""Custom handling to not fail training if fsdp optimizer is not savable"""
-
-from transformers import Trainer
-
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-class CheckpointSaveMixin(Trainer):
-    """Mixin to handle saving the optimizer and scheduler if they are not savable."""
-
-    def _save_optimizer_and_scheduler(self, output_dir):
-        try:
-            super()._save_optimizer_and_scheduler(output_dir)
-        except NotImplementedError as exc:
-            LOG.warning(
-                f"Trainer does not support saving optimizer and scheduler:  {exc}\n"
-                "Optimizer and scheduler states were not saved - resuming from checkpoints "
-                "for this training run will not be possible."
-            )
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -2,17 +2,238 @@
 extra axolotl specific training args
 """

-from __future__ import annotations
-
 from dataclasses import dataclass, field
-from typing import Optional, Type
+from typing import Optional

+from PIL.Image import Resampling
 from transformers import TrainingArguments
 from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig

-from axolotl.integrations.config import merge_training_args

-AxolotlTrainingMixins: Type = merge_training_args()
+@dataclass
+class AxolotlTrainingMixins:
+    """
+    Mixin class for the Axolotl training args.
+    """
+
+    # pylint: disable=duplicate-code
+    model_type: Optional[str] = field(
+        default=None, metadata={"help": "HF model configuration model_type."}
+    )
+    lr_quadratic_warmup: bool = field(
+        default=False,
+        metadata={"help": "Use quadratic warmup for cosine scheduling."},
+    )
+    pretraining: bool = field(
+        default=False,
+        metadata={
+            "help": "Indicates to trainer whether we are doing continued pretraining."
+        },
+    )
+    sample_packing: bool = field(
+        default=False,
+        metadata={"help": "Use sample packing for efficient training."},
+    )
+    sample_packing_sequentially: bool = field(
+        default=False,
+        metadata={
+            "help": "Use next-fit sample packing that preserves the order of samples coming from the sampler. Use in combination with curriculum_sampling for fully sequential packing."
+        },
+    )
+    multipack_real_batches: bool = field(
+        default=False,
+        metadata={"help": "Use real batches for efficient training."},
+    )
+    eval_sample_packing: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Use sample packing for efficient evals."},
+    )
+    sample_packing_efficiency: float = field(
+        default=1.0,
+        metadata={"help": "Sample packing efficiency for calculating batch length."},
+    )
+    sample_packing_bin_size: int = field(
+        default=200,
+        metadata={
+            "help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
+        },
+    )
+    sample_packing_group_size: int = field(
+        default=100000,
+        metadata={
+            "help": "The number of samples to group together for packing. Increase for better packing."
+        },
+    )
+    max_seq_length: int = field(
+        default=2048,
+        metadata={"help": "The maximum sequence length the model can handle"},
+    )
+    relora_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to reset for ReLoRA"},
+    )
+    relora_warmup_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
+    )
+    relora_anneal_steps: Optional[int] = field(
+        default=None,
+        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
+    )
+    relora_prune_ratio: Optional[float] = field(
+        default=0.9,
+        metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
+    )
+    bench_split: Optional[str] = field(
+        default="eval", metadata={"help": "The benchmark split to run on"}
+    )
+    bench_dataset: Optional[str] = field(
+        default="pharaouk/dharma-1/dharma_1_mini.json",
+        metadata={
+            "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
+        },
+    )
+    do_bench_eval: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
+    )
+    do_causal_lm_eval: Optional[bool] = field(
+        default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
+    )
+    max_bench_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
+        },
+    )
+    bench_source_max_len: int = field(
+        default=2048, metadata={"help": "Maximum source sequence length for bench."}
+    )
+    dataloader_prefetch_factor: Optional[int] = field(
+        default=None,
+        metadata={"help": "prefetch_factor argument to the dataloader"},
+    )
+    cosine_min_lr_ratio: Optional[float] = field(
+        default=None,
+        metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
+    )
+    cosine_constant_lr_ratio: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
+        },
+    )
+    loraplus_lr_ratio: Optional[float] = field(
+        default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
+    )
+    loraplus_lr_embedding: Optional[float] = field(
+        default=1e-6,
+        metadata={"help": "loraplus learning rate for lora embedding layers."},
+    )
+    embedding_lr_scale: Optional[float] = field(
+        default=None,
+        metadata={"help": "Scale the learning rate for the embedding layers."},
+    )
+    lr_groups: Optional[list[dict]] = field(
+        default=None,
+        metadata={"help": "Specify learning rate groups for with different LRs."},
+    )
+    embedding_lr: Optional[float] = field(
+        default=None,
+        metadata={"help": "absolute learning rate for the embedding layers."},
+    )
+    qlora: bool = field(
+        default=False,
+        metadata={"help": "whether this is a qlora training"},
+    )
+    orpo_alpha: Optional[float] = field(
+        default=None,
+    )
+    lisa_n_layers: Optional[int] = field(
+        default=None,
+        metadata={"help": "the number of activate layers in LISA"},
+    )
+    lisa_step_interval: Optional[int] = field(
+        default=None,
+        metadata={"help": "how often to switch layers in LISA"},
+    )
+    lisa_layers_attribute: Optional[str] = field(
+        default=None,
+        metadata={"help": "path under the model to access the layers"},
+    )
+    curriculum_sampling: Optional[bool] = field(
+        default=None,
+        metadata={"help": "whether to use sequential sampling for curriculum learning"},
+    )
+    alternate_lr_scheduler_type: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
+        },
+    )
+    chat_template: Optional[str] = field(
+        default=None,
+        metadata={"help": "Chat template converting chat messages to text"},
+    )
+
+    kd_ce_alpha: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "The alpha scaling parameter for SFT cross entropy loss when using KD"
+        },
+    )
+
+    kd_alpha: Optional[float] = field(
+        default=1.0,
+        metadata={"help": "The alpha scaling parameter for KD loss"},
+    )
+
+    kd_temperature: Optional[float] = field(
+        default=1.0,
+        metadata={
+            "help": "the temperature parameter for KL divergence loss when using KD"
+        },
+    )
+
+    kd_zscore_base_temp: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "the base temperature parameter for KL divergence with z-score when using KD"
+        },
+    )
+
+    kd_top_k_before_softmax: Optional[bool] = field(
+        default=None,
+        metadata={
+            "help": "Whether to apply top_k_before_softmax to the logits when using KD"
+        },
+    )
+
+    adam_beta3: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "The beta3 hyperparameter used in some optimizers such as CAME"
+        },
+    )
+    adam_epsilon2: Optional[float] = field(
+        default=None,
+        metadata={
+            "help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
+        },
+    )
+
+    # multi-modal section
+
+    image_size: int | tuple[int, int] | None = field(
+        default=None,
+        metadata={"help": "The size of the image to resize to"},
+    )
+
+    image_resize_algorithm: Resampling | None = field(
+        default=None,
+        metadata={"help": "The algorithm to use for image resizing"},
+    )
+
+    # end of multi-modal section


@dataclass
--- a/src/axolotl/core/training_args_base.py
+++ b/src/axolotl/core/training_args_base.py
@@ -1,224 +0,0 @@
-"""
-Base Axolotl Training Mixins shared across various trainer configs
-"""
-
-from dataclasses import dataclass, field
-from typing import Optional
-
-from PIL.Image import Resampling
-
-
-@dataclass
-class AxolotlTrainingMixins:
-    """
-    Mixin class for the Axolotl training args.
-    """
-
-    # pylint: disable=duplicate-code
-    model_type: Optional[str] = field(
-        default=None, metadata={"help": "HF model configuration model_type."}
-    )
-    lr_quadratic_warmup: bool = field(
-        default=False,
-        metadata={"help": "Use quadratic warmup for cosine scheduling."},
-    )
-    pretraining: bool = field(
-        default=False,
-        metadata={
-            "help": "Indicates to trainer whether we are doing continued pretraining."
-        },
-    )
-    sample_packing: bool = field(
-        default=False,
-        metadata={"help": "Use sample packing for efficient training."},
-    )
-    sample_packing_sequentially: bool = field(
-        default=False,
-        metadata={
-            "help": "Use next-fit sample packing that preserves the order of samples coming from the sampler. Use in combination with curriculum_sampling for fully sequential packing."
-        },
-    )
-    multipack_real_batches: bool = field(
-        default=False,
-        metadata={"help": "Use real batches for efficient training."},
-    )
-    eval_sample_packing: Optional[bool] = field(
-        default=None,
-        metadata={"help": "Use sample packing for efficient evals."},
-    )
-    sample_packing_efficiency: float = field(
-        default=1.0,
-        metadata={"help": "Sample packing efficiency for calculating batch length."},
-    )
-    sample_packing_bin_size: int = field(
-        default=200,
-        metadata={
-            "help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
-        },
-    )
-    sample_packing_group_size: int = field(
-        default=100000,
-        metadata={
-            "help": "The number of samples to group together for packing. Increase for better packing."
-        },
-    )
-    max_seq_length: int = field(
-        default=2048,
-        metadata={"help": "The maximum sequence length the model can handle"},
-    )
-    dataset_num_proc: int | None = field(
-        default=None,
-        metadata={"help": "The number of processes to use for data processing"},
-    )
-    relora_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how often to reset for ReLoRA"},
-    )
-    relora_warmup_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
-    )
-    relora_anneal_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
-    )
-    relora_prune_ratio: Optional[float] = field(
-        default=0.9,
-        metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
-    )
-    bench_split: Optional[str] = field(
-        default="eval", metadata={"help": "The benchmark split to run on"}
-    )
-    bench_dataset: Optional[str] = field(
-        default="pharaouk/dharma-1/dharma_1_mini.json",
-        metadata={
-            "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
-        },
-    )
-    do_bench_eval: Optional[bool] = field(
-        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
-    )
-    do_causal_lm_eval: Optional[bool] = field(
-        default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
-    )
-    max_bench_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
-        },
-    )
-    bench_source_max_len: int = field(
-        default=2048, metadata={"help": "Maximum source sequence length for bench."}
-    )
-    dataloader_prefetch_factor: Optional[int] = field(
-        default=None,
-        metadata={"help": "prefetch_factor argument to the dataloader"},
-    )
-    cosine_min_lr_ratio: Optional[float] = field(
-        default=None,
-        metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
-    )
-    cosine_constant_lr_ratio: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
-        },
-    )
-    loraplus_lr_ratio: Optional[float] = field(
-        default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
-    )
-    loraplus_lr_embedding: Optional[float] = field(
-        default=1e-6,
-        metadata={"help": "loraplus learning rate for lora embedding layers."},
-    )
-    embedding_lr_scale: Optional[float] = field(
-        default=None,
-        metadata={"help": "Scale the learning rate for the embedding layers."},
-    )
-    lr_groups: Optional[list[dict]] = field(
-        default=None,
-        metadata={"help": "Specify learning rate groups for with different LRs."},
-    )
-    embedding_lr: Optional[float] = field(
-        default=None,
-        metadata={"help": "absolute learning rate for the embedding layers."},
-    )
-    qlora: bool = field(
-        default=False,
-        metadata={"help": "whether this is a qlora training"},
-    )
-    orpo_alpha: Optional[float] = field(
-        default=None,
-    )
-    lisa_n_layers: Optional[int] = field(
-        default=None,
-        metadata={"help": "the number of activate layers in LISA"},
-    )
-    lisa_step_interval: Optional[int] = field(
-        default=None,
-        metadata={"help": "how often to switch layers in LISA"},
-    )
-    lisa_layers_attribute: Optional[str] = field(
-        default=None,
-        metadata={"help": "path under the model to access the layers"},
-    )
-    curriculum_sampling: Optional[bool] = field(
-        default=None,
-        metadata={"help": "whether to use sequential sampling for curriculum learning"},
-    )
-    alternate_lr_scheduler_type: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
-        },
-    )
-    chat_template: Optional[str] = field(
-        default=None,
-        metadata={"help": "Chat template converting chat messages to text"},
-    )
-
-    # kd_ce_alpha: Optional[float] = field(
-    #     default=None,
-    #     metadata={
-    #         "help": "The alpha scaling parameter for SFT cross entropy loss when using KD"
-    #     },
-    # )
-    #
-    # kd_alpha: Optional[float] = field(
-    #     default=1.0,
-    #     metadata={"help": "The alpha scaling parameter for KD loss"},
-    # )
-    #
-    # kd_temperature: Optional[float] = field(
-    #     default=1.0,
-    #     metadata={
-    #         "help": "the temperature parameter for KL divergence loss when using KD"
-    #     },
-    # )
-
-    adam_beta3: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "The beta3 hyperparameter used in some optimizers such as CAME"
-        },
-    )
-    adam_epsilon2: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
-        },
-    )
-
-    # multi-modal section
-
-    image_size: int | tuple[int, int] | None = field(
-        default=None,
-        metadata={"help": "The size of the image to resize to"},
-    )
-
-    image_resize_algorithm: Resampling | None = field(
-        default=None,
-        metadata={"help": "The algorithm to use for image resizing"},
-    )
-
-    # end of multi-modal section
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -1,6 +1,7 @@
 """Module containing Dataset functionality"""

 import os
+from typing import List, Optional, Union

 import torch
 from datasets import Dataset, IterableDataset
@@ -19,21 +20,21 @@ LOG = get_logger(__name__)


 class TokenizedPromptDataset(Dataset):
-    """Dataset that returns tokenized prompts from a stream of text files.
-
-    Args:
-        prompt_tokenizer: The prompt tokenizing method for processing the data.
-        dataset: Dataset with text files.
-        process_count: Number of processes to use for tokenizing.
-        keep_in_memory: Whether to keep the tokenized dataset in memory.
+    """
+    Dataset that returns tokenized prompts from a stream of text files.
+        Args:
+            prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data.
+            dataset (dataset.Dataset): Dataset with text files.
+            process_count (int): Number of processes to use for tokenizing.
+            keep_in_memory (bool): Whether to keep the tokenized dataset in memory.
    """

    def __init__(  # pylint: disable=super-init-not-called
        self,
        prompt_tokenizer: PromptTokenizingStrategy,
        dataset: Dataset,
-        process_count: int | None = None,
-        keep_in_memory: bool | None = False,
+        process_count: Optional[int] = None,
+        keep_in_memory: Optional[bool] = False,
        **kwargs,
    ):
        self.prompt_tokenizer = prompt_tokenizer
@@ -48,13 +49,6 @@ class TokenizedPromptDataset(Dataset):
        features = dataset.features.keys()
        num_proc = min(64, self.process_count if self.process_count else os.cpu_count())

-        # Disable multiprocessing if the tokenizer doesn't support it (e.g., mistral_common)
-        if not getattr(self.prompt_tokenizer, "supports_multiprocessing", True):
-            LOG.info(
-                "Disabling multiprocessing for tokenizer as it doesn't support it (e.g., mistral_common)"
-            )
-            num_proc = 1
-
        map_kwargs = {}
        if self.prompt_tokenizer.supports_batched:
            map_kwargs["batched"] = True
@@ -82,14 +76,14 @@ class TokenizedPromptDataset(Dataset):

 def wrap_dataset_for_tokenized_prompt(
    prompt_tokenizer: PromptTokenizingStrategy,
-    dataset: Dataset | IterableDataset,
+    dataset: Union[Dataset, IterableDataset],
    **kwargs,
 ):
    if isinstance(dataset, IterableDataset):
        map_kwargs = {}
        if prompt_tokenizer.supports_batched:
            map_kwargs["batched"] = True
-        features = list(dataset.features.keys())
+        features = dataset.features.keys()
        return dataset.map(
            prompt_tokenizer.tokenize_prompt,
            remove_columns=features,
@@ -100,13 +94,12 @@ def wrap_dataset_for_tokenized_prompt(

 # TODO this isn't the best since it can't interleave datasets
 class ConstantLengthDataset(IterableDataset):
-    """Iterable dataset that returns constant length chunks of tokens from stream of
-    text files.
-
-    Args:
-        tokenizer: The processor used for processing the data.
-        dataset: Dataset with text files.
-        seq_length: Length of token sequences to return.
+    """
+    Iterable dataset that returns constant length chunks of tokens from stream of text files.
+        Args:
+            tokenizer (Tokenizer): The processor used for processing the data.
+            dataset (dataset.Dataset): Dataset with text files.
+            seq_length (int): Length of token sequences to return.
    """

    def __init__(  # pylint: disable=super-init-not-called
@@ -117,7 +110,7 @@ class ConstantLengthDataset(IterableDataset):
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
-        self.datasets: list[IterableDataset] = datasets
+        self.datasets: List[IterableDataset] = datasets
        self.seq_length = seq_length

        vocab_size = len(tokenizer.get_vocab())
@@ -181,10 +174,7 @@ class ConstantLengthDataset(IterableDataset):
                            }
                        else:
                            LOG.warning(
-                                "Dropping batch due to tensor size mismatch "
-                                f"input_ids: {input_ids.size()}, "
-                                f"labels: {labels.size()}, "
-                                f"attention_mask: {attention_mask.size()}"
+                                f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
                            )
                    buffer = {
                        "input_ids": [],
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -7,6 +7,7 @@ from pathlib import Path
 from typing import Dict, Optional

 import torch
+from accelerate.logging import get_logger
 from datasets import Dataset
 from transformers.trainer import Trainer

@@ -16,7 +17,6 @@ from axolotl.train import (
 )
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
-from axolotl.utils.logging import get_logger
 from axolotl.utils.trainer import setup_trainer

 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -22,7 +22,6 @@ from __future__ import annotations

 import collections
 import importlib
-import traceback
 from typing import TYPE_CHECKING, Callable, OrderedDict, Union

 from peft import PeftModel
@@ -33,7 +32,7 @@ from transformers import PreTrainedModel, Trainer
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger

-LOG = get_logger(__name__)
+LOG = get_logger(__name__, use_environ=True)

 if TYPE_CHECKING:
    from axolotl.common.datasets import TrainDatasetMeta
@@ -84,11 +83,6 @@ class BasePlugin:
    def get_input_args(self) -> str | None:
        """Returns a pydantic model for the plugin's input arguments."""

-    def get_training_args_mixin(self) -> str | None:
-        """
-        Returns a dataclass model for the plugin's training arguments.
-        """
-
    def load_datasets(
        self, cfg: DictDefault, preprocess: bool = False
    ) -> Union["TrainDatasetMeta", None]:
@@ -164,31 +158,6 @@ class BasePlugin:
            trainer: The trainer object for training.
        """

-    def get_training_args(self, cfg: DictDefault):  # pylint: disable=unused-argument):
-        """
-        Returns custom training arguments to set on TrainingArgs.
-
-        Args:
-            cfg: The global axolotl configuration.
-
-        Returns:
-            object: dict containing the training arguments.
-        """
-
-    def get_collator_cls_and_kwargs(
-        self, cfg: DictDefault, is_eval: bool = False
-    ):  # pylint: disable=unused-argument):
-        """
-        Returns a custom class for the collator.
-
-        Args:
-            cfg: The global axolotl configuration.
-            is_eval: Whether this is an eval split.
-
-        Returns:
-            class: The class for the collator.
-        """
-
    # pylint: disable=unused-argument
    def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
        """Creates and returns an optimizer for training.
@@ -309,7 +278,7 @@ def load_plugin(plugin_name: str) -> BasePlugin:
    return plugin


-class PluginManager:  # pylint: disable=too-many-public-methods
+class PluginManager:
    """The `PluginManager` class is responsible for loading and managing plugins. It
    should be a singleton so it can be accessed from anywhere in the codebase.

@@ -368,11 +337,8 @@ class PluginManager:  # pylint: disable=too-many-public-methods
            plugin = load_plugin(plugin_name)
            self.plugins[plugin_name] = plugin
            LOG.info(f"Plugin loaded successfully: {plugin_name}")
-        except ImportError as exc:
+        except ImportError:
            LOG.error(f"Failed to load plugin: {plugin_name}")
-            # print stacktrace
-            traceback.print_exc()
-            print(f"Error: {exc}")

    def get_input_args(self) -> list[str]:
        """Returns a list of Pydantic classes for all registered plugins' input arguments.'
@@ -387,20 +353,6 @@ class PluginManager:  # pylint: disable=too-many-public-methods
                input_args.append(input_args_from_plugin)
        return input_args

-    def get_training_args_mixin(self):
-        """
-        Returns a list of dataclasses for all registered plugins' training args mixins'
-
-        Returns:
-        list[str]: A list of dataclsses
-        """
-        training_args = []
-        for plugin in self.plugins.values():
-            training_args_from_plugin = plugin.get_training_args_mixin()
-            if training_args_from_plugin is not None:
-                training_args.append(training_args_from_plugin)
-        return training_args
-
    def load_datasets(
        self, cfg: DictDefault, preprocess: bool = False
    ) -> Union["TrainDatasetMeta", None]:
@@ -490,42 +442,6 @@ class PluginManager:  # pylint: disable=too-many-public-methods
                return trainer_cls
        return None

-    def get_training_args(self, cfg):
-        """
-        Calls the get_training_args method of all registered plugins and returns the combined training arguments.
-
-        Parameters:
-        cfg (dict): The configuration for the plugins.
-
-        Returns:
-        object: The training arguments
-        """
-        training_args_kwargs = {}
-        for plugin in self.plugins.values():
-            training_args = plugin.get_training_args(cfg)
-            if training_args is not None:
-                training_args_kwargs.update(training_args)
-
-        return training_args_kwargs
-
-    def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
-        """
-        Calls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.
-
-        Parameters:
-        cfg (dict): The configuration for the plugins.
-        is_eval (bool): Whether this is an eval split.
-
-        Returns:
-        object: The collator class, or None if none was found.
-        """
-        for plugin in self.plugins.values():
-            collator = plugin.get_collator_cls_and_kwargs(cfg, is_eval=is_eval)
-            if collator is not None:
-                collator_cls, collator_kwargs = collator
-                return collator_cls, collator_kwargs
-        return None
-
    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
        """Calls the `post_trainer_create` method of all registered plugins.

--- a/src/axolotl/integrations/config.py
+++ b/src/axolotl/integrations/config.py
@@ -16,7 +16,7 @@ Module to handle merging the plugins' input arguments with the base configuratio
 This was moved here to prevent circular imports.
 """

-from typing import Any, Dict, List, Type
+from typing import Any, Dict, List

 from axolotl.utils.schemas.config import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
@@ -61,43 +61,3 @@ def merge_input_args():
        ]
        return AxolotlConfigWCapabilities, AxolotlInputConfig
    return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase
-
-
-def merge_training_args() -> Type:
-    """
-    Merges training arguments from registered plugins with the base TrainingArguments.
-
-    This function retrieves the training arguments from registered plugins using the PluginManager.
-    It then dynamically creates new classes, AxolotlTrainingMixins,
-    that inherit from the base configurations and include the training arguments from the plugins.
-
-    Returns:
-    tuple: A tuple containing the newly created classes, AxolotlTrainingMixins.
-    """
-    # pylint: disable=duplicate-code
-    from axolotl.core.training_args_base import (
-        AxolotlTrainingMixins as AxolotlTrainingMixinsBase,
-    )
-    from axolotl.integrations.base import PluginManager
-
-    plugin_manager = PluginManager.get_instance()
-    training_args_mixins: List[str] = plugin_manager.get_training_args_mixin()
-    mixin_classes = []
-    dynamic_input = ""
-    for plugin_args in training_args_mixins:
-        plugin_module, plugin_cls = plugin_args.rsplit(".", 1)
-        dynamic_input += f"from {plugin_module} import {plugin_cls}\n"
-        mixin_classes.append(plugin_cls)
-    if dynamic_input:
-        dynamic_input += f"class AxolotlTrainingMixins(AxolotlTrainingMixinsBase, {', '.join(mixin_classes)}):\n    pass\n"
-
-        namespace: Dict[Any, Any] = {}
-        local_vars = {"AxolotlTrainingMixinsBase": AxolotlTrainingMixinsBase}
-        exec(  # pylint: disable=exec-used  # nosec B102
-            dynamic_input, {**globals(), **local_vars}, namespace
-        )
-        AxolotlTrainingMixins = namespace[  # pylint: disable=invalid-name
-            "AxolotlTrainingMixins"
-        ]
-        return AxolotlTrainingMixins
-    return AxolotlTrainingMixinsBase
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -24,14 +24,6 @@ pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transform

 ## Usage

-**NOTE**: If you are training a VLM model, please use older version of Axolotl as upstream has applied a major VLM refactor, and our patches have not been updated yet.
-
-```bash
-git checkout 787880215b3ab32ccaf81c1b2e9588c6f3e6e764
-
-pip3 install --no-build-isolation -e .
-```
-
 ```yaml
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -28,7 +28,7 @@ from axolotl.utils.logging import get_logger

 from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401

-LOG = get_logger(__name__)
+LOG = get_logger(__name__, use_environ=True)

 _CCE_INSTALL_MESSAGE = (
    "Please install cut_cross_entropy with transformers support using "
--- a/src/axolotl/integrations/kd/init.py
+++ b/src/axolotl/integrations/kd/init.py
@@ -15,12 +15,7 @@
 """
 Plugin init to add KD support to Axolotl.
 """
-from typing import Any
-
-from transformers import Trainer
-
 from axolotl.integrations.base import BasePlugin
-from axolotl.integrations.kd.callbacks import KDTemperatureSchedulerCallback

 from .args import KDArgs  # pylint: disable=unused-import. # noqa: F401

@@ -33,75 +28,9 @@ class KDPlugin(BasePlugin):
    def get_input_args(self):
        return "axolotl.integrations.kd.KDArgs"

-    def get_training_args_mixin(self):
-        return "axolotl.integrations.kd.args.KDTrainingArgsMixin"
-
    def get_trainer_cls(self, cfg):
        if cfg.kd_trainer:
            from .trainer import AxolotlKDTrainer

            return AxolotlKDTrainer
        return None
-
-    def get_training_args(self, cfg):
-        return {
-            "kd_ce_alpha": cfg.kd_ce_alpha,
-            "kd_alpha": cfg.kd_alpha,
-            "kd_temperature": cfg.kd_temperature,
-            "kd_beta": cfg.kd_beta,
-            "kd_normalize_topk": cfg.kd_normalize_topk,
-        }
-
-    def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
-        if not cfg.kd_trainer:
-            return None, None
-
-        from .collator import DataCollatorForKD, KDBatchSamplerDataCollatorForSeq2Seq
-
-        use_batch_sampler_collator = False
-        if is_eval is False and cfg.sample_packing:
-            use_batch_sampler_collator = True
-        if cfg.eval_sample_packing and is_eval:
-            use_batch_sampler_collator = True
-
-        if cfg.kd_online_server_base_url:
-            from .collator_online_teacher import OnlineTeacherCollator
-
-            return OnlineTeacherCollator, {
-                "kd_online_server_base_url": cfg.kd_online_server_base_url,
-                "kd_online_topk": cfg.kd_online_topk,
-                "kd_temperature": cfg.kd_temperature,
-                "kd_online_server": cfg.kd_online_server,
-                "kd_online_timeout": cfg.kd_online_timeout,
-                "kd_normalize_topk": cfg.kd_normalize_topk,
-            }
-
-        if use_batch_sampler_collator:
-            return KDBatchSamplerDataCollatorForSeq2Seq, {}
-        return DataCollatorForKD, {}
-
-    def pre_model_load(self, cfg):
-        from .kernels.models import apply_kernel
-
-        apply_kernel(cfg.model_config_type)
-
-    def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
-        """
-        Adds temp scheduler callback to the Trainer instance.
-
-        Args:
-            cfg (Any): Configuration object containing the sparse recipe.
-            trainer (Trainer): Huggingface Trainer instance.
-
-        Returns:
-            list: List containing the configured callback instances.
-        """
-        if cfg.kd_temperature_min is not None and cfg.kd_online_server_base_url:
-            callback = KDTemperatureSchedulerCallback(
-                cfg.kd_temperature,
-                cfg.kd_temperature_min,
-                trainer,
-            )
-            return [callback]
-
-        return []
--- a/src/axolotl/integrations/kd/args.py
+++ b/src/axolotl/integrations/kd/args.py
@@ -15,19 +15,9 @@
 """
 Plugin args for KD support.
 """
-from dataclasses import dataclass
-from enum import Enum
+from typing import Optional

-from pydantic import BaseModel, Field
-
-
-class InferenceServerType(str, Enum):
-    """
-    Online inferences server types to handle different request args
-    """
-
-    vllm = "vllm"  # pylint: disable=invalid-name
-    sglang = "sglang"  # pylint: disable=invalid-name
+from pydantic import BaseModel


 class KDArgs(BaseModel):
@@ -35,41 +25,13 @@ class KDArgs(BaseModel):
    Input args for knowledge distillation.
    """

-    kd_trainer: float | None = None  # whether to use KD trainer
-    kd_ce_alpha: float | None = (
+    kd_trainer: Optional[bool] = None  # whether to use KD trainer
+    kd_ce_alpha: Optional[float] = (
        None  # loss coefficient for cross-entropy loss during KD
    )
-    kd_alpha: float | None = None  # loss coefficient for KD loss
-    kd_temperature: float | None = None  # temperature for sampling during KD
-    kd_beta: float | None = 0.0  # beta coefficient for ratio of fwd and reverse KL
-    kd_normalize_topk: bool | None = (
-        None  # whether to normalize student logits during KD
-    )
-
-    # TODO online kd
-    kd_online_server_base_url: str | None = None
-    kd_online_topk: int | None = None
-    kd_online_server: InferenceServerType | None = Field(
-        default_factory=lambda: InferenceServerType.vllm
-    )
-    kd_online_timeout: int | None = 120
-    kd_temperature_min: float | None = (
-        None  # kd temperature scheduling during online kd
-    )
-
-
-@dataclass
-class KDTrainingArgsMixin:
-    """
-    Additional args for KD training.
-    """
-
-    kd_ce_alpha: float | None = (
-        None  # loss coefficient for cross-entropy loss during KD
-    )
-    kd_alpha: float | None = None  # loss coefficient for KD loss
-    kd_temperature: float | None = None  # temperature for sampling during KD
-    kd_beta: float | None = None  # beta coefficient for ratio of fwd and reverse KL
-    kd_normalize_topk: float | None = (
-        None  # whether to normalize student logits during KD
+    kd_alpha: Optional[float] = None  # loss coefficient for KD loss
+    kd_temperature: Optional[float] = None  # temperature for sampling during KD
+    kd_zscore_base_temp: Optional[float] = None  # base temperature for zscore scaling
+    kd_top_k_before_softmax: Optional[bool] = (
+        None  # whether to sample top k before softmax during KD
    )
--- a/src/axolotl/integrations/kd/callbacks.py
+++ b/src/axolotl/integrations/kd/callbacks.py
@@ -1,36 +0,0 @@
-"""
-Transformers trainer callbacks to schedule the KD temperature during training
-"""
-
-import math
-
-from transformers.trainer_callback import TrainerCallback
-
-
-class KDTemperatureSchedulerCallback(TrainerCallback):
-    """
-    KD temperature scheduler callback for the trainer.
-    """
-
-    def __init__(self, temperature_start, temperature_min, trainer):
-        self.temperature_start = temperature_start
-        self.temperature_min = temperature_min
-        self.temperature = temperature_start
-
-        self.trainer = trainer
-
-    def on_step_end(
-        self, args, state, control, **kwargs
-    ):  # pylint: disable=unused-argument
-        # cosine decay temperature over the max steps
-
-        progress = state.global_step / state.max_steps
-        # Cosine decay factor: 0.5 * (1 + cos(pi * progress))
-        # This factor goes from 1 (at progress=0) to 0 (at progress=1)
-        decay_factor = 0.5 * (1.0 + math.cos(math.pi * progress))
-        self.temperature = self.temperature_start - (
-            (self.temperature_start - self.temperature_min) * (1.0 - decay_factor)
-        )
-
-        if hasattr(self.trainer.data_collator, "kd_temperature"):
-            self.trainer.data_collator.kd_temperature = self.temperature
--- a/src/axolotl/integrations/kd/chat_template.py
+++ b/src/axolotl/integrations/kd/chat_template.py
@@ -15,15 +15,12 @@
 """
 Chat template prompt strategy loader with KD support
 """
-import logging
 from typing import Any, Dict

 import torch

 from axolotl.prompt_strategies.chat_template import ChatTemplateStrategy, StrategyLoader

-LOG = logging.getLogger(__name__)
-

 class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
    """
@@ -104,8 +101,10 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
        # fill with -inf for padding_len tokens for top_k tokens
        # extend target_logprobs with a padding_len x top_k 2D list filled with -inf

-        # we shift for causal models in the trainer, so start the range from 0
-        for _ in range(0, input_padding_len):
+        # for causal models, if we start the range at 1, then we don't need to shift in the trainer
+        # otherwise, we need to shift in the trainer
+        shift = 0
+        for _ in range(shift, input_padding_len):
            target_logprobs.append([-float("inf")] * top_k)
            target_token_ids.append(list(range(top_k)))
            target_mask.append([0] * top_k)
@@ -144,10 +143,6 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
            #
            # Convert from log to probability
            teacher_probs_t1 = position_logprobs_tensor.exp()
-            # normalize probabilities to sum to 1 in case they aren't already
-            teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
-            if teacher_probs_t1_sum > 1e-9:
-                teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
            if self.kd_temperature != self.gen_temperature:
                # Exponentiate by factor (T1 / T2)
                exponent = self.gen_temperature / self.kd_temperature
@@ -167,115 +162,12 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
            target_logprobs.append(position_logprobs_scaled)
            target_token_ids.append(position_token_ids)

-        # Update sample with transformed logprobs
-        sample["target_logprobs"] = target_logprobs
-        sample["target_token_ids"] = target_token_ids
-        sample["target_mask"] = target_mask
-
-        return sample
-
-    def _tokenize_single_prompt(self, prompt):
-        logprobs = prompt.pop(self.logprobs_field)
-        tokenized_prompt = super()._tokenize_single_prompt(prompt)
-        tokenized_prompt[self.logprobs_field] = logprobs
-        tokenized_prompt = self.transform_logprobs(tokenized_prompt)
-
-        return tokenized_prompt
-
-
-class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
-    """
-    Strat for datasets with complete structured KD logprob data
-    """
-
-    def transform_logprobs(self, sample):
-        """
-        Transform logprobs to target format for KD training
-        """
-        # pylint: disable=duplicate-code
-
-        logprobs = sample.pop(self.logprobs_field)
-        target_seq_len = len(logprobs)
-        input_seq_len = len(sample["input_ids"])
-        input_padding_len = input_seq_len - target_seq_len
-        # get non-zero top-k (prune None logprobs from vllm data step)
-        top_k_vals = [
-            len(logprobs[i])
-            for i in range(len(logprobs))
-            if logprobs[i] is not None and len(logprobs[i])
-        ]
-        max_top_k = max(set(top_k_vals), key=top_k_vals.count)
-        min_top_k = min(set(top_k_vals), key=top_k_vals.count)
-        top_k = min(max_top_k, min_top_k)
-        if top_k == 0:
-            raise ValueError("No non-zero top-k logprobs found.")
-
-        target_logprobs = []
-        target_token_ids = []
-        target_mask = []
-
-        if input_padding_len < 0:
-            # logprobs is longer than target_seq_len,
-            # so we need to slice from the left/beginning of logprobs
-            logprobs = logprobs[:-input_seq_len]
-            input_padding_len = 0
-            # target_seq_len = input_seq_len
-
-        # truncate the second dimension of the logprobs to top_k
-        logprobs = [row[:top_k] for row in logprobs]
-
-        # fill with -inf for padding_len tokens for top_k tokens
-        # extend target_logprobs with a padding_len x top_k 2D list filled with -inf
-
-        # we shift for causal models in the trainer, so start the range from 0
-        for _ in range(0, input_padding_len):
+        if shift == 1:
+            # since we started at index 1 for causal, we need one more padding token
            target_logprobs.append([-float("inf")] * top_k)
            target_token_ids.append(list(range(top_k)))
            target_mask.append([0] * top_k)

-        for position in range(input_padding_len, input_seq_len):
-            if sample["labels"][position] == -100:
-                target_mask.append([0] * top_k)
-            else:
-                target_mask.append([1] * top_k)
-
-        for token_pos_logprobs, pos_target_token_ids in zip(
-            logprobs, sample["target_token_ids"]
-        ):
-            # Convert to a tensor for easier manipulation
-            position_logprobs_tensor = torch.tensor(
-                token_pos_logprobs, dtype=torch.float
-            )
-
-            # Now we have distribution at T1 in log form, i.e. log p_{T1}(k).
-            # Next, re-scale to T2 = self.kd_temperature via exponent-based trick
-            # p_{T2}(k) = [p_{T1}(k)]^(T1 / T2) / Z
-            #
-            # Convert from log to probability
-            teacher_probs_t1 = position_logprobs_tensor.exp()
-            # normalize probabilities to sum to 1 in case they aren't already
-            teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
-            if teacher_probs_t1_sum > 1e-9:
-                teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
-            if self.kd_temperature != self.gen_temperature:
-                # Exponentiate by factor (T1 / T2)
-                exponent = self.gen_temperature / self.kd_temperature
-                teacher_probs_t2 = teacher_probs_t1**exponent
-            else:
-                teacher_probs_t2 = teacher_probs_t1
-            # Re-normalize
-            teacher_probs_t2 = teacher_probs_t2 / teacher_probs_t2.sum(
-                dim=0, keepdim=True
-            )
-            # Convert back to log
-            position_logprobs_tensor = torch.log(teacher_probs_t2)
-
-            # Now we have log p_{teacher, T2}(k) stored in position_logprobs_tensor
-            position_logprobs_scaled = position_logprobs_tensor.tolist()
-
-            target_logprobs.append(position_logprobs_scaled)
-            target_token_ids.append(pos_target_token_ids)
-
        # Update sample with transformed logprobs
        sample["target_logprobs"] = target_logprobs
        sample["target_token_ids"] = target_token_ids
@@ -285,10 +177,8 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):

    def _tokenize_single_prompt(self, prompt):
        logprobs = prompt.pop(self.logprobs_field)
-        target_token_ids = prompt.pop("target_token_ids")
        tokenized_prompt = super()._tokenize_single_prompt(prompt)
        tokenized_prompt[self.logprobs_field] = logprobs
-        tokenized_prompt["target_token_ids"] = target_token_ids
        tokenized_prompt = self.transform_logprobs(tokenized_prompt)

        return tokenized_prompt
@@ -299,7 +189,7 @@ class KDStrategyLoader(StrategyLoader):
    Load ChatTemplateStrategy with KD support using StrategyLoader.
    """

-    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
+    def _get_strategy_cls(self):
        return ChatTemplateStrategyWithKD

    def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
@@ -314,14 +204,4 @@ class KDStrategyLoader(StrategyLoader):
        return strategy_params


-class KDStrategyLoaderV2(KDStrategyLoader):
-    """
-    Load KD chat template datasets with pre-tokenized logprob data
-    """
-
-    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
-        return ChatTemplateStrategyWithKDv2
-
-
-load_legacy = KDStrategyLoader()
-load = KDStrategyLoaderV2()
+load = KDStrategyLoader()
--- a/src/axolotl/integrations/kd/collator.py
+++ b/src/axolotl/integrations/kd/collator.py
@@ -47,16 +47,11 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
    position_pad_token_id: int = 0
    return_tensors: str = "pt"

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
-
    def __call__(self, features, return_tensors=None):
        if return_tensors is None:
            return_tensors = self.return_tensors

        padding_side = self.tokenizer.padding_side
-        max_len = 0

        # Pad labels and position_ids first
        for feature_name, pad_token_id in [
@@ -107,9 +102,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
                target_mask_list.append(f.pop("target_mask"))

            # Determine max lengths
-            max_teacher_seq_len = max_len or max(
-                len(seq) for seq in target_logprobs_list
-            )
+            max_teacher_seq_len = max(len(seq) for seq in target_logprobs_list)
            max_k = max(len(seq_k) for seq in target_logprobs_list for seq_k in seq)

            padded_target_logprobs = []
@@ -216,9 +209,7 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
        #    We want to produce a single "merged" feature dict for each sub-batch.
        out_features = [{} for _ in features]

-        for i, sub_features in enumerate(  # pylint: disable=too-many-nested-blocks
-            features
-        ):
+        for i, sub_features in enumerate(features):
            # sub_features is a list of dicts, each dict = one sequence’s features
            # We'll merge them into out_features[i].
            #
@@ -252,17 +243,10 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
                    # For example, input_ids or labels are often arrays.
                    arrays = []
                    for feat in sub_features:
-                        if field_name in feat and isinstance(
-                            feat[field_name], (list, torch.Tensor)
-                        ):
-                            if isinstance(
-                                feat[field_name][0], (dict, str)
-                            ):  # pylint: disable=too-many-nested-blocks
-                                continue
+                        if field_name in feat:
                            arr = np.array(feat[field_name])
                            arrays.append(arr)
-                    if arrays:
-                        out_features[i][field_name] = np.concatenate(arrays)
+                    out_features[i][field_name] = np.concatenate(arrays)

        # 3) Now call the parent collator, which will do:
        #    - padding of labels/position_ids
--- a/src/axolotl/integrations/kd/collator_online_teacher.py
+++ b/src/axolotl/integrations/kd/collator_online_teacher.py
@@ -1,561 +0,0 @@
-"""
-Packed data loader for online teacher training supporting vllm and sglang.
-"""
-
-import hashlib
-import hmac
-import logging
-from typing import Any, Dict, List, Optional
-
-import requests
-import torch
-from orjson import orjson
-
-from axolotl.integrations.kd.collator import KDBatchSamplerDataCollatorForSeq2Seq
-from axolotl.integrations.kd.utils import normalize_logprobs
-from axolotl.utils.data.utils import retry_on_request_exceptions
-
-LOG = logging.getLogger(__name__)
-
-
-def hmac_sha_from_int_list(int_list, key, hash_func=hashlib.sha256):
-    """
-    Create HMAC-SHA hash from a list of integers
-
-    Args:
-        int_list: List of integers
-        key: Secret key (string or bytes)
-        hash_func: Hash function (default: sha256)
-
-    Returns:
-        HMAC digest as hex string
-    """
-    # Convert key to bytes if it's a string
-    if isinstance(key, str):
-        key = key.encode("utf-8")
-
-    # Convert list of ints to bytes
-    # Method 1: Convert each int to bytes and concatenate
-    data = b"".join(i.to_bytes(4, byteorder="big") for i in int_list)
-
-    # Create HMAC
-    h = hmac.new(key, data, hash_func)
-    return h.hexdigest()
-
-
-class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
-    """
-    Collator for online teacher training.
-    """
-
-    DEFAULT_LABEL_PAD_TOKEN_ID: int = -100
-
-    def __init__(
-        self,
-        *args: Any,
-        kd_online_server_base_url: Optional[str] = None,
-        kd_online_topk: Optional[int] = None,
-        kd_temperature: Optional[float] = 1.0,
-        kd_online_server: Optional[str] = "vllm",
-        kd_online_timeout: Optional[int] = 120,
-        kd_cache_dir: Optional[str] = None,
-        kd_normalize_topk: Optional[bool] = True,
-        **kwargs: Any,
-    ):
-        super().__init__(*args, **kwargs)
-
-        if kd_online_server_base_url is None:
-            raise ValueError(
-                "kd_online_server_base_url must be provided for OnlineTeacherDataloader"
-            )
-        if kd_online_topk is None or kd_online_topk <= 0:
-            raise ValueError(
-                "kd_online_topk must be a positive integer for OnlineTeacherDataloader"
-            )
-
-        self.kd_online_server_base_url = kd_online_server_base_url.rstrip("/")
-        self.kd_online_topk = kd_online_topk
-        self.kd_temperature = kd_temperature
-        self.kd_online_server = kd_online_server
-        self.http_session = requests.Session()
-        self.kd_online_timeout = kd_online_timeout
-        self.kd_cache_dir = kd_cache_dir
-        self.kd_normalize_topk = kd_normalize_topk
-
-    def _normalize_logprobs(self, raw_logprobs: List[float]) -> List[float]:
-        """
-        Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs.
-        """
-        if not raw_logprobs or self.kd_online_topk == 0:
-            return (
-                [-float("inf")] * self.kd_online_topk if self.kd_online_topk > 0 else []
-            )
-
-        raw_logprobs_tensor = torch.tensor(raw_logprobs, dtype=torch.float32)
-        return normalize_logprobs(raw_logprobs_tensor, self.kd_online_topk).tolist()
-
-    @retry_on_request_exceptions(max_retries=10, delay=5)
-    def fetch_online_logprobs_sglang(
-        self, batch_input_ids: List[List[int]], labels: List[List[int]]
-    ):
-        """
-        Fetches logprobs from an online teacher served by sglang for a batch of input_ids.
-        Assumes API returns token IDs as strings in logprob dictionary keys.
-        """
-        api_endpoint = f"{self.kd_online_server_base_url}/generate"
-
-        payload = {
-            "input_ids": batch_input_ids,
-            "return_logprob": True,
-            "top_logprobs_num": self.kd_online_topk,
-            "logprob_start_len": 0,
-            "return_text_in_logprobs": True,
-            "echo": True,
-            "sampling_params": {
-                "max_new_tokens": 0,
-                "temperature": self.kd_temperature,
-                "skip_special_tokens": False,
-            },
-        }
-
-        # Initialize with empty lists, so if API call fails, these are returned.
-        ret_data_target_token_ids: List[List[List[int]]] = []
-        ret_data_target_logprobs: List[List[List[float]]] = []
-        ret_data_target_mask: List[List[List[int]]] = []
-
-        try:
-            response = self.http_session.post(
-                api_endpoint, json=payload, timeout=self.kd_online_timeout
-            )
-            response.raise_for_status()
-            api_data: list[dict] = response.json()
-
-            # Ensure api_data is a list, and its length matches batch_input_ids
-            if not isinstance(api_data, list) or len(api_data) != len(batch_input_ids):
-                LOG.error(
-                    f"API response format error. Expected a list of {len(batch_input_ids)} "
-                    f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}."
-                )
-                # Return empty data; items processed later will get default empty KD fields
-                return {
-                    "target_token_ids": ret_data_target_token_ids,
-                    "target_logprobs": ret_data_target_logprobs,
-                    "target_mask": ret_data_target_mask,
-                }
-
-            for sequence_data, seq_input_ids, seq_labels in zip(
-                api_data, batch_input_ids, labels
-            ):
-                current_target_logprobs = []
-                current_target_token_ids = []
-                current_target_mask = []
-
-                meta_info = sequence_data.pop("meta_info", {})
-                # Ensure input_top_logprobs is a list
-                input_top_logprobs: Optional[list[None | list[tuple]]] = meta_info.pop(
-                    "input_top_logprobs", []
-                )
-                if not isinstance(input_top_logprobs, list):
-                    LOG.warning(
-                        f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence."
-                    )
-                    input_top_logprobs = []  # Treat as empty
-
-                # basic check that the logprob data len matches the input len, so no need to handle padding
-                assert len(seq_input_ids) == len(input_top_logprobs)
-
-                for i, _, label in zip(
-                    range(len(seq_input_ids)), seq_input_ids, seq_labels
-                ):
-                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
-                        # this is always the case for the first token.
-                        # there is never logprob data for the first token since that's a true input
-                        # so we replace the None value with padding data
-                        current_target_logprobs.append(
-                            [-float("inf")] * self.kd_online_topk
-                        )
-                        current_target_token_ids.append([0] * self.kd_online_topk)
-                        current_target_mask.append([0] * self.kd_online_topk)
-                    elif (
-                        i < len(input_top_logprobs)
-                        and input_top_logprobs[i] is not None
-                    ):
-                        pos_top_logprobs_data = input_top_logprobs[i]
-                        # Ensure pos_top_logprobs_data is a list of lists as expected
-                        if not (
-                            isinstance(pos_top_logprobs_data, list)
-                            and all(
-                                isinstance(item, list) for item in pos_top_logprobs_data
-                            )
-                            and len(pos_top_logprobs_data) > 0
-                            and len(pos_top_logprobs_data[0]) == 3
-                        ):  # [logprob, token_id, token_str]
-                            LOG.warning(
-                                f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position."
-                            )
-                            current_target_logprobs.append(
-                                [-float("inf")] * self.kd_online_topk
-                            )
-                            current_target_token_ids.append([0] * self.kd_online_topk)
-                            current_target_mask.append([0] * self.kd_online_topk)
-                            continue
-
-                        # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
-                        pos_logprobs_raw, pos_token_ids, _ = [
-                            list(row) for row in zip(*pos_top_logprobs_data)
-                        ]
-
-                        # Ensure correct length (top_k)
-                        if len(pos_logprobs_raw) < self.kd_online_topk:
-                            pad_len = self.kd_online_topk - len(pos_logprobs_raw)
-                            pos_logprobs_raw.extend([-float("inf")] * pad_len)
-                            pos_token_ids.extend([0] * pad_len)  # Pad with 0 token_id
-
-                        # truncate to top_k in case the response was longer
-                        current_target_token_ids.append(
-                            pos_token_ids[: self.kd_online_topk]
-                        )
-
-                        if self.kd_normalize_topk:
-                            normalized_logprobs_for_position = self._normalize_logprobs(
-                                pos_logprobs_raw[: self.kd_online_topk]
-                            )
-                            current_target_logprobs.append(
-                                normalized_logprobs_for_position
-                            )
-                        else:
-                            current_target_logprobs.append(
-                                pos_logprobs_raw[: self.kd_online_topk]
-                            )
-
-                        # Mask depends on the corresponding label for the student
-                        if label == self.DEFAULT_LABEL_PAD_TOKEN_ID:
-                            current_target_mask.append([0] * self.kd_online_topk)
-                        else:
-                            current_target_mask.append([1] * self.kd_online_topk)
-                    else:
-                        # Pad if no logprobs for this position (either due to length mismatch or None entry)
-                        current_target_logprobs.append(
-                            [-float("inf")] * self.kd_online_topk
-                        )
-                        current_target_token_ids.append([0] * self.kd_online_topk)
-                        current_target_mask.append([0] * self.kd_online_topk)
-
-                ret_data_target_token_ids.append(current_target_token_ids)
-                ret_data_target_logprobs.append(current_target_logprobs)
-                ret_data_target_mask.append(current_target_mask)
-
-        except requests.exceptions.RequestException as e:
-            LOG.error(f"Error fetching logprobs from online teacher: {e}")
-            raise e
-            # ret_logprobs_data will be returned with empty lists, handled by the caller.
-        except Exception as e:  # Catch other potential errors during processing
-            LOG.error(
-                f"Unexpected error processing API response in fetch_online_logprobs: {e}",
-                exc_info=True,
-            )
-            raise e
-
-        return {
-            "target_token_ids": ret_data_target_token_ids,
-            "target_logprobs": ret_data_target_logprobs,
-            "target_mask": ret_data_target_mask,
-        }
-
-    @retry_on_request_exceptions(max_retries=10, delay=5)
-    def fetch_online_logprobs_vllm(
-        self, batch_input_ids: List[List[int]], labels: List[List[int]]
-    ):
-        """
-        Fetches logprobs from an online teacher served by vllm for a batch of input_ids.
-        Assumes API returns token IDs as strings in logprob dictionary keys.
-        """
-        api_endpoint = f"{self.kd_online_server_base_url}/v1/completions"
-
-        payload = {
-            "prompt": batch_input_ids,
-            "echo": True,
-            "logprobs": True,
-            "prompt_logprobs": self.kd_online_topk,
-            "top_logprobs": self.kd_online_topk,
-            "max_new_tokens": 0,
-            "skip_special_tokens": False,
-            "temperature": self.kd_temperature,
-            "sampling_params": {
-                "max_tokens": 0,
-            },
-        }
-
-        # Initialize with empty lists, so if API call fails, these are returned.
-        ret_data_target_token_ids: List[List[List[int]]] = []
-        ret_data_target_logprobs: List[List[List[float]]] = []
-        ret_data_target_mask: List[List[List[int]]] = []
-
-        try:
-            headers = {"Accept-Encoding": "deflate, gzip, br, zstd"}
-            response = self.http_session.post(
-                api_endpoint,
-                json=payload,
-                headers=headers,
-                timeout=self.kd_online_timeout,
-            )
-            response.raise_for_status()
-            api_data: dict = orjson.loads(response.content)
-            choices: list[dict] = api_data["choices"]
-
-            # Ensure api_data is a list, and its length matches batch_input_ids
-            if not isinstance(choices, list) or len(choices) != len(batch_input_ids):
-                LOG.error(
-                    f"API response format error. Expected a list of {len(batch_input_ids)} "
-                    f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}."
-                )
-                # Return empty data; items processed later will get default empty KD fields
-                return {
-                    "target_token_ids": ret_data_target_token_ids,
-                    "target_logprobs": ret_data_target_logprobs,
-                    "target_mask": ret_data_target_mask,
-                }
-
-            for sequence_data, seq_input_ids, seq_labels in zip(
-                choices, batch_input_ids, labels
-            ):
-                # seq_input_ids: List[int]
-                # seq_labels: List[int]
-
-                current_target_logprobs = []
-                current_target_token_ids = []
-                current_target_mask = []
-
-                # Ensure input_top_logprobs is a list
-                input_top_logprobs: Optional[list[None | dict[str, dict]]] = (
-                    sequence_data.pop("prompt_logprobs", [])
-                )
-
-                if not isinstance(input_top_logprobs, list):
-                    LOG.warning(
-                        f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence."
-                    )
-                    input_top_logprobs = []  # Treat as empty
-
-                # basic check that the logprob data len matches the input len, so no need to handle padding
-                assert len(seq_input_ids) == len(input_top_logprobs)
-
-                seq_len = len(seq_input_ids)
-
-                for i, _, label in zip(range(seq_len), seq_input_ids, seq_labels):
-                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
-                        # this is always the case for the first token.
-                        # there is never logprob data for the first token since that's a true input
-                        continue
-                    if (
-                        i < len(input_top_logprobs)
-                        and input_top_logprobs[i] is not None
-                    ):
-                        pos_top_logprobs_data: dict[str, dict] = input_top_logprobs[i]  # type: ignore[assignment]
-                        # Ensure pos_top_logprobs_data is a list of lists as expected
-                        if not (
-                            isinstance(pos_top_logprobs_data, dict)
-                            and all(
-                                isinstance(item, dict)
-                                for item in pos_top_logprobs_data.values()
-                            )
-                            and len(pos_top_logprobs_data.keys()) > 0
-                        ):  # [logprob, token_id, token_str]
-                            LOG.warning(
-                                f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position."
-                            )
-                            current_target_logprobs.append(
-                                [-float("inf")] * self.kd_online_topk
-                            )
-                            current_target_token_ids.append(
-                                list(range(self.kd_online_topk))
-                            )
-                            current_target_mask.append([0] * self.kd_online_topk)
-                            continue
-
-                        # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
-                        pos_token_ids_str = list(pos_top_logprobs_data.keys())
-                        pos_logprobs_dict = pos_top_logprobs_data.values()
-                        pos_token_ids = [
-                            int(token_id) for token_id in pos_token_ids_str
-                        ]
-                        pos_logprobs_raw = [
-                            float(logprob.get("logprob", -float("inf")))
-                            for logprob in pos_logprobs_dict
-                        ]
-
-                        # Ensure correct length (top_k)
-                        if len(pos_logprobs_raw) < self.kd_online_topk:
-                            pad_len = self.kd_online_topk - len(pos_logprobs_raw)
-                            LOG.warning(
-                                f"Padding position {i} with {pad_len} top-k tokens and logprobs."
-                            )
-                            pos_logprobs_raw.extend([-float("inf")] * pad_len)
-                            pos_token_ids.extend([0] * pad_len)  # Pad with 0 token_id
-
-                        # truncate to top_k in case the response was longer
-                        current_target_token_ids.append(
-                            pos_token_ids[: self.kd_online_topk]
-                        )
-
-                        if self.kd_normalize_topk:
-                            normalized_logprobs_for_position = self._normalize_logprobs(
-                                pos_logprobs_raw[: self.kd_online_topk]
-                            )
-                            current_target_logprobs.append(
-                                normalized_logprobs_for_position
-                            )
-                        else:
-                            current_target_logprobs.append(
-                                pos_logprobs_raw[: self.kd_online_topk]
-                            )
-
-                        # Mask depends on the corresponding label for the student
-                        if label == self.DEFAULT_LABEL_PAD_TOKEN_ID:
-                            current_target_mask.append([0] * self.kd_online_topk)
-                        else:
-                            current_target_mask.append([1] * self.kd_online_topk)
-                    else:
-                        # Pad if no logprobs for this position (either due to length mismatch or None entry)
-                        current_target_logprobs.append(
-                            [-float("inf")] * self.kd_online_topk
-                        )
-                        current_target_token_ids.append(
-                            list(range(self.kd_online_topk))
-                        )
-                        current_target_mask.append([0] * self.kd_online_topk)
-                for i in range(max(0, seq_len - len(current_target_logprobs))):
-                    current_target_logprobs.append(
-                        [-float("inf")] * self.kd_online_topk
-                    )
-                    current_target_token_ids.append(list(range(self.kd_online_topk)))
-                    current_target_mask.append([0] * self.kd_online_topk)
-
-                ret_data_target_token_ids.append(current_target_token_ids)
-                ret_data_target_logprobs.append(current_target_logprobs)
-                ret_data_target_mask.append(current_target_mask)
-
-                # TODO save and load targets to disk for caching for next epoch
-                # generate a hmac SHA256 hash over the list seq_input_ids and convert it to an int
-                # if self.kd_cache_dir:
-                #     hash_input_ids = hmac_sha_from_int_list(
-                #         seq_input_ids, f"{self.kd_online_server_base_url}:{self.kd_online_topk}"
-                #     )
-                #     with open(f"{self.kd_cache_dir}/{hash_input_ids}.parquet", "wb") as f:
-                #         pd.DataFrame(ret_logprobs_data).to_parquet(f, index=False)
-
-        except requests.exceptions.RequestException as e:
-            LOG.error(f"Error fetching logprobs from online teacher: {e}")
-            raise e
-            # ret_logprobs_data will be returned with empty lists, handled by the caller.
-        except Exception as e:  # Catch other potential errors during processing
-            LOG.error(
-                f"Unexpected error processing API response in fetch_online_logprobs: {e}",
-                exc_info=True,
-            )
-            raise e
-
-        return {
-            "target_token_ids": ret_data_target_token_ids,
-            "target_logprobs": ret_data_target_logprobs,
-            "target_mask": ret_data_target_mask,
-        }
-
-    def __call__(
-        self, features: List[List[Dict[str, Any]]], return_tensors: Optional[str] = None
-    ) -> Dict[str, Any]:
-        if not features:
-            return super().__call__(features, return_tensors=return_tensors)
-
-        for (
-            sub_batch_features
-        ) in features:  # sub_batch_features is List[Dict[str, Any]]
-            if not sub_batch_features:
-                continue
-
-            input_ids_for_api_call: List[List[int]] = []
-            labels_for_api_call: List[List[int]] = []
-            # Store references to the original item dictionaries to update them in-place
-            items_for_api_call: List[Dict[str, Any]] = []
-
-            for item_dict in sub_batch_features:
-                if not isinstance(item_dict, dict):
-                    LOG.warning(
-                        f"Skipping non-dict item in sub_batch_features: {item_dict}"
-                    )
-                    continue
-
-                current_input_ids = item_dict.get("input_ids")
-                current_labels = item_dict.get("labels")
-
-                if current_input_ids is not None and current_labels is not None:
-                    # Ensure input_ids and labels are lists of ints for JSON serialization
-                    input_ids_list = (
-                        current_input_ids.tolist()
-                        if hasattr(current_input_ids, "tolist")
-                        else list(current_input_ids)
-                    )
-                    labels_list = (
-                        current_labels.tolist()
-                        if hasattr(current_labels, "tolist")
-                        else list(current_labels)
-                    )
-
-                    input_ids_for_api_call.append(input_ids_list)
-                    labels_for_api_call.append(labels_list)
-                    items_for_api_call.append(item_dict)
-                else:
-                    # This item will not get teacher logprobs from the API.
-                    # Initialize KD fields to empty lists so downstream collators handle them uniformly.
-                    item_dict.setdefault("target_token_ids", [])
-                    item_dict.setdefault("target_logprobs", [])
-                    item_dict.setdefault("target_mask", [])
-
-            # print(items_for_api_call)
-            if items_for_api_call:  # Only call API if there's something to process
-                if self.kd_online_server == "sglang":
-                    api_responses_for_sub_batch = self.fetch_online_logprobs_sglang(
-                        input_ids_for_api_call, labels_for_api_call
-                    )
-                else:
-                    api_responses_for_sub_batch = self.fetch_online_logprobs_vllm(
-                        input_ids_for_api_call, labels_for_api_call
-                    )
-
-                # api_responses_for_sub_batch has keys: "target_token_ids", "target_logprobs", "target_mask"
-                # Each value is a list, corresponding to items_for_api_call
-                for i, item_to_update in enumerate(items_for_api_call):
-                    # TODO make sure to figure out which input in sub_batch_features to update the batch in the original `features` object so the super class can handle it properly.
-                    if api_responses_for_sub_batch and i < len(
-                        api_responses_for_sub_batch["target_token_ids"]
-                    ):  # Check bounds
-                        assert len(
-                            api_responses_for_sub_batch["target_token_ids"][i]
-                        ) == len(item_to_update["input_ids"])
-                        assert len(
-                            api_responses_for_sub_batch["target_logprobs"][i]
-                        ) == len(item_to_update["input_ids"])
-                        assert len(
-                            api_responses_for_sub_batch["target_mask"][i]
-                        ) == len(item_to_update["labels"])
-                        item_to_update["target_token_ids"] = (
-                            api_responses_for_sub_batch["target_token_ids"][i]
-                        )
-                        item_to_update["target_logprobs"] = api_responses_for_sub_batch[
-                            "target_logprobs"
-                        ][i]
-                        item_to_update["target_mask"] = api_responses_for_sub_batch[
-                            "target_mask"
-                        ][i]
-                    else:
-                        # API call failed for this item, or response was shorter than expected.
-                        # Ensure KD fields are initialized as empty lists.
-                        LOG.warning(
-                            f" (index {i}), or API response was too short. "
-                            f"API response keys: {list(api_responses_for_sub_batch.keys()) if api_responses_for_sub_batch else 'None'}"
-                        )
-                        item_to_update.setdefault("target_token_ids", [])
-                        item_to_update.setdefault("target_logprobs", [])
-                        item_to_update.setdefault("target_mask", [])
-
-        return super().__call__(features, return_tensors=return_tensors)
--- a/src/axolotl/integrations/kd/kernels/init.py
+++ b/src/axolotl/integrations/kd/kernels/init.py
@@ -1,8 +0,0 @@
-"""
-Liger Chunked loss optimizations module
-"""
-
-from .liger import LigerFusedLinearKLTopKLogprobLoss
-from .models import apply_kernel
-
-__all__ = ["LigerFusedLinearKLTopKLogprobLoss", "apply_kernel"]
--- a/src/axolotl/integrations/kd/kernels/liger.py
+++ b/src/axolotl/integrations/kd/kernels/liger.py
@@ -1,485 +0,0 @@
-"""
-Liger Kernels for Chunked Top-K Log-Prob Distillation
-"""
-
-import torch
-import torch.nn.functional as F
-from liger_kernel.chunked_loss.fused_linear_distillation import (
-    LigerFusedLinearDistillationBase,
-)
-
-from axolotl.integrations.kd.utils import normalize_logprobs
-
-
-class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
-    """
-    Chunked kl-div loss for top-k logprobs
-    """
-
-    @staticmethod
-    def distillation_loss_fn(
-        student_logits_temp_scaled: torch.Tensor,  # [chunk_size, vocab_size], already temp-scaled
-        target_token_ids_chunk: torch.Tensor,  # [chunk_size, top_k]
-        target_logprobs_chunk: torch.Tensor,  # [chunk_size, top_k], already temp-scaled and normalized logprobs
-        target_mask_chunk: torch.Tensor,  # [chunk_size, top_k]
-        beta: float = 0.0,
-        normalize_topk: bool = True,
-    ) -> torch.Tensor:
-        """
-        Compute Top-K KL divergence loss for a chunk.
-        Args:
-            student_logits_temp_scaled: Student logits, scaled by temperature. Shape: (N, V).
-            target_token_ids_chunk: Top-k teacher token IDs. Shape: (N, K).
-            target_logprobs_chunk: Top-k teacher log probabilities (temp-scaled, normalized). Shape: (N, K).
-            target_mask_chunk: Mask for valid top-k tokens. Shape: (N, K).
-            beta: Controls the type of KL divergence.
-                  0.0 for Forward KL (P_teacher || P_student).
-                  1.0 for Reverse KL (P_student || P_teacher).
-                  0.5 for Symmetric KL (average of Forward and Reverse).
-            normalize_topk: Whether to normalize the log probabilities
-        Returns:
-            Sum of KL divergence losses for the chunk.
-        """
-        topk = target_token_ids_chunk.shape[-1]
-        student_logits_temp_scaled = (  # [chunk_size, vocab_size]
-            student_logits_temp_scaled.float()
-        )
-        target_logprobs_chunk = target_logprobs_chunk.float()
-
-        # Gather student logits for the top-k teacher token IDs
-        # target_token_ids_chunk: [chunk_size, top_k]
-        # student_logits_topk_temp_scaled: [chunk_size, top_k]
-        student_logits_topk_temp_scaled = torch.gather(
-            student_logits_temp_scaled, dim=-1, index=target_token_ids_chunk
-        )
-
-        # Student log-probabilities for the gathered top-k tokens
-        student_lse = torch.logsumexp(
-            student_logits_temp_scaled, dim=-1, keepdim=True
-        )  # [chunk_size, 1]
-        student_logprobs_topk_temp_scaled = (
-            student_logits_topk_temp_scaled - student_lse
-        )
-
-        # we have the top-k student logprobs, normalize them
-        if normalize_topk:
-            student_logprobs_topk_temp_scaled = normalize_logprobs(
-                student_logprobs_topk_temp_scaled, topk
-            )
-
-        valid_mask = target_mask_chunk.to(torch.bool)  # [chunk_size, top_k]
-
-        student_logprobs_topk_valid = student_logprobs_topk_temp_scaled[valid_mask]
-        teacher_logprobs_valid = target_logprobs_chunk[valid_mask]
-
-        # Teacher probabilities P(y|x_teacher) from logprobs
-        # target_logprobs_valid are already normalized (log(softmax(teacher_logits/T)))
-        teacher_probs_valid = teacher_logprobs_valid.exp()
-        # Student probabilities P_student from log P_student
-        student_probs_topk_valid = student_logprobs_topk_valid.exp()
-
-        # kd_loss_per_token = torch.zeros_like(target_logprobs_valid)
-
-        # KL divergence: sum(P_teacher * (log P_teacher - log P_student))
-        # = sum(P_teacher * log P_teacher) - sum(P_teacher * log P_student)
-        # The distillation loss is often formulated as -sum(P_teacher * log P_student)
-        # or as sum(P_teacher * (log_softmax_teacher - log_softmax_student))
-        # Here, target_logprobs_valid are log_softmax_teacher.
-        # student_logprobs_topk_valid are log_softmax_student (for the selected K indices).
-        if beta == 0.0:  # Contribution from Forward KL
-            fwd_kl_per_token = teacher_probs_valid * (
-                teacher_logprobs_valid - student_logprobs_topk_valid
-            )
-            kd_loss = fwd_kl_per_token.sum()
-        elif beta == 1.0:  # Contribution from Reverse KL
-            rev_kl_per_token = student_probs_topk_valid * (
-                student_logprobs_topk_valid - teacher_logprobs_valid
-            )
-            kd_loss = rev_kl_per_token.sum()
-        else:
-            # JSD - Jensen-Shannon Divergence / Symmetric
-            mean_probs = (
-                1 - beta
-            ) * student_probs_topk_valid + beta * teacher_probs_valid
-            log_mean_probs = mean_probs.log()
-            student_kl = F.kl_div(
-                log_mean_probs,
-                student_logprobs_topk_valid,
-                reduction="sum",
-                log_target=True,
-            )
-            teacher_kl = F.kl_div(
-                log_mean_probs, teacher_logprobs_valid, reduction="sum", log_target=True
-            )
-            jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
-            kd_loss = jsd_loss
-
-        return kd_loss
-
-    @staticmethod
-    def _compute_loss_kl_topk(
-        student_input_chunk: torch.Tensor,
-        student_weight: torch.Tensor,
-        # Args for student_bias, target_token_ids_chunk etc. are passed to the lambda wrapped by grad_and_value
-        # or through `partial`. Let's make them explicit here for clarity.
-        target_token_ids_chunk: torch.Tensor,
-        target_logprobs_chunk: torch.Tensor,
-        target_mask_chunk: torch.Tensor,
-        target_chunk: torch.Tensor,  # For hard loss (true labels)
-        student_bias: torch.Tensor = None,  # This will be one of the grad targets
-        # Other params passed via `partial` from `forward`
-        distillation_loss_fn=None,
-        ignore_index: int = -100,
-        weight_hard_loss: float = 0.5,
-        weight_soft_loss: float = 0.5,
-        compute_ce_loss: bool = True,
-        temperature: float = 1.0,
-        beta: float = 0.0,
-        normalize_topk: bool = True,
-    ):
-        # Compute student logits for the chunk from hidden states and LM head
-        # student_input_chunk: [chunk_size, hidden_dim]
-        # student_lm_head_weight: [vocab_size, hidden_dim]
-        # student_logits_chunk: [chunk_size, vocab_size]
-        student_logits_chunk = F.linear(
-            student_input_chunk, student_weight, student_bias
-        )
-
-        ce_loss = torch.tensor(
-            0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype
-        )
-        if compute_ce_loss and weight_hard_loss > 0.0:
-            ce_loss = F.cross_entropy(
-                student_logits_chunk.view(-1, student_logits_chunk.shape[-1]),
-                target_chunk.view(-1),
-                reduction="sum",
-                ignore_index=ignore_index,
-            )
-
-        soft_loss = torch.tensor(
-            0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype
-        )
-        if weight_soft_loss > 0.0:
-            student_logits_chunk_temp_scaled = student_logits_chunk / temperature
-
-            # Assuming student_weight.shape[0] (vocab_size) is adequate for target_token_ids_chunk.max()
-            # No explicit padding here; user must ensure vocab alignment or pre-pad student_weight.
-
-            soft_loss = distillation_loss_fn(
-                student_logits_chunk_temp_scaled,
-                target_token_ids_chunk,
-                target_logprobs_chunk,
-                target_mask_chunk,
-                beta=beta,
-                normalize_topk=normalize_topk,
-            )
-
-        return soft_loss, ce_loss
-
-    @classmethod
-    def forward(
-        cls,
-        ctx,
-        student_input: torch.Tensor,  # [batch_size, seq_len, dim]
-        student_lm_head_weight: torch.Tensor,  # [dim, vocab_size]
-        target_token_ids: torch.Tensor,  # [batch_size, seq_len, top_k]
-        target_logprobs: torch.Tensor,  # [batch_size, seq_len, top_k]
-        target_mask: torch.Tensor,  # [batch_size, seq_len, top_k]
-        true_labels: torch.Tensor,  # [batch_size, seq_len]
-        student_lm_head_bias: torch.Tensor = None,
-        weight_hard_loss: float = 0.5,
-        weight_soft_loss: float = 0.5,
-        ignore_index: int = -100,
-        temperature: float = 1.0,
-        beta: float = 0.0,
-        compiled: bool = False,
-        chunk_size: int = 1024,
-        compute_ce_loss: bool = True,
-        normalize_topk: bool = True,
-    ):
-        CHUNK_SIZE = chunk_size  # pylint: disable=invalid-name
-        grad_weight_acc = torch.zeros_like(student_lm_head_weight)
-        grad_inputs_list = []
-        grad_bias_acc = (
-            torch.zeros_like(student_lm_head_bias)
-            if student_lm_head_bias is not None
-            else None
-        )
-        kd_loss_acc = torch.zeros(
-            (), device=student_input.device, dtype=student_input.dtype
-        )
-        ce_loss_acc = torch.zeros(
-            (), device=student_input.device, dtype=student_input.dtype
-        )
-
-        # This function will be what torch.func.grad_and_value differentiates.
-        # It takes student_input_chunk, student_weight (full), student_bias (full) as primals.
-        # Other necessary data (target_*, etc.) are passed as non-differentiable arguments.
-        def loss_fn_for_grad(
-            _student_input_chunk,
-            _student_lm_head_weight,  # full weight
-            _student_lm_head_bias,  # full bias
-            # Fixed arguments for a given chunk, not differentiated:
-            _target_token_ids_chunk,
-            _target_logprobs_chunk,
-            _target_mask_chunk,
-            _true_labels_chunk,
-        ):
-            return cls._compute_loss_kl_topk(
-                student_input_chunk=_student_input_chunk,
-                student_weight=_student_lm_head_weight,
-                target_token_ids_chunk=_target_token_ids_chunk,
-                target_logprobs_chunk=_target_logprobs_chunk,
-                target_mask_chunk=_target_mask_chunk,
-                target_chunk=_true_labels_chunk,
-                student_bias=_student_lm_head_bias,
-                distillation_loss_fn=cls.distillation_loss_fn,
-                ignore_index=ignore_index,
-                weight_hard_loss=weight_hard_loss,
-                weight_soft_loss=weight_soft_loss,
-                compute_ce_loss=compute_ce_loss,
-                temperature=temperature,
-                beta=beta,
-                normalize_topk=normalize_topk,
-            )
-
-        def accumulate_chunk_grads(
-            student_input_chunk_ac,
-            target_token_ids_chunk_ac,
-            target_logprobs_chunk_ac,
-            target_mask_chunk_ac,
-            true_labels_chunk_ac,
-        ):
-            # student_weight and student_bias are closed over from the outer scope (full tensors)
-            if student_lm_head_bias is not None:
-                (
-                    (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
-                    (chunk_kd_loss, chunk_ce_loss),
-                ) = torch.func.grad_and_value(
-                    loss_fn_for_grad, argnums=(0, 1, 2), has_aux=True
-                )(
-                    student_input_chunk_ac,
-                    student_lm_head_weight,
-                    student_lm_head_bias,  # primals
-                    target_token_ids_chunk_ac,
-                    target_logprobs_chunk_ac,
-                    target_mask_chunk_ac,
-                    true_labels_chunk_ac,
-                )  # non-primals
-                grad_bias_acc.add_(chunk_grad_bias)
-            else:
-                argnums_for_grad = (0, 1)  # Differentiate wrt input_chunk, weight
-                (
-                    (chunk_grad_input, chunk_grad_weight),  # No grad for bias
-                    (chunk_kd_loss, chunk_ce_loss),
-                ) = torch.func.grad_and_value(
-                    loss_fn_for_grad, argnums=argnums_for_grad, has_aux=True
-                )(
-                    student_input_chunk_ac,
-                    student_lm_head_weight,
-                    None,  # Pass None for student_bias primal
-                    target_token_ids_chunk_ac,
-                    target_logprobs_chunk_ac,
-                    target_mask_chunk_ac,
-                    true_labels_chunk_ac,
-                )
-
-            grad_weight_acc.add_(chunk_grad_weight)
-            kd_loss_acc.add_(chunk_kd_loss)
-            ce_loss_acc.add_(chunk_ce_loss)
-
-            return chunk_grad_input
-
-        if compiled:
-            accumulate_chunk_grads_compiled = torch.compile(
-                accumulate_chunk_grads, dynamic=True, backend="inductor"
-            )  # dynamic=True often helpful
-        else:
-            accumulate_chunk_grads_compiled = accumulate_chunk_grads
-
-        # Use the same chunking logic as LigerFusedLinearDistillationBase.forward
-        B, N, D = student_input.shape  # pylint: disable=invalid-name
-        K = target_token_ids.shape[-1]  # pylint: disable=invalid-name
-
-        student_input_flat = student_input.reshape(-1, student_input.shape[-1])
-        target_token_ids_flat = target_token_ids.reshape(-1, target_token_ids.shape[-1])
-        target_logprobs_flat = target_logprobs.reshape(-1, target_logprobs.shape[-1])
-        target_mask_flat = target_mask.reshape(-1, target_mask.shape[-1])
-        # pad and shift for cross entropy loss
-        true_labels = torch.nn.functional.pad(true_labels, (0, 1), value=ignore_index)
-        true_labels_flat = true_labels[:, 1:].contiguous().view(-1)
-
-        num_chunks = max(1, student_input_flat.shape[0] // CHUNK_SIZE)
-
-        _student_input_chunks = torch.chunk(
-            student_input_flat, chunks=num_chunks, dim=0
-        )
-        _target_token_ids_chunks = torch.chunk(
-            target_token_ids_flat, chunks=num_chunks, dim=0
-        )
-        _target_logprobs_chunks = torch.chunk(
-            target_logprobs_flat, chunks=num_chunks, dim=0
-        )
-        _target_mask_chunks = torch.chunk(target_mask_flat, chunks=num_chunks, dim=0)
-        _true_labels_chunks = torch.chunk(true_labels_flat, chunks=num_chunks, dim=0)
-
-        for i in range(num_chunks):
-            grad_input_chunk = accumulate_chunk_grads_compiled(
-                _student_input_chunks[i],
-                _target_token_ids_chunks[i],
-                _target_logprobs_chunks[i],
-                _target_mask_chunks[i],
-                _true_labels_chunks[i],
-            )
-            grad_inputs_list.append(grad_input_chunk)
-
-        grad_inputs_combined = torch.cat(grad_inputs_list, dim=0)
-        ctx.save_for_backward(grad_inputs_combined, grad_weight_acc, grad_bias_acc)
-
-        # For matching None returns in backward for non-tensor/non-grad_requiring inputs
-        ctx.hyperparams_count = 9  # Corresponds to number of hyperparams after main tensors in fwd signature
-        ctx.bias_was_none = student_lm_head_bias is None
-        ctx.orig_dims = (B, N, D, K)
-
-        # since this is packed, there is simply a single batch, so batchmean reduction of kl-div is simply the accumulated sum
-        # we still need to scale the kd_loss by the temp^2
-        kd_loss_acc = kd_loss_acc * (temperature**2)
-        final_loss = weight_soft_loss * kd_loss_acc + weight_hard_loss * ce_loss_acc
-
-        return final_loss
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        grad_input_flat, grad_weight, grad_bias_maybe = (
-            ctx.saved_tensors
-        )  # grad_input_flat is (B*N, D)
-
-        # Scale gradients by grad_output if it's not 1.0
-        if not torch.equal(
-            grad_output,
-            torch.tensor(1.0, device=grad_output.device, dtype=grad_output.dtype),
-        ):
-            grad_input_flat = grad_input_flat * grad_output
-            grad_weight = grad_weight * grad_output
-            if grad_bias_maybe is not None:
-                grad_bias_maybe = grad_bias_maybe * grad_output
-
-        # Reshape grad_input_flat to match original student_input shape (B, N, D)
-        # ctx.orig_dims stores (B, N, D, K)
-        # We need the first three dimensions for student_input's shape.
-        # Ensure that orig_dims are not (0,0,0,K) for empty inputs leading to view errors
-        if (
-            ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0
-            and grad_input_flat.numel() == 0
-        ):
-            # If original input was empty, gradient should also be empty with correct shape
-            grad_input_reshaped = torch.zeros(
-                ctx.orig_dims[0],
-                ctx.orig_dims[1],
-                ctx.orig_dims[2],
-                dtype=grad_input_flat.dtype,
-                device=grad_input_flat.device,
-            )
-        elif grad_input_flat.numel() == 0 and not (
-            ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0
-        ):
-            # This case should ideally not happen if forward path is correct (non-empty input -> non-empty flat grad)
-            # but as a safeguard:
-            grad_input_reshaped = torch.zeros(
-                ctx.orig_dims[0],
-                ctx.orig_dims[1],
-                ctx.orig_dims[2],
-                dtype=grad_input_flat.dtype,
-                device=grad_input_flat.device,
-            )
-        else:
-            grad_input_reshaped = grad_input_flat.view(
-                ctx.orig_dims[0], ctx.orig_dims[1], ctx.orig_dims[2]
-            )
-
-        nones_for_hyperparams = [None] * ctx.hyperparams_count
-        grad_bias_return = grad_bias_maybe if not ctx.bias_was_none else None
-
-        return (
-            grad_input_reshaped,  # Gradient for student_input (reshaped)
-            grad_weight,  # Gradient for student_lm_head_weight
-            None,  # Gradient for target_token_ids
-            None,  # Gradient for target_logprobs
-            None,  # Gradient for target_mask
-            None,  # Gradient for true_labels
-            grad_bias_return,  # Gradient for student_lm_head_bias
-            *nones_for_hyperparams,  # Grads for weight_hard_loss, ..., compute_ce_loss
-        )
-
-
-class LigerFusedLinearKLTopKLogprobLoss(torch.nn.Module):
-    """
-    wrapper for chunked top-k logprob kl-d
-    """
-
-    def __init__(
-        self,
-        weight_hard_loss: float = 0.5,
-        weight_soft_loss: float = 0.5,
-        temperature: float = 1.0,  # This is the kd_temperature
-        beta: float = 1.0,
-        ignore_index: int = -100,
-        compiled: bool = True,
-        chunk_size: int = 1024,
-        compute_ce_loss: bool = True,
-        normalize_topk: bool = True,
-    ):
-        super().__init__()
-        if not (0.0 <= weight_hard_loss <= 1.0 and 0.0 <= weight_soft_loss <= 1.0):
-            raise ValueError("Loss weights must be between 0.0 and 1.0.")
-        if temperature <= 0:
-            raise ValueError("Temperature must be positive.")
-
-        self.weight_hard_loss = weight_hard_loss
-        self.weight_soft_loss = weight_soft_loss
-        self.temperature = temperature
-        self.beta = beta
-        self.ignore_index = ignore_index
-        self.compiled = compiled
-        self.chunk_size = chunk_size
-        self.compute_ce_loss = compute_ce_loss
-        self.normalize_topk = normalize_topk
-
-        if not self.compute_ce_loss and self.weight_hard_loss > 0.0:
-            print(
-                f"Warning: compute_ce_loss is False, but weight_hard_loss ({self.weight_hard_loss}) > 0. Hard loss will effectively be zero."
-            )
-            # self.weight_hard_loss = 0.0 # Or let user manage this
-        if self.weight_soft_loss == 0.0:
-            print(
-                "Warning: weight_soft_loss is 0.0. Soft (KD) loss will not be computed."
-            )
-
-    def forward(
-        self,
-        lm_head_weight: torch.Tensor,  # Weights of the linear layer in the LM head
-        student_hidden_states: torch.Tensor,  # student_hidden_states before the lm_head
-        target_token_ids: torch.Tensor,
-        target_logprobs: torch.Tensor,
-        target_mask: torch.Tensor,
-        true_labels: torch.Tensor,
-        student_bias: torch.Tensor = None,
-    ) -> torch.Tensor:
-        return LigerFusedLinearKLTopKLogprobFunction.apply(
-            student_hidden_states,
-            lm_head_weight,
-            target_token_ids,
-            target_logprobs,
-            target_mask,
-            true_labels,
-            student_bias,
-            self.weight_hard_loss,
-            self.weight_soft_loss,
-            self.ignore_index,
-            self.temperature,
-            self.beta,
-            self.compiled,
-            self.chunk_size,
-            self.compute_ce_loss,
-            self.normalize_topk,
-        )
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -1,97 +0,0 @@
-"""
-model patcher for chunked top-k kl-div
-"""
-
-from typing import Optional, Union, Unpack
-
-import torch
-from transformers import Cache
-from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.utils import LossKwargs
-
-
-class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs):
-    """
-    placeholder kwargs for hf model classes
-    """
-
-
-def kldiv_forward_llama_like(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    target_logprobs: Optional[torch.Tensor] = None,
-    target_token_ids: Optional[torch.LongTensor] = None,
-    target_mask: Optional[torch.Tensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Cache] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,  # pylint: disable=unused-argument
-    **kwargs: Unpack[KwargsForCausalLM],  # type: ignore[misc]
-) -> CausalLMOutputWithPast:
-    # pylint: disable=duplicate-code
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs.last_hidden_state
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    # TODO, we can optimize this further by filtering hidden_states on sequence dimension using labels != -100
-    # self.loss_function should be LigerFusedLinearKLTopKLogprobLoss
-
-    loss = self.loss_function(
-        self.lm_head.weight,
-        hidden_states,
-        target_token_ids,
-        target_logprobs,
-        target_mask,
-        true_labels=labels,
-    )
-    num_items_in_batch = kwargs.pop("num_items_in_batch", -1)
-    if num_items_in_batch is not None and num_items_in_batch > 0:
-        loss = loss / num_items_in_batch
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=None,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def apply_kernel(model_type):
-    # Dynamically import the module and attention class
-    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
-    model_cls_prefix = "".join([part.capitalize() for part in model_type.split("_")])
-    module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"])
-    model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
-    model_cls.forward = kldiv_forward_llama_like
--- a/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
+++ b/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
@@ -16,7 +16,40 @@
 loss for top_k KL divergence
 """
 import torch
-from torch import nn
+
+
+def zscore_standardize(
+    logits: torch.Tensor,
+    mask: torch.Tensor = None,
+    base_temperature: float = 1.0,
+    eps: float = 1e-9,
+):
+    """
+    Z-score standardize along the last dimension of `logits`.
+    i.e., for each [B, seq_len] row, across K entries:
+        z = (logits - mean) / std,
+    then scale by 1 / base_temperature if desired.
+
+    mask can be broadcastable or None. If None, we standardize all elements.
+    """
+    if mask is None:
+        # shape: [B, seq_len, K]
+        # Mean and std over dim=-1
+        mean = logits.mean(dim=-1, keepdim=True)
+        var = logits.var(dim=-1, unbiased=False, keepdim=True)
+    else:
+        # If you have to exclude some tokens, multiply by mask, etc.
+        float_mask = mask.to(logits.dtype)
+        count = float_mask.sum(dim=-1, keepdim=True).clamp_min(1.0)
+        mean = (logits * float_mask).sum(dim=-1, keepdim=True) / count
+        var = (float_mask * (logits - mean) ** 2).sum(dim=-1, keepdim=True) / count
+
+    std = torch.sqrt(var.clamp_min(eps))
+    z = (logits - mean) / std
+
+    # Scale by 1 / base_temperature
+    z = z / base_temperature
+    return z


@torch.jit.script
@@ -27,6 +60,7 @@ def loss(
    target_mask: torch.Tensor,
    num_items_in_batch: int = -1,  # Use -1 to indicate "None"
    kd_temperature: float = 1.0,
+    top_k_before_softmax: int = 0,
 ) -> torch.Tensor:
    """
    A KD loss function that is TorchScript-friendly.
@@ -43,6 +77,8 @@ def loss(
        num_items_in_batch (int, optional): The number of items in the batch.
        kd_temperature (float, optional): The temperature for KD.
            Default: 1.0
+        top_k_before_softmax (int, optional): Flag of whether to apply softmax before gathering student top-k logits
+            Default: 0
    """

    target_logprobs = target_logprobs.float()
@@ -52,24 +88,46 @@ def loss(
    # student_logits shape:   [B, student_seq_len, vocab_size]
    teacher_seq_len = target_token_ids.shape[1]

-    # Slice student logits to match teacher-provided sequence length
-    student_logits_for_kd = (
-        student_logits[:, :teacher_seq_len, :] / kd_temperature
-    )  # [B, teacher_seq_len, vocab_size]
+    if top_k_before_softmax:
+        # Slice student logits to match teacher-provided sequence length
+        student_logits_for_kd = student_logits[
+            :, :teacher_seq_len, :
+        ]  # [B, teacher_seq_len, vocab_size]

-    # keep in full precision for numerical stability of loss
-    student_logits_for_kd = student_logits_for_kd.float()
+        # Gather student logits for teacher's top-K tokens
+        student_logits_topk = torch.gather(
+            student_logits_for_kd, dim=-1, index=target_token_ids
+        )  # [B, teacher_seq_len, K]

-    # Gather student logits for teacher's top-K tokens
-    student_logits_topk = torch.gather(
-        student_logits_for_kd, dim=-1, index=target_token_ids
-    )  # [B, teacher_seq_len, K]
+        student_logits_topk = student_logits_topk.float()

-    # Compute logsumexp across full vocabulary
-    student_lse = torch.logsumexp(student_logits_for_kd, dim=-1, keepdim=True)
+        # Apply KD temperature to student’s logits
+        if kd_temperature != 1.0:
+            student_logits_topk = student_logits_topk / kd_temperature

-    #  Convert just the top-k logits to logprobs
-    student_logprobs_topk = student_logits_topk - student_lse
+        # Convert student top-k logits to logprobs
+        student_logprobs_topk = student_logits_topk - torch.logsumexp(
+            student_logits_topk, dim=-1, keepdim=True
+        )  # [B, teacher_seq_len, K]
+    else:
+        # Slice student logits to match teacher-provided sequence length
+        student_logits_for_kd = (
+            student_logits[:, :teacher_seq_len, :] / kd_temperature
+        )  # [B, teacher_seq_len, vocab_size]
+
+        # keep in full precision for numerical stability of loss
+        student_logits_for_kd = student_logits_for_kd.float()
+
+        # Gather student logits for teacher's top-K tokens
+        student_logits_topk = torch.gather(
+            student_logits_for_kd, dim=-1, index=target_token_ids
+        )  # [B, teacher_seq_len, K]
+
+        # Compute logsumexp across full vocabulary
+        student_lse = torch.logsumexp(student_logits_for_kd, dim=-1, keepdim=True)
+
+        #  Convert just the top-k logits to logprobs
+        student_logprobs_topk = student_logits_topk - student_lse

    # Convert teacher_mask to boolean for indexing
    # In TorchScript, .bool() is sometimes unsupported, so we do:
@@ -86,6 +144,10 @@ def loss(
    kd_loss_per_token = teacher_probs * (target_logprobs - student_logprobs_topk)
    kd_loss = kd_loss_per_token.sum()

+    # Multiply by T^2 (classical KD scaling)
+    if kd_temperature != 1.0:
+        kd_loss = kd_loss * (kd_temperature**2)
+
    # Normalize by number of items (if provided) or by valid tokens
    if num_items_in_batch > 0:
        kd_loss = kd_loss / float(num_items_in_batch)
@@ -96,74 +158,80 @@ def loss(
    return kd_loss


-class ChunkedTopKKDLoss(nn.Module):
+def topk_kd_loss_with_zscore(
+    student_logits: torch.Tensor,  # [B, seq_len, vocab_size]
+    target_token_ids: torch.Tensor,  # [B, seq_len, K]
+    target_logprobs: torch.Tensor,  # [B, seq_len, K], sums to 1.0 in prob space
+    target_mask: torch.Tensor,  # [B, seq_len, K] or [B, seq_len]
+    kd_temperature: float = 1.0,  # classic KD temperature
+    zscore_base_temp: float = 1.0,  # from the paper
+    num_items_in_batch: int = -1,
+):
    """
-    A wrapper that chunks (splits) the student and teacher outputs along the time dimension
-    to reduce peak memory usage when upcasting from bf16 to fp32, especially for large vocabularies.
-
-    Usage is analogous to ForwardKLWithChunkedOutputLoss but adapted to top-K teacher logprobs.
+    A variant of top_k KL divergence with Z-score scaling
+    from "Logit Standardization in Knowledge Distillation".
    """

-    def __init__(self, num_output_chunks: int = 8, kd_temperature: float = 1.0):
-        super().__init__()
-        self.num_output_chunks = num_output_chunks
-        self.kd_temperature = kd_temperature
+    target_logprobs = target_logprobs.float()

-    def forward(
-        self,
-        student_logits: torch.Tensor,  # [B, seq_len, vocab_size]
-        target_token_ids: torch.Tensor,  # [B, seq_len, K]
-        target_logprobs: torch.Tensor,  # [B, seq_len, K]
-        target_mask: torch.Tensor,  # [B, seq_len, K]
-        num_items_in_batch: int = -1,  # optional batch size for normalization
-    ) -> torch.Tensor:
+    B, teacher_seq_len, K = target_logprobs.shape  # pylint: disable=invalid-name
+    # 1) Gather the student's top-k logits to match teacher
+    student_logits_for_kd = student_logits[
+        :, :teacher_seq_len, :
+    ]  # [B, seq_len, vocab]
+    student_topk_logits = torch.gather(
+        student_logits_for_kd, dim=-1, index=target_token_ids
+    )  # [B, seq_len, K]

-        # 1. Split along the "token" dimension (dim=1).
-        student_logits_chunks = student_logits.chunk(self.num_output_chunks, dim=1)
-        token_ids_chunks = target_token_ids.chunk(self.num_output_chunks, dim=1)
-        logprobs_chunks = target_logprobs.chunk(self.num_output_chunks, dim=1)
-        mask_chunks = target_mask.chunk(self.num_output_chunks, dim=1)
+    student_topk_logits = student_topk_logits.float()

-        # We'll accumulate a global "sum of losses" and "sum of valid tokens"
-        # so that our final average is consistent with the entire sequence/batch.
-        total_loss = 0.0
-        total_valid_tokens = 0
+    # 2) If you want to keep the "classical" T scaling, apply it first
+    if kd_temperature != 1.0:
+        student_topk_logits = student_topk_logits / kd_temperature

-        # 2. Loop over each chunk and compute a chunk-specific loss.
-        for st_chunk, tid_chunk, lp_chunk, msk_chunk in zip(
-            student_logits_chunks, token_ids_chunks, logprobs_chunks, mask_chunks
-        ):
-            # We pass num_items_in_batch=-1 so that the kd_loss
-            # will average over *this chunk's* valid tokens only.
-            chunk_loss = loss(
-                student_logits=st_chunk,
-                target_token_ids=tid_chunk,
-                target_logprobs=lp_chunk,
-                target_mask=msk_chunk,
-                num_items_in_batch=-1,  # ensure per-chunk averaging by valid tokens
-                kd_temperature=self.kd_temperature,
-            )
+    # 3) Convert teacher logprobs -> treat them as “logits” for z-score
+    #    (They differ by +some_constant from real logits, but in z-score
+    #     that constant is subtracted out anyway.)
+    teacher_logits_for_zscore = target_logprobs  # rename variable for clarity

-            # kd_loss returns an average over the chunk's valid tokens.
-            # We want a global average in the end, so we need to re‐weight
-            # by the number of valid tokens in this chunk and keep track of the total.
-            chunk_valid_mask = msk_chunk.to(torch.bool)
-            chunk_valid_count = chunk_valid_mask.sum()  # scalar tensor
+    # 4) Z-score teacher and student
+    #    If target_mask is 2D, expand to 3D for the K dimension
+    if target_mask.dim() == 2 and target_mask.shape[:2] == (B, teacher_seq_len):
+        target_mask = target_mask.unsqueeze(-1).expand(-1, -1, K)

-            # Re-scale "chunk average" back to "chunk sum"
-            chunk_loss_sum = chunk_loss * chunk_valid_count
+    teacher_z = zscore_standardize(
+        teacher_logits_for_zscore, mask=target_mask, base_temperature=zscore_base_temp
+    )
+    student_z = zscore_standardize(
+        student_topk_logits, mask=target_mask, base_temperature=zscore_base_temp
+    )

-            total_loss += chunk_loss_sum
-            total_valid_tokens += chunk_valid_count
+    # 5) Convert to log-probs for KL
+    teacher_logprobs_z = teacher_z - torch.logsumexp(teacher_z, dim=-1, keepdim=True)
+    student_logprobs_z = student_z - torch.logsumexp(student_z, dim=-1, keepdim=True)

-        # 3. Normalize *once* at the end.
-        if num_items_in_batch > 0:
-            # If the user gave us a manual denominator (e.g. total items in batch),
-            # we divide by it. Typically used if each item is of different length.
-            final_loss = total_loss / float(num_items_in_batch)
-        else:
-            # Otherwise, divide by total valid tokens across all chunks.
-            # to get the same result as a non-chunked approach.
-            final_loss = total_loss / float(total_valid_tokens)
+    # 6) Restrict to valid tokens if needed
+    valid_mask = target_mask.bool()  # shape [B, seq_len, K]
+    teacher_probs_z = teacher_logprobs_z.exp()
+    teacher_probs_z = teacher_probs_z[valid_mask]
+    teacher_logprobs_z = teacher_logprobs_z[valid_mask]
+    student_logprobs_z = student_logprobs_z[valid_mask]

-        return final_loss
+    # 7) forward KL:  sum( p_teacher * [log(p_teacher) - log(p_student)] )
+    kd_loss_per_token = teacher_probs_z * (teacher_logprobs_z - student_logprobs_z)
+    kd_loss = kd_loss_per_token.sum()
+
+    # 8) If using classical KD scaling by T^2
+    if kd_temperature != 1.0:
+        kd_loss = kd_loss * (kd_temperature**2)
+
+    # Optionally scale by zscore_base_temp**2 if you want (paper might differ).
+    # kd_loss = kd_loss * (zscore_base_temp**2)
+
+    # 9) Normalize
+    if num_items_in_batch is not None and num_items_in_batch > 0:
+        kd_loss = kd_loss / float(num_items_in_batch)
+    else:
+        kd_loss = kd_loss / float(kd_loss_per_token.size(0))
+
+    return kd_loss
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -18,7 +18,8 @@ KD trainer

 from axolotl.core.trainers.base import AxolotlTrainer

-from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss
+from .topk_logprob.forward_kl import loss as topk_kd_loss
+from .topk_logprob.forward_kl import topk_kd_loss_with_zscore


 class AxolotlKDTrainer(AxolotlTrainer):
@@ -26,18 +27,6 @@ class AxolotlKDTrainer(AxolotlTrainer):
    Custom trainer subclass for Knowledge Distillation (KD)
    """

-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.model_accepts_loss_kwargs = True
-        self.model._loss_function = LigerFusedLinearKLTopKLogprobLoss(
-            self.args.kd_ce_alpha,  # hard label loss
-            self.args.kd_alpha,  # kd loss
-            self.args.kd_temperature,
-            self.args.kd_beta or 0.0,
-            compute_ce_loss=bool(self.args.kd_ce_alpha),
-            normalize_topk=self.args.kd_normalize_topk,
-        )
-
    def _set_signature_columns_if_needed(self):
        super()._set_signature_columns_if_needed()
        columns_to_add = []
@@ -63,12 +52,12 @@ class AxolotlKDTrainer(AxolotlTrainer):

        Subclass and override for custom behavior.
        """
-        if (
-            self.args.sample_packing
-            and hasattr(inputs, "attention_mask")
-            and hasattr(inputs, "position_ids")
-        ):
-            del inputs["attention_mask"]
+
+        target_logprobs = inputs.pop("target_logprobs")
+        target_token_ids = inputs.pop("target_token_ids")
+        target_mask = inputs.pop("target_mask")
+
+        seq_len = target_token_ids.shape[1]

        if self.model_accepts_loss_kwargs:
            loss_kwargs = {}
@@ -76,4 +65,49 @@ class AxolotlKDTrainer(AxolotlTrainer):
                loss_kwargs["num_items_in_batch"] = num_items_in_batch
            inputs = {**inputs, **loss_kwargs}
        outputs = model(**inputs)
-        return outputs[0]
+
+        # FIXME: account for tokenizer.padding_side
+        student_logits = outputs["logits"][:, : seq_len - 1, :].contiguous()
+
+        shift_logits = student_logits.contiguous()
+        target_logprobs_for_loss = target_logprobs[..., 1:, :].contiguous()
+        target_token_ids_for_loss = target_token_ids[..., 1:, :].contiguous()
+        target_mask_for_loss = target_mask[..., 1:, :].contiguous()
+
+        if self.args.kd_zscore_base_temp:
+            loss_kd = topk_kd_loss_with_zscore(
+                shift_logits,
+                target_token_ids_for_loss,
+                target_logprobs_for_loss,
+                target_mask_for_loss,
+                kd_temperature=self.args.kd_temperature,
+                zscore_base_temp=self.args.kd_zscore_base_temp,
+                num_items_in_batch=num_items_in_batch,
+            )
+        else:
+            loss_kd = topk_kd_loss(
+                shift_logits,
+                target_token_ids_for_loss,
+                target_logprobs_for_loss,
+                target_mask_for_loss,
+                num_items_in_batch=num_items_in_batch,
+                kd_temperature=self.args.kd_temperature,
+                top_k_before_softmax=1 if self.args.kd_top_k_before_softmax else 0,
+            )
+
+        if self.args.kd_ce_alpha > 0:
+            kd_alpha = self.args.kd_alpha
+            loss = self.args.kd_ce_alpha * outputs["loss"] + kd_alpha * loss_kd
+        else:
+            loss = loss_kd
+        # Save past state if it exists
+        # TODO: this needs to be fixed and made cleaner later.
+        if self.args.past_index >= 0:
+            self._past = outputs[  # pylint: disable=attribute-defined-outside-init
+                self.args.past_index
+            ]
+
+        if self.args.average_tokens_across_devices and self.model_accepts_loss_kwargs:
+            loss *= self.accelerator.num_processes
+
+        return (loss, outputs) if return_outputs else loss
--- a/src/axolotl/integrations/kd/utils.py
+++ b/src/axolotl/integrations/kd/utils.py
@@ -1,100 +0,0 @@
-"""Helper KD utils"""
-
-import math
-from typing import List, Union
-
-import numpy as np
-import torch
-from torch import FloatTensor, Tensor
-
-
-def normalize_logprobs(logprobs: FloatTensor, topk: int) -> FloatTensor:
-    """
-    Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs.
-    """
-    # Ensure raw_logprobs matches kd_online_topk length for tensor operations
-    # This should ideally be handled by the caller ensuring correct padding/truncation first
-    if logprobs.shape[-1] != topk:
-        # pad last dimension of logprobs to match topk length with -inf
-        padding_len = topk - logprobs.shape[-1]
-        padding_tensor = torch.full(
-            (
-                *logprobs.shape[:-1],
-                padding_len,
-            ),  # Takes all dimensions of logprobs except the last, then appends padding_needed
-            float("-inf"),
-            dtype=logprobs.dtype,
-            device=logprobs.device,
-        )
-        logprobs = torch.cat((logprobs, padding_tensor), dim=-1)
-
-    # Convert logprobs at T_online to probabilities
-    # use log sum exp trick to avoid underflow
-    position_logprobs_lse = torch.logsumexp(logprobs, dim=-1, keepdim=True)
-    teacher_probs_t_online = torch.exp(logprobs - position_logprobs_lse)
-
-    # Normalize probabilities (sum to 1)
-    # This is important if the top-k from server aren't a full distribution
-    teacher_probs_t_online_sum = teacher_probs_t_online.sum(dim=-1, keepdim=True)
-    teacher_probs_t_online = teacher_probs_t_online / teacher_probs_t_online_sum
-
-    final_logprobs_tensor = torch.log(teacher_probs_t_online)
-
-    return final_logprobs_tensor
-
-
-def strided_chunk_views(
-    tensor: Union[np.ndarray, torch.Tensor],
-    chunks: int,
-    dim: int = 0,
-    stride: int = 1,
-    chunk_size: int | None = None,
-) -> List[Union[np.ndarray, torch.Tensor]]:
-    """
-    Split a tensor into chunks along a dimension with striding, prioritizing views over copies.
-
-    Args:
-        tensor: Input tensor (numpy array or torch tensor)
-        chunks: Number of chunks to create
-        dim: Dimension along which to chunk (default: 0)
-        stride: Stride between chunk starting positions (default: 1)
-        chunk_size: Size of each chunk. If None, calculated automatically (default: None)
-
-    Returns:
-        List of tensor chunks (views when possible, copies when necessary)
-    """
-
-    # Get the size of the specified dimension
-    dim_size = tensor.shape[dim]
-
-    # Calculate chunk size if not provided
-    if chunk_size is None:
-        chunk_size = (dim_size + chunks - 1) // chunks  # Ceiling division
-
-    chunks_list = []
-
-    for i in range(chunks):
-        start_idx = i * stride
-        end_idx = min(start_idx + chunk_size, dim_size)
-
-        # Break if we've gone beyond the tensor
-        if start_idx >= dim_size:
-            break
-
-        # Create slice objects for all dimensions
-        slices = [slice(None)] * tensor.ndim
-        slices[dim] = slice(start_idx, end_idx)
-
-        chunk = tensor[tuple(slices)]
-        chunks_list.append(chunk)
-
-    return chunks_list
-
-
-def chunk_overlap(input_tensor: Tensor, chunks: int, dim: int = 0, overlap: int = 1):
-    dim_size = input_tensor.shape[dim]
-    stride = math.ceil(dim_size / chunks)
-
-    return strided_chunk_views(
-        input_tensor, chunks, dim, stride=stride, chunk_size=stride + overlap
-    )
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -27,7 +27,7 @@ from axolotl.utils.logging import get_logger
 from .args import LigerArgs  # pylint: disable=unused-import. # noqa: F401
 from .utils import patch_with_compile_disable

-LOG = get_logger(__name__)
+LOG = get_logger(__name__, use_environ=True)


 class LigerPlugin(BasePlugin):
--- a/src/axolotl/integrations/liger/args.py
+++ b/src/axolotl/integrations/liger/args.py
@@ -15,7 +15,6 @@
 """
 Module for handling LIGER input arguments.
 """
-
 from typing import Optional

 from pydantic import BaseModel, model_validator
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -166,17 +166,6 @@ class PatchManager:
    def _apply_self_attention_lora_patch(self):
        """Apply self-attention LoRA patches if configured."""
        if self.cfg.lora_qkv_kernel or self.cfg.lora_o_kernel:
-            # Only patch if conditions are met
-            can_patch = (
-                self.cfg.lora_dropout == 0
-                if hasattr(self.cfg, "lora_dropout")
-                else True
-            )  # default to True if lora_dropout is not set
-
-            if not can_patch:
-                LOG.warning("Cannot patch self-attention - requires no dropout")
-                return
-
            from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora

            patch_self_attn_lora(self.cfg)
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -7,14 +7,12 @@ import transformers
 from transformers import (
    AddedToken,
    AutoTokenizer,
-    PreTrainedTokenizer,
 )

 from axolotl.integrations.base import PluginManager
 from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.chat_templates import get_chat_template_from_config
-from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import (
    barrier,
    is_local_main_process,
@@ -119,21 +117,8 @@ def modify_tokenizer_files(
    return tokenizer_dir


-def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
+def load_tokenizer(cfg):
    """Load and configure the tokenizer based on the provided config."""
-
-    def _load_mistral_common_tokenizer(cfg: DictDefault):
-        """Load mistral-common tokenizer"""
-        from axolotl.utils.mistral_tokenizer import HFMistralTokenizer
-
-        # Load the HF-compatible wrapper around MistralTokenizer
-        tokenizer = HFMistralTokenizer.from_pretrained(cfg.tokenizer_config)
-
-        return tokenizer
-
-    if cfg.tokenizer_use_mistral_common:
-        return _load_mistral_common_tokenizer(cfg)
-
    model_config = load_model_config(cfg)
    tokenizer_kwargs = {}
    use_fast = True  # this is the default
@@ -222,12 +207,11 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
                )
                and k != "pad_token"
            ):
-                lora_modules_to_save_str = ", ".join(
+                lora_modules_to_save = ", ".join(
                    [f"`{x}`" for x in lora_modules_to_save]
                )
                raise ValueError(
-                    f"Please set lora_modules_to_save to [{lora_modules_to_save_str}] "
-                    "when using an adapter and changing the special tokens."
+                    f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens."
                )

            tokenizer.add_special_tokens(
@@ -273,7 +257,7 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
            {"additional_special_tokens": additional_special_tokens}
        )

-    if is_main_process():
+    if is_main_process(use_environ=True):
        LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
        LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
        LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
--- a/src/axolotl/logging_config.py
+++ b/src/axolotl/logging_config.py
@@ -25,20 +25,12 @@ class AxolotlOrWarnErrorFilter(logging.Filter):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)

-        axolotl_log_level = os.getenv(
-            "AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL
-        ).upper()
-        other_log_level = os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL).upper()
-
-        try:
-            # py311+ only
-            level_mapping = logging.getLevelNamesMapping()
-            self.axolotl_level = level_mapping[axolotl_log_level]
-            self.other_level = level_mapping[other_log_level]
-        except AttributeError:
-            # For py310, use getLevelName directly
-            self.axolotl_level = logging.getLevelName(axolotl_log_level)
-            self.other_level = logging.getLevelName(other_log_level)
+        self.axolotl_level = logging.getLevelNamesMapping()[
+            os.getenv("AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL)
+        ]
+        self.other_level = logging.getLevelNamesMapping()[
+            os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL)
+        ]

    def filter(self, record: LogRecord) -> bool:
        # General filter
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -145,11 +145,6 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:

        return Qwen2Attention

-    if model_type == "mllama":
-        from transformers.models.mllama.modeling_mllama import MllamaTextSelfAttention
-
-        return MllamaTextSelfAttention
-
    try:
        # Dynamically import the module and attention class
        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
@@ -274,29 +269,6 @@ def find_mlp_in_layer(
                )


-def get_layers(model: PeftModelForCausalLM) -> list[nn.Module]:
-    """
-    Get the layers of the model. Handles text-only and multimodal models.
-
-    Args:
-        model: A PEFT model.
-
-    Returns:
-        A list of layers.
-    """
-    pretrained_model = model.model
-
-    # check for multimodal models first
-    if hasattr(pretrained_model, "language_model"):
-        return pretrained_model.language_model.layers
-    if hasattr(pretrained_model, "model"):
-        return pretrained_model.model.layers
-
-    raise NotImplementedError(
-        f"Model type {model.config.model_type} is not supported yet. Please create an Issue."
-    )
-
-
 def apply_lora_kernel_patches(
    model: PeftModelForCausalLM, cfg: DictDefault
 ) -> PeftModelForCausalLM:
@@ -368,7 +340,17 @@ def apply_lora_kernel_patches(
    if activation not in SUPPORTED_ACTIVATIONS:
        raise NotImplementedError(f"Activation {activation} is not supported")

-    layers = get_layers(model)
+    layers = []
+    # check for multimodal models first
+    pretrained_model = model.model
+    if hasattr(pretrained_model, "language_model"):
+        layers = pretrained_model.language_model.layers
+    elif hasattr(pretrained_model, "model"):
+        layers = pretrained_model.model.layers
+    else:
+        raise NotImplementedError(
+            f"Model type {model.config.model_type} is not supported yet. Please create an Issue."
+        )

    # Patch each layer
    for layer in layers:
--- a/src/axolotl/monkeypatch/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/ring_attn/patch.py
@@ -2,10 +2,10 @@

 Make use of the `ring-flash-attn` (https://github.com/zhuzilin/ring-flash-attention)
 package, specifically the `hf_adapter.substitute_hf_flash_attn` function to patch in
-their sequence parallel version of Flash Attention 2.
+their context parallel version of Flash Attention 2.

 We also provide some patches for accelerate functions to prepare the dataloader for
-sequence parallelism training.
+context parallelism training.
 """

 import inspect
@@ -63,15 +63,15 @@ def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):


 def register_ring_attn(
-    sequence_parallel_degree: int,
+    context_parallel_degree: int,
    heads_k_stride: int | None,
    ring_attn_func: RingAttnFunc | None,
 ):
    """Create ring attention group and substitute flash attn with ring flash attn.

    Args:
-        sequence_parallel_degree: Sequence parallelism factor.
-        heads_k_stride: Sequence parallelism K head stride size. Passed through to
+        context_parallel_degree: Context parallelism factor.
+        heads_k_stride: Context parallelism K head stride size. Passed through to
            `varlen_llama3` `ring_flash_attn` implementation.
        ring_attn_func: `ring_flash_attn` ring attention implemention. If sample
            packing is enabled, it must be a `varlen` function; otherwise, it must be a
@@ -80,28 +80,18 @@ def register_ring_attn(
    rank = dist.get_rank()
    world_size = dist.get_world_size()

-    if rank == 0:
-        LOG.info(
-            "Enabling ring attention sequence parallelism: "
-            f"each sequence will be processed across {sequence_parallel_degree} GPUs"
-        )
-
-    assert sequence_parallel_degree <= world_size, (
-        f"sequence_parallel_degree ({sequence_parallel_degree}) "
-        f"must be less than or equal to world_size ({world_size})"
-    )
-    assert world_size % sequence_parallel_degree == 0, (
-        f"sequence_parallel_degree ({sequence_parallel_degree}) "
-        f"must evenly divide world_size ({world_size})"
+    LOG.info(
+        "Enabling ring attention context parallelism: "
+        f"each sequence will be processed across {context_parallel_degree} GPUs"
    )

-    # Assign ranks to sequence parallel groups
+    # Assign ranks to context parallel groups
    group_assignments = {}
-    for i in range(world_size // sequence_parallel_degree):
+    for i in range(world_size // context_parallel_degree):
        ring_attn_ranks = list(
            range(
-                i * sequence_parallel_degree,
-                (i + 1) * sequence_parallel_degree,
+                i * context_parallel_degree,
+                (i + 1) * context_parallel_degree,
            )
        )
        group = dist.new_group(ranks=ring_attn_ranks, backend="nccl")
@@ -113,9 +103,7 @@ def register_ring_attn(
        if rank in ring_attn_ranks:
            set_ring_attn_group(group)

-    # Log the GPU group assignments
-    if rank == 0:
-        LOG.info(f"Sequence parallel group assignments: {group_assignments}")
+    LOG.info(f"Context parallel group assignments: {group_assignments}")

    if ring_attn_func is RingAttnFunc.VARLEN_LLAMA3:
        from ring_flash_attn import substitute_hf_flash_attn
@@ -150,7 +138,7 @@ def update_ring_attn_params(position_ids: torch.Tensor | None):


 def patch_prepare_data_loader():
-    """Patch `accelerate.data_loader.prepare_data_loader` to respect the SP degree.
+    """Patch `accelerate.data_loader.prepare_data_loader` to respect the CP degree.

    Raies:
        RuntimeError: If source code to patch does not exist.
@@ -176,15 +164,15 @@ def patch_prepare_data_loader():
    patched_function = namespace["prepare_data_loader"]

    accelerate.data_loader.prepare_data_loader = patched_function
-    LOG.info("Patched accelerate.data_loader.prepare_data_loader for SP support")
+    LOG.info("Patched accelerate.data_loader.prepare_data_loader for CP support")


-def patch_prepare_device_mesh(sequence_parallel_degree: int):
+def patch_prepare_device_mesh(context_parallel_degree: int):
    """Patches the `Accelerator._prepare_device_mesh` method to create a device mesh
-    that includes sequence parallelism with the specified degree.
+    that includes context parallelism with the specified degree.

    Args:
-        sequence_parallel_degree (int): The degree of sequence parallelism to use.
+        context_parallel_degree (int): The degree of context parallelism to use.
    """

    def _prepare_device_mesh(self):
@@ -199,11 +187,11 @@ def patch_prepare_device_mesh(sequence_parallel_degree: int):
        ):
            return self.state.ds_device_mesh

-        # Create device mesh with sequence parallelism
+        # Create device mesh with context parallelism
        world_size = dist.get_world_size()
        mesh_shape = (
-            world_size // sequence_parallel_degree,
-            sequence_parallel_degree,
+            world_size // context_parallel_degree,
+            context_parallel_degree,
        )
        device_ids = list(range(world_size))

@@ -221,5 +209,5 @@ def patch_prepare_device_mesh(sequence_parallel_degree: int):

    LOG.info(
        "Successfully patched Accelerator._prepare_device_mesh "
-        f"with sequence_parallel_degree={sequence_parallel_degree}"
+        f"with context_parallel_degree={context_parallel_degree}"
    )
--- a/src/axolotl/monkeypatch/unsloth_.py
+++ b/src/axolotl/monkeypatch/unsloth_.py
@@ -4,12 +4,12 @@ import inspect
 import types

 import torch
+from accelerate.logging import get_logger
 from peft import PeftModelForCausalLM
 from torch import nn
 from transformers.models.llama.modeling_llama import LlamaFlashAttention2

 from axolotl.monkeypatch.utils import detab_code
-from axolotl.utils.logging import get_logger

 LOG = get_logger(__name__)

--- a/src/axolotl/prompt_strategies/init.py
+++ b/src/axolotl/prompt_strategies/init.py
@@ -17,10 +17,7 @@ def load(strategy, tokenizer, cfg, ds_cfg, processor=None):
            return messages_load(tokenizer, cfg, ds_cfg, processor=processor)
        load_fn = "load"
        package = "axolotl.prompt_strategies"
-        if (
-            strategy.split(".")[-1].startswith("load_")
-            or strategy.split(".")[-1] == "load"
-        ):
+        if strategy.split(".")[-1].startswith("load_"):
            load_fn = strategy.split(".")[-1]
            strategy = ".".join(strategy.split(".")[:-1])
        elif len(strategy.split(".")) > 1:
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -2,10 +2,8 @@
 HF Chat Templates prompt strategy
 """

-# pylint: disable=too-many-lines
-
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any, Dict, List, Set, Union
+from typing import Any, Dict, List, Set, Union

 from pydantic import BaseModel
 from transformers import ProcessorMixin
@@ -17,9 +15,6 @@ from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.datasets import DatasetConfig

-if TYPE_CHECKING:
-    from axolotl.utils.mistral_tokenizer import HFMistralTokenizer
-
 # Configure the logger
 LOG = get_logger(__name__)
 LOG.setLevel("INFO")
@@ -39,7 +34,6 @@ class ChatTemplatePrompter(Prompter):
        message_field_training_detail: str | None = None,
        field_messages: str = "messages",
        field_system: str = "system",
-        field_tools: str = "tools",
        roles: dict[str, list[str]] | None = None,
        chat_template_kwargs: dict[str, Any] | None = None,
        drop_system_message: bool = False,
@@ -72,7 +66,6 @@ class ChatTemplatePrompter(Prompter):
        self.message_field_training_detail = message_field_training_detail
        self.field_messages = field_messages
        self.field_system = field_system
-        self.field_tools = field_tools
        self.tokenizer = tokenizer
        self.processor: ProcessorMixin | None = processor
        self.chat_template = chat_template
@@ -84,38 +77,17 @@ class ChatTemplatePrompter(Prompter):
    def chat_template_msg_variables(self) -> Set[str]:
        return self._chat_template_msg_variables

-    def build_prompt(
-        self,
-        conversation: list[dict],
-        add_generation_prompt=False,
-        images=None,
-        tools=None,
-    ):
-        """
-        Build a prompt from a conversation.
-
-        Args:
-            conversation: A list of messages.
-            add_generation_prompt: Whether to add a generation prompt.
-            images: A list of images. (optional)
-            tools: A list of tools. (optional)
-        """
-        chat_template_kwargs = {
-            "chat_template": self.chat_template,
-            "add_generation_prompt": add_generation_prompt,
-        }
-
-        if tools:
-            chat_template_kwargs["tools"] = tools
-
+    def build_prompt(self, conversation, add_generation_prompt=False, images=None):
        if self.processor:
            if not callable(self.processor):
                raise TypeError("Processor must be callable")

            text = self.processor.apply_chat_template(
                conversation,
+                chat_template=self.chat_template,
                tokenize=False,
-                **chat_template_kwargs,
+                add_generation_prompt=add_generation_prompt,
+                **self.chat_template_kwargs,
            )
            batch = self.processor(
                text=text,
@@ -132,7 +104,9 @@ class ChatTemplatePrompter(Prompter):

        return self.tokenizer.apply_chat_template(
            conversation,
-            **chat_template_kwargs,
+            add_generation_prompt=add_generation_prompt,
+            chat_template=self.chat_template,
+            **self.chat_template_kwargs,
        )

    def get_offsets_for_train_detail(
@@ -276,15 +250,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos

        # Default to eos_token if eot_tokens not provided
-        self.eot_tokens = []
-        if eot_tokens is not None:
-            self.eot_tokens = eot_tokens
-        elif (
-            hasattr(self.tokenizer, "eos_token")
-            and self.tokenizer.eos_token is not None
-        ):
-            self.eot_tokens = [self.tokenizer.eos_token]
-
+        self.eot_tokens = (
+            eot_tokens if eot_tokens is not None else [self.tokenizer.eos_token]
+        )
        self.split_thinking = split_thinking

        self.images = "images"
@@ -408,7 +376,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
            and not self.prompter.message_field_training_detail  # type: ignore
        ):
            turns = self.get_conversation_thread(prompt)
-            images = self._get_images(prompt)
+            images = self.get_images(prompt)
            prompt_ids = self.prompter.build_prompt(  # type: ignore
                turns[:-1],
                add_generation_prompt=True,
@@ -437,8 +405,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
            return tokenized_prompt

        turns = self.get_conversation_thread(prompt)
-        tools = self._get_tools(prompt)
-        input_ids = self.prompter.build_prompt(turns, tools=tools)  # type: ignore
+        input_ids = self.prompter.build_prompt(turns)  # type: ignore
        labels = [IGNORE_TOKEN_ID] * len(input_ids)

        last_eos_idx = -1
@@ -477,9 +444,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):

                continue

-            turn_start_idx, turn_end_idx = self.find_turn(
-                turns=turns, turn_idx=index, tools=tools
-            )
+            turn_start_idx, turn_end_idx = self.find_turn(turns=turns, turn_idx=index)

            LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}")

@@ -581,9 +546,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                return i
        return -1

-    def find_turn(
-        self, turns: list[dict], turn_idx: int, tools: list[dict] | None = None
-    ):
+    def find_turn(self, turns: list[dict], turn_idx: int):
        """
        Locate the starting and ending indices of the specified turn in a conversation.
        """
@@ -596,7 +559,11 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        if (
            turn_idx == 0
            and turns[0].get("role") == "system"
-            and ("mistral" in self.tokenizer.name_or_path.lower())
+            and (
+                "mistral" in self.tokenizer.name_or_path.lower()
+                or "gemma"
+                in self.tokenizer.name_or_path.lower()  # gemma3 uses gemma tokenizer
+            )
        ):
            return -1, -1

@@ -610,10 +577,10 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        turns_with_content = turns[: turn_idx + 1]

        # Generate the conversation up to the turn, with final turn replaced with dummy content
-        dummy_ids = self.prompter.build_prompt(turns_with_empty, tools=tools)  # type: ignore
+        dummy_ids = self.prompter.build_prompt(turns_with_empty)  # type: ignore

        # Generate the conversation up to the turn, with final turn included
-        full_ids = self.prompter.build_prompt(turns_with_content, tools=tools)  # type: ignore
+        full_ids = self.prompter.build_prompt(turns_with_content)  # type: ignore

        if not full_ids or not dummy_ids:
            LOG.warning(f"Empty template generated for turn {turn_idx}")
@@ -666,10 +633,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
    def get_conversation_thread(self, prompt):
        turns = []

-        messages = self._get_messages(prompt)
-
-        possible_sys_turn = self.transform_message(messages[0])
-
+        possible_sys_turn = self.transform_message(
+            prompt[self.prompter.field_messages][0]
+        )
        if (
            possible_sys_turn["role"] != "system"
            and self.prompter.field_system in prompt
@@ -677,7 +643,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
            turn = {"role": "system", "content": prompt[self.prompter.field_system]}
            turns.append(turn)

-        for message in messages:
+        for message in prompt[self.prompter.field_messages]:
            transformed_message = self.transform_message(message)

            turn = {
@@ -695,7 +661,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):

        return turns

-    def transform_message(self, message: dict) -> dict:
+    def transform_message(self, message):
        # Build the initial transformed message from the mappings
        transformed_message = {}
        for key, value in self.prompter.message_property_mappings.items():
@@ -772,135 +738,18 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):

        return transformed_message

-    def _get_images(self, prompt):
+    def get_images(self, prompt):
        return prompt.get(self.images, None)

-    def _get_tools(self, prompt) -> list[dict] | None:
-        """Get tools from prompt if available."""
-        tools = prompt.get(self.prompter.field_tools, None)
-        if tools is None:
-            return None
-
-        if isinstance(tools, list):
-            return tools
-
-        raise ValueError(
-            "Unknown tools format. Please convert it into a list[dict].\n"
-            f"Current format: {type(tools)}"
-        )
-
-    def _get_messages(self, prompt):
-        messages = prompt.get(self.prompter.field_messages, None)
-        if messages is None:
-            raise ValueError("Messages is null. Please check `field_messages`.")
-
-        if isinstance(messages, list):
-            return messages
-
-        raise ValueError(
-            "Unknown messages format. Please convert it into a list[dict].\n"
-            f"Current format: {type(messages)}"
-        )
-
-
-class MistralStrategy(ChatTemplateStrategy):
-    """
-    Mistral strategy for chat template.
-    """
-
-    def __init__(
-        self,
-        prompter: "ChatTemplatePrompter",
-        tokenizer: "HFMistralTokenizer",
-        train_on_inputs: bool,
-        sequence_len: int,
-        roles_to_train: list[str] | None = None,
-        train_on_eos: str | None = None,
-        train_on_eot: str | None = None,
-        eot_tokens: list[str] | None = None,
-        split_thinking: bool | None = False,
-    ):
-        # Call the parent's parent __init__ (PromptTokenizingStrategy) to skip ChatTemplateStrategy's validation
-        # pylint: disable=non-parent-init-called,super-init-not-called
-        PromptTokenizingStrategy.__init__(
-            self, prompter, tokenizer, train_on_inputs, sequence_len
-        )
-        self.prompter: ChatTemplatePrompter = prompter
-
-        self.roles_to_train = []
-        if roles_to_train:
-            # map roles if exist in prompter.roles else use the role as is
-            self.roles_to_train = [
-                prompter.roles.get(role, role) for role in roles_to_train
-            ]
-
-        self.train_on_eos = train_on_eos
-        # Backward compatibility, load from train_on_eos
-        self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos
-
-        # Default to eos_token if eot_tokens not provided
-        self.eot_tokens = []
-        if eot_tokens is not None:
-            self.eot_tokens = eot_tokens
-        else:
-            # set eot_tokens to the eos_token
-            self.eot_tokens = [self.tokenizer.eos_token]
-
-        self.split_thinking = split_thinking
-
-        self.images = "images"
-
-        LOG.debug(
-            f"The chat template uses the following properites on the message: {self.prompter.chat_template_msg_variables}"
-        )
-
-        # Skip the validation that ChatTemplateStrategy calls
-        # TODO: address this in the future with mistral-specific checks
-        # self._validate_eot_and_eos_tokens()
-
-    @property
-    def supports_multiprocessing(self) -> bool:
-        """
-        Whether this tokenizing strategy supports multiprocessing.
-        mistral_common tokenizers cannot be pickled for multiprocessing.
-        """
-
-        return False
-
-    def find_first_eot_token(self, input_ids, start_idx):
-        """Find the first EOT token in the input_ids starting from start_idx."""
-        # mistral-common tokenizer does not support eot_tokens
-        return self.find_first_eos_token(input_ids, start_idx)
-
-
-class MistralPrompter(ChatTemplatePrompter):
-    """
-    Mistral prompter for chat template.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-        self._chat_template_msg_variables = set(["tool_call_id", "name", "tool_calls"])
-

 class StrategyLoader:
    """
    Load chat template strategy based on configuration.
    """

-    def _get_strategy_cls(self, cfg):
-        if cfg.tokenizer_use_mistral_common:
-            return MistralStrategy
-
+    def _get_strategy_cls(self):
        return ChatTemplateStrategy

-    def _get_prompter_cls(self, cfg):
-        if cfg.tokenizer_use_mistral_common:
-            return MistralPrompter
-
-        return ChatTemplatePrompter
-
    def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
        return {
            "train_on_inputs": cfg.train_on_inputs,
@@ -926,14 +775,9 @@ class StrategyLoader:
        else:
            dataset_config = ds_cfg

-        if cfg.tokenizer_use_mistral_common:
-            # mistral-common does not use this, so we pass an empty string
-            chat_template_string = ""
-        else:
-            chat_template_string = get_chat_template_from_config(
-                cfg=cfg, ds_cfg=dataset_config, tokenizer=tokenizer
-            )
-
+        chat_template_string = get_chat_template_from_config(
+            cfg=cfg, ds_cfg=dataset_config, tokenizer=tokenizer
+        )
        LOG.info(f"Using chat template:\n---\n{chat_template_string!s}\n---")

        prompter_params = {
@@ -959,11 +803,10 @@ class StrategyLoader:
        }

        strategy_params = self._get_strategy_params(cfg, dataset_config)
-        strategy_cls = self._get_strategy_cls(cfg)
-        prompter_cls = self._get_prompter_cls(cfg)
+        strategy_cls = self._get_strategy_cls()

        strategy = strategy_cls(
-            prompter_cls(**prompter_params),
+            ChatTemplatePrompter(**prompter_params),
            tokenizer=tokenizer,
            **strategy_params,
        )
--- a/src/axolotl/prompt_strategies/dpo/chat_template.py
+++ b/src/axolotl/prompt_strategies/dpo/chat_template.py
@@ -46,14 +46,6 @@ def default(
        )

        messages = sample[field_messages]
-        if isinstance(messages, str):
-            messages = [
-                {
-                    message_property_mappings["role"]: "user",
-                    message_property_mappings["content"]: messages,
-                }
-            ]
-
        messages = [
            {
                "role": role_map[m[message_property_mappings["role"]]],
@@ -61,35 +53,13 @@ def default(
            }
            for m in messages
        ]
-
-        chosen_raw = sample[field_chosen]
-        if isinstance(chosen_raw, str):
-            chosen_msg = {
-                message_property_mappings["role"]: "assistant",
-                message_property_mappings["content"]: chosen_raw,
-            }
-        elif isinstance(chosen_raw, dict):
-            chosen_msg = chosen_raw
-        else:
-            chosen_msg = chosen_raw[-1]
        chosen = {
-            "role": role_map[chosen_msg[message_property_mappings["role"]]],
-            "content": chosen_msg[message_property_mappings["content"]],
+            "role": role_map[sample[field_chosen][message_property_mappings["role"]]],
+            "content": sample[field_chosen][message_property_mappings["content"]],
        }
-
-        rejected_raw = sample[field_rejected]
-        if isinstance(rejected_raw, str):
-            rejected_msg = {
-                message_property_mappings["role"]: "assistant",
-                message_property_mappings["content"]: rejected_raw,
-            }
-        elif isinstance(rejected_raw, dict):
-            rejected_msg = rejected_raw
-        else:
-            rejected_msg = rejected_raw[-1]
        rejected = {
-            "role": role_map[rejected_msg[message_property_mappings["role"]]],
-            "content": rejected_msg[message_property_mappings["content"]],
+            "role": role_map[sample[field_rejected][message_property_mappings["role"]]],
+            "content": sample[field_rejected][message_property_mappings["content"]],
        }
        dummy_user_message = {"role": "user", "content": "[[dummy_message]]"}

--- a/src/axolotl/prompt_strategies/jinja_template_analyzer.py
+++ b/src/axolotl/prompt_strategies/jinja_template_analyzer.py
@@ -3,7 +3,6 @@
 from typing import Dict, Optional, Set, TypedDict, Union

 from jinja2 import Environment, meta, nodes
-from jinja2.ext import Extension


 class JinjaTemplateAnalysis(TypedDict):
@@ -28,18 +27,6 @@ class JinjaTemplateAnalysis(TypedDict):
    iteration_target: Optional[Union[str, list[str]]]


-class GenerationTagIgnore(Extension):
-    """
-    Ignores the generation and endgeneration tags in Jinja templates.
-    """
-
-    tags = {"generation", "endgeneration"}
-
-    def parse(self, parser):
-        parser.stream.skip(1)
-        return nodes.Const("")
-
-
 class JinjaTemplateAnalyzer:
    """
    Analyzes Jinja templates to extract information about variable usage,
@@ -70,9 +57,7 @@ class JinjaTemplateAnalyzer:
    """

    def __init__(self, template: str):
-        self.env: Environment = Environment(
-            autoescape=True, extensions=[GenerationTagIgnore]
-        )
+        self.env: Environment = Environment(autoescape=True)
        self.property_access: Dict[str, Set[str]] = {}
        self.iteration_targets: Dict[str, Union[str, list[str]]] = {}
        self.index_access: Dict[str, Set[Union[int, float]]] = {}
--- a/src/axolotl/prompt_strategies/messages/init.py
+++ b/src/axolotl/prompt_strategies/messages/init.py
@@ -32,3 +32,4 @@ def load(tokenizer, cfg, ds_cfg, processor=None):
    except Exception as exc:  # pylint: disable=broad-exception-caught
        LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
        raise exc
+    return None
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -3,7 +3,6 @@
 import abc
 from typing import Callable, Dict, List, Optional, Tuple, Union

-from datasets import Dataset
 from transformers import BatchEncoding, PreTrainedTokenizer

 from axolotl.prompters import Prompter
@@ -29,16 +28,6 @@ class DatasetWrappingStrategy(abc.ABC):
    Abstract class for wrapping datasets for Chat Messages
    """

-    @abc.abstractmethod
-    def wrap_dataset(
-        self,
-        dataset,
-        process_count: int | None = None,
-        keep_in_memory: bool | None = False,
-        **kwargs,
-    ) -> Dataset:
-        pass
-

 class PromptTokenizingStrategy(abc.ABC):
    """
@@ -70,14 +59,6 @@ class PromptTokenizingStrategy(abc.ABC):
    def supports_batched(self):
        return False

-    @property
-    def supports_multiprocessing(self):
-        """
-        Whether this tokenizing strategy supports multiprocessing.
-        Should return False if the tokenizer has unpicklable objects.
-        """
-        return True
-
    def _tokenize(
        self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
    ) -> BatchEncoding:
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -1,13 +1,10 @@
 """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""

-from __future__ import annotations
-
 import importlib
 import inspect
 import os
 import signal
 import sys
-import typing
 import weakref
 from contextlib import ExitStack
 from pathlib import Path
@@ -34,7 +31,7 @@ from axolotl.loaders import (
    load_processor,
    load_tokenizer,
 )
-from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
+from axolotl.utils.ctx_managers import ContextParallelContextManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
@@ -47,9 +44,6 @@ try:
 except ImportError:
    BetterTransformer = None

-if typing.TYPE_CHECKING:
-    from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
-
 LOG = get_logger(__name__)


@@ -58,8 +52,8 @@ def setup_model_and_tokenizer(
 ) -> tuple[
    PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None
 ]:
-    """Load the tokenizer, processor (for multimodal models), and model based on
-    configuration.
+    """
+    Load the tokenizer, processor (for multimodal models), and model based on configuration.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
@@ -153,7 +147,7 @@ def determine_resume_checkpoint(cfg: DictDefault) -> str | None:


 def setup_signal_handler(
-    cfg: DictDefault, model: PreTrainedModel, safe_serialization: bool
+    cfg: DictDefault, model: PeftModel | PreTrainedModel, safe_serialization: bool
 ):
    """
    Set up signal handler for graceful termination.
@@ -207,15 +201,20 @@ def execute_training(
                )
            )

-        if cfg.sequence_parallel_degree > 1:
+        if cfg.context_parallel_degree > 1 and not cfg.sdp_attention:
+            # Models to enter context parallel manager for
            models = [trainer.model]
            if hasattr(trainer, "ref_model") and trainer.ref_model:
                models.append(trainer.ref_model)

+            # Attention backend
+            backend = "sdp_attention" if cfg.sdp_attention else "flash_attention"
+
            stack.enter_context(
-                SequenceParallelContextManager(
+                ContextParallelContextManager(
                    models=models,
-                    sequence_parallel_degree=cfg.sequence_parallel_degree,
+                    backend=backend,
+                    context_parallel_degree=cfg.context_parallel_degree,
                    gradient_accumulation_steps=cfg.gradient_accumulation_steps,
                    ring_attn_func=cfg.ring_attn_func,
                    heads_k_stride=cfg.heads_k_stride,
@@ -229,7 +228,7 @@ def execute_training(
 def save_trained_model(
    cfg: DictDefault,
    trainer: Any,
-    model: PreTrainedModel,
+    model: PeftModel | PreTrainedModel,
    safe_serialization: bool,
 ):
    """
@@ -380,7 +379,7 @@ def create_model_card(cfg: DictDefault, trainer: Trainer):
 def save_initial_configs(
    cfg: DictDefault,
    tokenizer: PreTrainedTokenizer,
-    model: PreTrainedModel,
+    model: PeftModel | PreTrainedModel,
    peft_config: PeftConfig | None,
    processor: ProcessorMixin | None,
 ):
@@ -434,7 +433,7 @@ def setup_model_card(cfg: DictDefault):

 def handle_untrained_tokens_fix(
    cfg: DictDefault,
-    model: PreTrainedModel,
+    model: PeftModel | PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    train_dataset: Dataset,
    safe_serialization: bool,
@@ -477,7 +476,7 @@ def handle_untrained_tokens_fix(


 def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> tuple[
-    "HFRLTrainerBuilder" | "HFCausalTrainerBuilder",
+    Trainer,
    PeftModel | PreTrainedModel,
    PreTrainedTokenizer,
    PeftConfig | None,
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -52,10 +52,3 @@ def patch_optimized_env():
    if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
    set_pytorch_cuda_alloc_conf()
-
-
-def get_not_null(value, default=None):
-    """
-    return the value if it's not None, otherwise return the default value
-    """
-    return value if value is not None else default
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -53,6 +53,25 @@ IGNORE_INDEX = -100
 LOG = get_logger(__name__)


+class EvalFirstStepCallback(
+    TrainerCallback
+):  # pylint: disable=too-few-public-methods disable=unused-argument
+    """
+    Callback to trigger evals on the first step
+    """
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        if args.eval_strategy == IntervalStrategy.STEPS and state.global_step == 1:
+            control.should_evaluate = True
+        return control
+
+
 class SaveBetterTransformerModelCallback(
    TrainerCallback
 ):  # pylint: disable=too-few-public-methods
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
--- a/src/axolotl/utils/collators/batching.py
+++ b/src/axolotl/utils/collators/batching.py
@@ -1,7 +1,7 @@
 """Data collators for axolotl to pad labels and position_ids for packed sequences"""

 from dataclasses import dataclass
-from typing import Any, List
+from typing import Any

 import numpy as np
 from transformers import PreTrainedTokenizerBase
@@ -81,11 +81,9 @@ class DataCollatorForSeq2Seq:

                padding_side = self.tokenizer.padding_side
                for feature in features:
-                    remainder_len = max_feature_length - len(feature[feature_name])
-                    if feature_name == "position_ids":
-                        remainder = list(range(remainder_len))
-                    else:
-                        remainder = [pad_token_id] * remainder_len
+                    remainder = [pad_token_id] * (
+                        max_feature_length - len(feature[feature_name])
+                    )
                    if isinstance(feature[feature_name], list):
                        feature[feature_name] = (
                            feature[feature_name] + remainder
@@ -163,7 +161,7 @@ class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):

    def __call__(self, features, return_tensors=None):
        if not isinstance(features[0], list):
-            features: List[List[dict]] = [features]
+            features = [features]
        out_features = [{} for _ in features]
        for i, features_ in enumerate(features):
            for feature in features_[0].keys():
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -21,7 +21,7 @@ from axolotl.utils.schemas.config import (
 from axolotl.utils.schemas.config import AxolotlInputConfig as AxolotlInputConfigBase
 from axolotl.utils.schemas.datasets import DPODataset, KTODataset, SFTDataset

-LOG = get_logger(__name__)
+LOG = get_logger(__name__, use_environ=True)


 def choose_device(cfg):
--- a/src/axolotl/utils/ctx_managers/init.py
+++ b/src/axolotl/utils/ctx_managers/init.py
@@ -1,6 +1,5 @@
-"""Init for context manager submodule"""
+"""Init for context manager submodule."""

-# pylint: disable=unused-import
-# flake8: noqa
+from .context_parallel.manager import ContextParallelContextManager

-from .sequence_parallel import SequenceParallelContextManager
+__all__ = ["ContextParallelContextManager"]
--- a/src/axolotl/utils/ctx_managers/context_parallel/init.py
+++ b/src/axolotl/utils/ctx_managers/context_parallel/init.py
--- a/src/axolotl/utils/ctx_managers/context_parallel/distributed.py
+++ b/src/axolotl/utils/ctx_managers/context_parallel/distributed.py
@@ -0,0 +1,146 @@
+# BSD 3-Clause License
+
+# Copyright 2024 Meta
+
+# Redistribution and use in source and binary forms, with or without modification,
+# are permitted provided that the following conditions are met:
+
+# 1. Redistributions of source code must retain the above copyright notice,this list
+# of conditions and the following disclaimer.
+
+# 2. Redistributions in binary form must reproduce the above copyright notice, this
+# list of conditions and the following disclaimer in the documentation
+# and/or other materials provided with the distribution.
+
+# 3. Neither the name of the copyright holder nor the names of its contributors may
+# be used to endorse or promote products derived from this software without specific
+# prior written permission.
+
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS “AS IS” AND ANY
+# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+# OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
+# SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+# TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
+# DAMAGE.
+
+"""
+Distributed utils for SDPA context parallel implementation. Slightly modified from
+https://github.com/pytorch/torchtune/blob/2344509cf83bd886538fe3e8263e5145d1afb5c2/torchtune/training/_distributed.py.
+"""
+
+import contextlib
+from typing import Callable, Generator, Optional, Union
+
+import torch
+from torch import nn
+from torch.distributed.tensor.experimental import context_parallel
+from torch.distributed.tensor.experimental._attention import set_rotate_method
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from torch.nn.attention.flex_attention import BlockMask
+
+
+def _get_sdpa_context() -> (
+    Callable[[Optional[Generator[None, None, None]]], Generator[None, None, None]]
+):
+    """
+    Creates a context manager to confine to flash/efficient/cuDNN attention backends.
+
+    Returns:
+        A context manager function that takes an optional context parallel context.
+    """
+
+    @contextlib.contextmanager
+    def context(cp_context: Union[Generator[None, None, None], None] = None):
+        with contextlib.ExitStack() as stack:
+            if cp_context is not None:
+                stack.enter_context(
+                    sdpa_kernel(
+                        [
+                            SDPBackend.FLASH_ATTENTION,
+                            SDPBackend.EFFICIENT_ATTENTION,
+                            SDPBackend.CUDNN_ATTENTION,
+                        ]
+                    )
+                )
+                stack.enter_context(cp_context)
+
+            yield
+
+    return context
+
+
+def get_context_parallel_manager(
+    *,
+    world_mesh: torch.distributed.DeviceMesh,
+    model: nn.Module,
+) -> Callable[[list[torch.Tensor]], Generator[None, None, None]]:
+    """
+    Context manager for applying context parallelism to a model. In addition to applying the
+    standard context manager to patch SDPA and shard model inputs and buffers along the sequence
+    dimension, this context manager also calls into _get_sdpa_context to filter to acceptable SDPA backends.
+
+    Args:
+        world_mesh: Global device mesh.
+        model: Model to apply context parallelism to.
+
+    Returns:
+        A context manager applying context parallelism if enabled is True. Otherwise a context manager
+        disabling the math SDPA backend.
+
+    Raises:
+        ValueError: if enabled is True but world_mesh does not contain a "cp" dimension
+    """
+
+    if "cp" not in world_mesh.mesh_dim_names:
+        raise ValueError(
+            "Context parallel is enabled but no context parallel device mesh is provided."
+        )
+    # TODO: context parallel for multimodal models requires extra work
+    # if not isinstance(model, TransformerDecoder):
+    #     raise ValueError("Context parallel is only supported for text models")
+    # model_buffers = list(model.buffers())
+    
+    # def get_all_buffers(module, prefix=""):
+    #     buffers = {}
+    #     for name, buffer in module.named_buffers(recurse=False):
+    #         full_name = f"{prefix}.{name}" if prefix else name
+    #         buffers[full_name] = buffer
+
+    #     for name, child in module.named_children():
+    #         child_prefix = f"{prefix}.{name}" if prefix else name
+    #         buffers.update(get_all_buffers(child, child_prefix))
+
+    #     return buffers
+
+    # model_buffers = get_all_buffers(model)
+
+    @contextlib.contextmanager
+    def context(model_inputs: list[torch.Tensor]):
+        # Create context parallel context if enabled
+        cp_context = None
+        if any([isinstance(input, BlockMask) for input in model_inputs]):
+            raise ValueError(
+                "Context parallel with flex attention is not yet supported"
+            )
+        set_rotate_method("allgather")
+
+        cp_context = context_parallel(
+            world_mesh["cp"],
+            # buffers=model_inputs + model_buffers,
+            buffers=model_inputs,
+            # buffer_seq_dims=[1] * len(model_inputs) + [0] * len(model_buffers),
+            buffer_seq_dims=[1] * len(model_inputs),
+            no_restore_buffers=set(model_inputs),
+        )
+
+        # Create and enter the train context with the optional cp_context
+        sdpa_context = _get_sdpa_context()
+
+        with sdpa_context(cp_context):
+            yield
+
+    return context
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dan Saunders	cbcc795bb3	commenting out unused	2025-06-16 01:53:13 +00:00
Dan Saunders	e34b6f4dfe	temp: trying another approach	2025-06-15 21:32:10 +00:00
Dan Saunders	f8f87321bd	progress	2025-06-14 17:40:21 +00:00
Dan Saunders	7a88de4fa8	finish basic impl; change naming from SP -> CP to match torch	2025-06-13 09:51:06 -04:00
Dan Saunders	aced809989	progress (messy :O)	2025-06-12 18:54:41 +00:00
Dan Saunders	ae73123eae	progress; move validation to pydantic model config	2025-06-07 06:58:59 +00:00
Dan Saunders	10d1e44943	SDPA context parallel	2025-06-06 00:34:12 +00:00