coderabbit comments

update posthog dep
lint
2025-06-07 04:50:29 +00:00 · 2025-06-05 23:46:20 +00:00 · 2025-06-05 23:41:46 +00:00 · 2025-06-05 23:33:46 +00:00 · 2025-06-05 23:33:46 +00:00 · 2025-06-05 23:33:46 +00:00
152 changed files with 4619 additions and 6324 deletions
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -16,7 +16,6 @@ on:
 jobs:
  build-base:
    if: github.repository_owner == 'axolotl-ai-cloud'
    timeout-minutes: 480
    # this job needs to be run on self-hosted GPU runners...
    runs-on: ubuntu-latest-m
    strategy:
@@ -48,14 +47,14 @@ jobs:
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
          - cuda: "128"
            cuda_version: 12.6.3
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
          - cuda: "128"
@@ -107,7 +106,6 @@ jobs:
            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
  build-base-uv:
    if: github.repository_owner == 'axolotl-ai-cloud'
    timeout-minutes: 480
    runs-on: ubuntu-latest-m
    strategy:
      fail-fast: false
@@ -124,7 +122,7 @@ jobs:
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
    steps:
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -29,12 +29,12 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
@@ -97,12 +97,12 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -43,7 +43,7 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -52,7 +52,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20
    steps:
@@ -125,7 +125,7 @@ jobs:
      fail-fast: false
      matrix:
        python_version: ["3.11"]
-        pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
+        pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
    timeout-minutes: 20
    steps:
@@ -188,7 +188,7 @@ jobs:
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
+    timeout-minutes: 90
    needs: [pre-commit, pytest, pytest-sdist]
    strategy:
@@ -238,7 +238,7 @@ jobs:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
-    timeout-minutes: 120
+    timeout-minutes: 90
    # Only run the remainder of the matrix if the first e2e check passed;
    # this is to save on wasted compute costs for known failures that get caught in the first run
    needs: [pre-commit, pytest, docker-e2e-tests-1st]
@@ -262,13 +262,13 @@ jobs:
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            num_gpus: 1
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.7.0
            num_gpus: 1
            axolotl_extras:
    steps:
--- a/README.md
+++ b/README.md
@@ -22,32 +22,28 @@
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
 </p>
 ## 🎉 Latest Updates
 - 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
 - 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
 - 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
 - 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
 - 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
 - 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
 - 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
 - 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).
 ## ✨ Overview
 Axolotl is a tool designed to streamline post-training for various AI models.
 Post-training refers to any modifications or additional training performed on
 pre-trained models - including full model fine-tuning, parameter-efficient tuning (like
 LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment
 techniques. With support for multiple model architectures and training configurations,
 Axolotl makes it easy to get started with these techniques.
 Axolotl is designed to work with YAML config files that contain everything you need to
 preprocess a dataset, train or fine-tune a model, run model inference or evaluation,
 and much more.
 Features:
- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
+- Train various Huggingface models such as llama, pythia, falcon, mpt
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
+- Supports fullfinetune, lora, qlora, relora, and gptq
- **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
+- Customize configurations using a simple yaml file or CLI overwrite
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), Sequence Parallelism (SP), LoRA optimizations, Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!
+- Load different dataset formats, use custom formats, or bring your own tokenized datasets
- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
+- Integrated with [xformers](https://github.com/facebookresearch/xformers), flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
+- Works with single GPU or multiple GPUs via FSDP or Deepspeed
-
+- Easily run with Docker locally or on the cloud
-
+- Log results and optionally checkpoints to wandb, mlflow or Comet
 - And more!
 ## 🚀 Quick Start
@@ -85,12 +81,19 @@ axolotl train examples/llama-3/lora-1b.yml
 That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
 ## ✨ Key Features
 - **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more
 - **Training Methods**: Full fine-tuning, LoRA, QLoRA, and more
 - **Easy Configuration**: Simple YAML files to control your training setup
 - **Performance Optimizations**: Flash Attention, xformers, multi-GPU training
 - **Flexible Dataset Handling**: Use various formats and custom datasets
 - **Cloud Ready**: Run on cloud platforms or local hardware
 ## 📚 Documentation
 - [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
 - [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
 - [Dataset Loading](https://docs.axolotl.ai/docs/dataset_loading.html) - Loading datasets from various sources
 - [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
 - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
 - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
@@ -109,6 +112,38 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
 Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
 ## 📈 Telemetry
 Axolotl has opt-in telemetry that helps us understand how the project is being used
 and prioritize improvements. We collect basic system information, model types, and
 error rates—never personal data or file paths. Telemetry is disabled by default. To
 enable it, set AXOLOTL_DO_NOT_TRACK=0. For more details, see our [telemetry documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/telemetry.html).
 ## Supported Models
 |             | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
 |-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
 | llama       | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
 | Mistral     | ✅         | ✅    | ✅     | ✅             | ✅                 | ✅          | ✅            |
 | Mixtral-MoE | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
 | Mixtral8X22 | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
 | Pythia      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
 | cerebras    | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
 | btlm        | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
 | mpt         | ✅         | ❌    | ❓     | ❌             | ❌                 | ❌          | ❓            |
 | falcon      | ✅         | ✅    | ✅     | ❌             | ❌                 | ❌          | ❓            |
 | gpt-j       | ✅         | ✅    | ✅     | ❌             | ❌                 | ❓          | ❓            |
 | XGen        | ✅         | ❓    | ✅     | ❓             | ❓                 | ❓          | ✅            |
 | phi         | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
 | RWKV        | ✅         | ❓    | ❓     | ❓             | ❓                 | ❓          | ❓            |
 | Qwen        | ✅         | ✅    | ✅     | ❓             | ❓                 | ❓          | ❓            |
 | Gemma       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
 | Jamba       | ✅         | ✅    | ✅     | ❓             | ❓                 | ✅          | ❓            |
 ✅: supported
 ❌: not supported
 ❓: untested
 ## ❤️ Sponsors
 Thank you to our sponsors who help make Axolotl possible:
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -236,6 +236,7 @@ website:
            - docs/inference.qmd
            - docs/cli.qmd
            - docs/config.qmd
            - docs/telemetry.qmd
            - text: "API Reference"
              href: docs/api
--- a/deepspeed_configs/zero2_torch_compile.json
+++ b/deepspeed_configs/zero2_torch_compile.json
@@ -1,31 +0,0 @@
 {
  "compile": {
    "disable": false,
    "backend": "inductor"
  },
  "zero_optimization": {
    "stage": 2,
    "offload_optimizer": {
      "device": "cpu"
    },
    "contiguous_gradients": true,
    "overlap_comm": true
  },
  "bf16": {
    "enabled": "auto"
  },
  "fp16": {
    "enabled": "auto",
    "auto_cast": false,
    "loss_scale": 0,
    "initial_scale_power": 32,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
 }
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -38,6 +38,6 @@ RUN git lfs install --skip-repo && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10
-RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
+RUN if [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
        pip3 install flash-attn==2.7.4.post1; \
    fi
--- a/docker/Dockerfile-base-next
+++ b/docker/Dockerfile-base-next
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 WORKDIR /workspace
 RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
-    python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
+    python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -29,12 +29,8 @@ RUN uv venv --no-project --relocatable axolotl-venv
 ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
-RUN uv pip install packaging setuptools wheel psutil \
+RUN uv pip install packaging setuptools wheel \
    && uv pip install torch==${PYTORCH_VERSION} \
    && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
    && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
    && uv pip install awscli pydantic
 RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
        uv pip install --no-build-isolation flash-attn==2.7.4.post1; \
    fi
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -27,8 +27,6 @@ trust_remote_code:
 tokenizer_use_fast:
 # Whether to use the legacy tokenizer setting, defaults to True
 tokenizer_legacy:
 # Whether to use mistral-common tokenizer. If set to True, it will use the mistral-common tokenizer.
 tokenizer_use_mistral_common:
 # Resize the model embeddings when new tokens are added to multiples of 32
 # This is reported to improve training speed on some models
 resize_token_embeddings_to_32x:
@@ -175,10 +173,6 @@ datasets:
    # Key containing the messages (default: "messages")
    field_messages: messages
    # Key containing the tools (default: "tools")
    # Must be a list[dict] and follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
    field_tools: tools
    # Key containing the system message (default: "system")
    # If the system message is not present in the dataset sample, it will be loaded from the field_system property.
    field_system: system
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -52,9 +52,7 @@ We recommend checking the below examples for other usecases.
 ### Examples
-#### Training on last message
+1. (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
 (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
 ```yaml
 datasets:
@@ -68,9 +66,7 @@ datasets:
 If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
 :::
-#### Overriding default chat template
+2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
 Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
 ```yaml
 chat_template: gemma # this overwrites the tokenizer's chat_template
@@ -80,13 +76,7 @@ datasets:
    roles_to_train: ["assistant"]  # default value
 ```
-::: {.callout-note}
+3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
 If you want to use built-in chat_template, use `chat_template: tokenizer_default` (this is set by default).
 :::
 #### Using default chat template with fallback
 Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
 ```yaml
 chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
@@ -95,9 +85,7 @@ datasets:
    type: chat_template
 ```
-#### Custom Jinja template
+4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
 Using a custom jinja template on OpenAI messages format, training on all assistant messages.
 ```yaml
 # chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
@@ -112,9 +100,7 @@ datasets:
 Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
 :::
-#### Using template with different token for EOT and EOS
+5. If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
 - If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
 ```yaml
 eot_tokens:
@@ -139,7 +125,7 @@ Using `eot_tokens` requires each token that exists in `chat_template` to be a si
 You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
 :::
- Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
+6. Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
 ```yaml
 eot_tokens:
@@ -159,73 +145,7 @@ If EOS token only appears at the end of a prompt, `train_on_eos: last` is equiva
 :::
-#### Using tool use
+7. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
 Instead of passing `tools` via the system prompt, an alternative method would be to have the `tools` in a separate column and loaded via `chat_template` to let the template dynamically build it.
 ```json
 {
    "tools": [
        {
            "type": "...",
            "function": {
                "name": "...",
                "description": "...",
                "parameters": {
                    "type": "...",
                    "properties": {
                        // ...
                    },
                    "required": ["..."],
                },
            },
        },
    ],
    "messages": [
        // ...
        {
            "role": "assistant", // call the function via assistant
            "tool_calls": [
                {
                    "type": "function",
                    "function": {
                        "name": "...",
                        "arguments": {
                            "...": "...",
                        }
                    }
                }
            ]
        },
        {
            "role": "tool",
            "name": "...",
            "content": "..."
        },
    ],
 }
 ```
 ::: {.callout-note}
 Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
 :::
 ```yaml
 chat_template: llama4
 datasets:
  - path: ...
    type: chat_template
    # field_tools: tools # default is `tools`
 ```
 ::: {.callout-tip}
 Look into the `chat_template` you are using to see if it supports `tools` and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the `tool` or `ipython` role for `llama4` template.
 :::
 #### Using fine-grained control over token masking
 (Advanced) Using fine-grained control over tokens and turns to train in a conversation
 For a data sample that looks like:
@@ -276,9 +196,7 @@ datasets:
 It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
 :::
-#### Reasoning split
+8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
 (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
 ```yaml
 datasets:
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -9,7 +9,7 @@ format:
 This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
 ::: {.callout-important}
-For Blackwell GPUs, please use the tags with Pytorch 2.7.1 and CUDA 12.8.
+For Blackwell GPUs, please use the tags with Pytorch 2.7.0 and CUDA 12.8.
 :::
 ## Base
@@ -32,8 +32,8 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
 Tags examples:
- `main-base-py3.11-cu128-2.7.1`
+- `main-base-py3.11-cu128-2.7.0`
- `main-base-py3.11-cu126-2.7.1`
+- `main-base-py3.11-cu126-2.7.0`
 - `main-base-py3.11-cu124-2.6.0`
 - `main-base-py3.11-cu124-2.5.1`
--- a/docs/qat.qmd
+++ b/docs/qat.qmd
@@ -29,4 +29,4 @@ qat:
  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
 ```
-Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.
+Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize` command](./quantize.md) to do this.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -500,7 +500,7 @@ The input format is a simple JSON input with customizable fields based on the ab
 ### GRPO
 ::: {.callout-tip}
-Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/grpo_code).
+Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
 :::
 In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
--- a/docs/telemetry.qmd
+++ b/docs/telemetry.qmd
@@ -0,0 +1,59 @@
 ---
 title: Telemetry
 description: A description of the opt-in telemetry implementation in Axolotl.
 ---
 # Telemetry in Axolotl
 Axolotl implements anonymous telemetry to help maintainers understand how the library
 is used and where users encounter issues. This data helps prioritize features, optimize
 performance, and fix bugs.
 ## Data Collection
 We collect:
 - System info: OS, Python version, Axolotl version, PyTorch version, Transformers
 version, etc.
 - Hardware info: CPU count, memory, GPU count and models
 - Runtime metrics: Training progress, memory usage, timing information
 - Usage patterns: Models (from a whitelist) and configurations used
 - Error tracking: Stack traces and error messages (sanitized to remove personal
 information)
 Personally identifiable information (PII) is not collected.
 ## Implementation
 Telemetry is implemented using PostHog and consists of:
 - `axolotl.telemetry.TelemetryManager`: A singleton class that initializes the
 telemetry system and provides methods for tracking events.
 - `axolotl.telemetry.errors.send_errors`: A decorator that captures exceptions and
 sends sanitized stack traces.
 - `axolotl.telemetry.runtime_metrics.RuntimeMetricsTracker`: A class that tracks
 runtime metrics during training.
 - `axolotl.telemetry.callbacks.TelemetryCallback`: A Trainer callback that sends
 runtime metrics telemetry.
 The telemetry system will block training startup for 15 seconds to ensure users are
 aware of data collection, unless telemetry is explicitly enabled or disabled.
 ## Opt-In Mechanism
 Telemetry is **disabled by default** on an opt-in basis. To enable it, set `AXOLOTL_DO_NOT_TRACK=0`.
 To remove the warning message about telemetry that is displayed on train, etc. startup,
 explicitly set: `AXOLOTL_DO_NOT_TRACK=0` (enable telemetry) or `AXOLOTL_DO_NOT_TRACK=1`
 (explicitly disable telemetry).
 **Note**: Telemetry will move to an opt-out model in a later release.
 ## Privacy
 - All path-like config information is automatically redacted from telemetry data
 - Model information is only collected for whitelisted organizations
    - See `axolotl/telemetry/whitelist.yaml` for the set of whitelisted organizations
 - Each run generates a unique anonymous ID
    - This allows us to link different telemetry events in a single same training run
 - Telemetry is only sent from the main process to avoid duplicate events
--- a/examples/llama-3/instruct-dpo-lora-8b.yml
+++ b/examples/llama-3/instruct-dpo-lora-8b.yml
@@ -5,10 +5,6 @@ tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>
 load_in_8bit: true
 load_in_4bit: false
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -1,71 +0,0 @@
 # Finetune Magistral Small with Axolotl
 Magistral Small is a 24B parameter opensource model from MistralAI found on [HuggingFace](https://huggingface.co/mistralai/Magistral-Small-2506). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
 MistralAI has also released a proprietary medium-sized version called Magistral Medium.
 Thanks to the team at MistralAI for giving us early access to prepare for this release.
 ## Getting started
 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Magistral is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
    Here is an example of how to install from main for pip:
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 recommended)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn,mistral]'
 ```
 2. Download the example config:
 ```bash
 axolotl fetch examples
 ```
 3. Run the finetuning example:
 ```bash
 axolotl train examples/magistral/magistral-small-qlora.yaml
 ```
 This config uses about 24GB VRAM.
 Let us know how it goes. Happy finetuning! 🚀
 ### TIPS
 - For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
 - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
 - The dataset format is the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
 ## Optimization Guides
 - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
 - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
 - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
 ## Limitations
 We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
 The tokenizer does not work with `dataset.map` with multiprocessing, so we had to disable it. In addition, we do not support overriding tokens yet.
 ## Related Resources
 - [MistralAI Magistral Blog](https://mistral.ai/news/magistral/)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl Website](https://axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
 ## Future Work
 - Add parity to Preference Tuning, RL, Multi-modal, etc.
 - Add parity to other tokenizer configs like overriding tokens.
--- a/examples/magistral/magistral-small-fsdp-qlora.yaml
+++ b/examples/magistral/magistral-small-fsdp-qlora.yaml
@@ -1,72 +0,0 @@
 base_model: mistralai/Magistral-Small-2506
 # Enable to use mistral-common tokenizer
 tokenizer_use_mistral_common: true
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
 pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing:
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 fsdp:
  - full_shard
  - auto_wrap
 fsdp_config:
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
  fsdp_activation_checkpointing: true
--- a/examples/magistral/magistral-small-qlora.yaml
+++ b/examples/magistral/magistral-small-qlora.yaml
@@ -1,63 +0,0 @@
 base_model: mistralai/Magistral-Small-2506
 # Enable to use mistral-common tokenizer
 tokenizer_use_mistral_common: true
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: true
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 pad_to_sequence_len: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
-lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 wandb_project:
 wandb_entity:
--- a/requirements.txt
+++ b/requirements.txt
@@ -68,4 +68,5 @@ schedulefree==1.4.1
 axolotl-contribs-lgpl==0.0.6
 axolotl-contribs-mit==0.0.3
-mistral-common==1.6.0
+# telemetry
 posthog>=4.2.0
--- a/src/axolotl/init.py
+++ b/src/axolotl/init.py
@@ -4,4 +4,4 @@ import pkgutil
 __path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package
-__version__ = "0.10.0"
+__version__ = "0.10.0.dev0"
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -14,6 +14,8 @@ import yaml
 from transformers.utils import is_torch_bf16_gpu_available
 from axolotl.integrations.base import PluginManager
 from axolotl.telemetry.errors import send_errors
 from axolotl.telemetry.manager import TelemetryManager
 from axolotl.utils.comet_ import setup_comet_env_vars
 from axolotl.utils.config import (
    normalize_cfg_datasets,
@@ -28,6 +30,8 @@ from axolotl.utils.wandb_ import setup_wandb_env_vars
 LOG = get_logger(__name__, use_environ=True)
 TELEMETRY_MANAGER = TelemetryManager.get_instance()
 def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
    """
@@ -159,6 +163,7 @@ def plugin_set_cfg(cfg: DictDefault):
        plugin_manager.cfg = cfg
@send_errors
 def load_cfg(
    config: str | Path | DictDefault = Path("examples/"), **kwargs
 ) -> DictDefault:
@@ -192,6 +197,8 @@ def load_cfg(
            temp_file.close()
        cfg.axolotl_config_path = temp_file.name
    TELEMETRY_MANAGER.send_event(event_type="config-loaded", properties=cfg)
    # If there are any options passed in the cli, if it is something that seems valid
    # from the yaml, then overwrite the value
    cfg_keys = cfg.keys()
@@ -233,4 +240,6 @@ def load_cfg(
    setup_comet_env_vars(cfg)
    plugin_set_cfg(cfg)
    TELEMETRY_MANAGER.send_event(event_type="config-processed", properties=cfg)
    return cfg
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -16,6 +16,7 @@ from axolotl.cli.args import InferenceCliArgs
 from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.config import load_cfg
 from axolotl.cli.utils import load_model_and_tokenizer
 from axolotl.telemetry.errors import send_errors
 from axolotl.utils.chat_templates import (
    get_chat_template,
    get_chat_template_from_config,
@@ -42,6 +43,7 @@ def get_multi_line_input() -> str:
    return instruction
@send_errors
 def do_inference(
    *,
    cfg: DictDefault,
@@ -135,6 +137,7 @@ def do_inference(
        print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
@send_errors
 def do_inference_gradio(
    *,
    cfg: DictDefault,
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -9,12 +9,14 @@ from dotenv import load_dotenv
 from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.config import load_cfg
 from axolotl.cli.utils import load_model_and_tokenizer
 from axolotl.telemetry.errors import send_errors
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
@send_errors
 def do_merge_lora(*, cfg: DictDefault) -> None:
    """
    Calls `transformers`' `merge_and_unload` on the model given in the `axolotl` config
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -24,6 +24,7 @@ from torch.distributed.checkpoint.format_utils import _EmptyStateDictLoadPlanner
 from axolotl.cli.art import print_axolotl_text_art
 from axolotl.cli.config import load_cfg
 from axolotl.telemetry.errors import send_errors
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
@@ -118,6 +119,7 @@ def _distributed_checkpoint_to_merged_weights(
    return save_path_
@send_errors
 def merge_fsdp_weights(
    checkpoint_dir: str,
    output_path: str,
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -18,6 +18,7 @@ from axolotl.cli.config import load_cfg
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.telemetry.errors import send_errors
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 from axolotl.utils.trainer import disable_datasets_caching
@@ -25,6 +26,7 @@ from axolotl.utils.trainer import disable_datasets_caching
 LOG = get_logger(__name__)
@send_errors
 def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
    """
    Preprocesses dataset specified in axolotl config.
--- a/src/axolotl/common/const.py
+++ b/src/axolotl/common/const.py
@@ -1,3 +1,5 @@
-"""Various shared constants"""
+"""
 Various shared constants
 """
 DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -3,13 +3,16 @@
 import math
 import random
 from dataclasses import dataclass
 from typing import Optional, Union
 from datasets import Dataset
 import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.loaders import load_processor, load_tokenizer
-from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
+from axolotl.telemetry.errors import send_errors
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RLType
@@ -28,49 +31,66 @@ class TrainDatasetMeta:
 def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
-    """Randomly sample `num_samples` samples with replacement from `dataset`."""
+    """
    Randomly sample `num_samples` samples from `dataset`.
    Args:
        dataset: Dataset.
        num_samples: Number of samples to return.
    Returns:
        Random sample (with replacement) of examples in `dataset`.
    """
    return dataset.select(
        [random.randrange(0, len(dataset) - 1) for _ in range(num_samples)]  # nosec
    )
@send_errors
 def load_datasets(
    *,
    cfg: DictDefault,
    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
    debug: bool = False,
 ) -> TrainDatasetMeta:
-    """Loads one or more training or evaluation datasets, calling
+    """
-    `axolotl.utils.data.prepare_datasets`. Optionally, logs out debug information.
+    Loads one or more training or evaluation datasets, calling
    `axolotl.utils.data.prepare_dataset`. Optionally, logs out debug information.
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Command-specific CLI arguments.
-        debug: Whether to print out tokenization of sample. This is duplicated in
+        debug: Whether to print out tokenization of sample
            `cfg` and `cli_args`, but is kept due to use in our Colab notebooks.
    Returns:
        Dataclass with fields for training and evaluation datasets and the computed
-            `total_num_steps`.
+        `total_num_steps`.
    """
    tokenizer = load_tokenizer(cfg)
    processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
-    preprocess_iterable = getattr(cli_args, "iterable", False)
+    preprocess_iterable = (
        cli_args
        and hasattr(cli_args, "iterable")
        and cli_args.iterable is not None
        and cli_args.iterable
    )
-    train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
+    train_dataset, eval_dataset, total_num_steps, prompters = prepare_dataset(
        cfg,
        tokenizer,
        processor=processor,
        preprocess_iterable=preprocess_iterable,
    )
-    if (
+    if (  # pylint: disable=too-many-boolean-expressions
-        cfg.debug
+        cli_args
-        or getattr(cli_args, "debug", False)
+        and (
-        or getattr(cli_args, "debug_text_only", False)
+            cli_args.debug
-        or getattr(cli_args, "debug_num_examples", 0) > 0
+            or cfg.debug
-        or debug
+            or cli_args.debug_text_only
-    ):
+            or int(cli_args.debug_num_examples) > 0
        )
    ) or debug:
        LOG.info("check_dataset_labels...")
        num_examples = cli_args.debug_num_examples if cli_args else 1
@@ -94,11 +114,15 @@ def load_datasets(
    )
@send_errors
 def load_preference_datasets(
-    *, cfg: DictDefault, cli_args: PreprocessCliArgs | TrainerCliArgs | None = None
+    *,
    cfg: DictDefault,
    cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
 ) -> TrainDatasetMeta:
-    """Loads one or more training or evaluation datasets for RL training using paired
+    """
-    preference data, calling `axolotl.utils.data.rl.prepare_preference_datasets`.
+    Loads one or more training or evaluation datasets for RL training using paired
    preference data, calling `axolotl.utils.data.rl.load_prepare_preference_datasets`.
    Optionally, logs out debug information.
    Args:
@@ -109,28 +133,23 @@ def load_preference_datasets(
        Dataclass with fields for training and evaluation datasets and the computed
        `total_num_steps`.
    """
-    tokenizer = load_tokenizer(cfg)
+    train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
-    train_dataset, eval_dataset = prepare_preference_datasets(cfg, tokenizer)
+    total_num_steps: Optional[int] = int(
        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
    )
    if cfg.rl is RLType.GRPO:
        total_num_steps = None
-    total_num_steps: int | None = None
+    if cli_args.debug or cfg.debug:
    if cfg.rl is not RLType.GRPO:
        total_num_steps = int(
            math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
        )
    if (cli_args and cli_args.debug) or cfg.debug:
        LOG.info("check_dataset_labels...")
        num_examples = cli_args.debug_num_examples if cli_args else 1
        text_only = cli_args.debug_text_only if cli_args else False
        tokenizer = load_tokenizer(cfg)
-        train_samples = sample_dataset(train_dataset, num_examples)
+        train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
        check_dataset_labels(
-            dataset=train_samples,
+            train_samples,
-            tokenizer=tokenizer,
+            tokenizer,
-            num_examples=num_examples,
+            num_examples=cli_args.debug_num_examples,
-            text_only=text_only,
+            text_only=cli_args.debug_text_only,
            rl_mode=True,
        )
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -31,6 +31,8 @@ from transformers.training_args import OptimizerNames
 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
 from axolotl.telemetry.callbacks import TelemetryCallback
 from axolotl.telemetry.manager import TelemetryManager
 from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
    GCCallback,
@@ -145,6 +147,10 @@ class TrainerBuilderBase(abc.ABC):
        callbacks.append(GPUStatsCallback(cfg=self.cfg))
        telemetry_manager = TelemetryManager.get_instance()
        if telemetry_manager.enabled:
            callbacks.append(TelemetryCallback())
        return callbacks
    def get_post_trainer_create_callbacks(self, trainer):
@@ -380,16 +386,14 @@ class TrainerBuilderBase(abc.ABC):
        )
        # eval_strategy and eval_steps
-        if not self.eval_dataset and self.cfg.val_set_size == 0:
+        if not self.eval_dataset or self.cfg.val_set_size == 0:
-            # do not eval if no eval_dataset and val_set_size=0
+            # do not eval if no eval_dataset or val_set_size=0
            training_args_kwargs["eval_strategy"] = "no"
        elif self.cfg.eval_steps:
            training_args_kwargs["eval_strategy"] = "steps"
            training_args_kwargs["eval_steps"] = self.cfg.eval_steps
            training_args_kwargs["eval_on_start"] = True
        elif self.cfg.eval_strategy:
            training_args_kwargs["eval_strategy"] = self.cfg.eval_strategy
            training_args_kwargs["eval_on_start"] = True
    def _configure_reporting(self, training_args_kwargs: dict):
        report_to = []
@@ -492,9 +496,6 @@ class TrainerBuilderBase(abc.ABC):
        training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
        training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs
        if self.cfg.dataset_processes:
            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
        # max_length is not used in CausalTrainer
        if self.cfg.reward_model or self.cfg.rl:
            training_args_kwargs["max_length"] = self.cfg.sequence_len
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -21,12 +21,18 @@ from axolotl.core.trainers import (
    AxolotlTrainer,
    ReLoRATrainer,
 )
 from axolotl.core.training_args import (
    AxolotlPRMConfig,
    AxolotlRewardConfig,
    AxolotlTrainingArguments,
 )
 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback
 from axolotl.processing_strategies import get_processing_strategy
 from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
    EvalFirstStepCallback,
    LossWatchDogCallback,
    SaveBetterTransformerModelCallback,
    bench_eval_callback_factory,
@@ -57,6 +63,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
    def get_callbacks(self):
        callbacks = super().get_callbacks()
        callbacks.append(EvalFirstStepCallback())
        if self.cfg.relora_steps:
            callbacks.append(ReLoRACallback(self.cfg))
@@ -123,9 +130,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        return callbacks
    def _get_trainer_cls(self):
        """
        Gets the trainer class for the given configuration.
        """
        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
@@ -142,12 +146,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        return AxolotlTrainer
    def build(self, total_num_steps):
        from axolotl.core.training_args import (
            AxolotlPRMConfig,
            AxolotlRewardConfig,
            AxolotlTrainingArguments,
        )
        training_arguments_kwargs, trainer_kwargs = self._set_base_training_args(
            total_num_steps
        )
@@ -316,12 +314,20 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            training_arguments_kwargs["image_resize_algorithm"] = (
                self.cfg.image_resize_algorithm
            )
-
+        if self.cfg.kd_ce_alpha is not None:
-        if self.cfg.plugins:
+            training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
-            plugin_manager = PluginManager.get_instance()
+        if self.cfg.kd_alpha is not None:
-            plugin_training_args = plugin_manager.get_training_args(self.cfg)
+            training_arguments_kwargs["kd_alpha"] = self.cfg.kd_alpha
-            if plugin_training_args:
+        if self.cfg.kd_temperature is not None:
-                training_arguments_kwargs.update(plugin_training_args)
+            training_arguments_kwargs["kd_temperature"] = self.cfg.kd_temperature
        if self.cfg.kd_zscore_base_temp is not None:
            training_arguments_kwargs["kd_zscore_base_temp"] = (
                self.cfg.kd_zscore_base_temp
            )
        if self.cfg.kd_top_k_before_softmax is not None:
            training_arguments_kwargs["kd_top_k_before_softmax"] = (
                self.cfg.kd_top_k_before_softmax
            )
        if self.cfg.reward_model:
            training_args_cls = AxolotlRewardConfig
@@ -375,7 +381,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        elif "tokenizer" in sig.parameters:
            trainer_kwargs["tokenizer"] = self.tokenizer
        if (
-            trainer_cls not in [AxolotlRewardTrainer, AxolotlPRMTrainer]
+            not (trainer_cls in [AxolotlRewardTrainer, AxolotlPRMTrainer])
            and self.cfg.datasets is not None
        ):
            trainer_kwargs["dataset_tags"] = [
@@ -402,10 +408,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        return trainer
    def build_collator(
-        self,
+        self, training_args: AxolotlTrainingArguments, is_eval=False, **kwargs
        training_args,  # type: "AxolotlTrainingArguments"  # type: ignore
        is_eval=False,
        **kwargs,
    ):
        if training_args.pretraining:
            if (
@@ -434,19 +437,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            ]
        ]
        collator_args = [self.tokenizer]
-
+        if self.cfg.reward_model:
        collator_cls_and_kwargs = None
        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            collator_cls_and_kwargs = plugin_manager.get_collator_cls_and_kwargs(
                self.cfg, is_eval=is_eval
            )
        if collator_cls_and_kwargs:
            collator = collator_cls_and_kwargs[0]
            if kwargs and isinstance(kwargs, dict):
                kwargs.update(collator_cls_and_kwargs[1])
        elif self.cfg.reward_model:
            collator = RewardDataCollatorWithPadding
        elif use_batch_sampler_collator:
            # Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
@@ -477,6 +468,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                collator_args.pop(0)
                kwargs.pop("pad_to_multiple_of", None)
                kwargs.pop("padding", None)
            elif self.cfg.kd_trainer:
                from axolotl.integrations.kd.collator import (
                    DataCollatorForKD,
                    KDBatchSamplerDataCollatorForSeq2Seq,
                )
                if self.cfg.sample_packing:
                    collator = KDBatchSamplerDataCollatorForSeq2Seq
                else:
                    collator = DataCollatorForKD
            else:
                collator = DataCollatorForSeq2Seq
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -12,9 +12,13 @@ from axolotl.core.trainers import (
 from axolotl.core.trainers.dpo import DPOStrategy
 from axolotl.core.trainers.dpo.args import AxolotlDPOConfig
 from axolotl.core.trainers.grpo import GRPOStrategy
 from axolotl.core.training_args import (
    AxolotlCPOConfig,
    AxolotlKTOConfig,
    AxolotlORPOConfig,
 )
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders.utils import ensure_dtype
 from axolotl.utils.callbacks.qat import QATCallback
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RLType
@@ -27,9 +31,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
    def get_callbacks(self):
        callbacks = super().get_callbacks()
        if self.cfg.qat:
            callbacks.append(QATCallback(self.cfg.qat))
        return callbacks
    def get_post_trainer_create_callbacks(self, trainer):
@@ -78,12 +79,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        """
        Returns training_args and trainer_kwargs
        """
        from axolotl.core.training_args import (
            AxolotlCPOConfig,
            AxolotlKTOConfig,
            AxolotlORPOConfig,
        )
        training_args_kwargs, trainer_kwargs = self._set_base_training_args(
            total_num_steps=total_num_steps
        )
@@ -95,6 +90,10 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        else:
            training_args_kwargs["remove_unused_columns"] = False
        # only rlhf
        if self.cfg.dataset_processes:
            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
        if self.cfg.trl and self.cfg.trl.beta is not None:
            training_args_kwargs["beta"] = self.cfg.trl.beta
        elif self.cfg.rl_beta is not None:
@@ -143,7 +142,22 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
            training_args_cls = AxolotlDPOConfig
-            training_args_kwargs.update(DPOStrategy.set_training_args_kwargs(self.cfg))
+            if self.cfg.rl is RLType.IPO:
                training_args_kwargs["loss_type"] = "ipo"
            # Not compatible with IPO
            if self.cfg.rl is RLType.DPO and self.cfg.dpo_label_smoothing:
                training_args_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
            training_args_kwargs["max_completion_length"] = None
            training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
            training_args_kwargs["generate_during_eval"] = self.cfg.use_wandb
            if self.cfg.dpo_use_weighting is not None:
                training_args_kwargs["use_weighting"] = self.cfg.dpo_use_weighting
            if self.cfg.dpo_use_logits_to_keep is not None:
                training_args_kwargs["use_logits_to_keep"] = (
                    self.cfg.dpo_use_logits_to_keep
                )
        else:
            raise ValueError(f"Unsupported RL: {self.cfg.rl}")
@@ -151,12 +165,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if blocklist_key in training_args_kwargs:
                del training_args_kwargs[blocklist_key]
        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            plugin_training_args = plugin_manager.get_training_args(self.cfg)
            if plugin_training_args:
                training_args_kwargs.update(plugin_training_args)
        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
            logging_first_step=True,
            **training_args_kwargs,
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -25,7 +25,6 @@ from trl.trainer.utils import pad_to_length
 from typing_extensions import override
 from axolotl.core.trainers.mixins import (
    CheckpointSaveMixin,
    OptimizerMixin,
    RngLoaderMixin,
    SchedulerMixin,
@@ -34,16 +33,13 @@ from axolotl.core.trainers.utils import (
    sanitize_kwargs_for_ds_tagging,
    sanitize_kwargs_for_tagging,
 )
 from axolotl.utils import get_not_null
 from axolotl.utils.logging import get_logger
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 LOG = get_logger(__name__)
-class AxolotlTrainer(
+class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
    SchedulerMixin, OptimizerMixin, RngLoaderMixin, CheckpointSaveMixin, Trainer
 ):
    """Extend the base Trainer for axolotl helpers"""
    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
@@ -105,7 +101,7 @@ class AxolotlTrainer(
            )
            batch_max_len = train_batch_size * self.args.max_seq_length
-        sampler = MultipackBatchSampler(
+        return MultipackBatchSampler(
            base_sampler,
            lengths=get_dataset_lengths(dataset),
            packing_efficiency_estimate=self.args.sample_packing_efficiency,
@@ -115,12 +111,8 @@ class AxolotlTrainer(
            bin_size=self.args.sample_packing_bin_size,
            sequential=self.args.sample_packing_sequentially,
            drop_last=True,
            num_processes=self.args.dataset_num_proc,
        )
        len(sampler)
        return sampler
    def _get_train_sampler(
        self, train_dataset: Optional[Dataset] = None
    ) -> Optional[Sampler]:
@@ -228,9 +220,7 @@ class AxolotlTrainer(
        }
        if not isinstance(dataset, torch.utils.data.IterableDataset):
-            dataloader_params["drop_last"] = get_not_null(
+            dataloader_params["drop_last"] = self.args.dataloader_drop_last
                self.args.dataloader_drop_last, True
            )
            if sampler_fn is not None:
                sampler = sampler_fn(dataset)
                if isinstance(sampler, BatchSampler):
--- a/src/axolotl/core/trainers/dpo/init.py
+++ b/src/axolotl/core/trainers/dpo/init.py
@@ -22,19 +22,10 @@ class DPOStrategy:
        training_args_kwargs = {}
        if cfg.rl is RLType.IPO:
            training_args_kwargs["loss_type"] = "ipo"
        # Label smoothing is not compatible with IPO
        if cfg.rl is RLType.DPO and cfg.dpo_label_smoothing:
            training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing
        training_args_kwargs["max_completion_length"] = None
        training_args_kwargs["max_length"] = cfg.sequence_len
        training_args_kwargs["max_completion_length"] = None
        training_args_kwargs["max_prompt_length"] = cfg.sequence_len
        training_args_kwargs["generate_during_eval"] = cfg.use_wandb
        if cfg.dpo_use_weighting is not None:
            training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting
        if cfg.dpo_padding_free is not None:
            training_args_kwargs["padding_free"] = cfg.dpo_padding_free
        if cfg.dpo_norm_loss is not None:
            training_args_kwargs["dpo_norm_loss"] = cfg.dpo_norm_loss
        if cfg.dpo_use_logits_to_keep is not None:
            training_args_kwargs["use_logits_to_keep"] = cfg.dpo_use_logits_to_keep
        return training_args_kwargs
--- a/src/axolotl/core/trainers/dpo/args.py
+++ b/src/axolotl/core/trainers/dpo/args.py
@@ -14,5 +14,3 @@ class AxolotlDPOConfig(AxolotlTrainingMixins, DPOConfig):
    """
    DPO config for DPO training
    """
    dpo_norm_loss: bool | None = False
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -83,20 +83,3 @@ class AxolotlDPOTrainer(
        gc.collect()
        torch.cuda.empty_cache()
        return loss
    def concatenated_forward(
        self,
        model: nn.Module,
        batch: dict[str, Union[list, torch.LongTensor]],
        is_ref_model: bool = False,
    ) -> dict[str, torch.Tensor]:
        if self.args.dpo_norm_loss:
            # fmt: off
            loss_type: str = self.loss_type  # type: ignore[has-type]  # pylint: disable=access-member-before-definition
            # fmt: on
            # concatenated_forward handles avg token logprob for ipo case already
            self.loss_type = "ipo"  # pylint: disable=attribute-defined-outside-init
            res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
            self.loss_type = loss_type  # pylint: disable=attribute-defined-outside-init
            return res
        return super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -3,7 +3,6 @@
 # pylint: disable=too-many-lines,duplicate-code,protected-access,no-member
 import warnings
 from functools import partial
 from typing import Any
 import datasets
@@ -59,42 +58,6 @@ class AxolotlGRPOTrainer(
    _tag_names = ["trl", "grpo", "axolotl"]
    def get_train_dataloader(self):
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        train_dataset = self.train_dataset
        data_collator = self.data_collator
        if isinstance(train_dataset, datasets.Dataset):
            train_dataset = self._remove_unused_columns(
                train_dataset, description="training"
            )
        else:
            data_collator = self._get_collator_with_removed_columns(
                data_collator, description="training"
            )
        dataloader_params = {
            "batch_size": self._train_batch_size
            * self.args.steps_per_generation,  # < this is the change
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
            "persistent_workers": self.args.dataloader_persistent_workers,
        }
        if not isinstance(train_dataset, torch.utils.data.IterableDataset):
            dataloader_params["sampler"] = self._get_train_sampler()
            dataloader_params["drop_last"] = self.args.dataloader_drop_last
            dataloader_params["worker_init_fn"] = partial(
                seed_worker,
                num_workers=self.args.dataloader_num_workers,
                rank=self.args.process_index,
            )
            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
        return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
 class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
    """Extend the base GRPOTrainer for sequence parallelism handling"""
--- a/src/axolotl/core/trainers/mixins/init.py
+++ b/src/axolotl/core/trainers/mixins/init.py
@@ -3,7 +3,6 @@
 # pylint: disable=unused-import
 # flake8: noqa
 from .checkpoints import CheckpointSaveMixin
 from .optimizer import OptimizerMixin
 from .rng_state_loader import RngLoaderMixin
 from .scheduler import SchedulerMixin
--- a/src/axolotl/core/trainers/mixins/checkpoints.py
+++ b/src/axolotl/core/trainers/mixins/checkpoints.py
@@ -1,21 +0,0 @@
 """Custom handling to not fail training if fsdp optimizer is not savable"""
 from transformers import Trainer
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
 class CheckpointSaveMixin(Trainer):
    """Mixin to handle saving the optimizer and scheduler if they are not savable."""
    def _save_optimizer_and_scheduler(self, output_dir):
        try:
            super()._save_optimizer_and_scheduler(output_dir)
        except NotImplementedError as exc:
            LOG.warning(
                f"Trainer does not support saving optimizer and scheduler:  {exc}\n"
                "Optimizer and scheduler states were not saved - resuming from checkpoints "
                "for this training run will not be possible."
            )
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -2,17 +2,238 @@
 extra axolotl specific training args
 """
 from __future__ import annotations
 from dataclasses import dataclass, field
-from typing import Optional, Type
+from typing import Optional
 from PIL.Image import Resampling
 from transformers import TrainingArguments
 from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig
 from axolotl.integrations.config import merge_training_args
-AxolotlTrainingMixins: Type = merge_training_args()
+@dataclass
 class AxolotlTrainingMixins:
    """
    Mixin class for the Axolotl training args.
    """
    # pylint: disable=duplicate-code
    model_type: Optional[str] = field(
        default=None, metadata={"help": "HF model configuration model_type."}
    )
    lr_quadratic_warmup: bool = field(
        default=False,
        metadata={"help": "Use quadratic warmup for cosine scheduling."},
    )
    pretraining: bool = field(
        default=False,
        metadata={
            "help": "Indicates to trainer whether we are doing continued pretraining."
        },
    )
    sample_packing: bool = field(
        default=False,
        metadata={"help": "Use sample packing for efficient training."},
    )
    sample_packing_sequentially: bool = field(
        default=False,
        metadata={
            "help": "Use next-fit sample packing that preserves the order of samples coming from the sampler. Use in combination with curriculum_sampling for fully sequential packing."
        },
    )
    multipack_real_batches: bool = field(
        default=False,
        metadata={"help": "Use real batches for efficient training."},
    )
    eval_sample_packing: Optional[bool] = field(
        default=None,
        metadata={"help": "Use sample packing for efficient evals."},
    )
    sample_packing_efficiency: float = field(
        default=1.0,
        metadata={"help": "Sample packing efficiency for calculating batch length."},
    )
    sample_packing_bin_size: int = field(
        default=200,
        metadata={
            "help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
        },
    )
    sample_packing_group_size: int = field(
        default=100000,
        metadata={
            "help": "The number of samples to group together for packing. Increase for better packing."
        },
    )
    max_seq_length: int = field(
        default=2048,
        metadata={"help": "The maximum sequence length the model can handle"},
    )
    relora_steps: Optional[int] = field(
        default=None,
        metadata={"help": "how often to reset for ReLoRA"},
    )
    relora_warmup_steps: Optional[int] = field(
        default=None,
        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
    )
    relora_anneal_steps: Optional[int] = field(
        default=None,
        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
    )
    relora_prune_ratio: Optional[float] = field(
        default=0.9,
        metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
    )
    bench_split: Optional[str] = field(
        default="eval", metadata={"help": "The benchmark split to run on"}
    )
    bench_dataset: Optional[str] = field(
        default="pharaouk/dharma-1/dharma_1_mini.json",
        metadata={
            "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
        },
    )
    do_bench_eval: Optional[bool] = field(
        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
    )
    do_causal_lm_eval: Optional[bool] = field(
        default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
    )
    max_bench_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
        },
    )
    bench_source_max_len: int = field(
        default=2048, metadata={"help": "Maximum source sequence length for bench."}
    )
    dataloader_prefetch_factor: Optional[int] = field(
        default=None,
        metadata={"help": "prefetch_factor argument to the dataloader"},
    )
    cosine_min_lr_ratio: Optional[float] = field(
        default=None,
        metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
    )
    cosine_constant_lr_ratio: Optional[float] = field(
        default=None,
        metadata={
            "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
        },
    )
    loraplus_lr_ratio: Optional[float] = field(
        default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
    )
    loraplus_lr_embedding: Optional[float] = field(
        default=1e-6,
        metadata={"help": "loraplus learning rate for lora embedding layers."},
    )
    embedding_lr_scale: Optional[float] = field(
        default=None,
        metadata={"help": "Scale the learning rate for the embedding layers."},
    )
    lr_groups: Optional[list[dict]] = field(
        default=None,
        metadata={"help": "Specify learning rate groups for with different LRs."},
    )
    embedding_lr: Optional[float] = field(
        default=None,
        metadata={"help": "absolute learning rate for the embedding layers."},
    )
    qlora: bool = field(
        default=False,
        metadata={"help": "whether this is a qlora training"},
    )
    orpo_alpha: Optional[float] = field(
        default=None,
    )
    lisa_n_layers: Optional[int] = field(
        default=None,
        metadata={"help": "the number of activate layers in LISA"},
    )
    lisa_step_interval: Optional[int] = field(
        default=None,
        metadata={"help": "how often to switch layers in LISA"},
    )
    lisa_layers_attribute: Optional[str] = field(
        default=None,
        metadata={"help": "path under the model to access the layers"},
    )
    curriculum_sampling: Optional[bool] = field(
        default=None,
        metadata={"help": "whether to use sequential sampling for curriculum learning"},
    )
    alternate_lr_scheduler_type: Optional[str] = field(
        default=None,
        metadata={
            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
        },
    )
    chat_template: Optional[str] = field(
        default=None,
        metadata={"help": "Chat template converting chat messages to text"},
    )
    kd_ce_alpha: Optional[float] = field(
        default=None,
        metadata={
            "help": "The alpha scaling parameter for SFT cross entropy loss when using KD"
        },
    )
    kd_alpha: Optional[float] = field(
        default=1.0,
        metadata={"help": "The alpha scaling parameter for KD loss"},
    )
    kd_temperature: Optional[float] = field(
        default=1.0,
        metadata={
            "help": "the temperature parameter for KL divergence loss when using KD"
        },
    )
    kd_zscore_base_temp: Optional[float] = field(
        default=None,
        metadata={
            "help": "the base temperature parameter for KL divergence with z-score when using KD"
        },
    )
    kd_top_k_before_softmax: Optional[bool] = field(
        default=None,
        metadata={
            "help": "Whether to apply top_k_before_softmax to the logits when using KD"
        },
    )
    adam_beta3: Optional[float] = field(
        default=None,
        metadata={
            "help": "The beta3 hyperparameter used in some optimizers such as CAME"
        },
    )
    adam_epsilon2: Optional[float] = field(
        default=None,
        metadata={
            "help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
        },
    )
    # multi-modal section
    image_size: int | tuple[int, int] | None = field(
        default=None,
        metadata={"help": "The size of the image to resize to"},
    )
    image_resize_algorithm: Resampling | None = field(
        default=None,
        metadata={"help": "The algorithm to use for image resizing"},
    )
    # end of multi-modal section
@dataclass
--- a/src/axolotl/core/training_args_base.py
+++ b/src/axolotl/core/training_args_base.py
@@ -1,224 +0,0 @@
 """
 Base Axolotl Training Mixins shared across various trainer configs
 """
 from dataclasses import dataclass, field
 from typing import Optional
 from PIL.Image import Resampling
@dataclass
 class AxolotlTrainingMixins:
    """
    Mixin class for the Axolotl training args.
    """
    # pylint: disable=duplicate-code
    model_type: Optional[str] = field(
        default=None, metadata={"help": "HF model configuration model_type."}
    )
    lr_quadratic_warmup: bool = field(
        default=False,
        metadata={"help": "Use quadratic warmup for cosine scheduling."},
    )
    pretraining: bool = field(
        default=False,
        metadata={
            "help": "Indicates to trainer whether we are doing continued pretraining."
        },
    )
    sample_packing: bool = field(
        default=False,
        metadata={"help": "Use sample packing for efficient training."},
    )
    sample_packing_sequentially: bool = field(
        default=False,
        metadata={
            "help": "Use next-fit sample packing that preserves the order of samples coming from the sampler. Use in combination with curriculum_sampling for fully sequential packing."
        },
    )
    multipack_real_batches: bool = field(
        default=False,
        metadata={"help": "Use real batches for efficient training."},
    )
    eval_sample_packing: Optional[bool] = field(
        default=None,
        metadata={"help": "Use sample packing for efficient evals."},
    )
    sample_packing_efficiency: float = field(
        default=1.0,
        metadata={"help": "Sample packing efficiency for calculating batch length."},
    )
    sample_packing_bin_size: int = field(
        default=200,
        metadata={
            "help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
        },
    )
    sample_packing_group_size: int = field(
        default=100000,
        metadata={
            "help": "The number of samples to group together for packing. Increase for better packing."
        },
    )
    max_seq_length: int = field(
        default=2048,
        metadata={"help": "The maximum sequence length the model can handle"},
    )
    dataset_num_proc: int | None = field(
        default=None,
        metadata={"help": "The number of processes to use for data processing"},
    )
    relora_steps: Optional[int] = field(
        default=None,
        metadata={"help": "how often to reset for ReLoRA"},
    )
    relora_warmup_steps: Optional[int] = field(
        default=None,
        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
    )
    relora_anneal_steps: Optional[int] = field(
        default=None,
        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
    )
    relora_prune_ratio: Optional[float] = field(
        default=0.9,
        metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
    )
    bench_split: Optional[str] = field(
        default="eval", metadata={"help": "The benchmark split to run on"}
    )
    bench_dataset: Optional[str] = field(
        default="pharaouk/dharma-1/dharma_1_mini.json",
        metadata={
            "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
        },
    )
    do_bench_eval: Optional[bool] = field(
        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
    )
    do_causal_lm_eval: Optional[bool] = field(
        default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
    )
    max_bench_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
        },
    )
    bench_source_max_len: int = field(
        default=2048, metadata={"help": "Maximum source sequence length for bench."}
    )
    dataloader_prefetch_factor: Optional[int] = field(
        default=None,
        metadata={"help": "prefetch_factor argument to the dataloader"},
    )
    cosine_min_lr_ratio: Optional[float] = field(
        default=None,
        metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
    )
    cosine_constant_lr_ratio: Optional[float] = field(
        default=None,
        metadata={
            "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
        },
    )
    loraplus_lr_ratio: Optional[float] = field(
        default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
    )
    loraplus_lr_embedding: Optional[float] = field(
        default=1e-6,
        metadata={"help": "loraplus learning rate for lora embedding layers."},
    )
    embedding_lr_scale: Optional[float] = field(
        default=None,
        metadata={"help": "Scale the learning rate for the embedding layers."},
    )
    lr_groups: Optional[list[dict]] = field(
        default=None,
        metadata={"help": "Specify learning rate groups for with different LRs."},
    )
    embedding_lr: Optional[float] = field(
        default=None,
        metadata={"help": "absolute learning rate for the embedding layers."},
    )
    qlora: bool = field(
        default=False,
        metadata={"help": "whether this is a qlora training"},
    )
    orpo_alpha: Optional[float] = field(
        default=None,
    )
    lisa_n_layers: Optional[int] = field(
        default=None,
        metadata={"help": "the number of activate layers in LISA"},
    )
    lisa_step_interval: Optional[int] = field(
        default=None,
        metadata={"help": "how often to switch layers in LISA"},
    )
    lisa_layers_attribute: Optional[str] = field(
        default=None,
        metadata={"help": "path under the model to access the layers"},
    )
    curriculum_sampling: Optional[bool] = field(
        default=None,
        metadata={"help": "whether to use sequential sampling for curriculum learning"},
    )
    alternate_lr_scheduler_type: Optional[str] = field(
        default=None,
        metadata={
            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
        },
    )
    chat_template: Optional[str] = field(
        default=None,
        metadata={"help": "Chat template converting chat messages to text"},
    )
    # kd_ce_alpha: Optional[float] = field(
    #     default=None,
    #     metadata={
    #         "help": "The alpha scaling parameter for SFT cross entropy loss when using KD"
    #     },
    # )
    #
    # kd_alpha: Optional[float] = field(
    #     default=1.0,
    #     metadata={"help": "The alpha scaling parameter for KD loss"},
    # )
    #
    # kd_temperature: Optional[float] = field(
    #     default=1.0,
    #     metadata={
    #         "help": "the temperature parameter for KL divergence loss when using KD"
    #     },
    # )
    adam_beta3: Optional[float] = field(
        default=None,
        metadata={
            "help": "The beta3 hyperparameter used in some optimizers such as CAME"
        },
    )
    adam_epsilon2: Optional[float] = field(
        default=None,
        metadata={
            "help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
        },
    )
    # multi-modal section
    image_size: int | tuple[int, int] | None = field(
        default=None,
        metadata={"help": "The size of the image to resize to"},
    )
    image_resize_algorithm: Resampling | None = field(
        default=None,
        metadata={"help": "The algorithm to use for image resizing"},
    )
    # end of multi-modal section
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -1,6 +1,7 @@
 """Module containing Dataset functionality"""
 import os
 from typing import List, Optional, Union
 import torch
 from datasets import Dataset, IterableDataset
@@ -19,21 +20,21 @@ LOG = get_logger(__name__)
 class TokenizedPromptDataset(Dataset):
-    """Dataset that returns tokenized prompts from a stream of text files.
+    """
-
+    Dataset that returns tokenized prompts from a stream of text files.
-    Args:
+        Args:
-        prompt_tokenizer: The prompt tokenizing method for processing the data.
+            prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data.
-        dataset: Dataset with text files.
+            dataset (dataset.Dataset): Dataset with text files.
-        process_count: Number of processes to use for tokenizing.
+            process_count (int): Number of processes to use for tokenizing.
-        keep_in_memory: Whether to keep the tokenized dataset in memory.
+            keep_in_memory (bool): Whether to keep the tokenized dataset in memory.
    """
    def __init__(  # pylint: disable=super-init-not-called
        self,
        prompt_tokenizer: PromptTokenizingStrategy,
        dataset: Dataset,
-        process_count: int | None = None,
+        process_count: Optional[int] = None,
-        keep_in_memory: bool | None = False,
+        keep_in_memory: Optional[bool] = False,
        **kwargs,
    ):
        self.prompt_tokenizer = prompt_tokenizer
@@ -48,13 +49,6 @@ class TokenizedPromptDataset(Dataset):
        features = dataset.features.keys()
        num_proc = min(64, self.process_count if self.process_count else os.cpu_count())
        # Disable multiprocessing if the tokenizer doesn't support it (e.g., mistral_common)
        if not getattr(self.prompt_tokenizer, "supports_multiprocessing", True):
            LOG.info(
                "Disabling multiprocessing for tokenizer as it doesn't support it (e.g., mistral_common)"
            )
            num_proc = 1
        map_kwargs = {}
        if self.prompt_tokenizer.supports_batched:
            map_kwargs["batched"] = True
@@ -82,14 +76,14 @@ class TokenizedPromptDataset(Dataset):
 def wrap_dataset_for_tokenized_prompt(
    prompt_tokenizer: PromptTokenizingStrategy,
-    dataset: Dataset | IterableDataset,
+    dataset: Union[Dataset, IterableDataset],
    **kwargs,
 ):
    if isinstance(dataset, IterableDataset):
        map_kwargs = {}
        if prompt_tokenizer.supports_batched:
            map_kwargs["batched"] = True
-        features = list(dataset.features.keys())
+        features = dataset.features.keys()
        return dataset.map(
            prompt_tokenizer.tokenize_prompt,
            remove_columns=features,
@@ -100,13 +94,12 @@ def wrap_dataset_for_tokenized_prompt(
 # TODO this isn't the best since it can't interleave datasets
 class ConstantLengthDataset(IterableDataset):
-    """Iterable dataset that returns constant length chunks of tokens from stream of
+    """
-    text files.
+    Iterable dataset that returns constant length chunks of tokens from stream of text files.
-
+        Args:
-    Args:
+            tokenizer (Tokenizer): The processor used for processing the data.
-        tokenizer: The processor used for processing the data.
+            dataset (dataset.Dataset): Dataset with text files.
-        dataset: Dataset with text files.
+            seq_length (int): Length of token sequences to return.
        seq_length: Length of token sequences to return.
    """
    def __init__(  # pylint: disable=super-init-not-called
@@ -117,7 +110,7 @@ class ConstantLengthDataset(IterableDataset):
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
-        self.datasets: list[IterableDataset] = datasets
+        self.datasets: List[IterableDataset] = datasets
        self.seq_length = seq_length
        vocab_size = len(tokenizer.get_vocab())
@@ -181,10 +174,7 @@ class ConstantLengthDataset(IterableDataset):
                            }
                        else:
                            LOG.warning(
-                                "Dropping batch due to tensor size mismatch "
+                                f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
                                f"input_ids: {input_ids.size()}, "
                                f"labels: {labels.size()}, "
                                f"attention_mask: {attention_mask.size()}"
                            )
                    buffer = {
                        "input_ids": [],
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -7,16 +7,17 @@ from pathlib import Path
 from typing import Dict, Optional
 import torch
 from accelerate.logging import get_logger
 from datasets import Dataset
 from transformers.trainer import Trainer
 from axolotl.telemetry.errors import send_errors
 from axolotl.train import (
    TrainDatasetMeta,
    setup_model_and_tokenizer,
 )
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.logging import get_logger
 from axolotl.utils.trainer import setup_trainer
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -63,6 +64,7 @@ def evaluate_dataset(
    return metrics
@send_errors
 def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, float]:
    """
    Evaluate a model on training and validation datasets.
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -22,7 +22,6 @@ from __future__ import annotations
 import collections
 import importlib
 import traceback
 from typing import TYPE_CHECKING, Callable, OrderedDict, Union
 from peft import PeftModel
@@ -84,11 +83,6 @@ class BasePlugin:
    def get_input_args(self) -> str | None:
        """Returns a pydantic model for the plugin's input arguments."""
    def get_training_args_mixin(self) -> str | None:
        """
        Returns a dataclass model for the plugin's training arguments.
        """
    def load_datasets(
        self, cfg: DictDefault, preprocess: bool = False
    ) -> Union["TrainDatasetMeta", None]:
@@ -164,31 +158,6 @@ class BasePlugin:
            trainer: The trainer object for training.
        """
    def get_training_args(self, cfg: DictDefault):  # pylint: disable=unused-argument):
        """
        Returns custom training arguments to set on TrainingArgs.
        Args:
            cfg: The global axolotl configuration.
        Returns:
            object: dict containing the training arguments.
        """
    def get_collator_cls_and_kwargs(
        self, cfg: DictDefault, is_eval: bool = False
    ):  # pylint: disable=unused-argument):
        """
        Returns a custom class for the collator.
        Args:
            cfg: The global axolotl configuration.
            is_eval: Whether this is an eval split.
        Returns:
            class: The class for the collator.
        """
    # pylint: disable=unused-argument
    def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
        """Creates and returns an optimizer for training.
@@ -309,7 +278,7 @@ def load_plugin(plugin_name: str) -> BasePlugin:
    return plugin
-class PluginManager:  # pylint: disable=too-many-public-methods
+class PluginManager:
    """The `PluginManager` class is responsible for loading and managing plugins. It
    should be a singleton so it can be accessed from anywhere in the codebase.
@@ -368,11 +337,8 @@ class PluginManager:  # pylint: disable=too-many-public-methods
            plugin = load_plugin(plugin_name)
            self.plugins[plugin_name] = plugin
            LOG.info(f"Plugin loaded successfully: {plugin_name}")
-        except ImportError as exc:
+        except ImportError:
            LOG.error(f"Failed to load plugin: {plugin_name}")
            # print stacktrace
            traceback.print_exc()
            print(f"Error: {exc}")
    def get_input_args(self) -> list[str]:
        """Returns a list of Pydantic classes for all registered plugins' input arguments.'
@@ -387,20 +353,6 @@ class PluginManager:  # pylint: disable=too-many-public-methods
                input_args.append(input_args_from_plugin)
        return input_args
    def get_training_args_mixin(self):
        """
        Returns a list of dataclasses for all registered plugins' training args mixins'
        Returns:
        list[str]: A list of dataclsses
        """
        training_args = []
        for plugin in self.plugins.values():
            training_args_from_plugin = plugin.get_training_args_mixin()
            if training_args_from_plugin is not None:
                training_args.append(training_args_from_plugin)
        return training_args
    def load_datasets(
        self, cfg: DictDefault, preprocess: bool = False
    ) -> Union["TrainDatasetMeta", None]:
@@ -490,42 +442,6 @@ class PluginManager:  # pylint: disable=too-many-public-methods
                return trainer_cls
        return None
    def get_training_args(self, cfg):
        """
        Calls the get_training_args method of all registered plugins and returns the combined training arguments.
        Parameters:
        cfg (dict): The configuration for the plugins.
        Returns:
        object: The training arguments
        """
        training_args_kwargs = {}
        for plugin in self.plugins.values():
            training_args = plugin.get_training_args(cfg)
            if training_args is not None:
                training_args_kwargs.update(training_args)
        return training_args_kwargs
    def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
        """
        Calls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.
        Parameters:
        cfg (dict): The configuration for the plugins.
        is_eval (bool): Whether this is an eval split.
        Returns:
        object: The collator class, or None if none was found.
        """
        for plugin in self.plugins.values():
            collator = plugin.get_collator_cls_and_kwargs(cfg, is_eval=is_eval)
            if collator is not None:
                collator_cls, collator_kwargs = collator
                return collator_cls, collator_kwargs
        return None
    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
        """Calls the `post_trainer_create` method of all registered plugins.
--- a/src/axolotl/integrations/config.py
+++ b/src/axolotl/integrations/config.py
@@ -16,7 +16,7 @@ Module to handle merging the plugins' input arguments with the base configuratio
 This was moved here to prevent circular imports.
 """
-from typing import Any, Dict, List, Type
+from typing import Any, Dict, List
 from axolotl.utils.schemas.config import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
@@ -61,43 +61,3 @@ def merge_input_args():
        ]
        return AxolotlConfigWCapabilities, AxolotlInputConfig
    return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase
 def merge_training_args() -> Type:
    """
    Merges training arguments from registered plugins with the base TrainingArguments.
    This function retrieves the training arguments from registered plugins using the PluginManager.
    It then dynamically creates new classes, AxolotlTrainingMixins,
    that inherit from the base configurations and include the training arguments from the plugins.
    Returns:
    tuple: A tuple containing the newly created classes, AxolotlTrainingMixins.
    """
    # pylint: disable=duplicate-code
    from axolotl.core.training_args_base import (
        AxolotlTrainingMixins as AxolotlTrainingMixinsBase,
    )
    from axolotl.integrations.base import PluginManager
    plugin_manager = PluginManager.get_instance()
    training_args_mixins: List[str] = plugin_manager.get_training_args_mixin()
    mixin_classes = []
    dynamic_input = ""
    for plugin_args in training_args_mixins:
        plugin_module, plugin_cls = plugin_args.rsplit(".", 1)
        dynamic_input += f"from {plugin_module} import {plugin_cls}\n"
        mixin_classes.append(plugin_cls)
    if dynamic_input:
        dynamic_input += f"class AxolotlTrainingMixins(AxolotlTrainingMixinsBase, {', '.join(mixin_classes)}):\n    pass\n"
        namespace: Dict[Any, Any] = {}
        local_vars = {"AxolotlTrainingMixinsBase": AxolotlTrainingMixinsBase}
        exec(  # pylint: disable=exec-used  # nosec B102
            dynamic_input, {**globals(), **local_vars}, namespace
        )
        AxolotlTrainingMixins = namespace[  # pylint: disable=invalid-name
            "AxolotlTrainingMixins"
        ]
        return AxolotlTrainingMixins
    return AxolotlTrainingMixinsBase
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -24,14 +24,6 @@ pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transform
 ## Usage
 **NOTE**: If you are training a VLM model, please use older version of Axolotl as upstream has applied a major VLM refactor, and our patches have not been updated yet.
 ```bash
 git checkout 787880215b3ab32ccaf81c1b2e9588c6f3e6e764
 pip3 install --no-build-isolation -e .
 ```
 ```yaml
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
--- a/src/axolotl/integrations/kd/init.py
+++ b/src/axolotl/integrations/kd/init.py
@@ -15,12 +15,7 @@
 """
 Plugin init to add KD support to Axolotl.
 """
 from typing import Any
 from transformers import Trainer
 from axolotl.integrations.base import BasePlugin
 from axolotl.integrations.kd.callbacks import KDTemperatureSchedulerCallback
 from .args import KDArgs  # pylint: disable=unused-import. # noqa: F401
@@ -33,75 +28,9 @@ class KDPlugin(BasePlugin):
    def get_input_args(self):
        return "axolotl.integrations.kd.KDArgs"
    def get_training_args_mixin(self):
        return "axolotl.integrations.kd.args.KDTrainingArgsMixin"
    def get_trainer_cls(self, cfg):
        if cfg.kd_trainer:
            from .trainer import AxolotlKDTrainer
            return AxolotlKDTrainer
        return None
    def get_training_args(self, cfg):
        return {
            "kd_ce_alpha": cfg.kd_ce_alpha,
            "kd_alpha": cfg.kd_alpha,
            "kd_temperature": cfg.kd_temperature,
            "kd_beta": cfg.kd_beta,
            "kd_normalize_topk": cfg.kd_normalize_topk,
        }
    def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
        if not cfg.kd_trainer:
            return None, None
        from .collator import DataCollatorForKD, KDBatchSamplerDataCollatorForSeq2Seq
        use_batch_sampler_collator = False
        if is_eval is False and cfg.sample_packing:
            use_batch_sampler_collator = True
        if cfg.eval_sample_packing and is_eval:
            use_batch_sampler_collator = True
        if cfg.kd_online_server_base_url:
            from .collator_online_teacher import OnlineTeacherCollator
            return OnlineTeacherCollator, {
                "kd_online_server_base_url": cfg.kd_online_server_base_url,
                "kd_online_topk": cfg.kd_online_topk,
                "kd_temperature": cfg.kd_temperature,
                "kd_online_server": cfg.kd_online_server,
                "kd_online_timeout": cfg.kd_online_timeout,
                "kd_normalize_topk": cfg.kd_normalize_topk,
            }
        if use_batch_sampler_collator:
            return KDBatchSamplerDataCollatorForSeq2Seq, {}
        return DataCollatorForKD, {}
    def pre_model_load(self, cfg):
        from .kernels.models import apply_kernel
        apply_kernel(cfg.model_config_type)
    def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
        """
        Adds temp scheduler callback to the Trainer instance.
        Args:
            cfg (Any): Configuration object containing the sparse recipe.
            trainer (Trainer): Huggingface Trainer instance.
        Returns:
            list: List containing the configured callback instances.
        """
        if cfg.kd_temperature_min is not None and cfg.kd_online_server_base_url:
            callback = KDTemperatureSchedulerCallback(
                cfg.kd_temperature,
                cfg.kd_temperature_min,
                trainer,
            )
            return [callback]
        return []
--- a/src/axolotl/integrations/kd/args.py
+++ b/src/axolotl/integrations/kd/args.py
@@ -15,19 +15,9 @@
 """
 Plugin args for KD support.
 """
-from dataclasses import dataclass
+from typing import Optional
 from enum import Enum
-from pydantic import BaseModel, Field
+from pydantic import BaseModel
 class InferenceServerType(str, Enum):
    """
    Online inferences server types to handle different request args
    """
    vllm = "vllm"  # pylint: disable=invalid-name
    sglang = "sglang"  # pylint: disable=invalid-name
 class KDArgs(BaseModel):
@@ -35,41 +25,13 @@ class KDArgs(BaseModel):
    Input args for knowledge distillation.
    """
-    kd_trainer: float | None = None  # whether to use KD trainer
+    kd_trainer: Optional[bool] = None  # whether to use KD trainer
-    kd_ce_alpha: float | None = (
+    kd_ce_alpha: Optional[float] = (
        None  # loss coefficient for cross-entropy loss during KD
    )
-    kd_alpha: float | None = None  # loss coefficient for KD loss
+    kd_alpha: Optional[float] = None  # loss coefficient for KD loss
-    kd_temperature: float | None = None  # temperature for sampling during KD
+    kd_temperature: Optional[float] = None  # temperature for sampling during KD
-    kd_beta: float | None = 0.0  # beta coefficient for ratio of fwd and reverse KL
+    kd_zscore_base_temp: Optional[float] = None  # base temperature for zscore scaling
-    kd_normalize_topk: bool | None = (
+    kd_top_k_before_softmax: Optional[bool] = (
-        None  # whether to normalize student logits during KD
+        None  # whether to sample top k before softmax during KD
    )
    # TODO online kd
    kd_online_server_base_url: str | None = None
    kd_online_topk: int | None = None
    kd_online_server: InferenceServerType | None = Field(
        default_factory=lambda: InferenceServerType.vllm
    )
    kd_online_timeout: int | None = 120
    kd_temperature_min: float | None = (
        None  # kd temperature scheduling during online kd
    )
@dataclass
 class KDTrainingArgsMixin:
    """
    Additional args for KD training.
    """
    kd_ce_alpha: float | None = (
        None  # loss coefficient for cross-entropy loss during KD
    )
    kd_alpha: float | None = None  # loss coefficient for KD loss
    kd_temperature: float | None = None  # temperature for sampling during KD
    kd_beta: float | None = None  # beta coefficient for ratio of fwd and reverse KL
    kd_normalize_topk: float | None = (
        None  # whether to normalize student logits during KD
    )
--- a/src/axolotl/integrations/kd/callbacks.py
+++ b/src/axolotl/integrations/kd/callbacks.py
@@ -1,36 +0,0 @@
 """
 Transformers trainer callbacks to schedule the KD temperature during training
 """
 import math
 from transformers.trainer_callback import TrainerCallback
 class KDTemperatureSchedulerCallback(TrainerCallback):
    """
    KD temperature scheduler callback for the trainer.
    """
    def __init__(self, temperature_start, temperature_min, trainer):
        self.temperature_start = temperature_start
        self.temperature_min = temperature_min
        self.temperature = temperature_start
        self.trainer = trainer
    def on_step_end(
        self, args, state, control, **kwargs
    ):  # pylint: disable=unused-argument
        # cosine decay temperature over the max steps
        progress = state.global_step / state.max_steps
        # Cosine decay factor: 0.5 * (1 + cos(pi * progress))
        # This factor goes from 1 (at progress=0) to 0 (at progress=1)
        decay_factor = 0.5 * (1.0 + math.cos(math.pi * progress))
        self.temperature = self.temperature_start - (
            (self.temperature_start - self.temperature_min) * (1.0 - decay_factor)
        )
        if hasattr(self.trainer.data_collator, "kd_temperature"):
            self.trainer.data_collator.kd_temperature = self.temperature
--- a/src/axolotl/integrations/kd/chat_template.py
+++ b/src/axolotl/integrations/kd/chat_template.py
@@ -15,15 +15,12 @@
 """
 Chat template prompt strategy loader with KD support
 """
 import logging
 from typing import Any, Dict
 import torch
 from axolotl.prompt_strategies.chat_template import ChatTemplateStrategy, StrategyLoader
 LOG = logging.getLogger(__name__)
 class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
    """
@@ -104,8 +101,10 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
        # fill with -inf for padding_len tokens for top_k tokens
        # extend target_logprobs with a padding_len x top_k 2D list filled with -inf
-        # we shift for causal models in the trainer, so start the range from 0
+        # for causal models, if we start the range at 1, then we don't need to shift in the trainer
-        for _ in range(0, input_padding_len):
+        # otherwise, we need to shift in the trainer
        shift = 0
        for _ in range(shift, input_padding_len):
            target_logprobs.append([-float("inf")] * top_k)
            target_token_ids.append(list(range(top_k)))
            target_mask.append([0] * top_k)
@@ -144,10 +143,6 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
            #
            # Convert from log to probability
            teacher_probs_t1 = position_logprobs_tensor.exp()
            # normalize probabilities to sum to 1 in case they aren't already
            teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
            if teacher_probs_t1_sum > 1e-9:
                teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
            if self.kd_temperature != self.gen_temperature:
                # Exponentiate by factor (T1 / T2)
                exponent = self.gen_temperature / self.kd_temperature
@@ -167,115 +162,12 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
            target_logprobs.append(position_logprobs_scaled)
            target_token_ids.append(position_token_ids)
-        # Update sample with transformed logprobs
+        if shift == 1:
-        sample["target_logprobs"] = target_logprobs
+            # since we started at index 1 for causal, we need one more padding token
        sample["target_token_ids"] = target_token_ids
        sample["target_mask"] = target_mask
        return sample
    def _tokenize_single_prompt(self, prompt):
        logprobs = prompt.pop(self.logprobs_field)
        tokenized_prompt = super()._tokenize_single_prompt(prompt)
        tokenized_prompt[self.logprobs_field] = logprobs
        tokenized_prompt = self.transform_logprobs(tokenized_prompt)
        return tokenized_prompt
 class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
    """
    Strat for datasets with complete structured KD logprob data
    """
    def transform_logprobs(self, sample):
        """
        Transform logprobs to target format for KD training
        """
        # pylint: disable=duplicate-code
        logprobs = sample.pop(self.logprobs_field)
        target_seq_len = len(logprobs)
        input_seq_len = len(sample["input_ids"])
        input_padding_len = input_seq_len - target_seq_len
        # get non-zero top-k (prune None logprobs from vllm data step)
        top_k_vals = [
            len(logprobs[i])
            for i in range(len(logprobs))
            if logprobs[i] is not None and len(logprobs[i])
        ]
        max_top_k = max(set(top_k_vals), key=top_k_vals.count)
        min_top_k = min(set(top_k_vals), key=top_k_vals.count)
        top_k = min(max_top_k, min_top_k)
        if top_k == 0:
            raise ValueError("No non-zero top-k logprobs found.")
        target_logprobs = []
        target_token_ids = []
        target_mask = []
        if input_padding_len < 0:
            # logprobs is longer than target_seq_len,
            # so we need to slice from the left/beginning of logprobs
            logprobs = logprobs[:-input_seq_len]
            input_padding_len = 0
            # target_seq_len = input_seq_len
        # truncate the second dimension of the logprobs to top_k
        logprobs = [row[:top_k] for row in logprobs]
        # fill with -inf for padding_len tokens for top_k tokens
        # extend target_logprobs with a padding_len x top_k 2D list filled with -inf
        # we shift for causal models in the trainer, so start the range from 0
        for _ in range(0, input_padding_len):
            target_logprobs.append([-float("inf")] * top_k)
            target_token_ids.append(list(range(top_k)))
            target_mask.append([0] * top_k)
        for position in range(input_padding_len, input_seq_len):
            if sample["labels"][position] == -100:
                target_mask.append([0] * top_k)
            else:
                target_mask.append([1] * top_k)
        for token_pos_logprobs, pos_target_token_ids in zip(
            logprobs, sample["target_token_ids"]
        ):
            # Convert to a tensor for easier manipulation
            position_logprobs_tensor = torch.tensor(
                token_pos_logprobs, dtype=torch.float
            )
            # Now we have distribution at T1 in log form, i.e. log p_{T1}(k).
            # Next, re-scale to T2 = self.kd_temperature via exponent-based trick
            # p_{T2}(k) = [p_{T1}(k)]^(T1 / T2) / Z
            #
            # Convert from log to probability
            teacher_probs_t1 = position_logprobs_tensor.exp()
            # normalize probabilities to sum to 1 in case they aren't already
            teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
            if teacher_probs_t1_sum > 1e-9:
                teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
            if self.kd_temperature != self.gen_temperature:
                # Exponentiate by factor (T1 / T2)
                exponent = self.gen_temperature / self.kd_temperature
                teacher_probs_t2 = teacher_probs_t1**exponent
            else:
                teacher_probs_t2 = teacher_probs_t1
            # Re-normalize
            teacher_probs_t2 = teacher_probs_t2 / teacher_probs_t2.sum(
                dim=0, keepdim=True
            )
            # Convert back to log
            position_logprobs_tensor = torch.log(teacher_probs_t2)
            # Now we have log p_{teacher, T2}(k) stored in position_logprobs_tensor
            position_logprobs_scaled = position_logprobs_tensor.tolist()
            target_logprobs.append(position_logprobs_scaled)
            target_token_ids.append(pos_target_token_ids)
        # Update sample with transformed logprobs
        sample["target_logprobs"] = target_logprobs
        sample["target_token_ids"] = target_token_ids
@@ -285,10 +177,8 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
    def _tokenize_single_prompt(self, prompt):
        logprobs = prompt.pop(self.logprobs_field)
        target_token_ids = prompt.pop("target_token_ids")
        tokenized_prompt = super()._tokenize_single_prompt(prompt)
        tokenized_prompt[self.logprobs_field] = logprobs
        tokenized_prompt["target_token_ids"] = target_token_ids
        tokenized_prompt = self.transform_logprobs(tokenized_prompt)
        return tokenized_prompt
@@ -299,7 +189,7 @@ class KDStrategyLoader(StrategyLoader):
    Load ChatTemplateStrategy with KD support using StrategyLoader.
    """
-    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
+    def _get_strategy_cls(self):
        return ChatTemplateStrategyWithKD
    def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
@@ -314,14 +204,4 @@ class KDStrategyLoader(StrategyLoader):
        return strategy_params
-class KDStrategyLoaderV2(KDStrategyLoader):
+load = KDStrategyLoader()
    """
    Load KD chat template datasets with pre-tokenized logprob data
    """
    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
        return ChatTemplateStrategyWithKDv2
 load_legacy = KDStrategyLoader()
 load = KDStrategyLoaderV2()
--- a/src/axolotl/integrations/kd/collator.py
+++ b/src/axolotl/integrations/kd/collator.py
@@ -47,16 +47,11 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
    position_pad_token_id: int = 0
    return_tensors: str = "pt"
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
    def __call__(self, features, return_tensors=None):
        if return_tensors is None:
            return_tensors = self.return_tensors
        padding_side = self.tokenizer.padding_side
        max_len = 0
        # Pad labels and position_ids first
        for feature_name, pad_token_id in [
@@ -107,9 +102,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
                target_mask_list.append(f.pop("target_mask"))
            # Determine max lengths
-            max_teacher_seq_len = max_len or max(
+            max_teacher_seq_len = max(len(seq) for seq in target_logprobs_list)
                len(seq) for seq in target_logprobs_list
            )
            max_k = max(len(seq_k) for seq in target_logprobs_list for seq_k in seq)
            padded_target_logprobs = []
@@ -216,9 +209,7 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
        #    We want to produce a single "merged" feature dict for each sub-batch.
        out_features = [{} for _ in features]
-        for i, sub_features in enumerate(  # pylint: disable=too-many-nested-blocks
+        for i, sub_features in enumerate(features):
            features
        ):
            # sub_features is a list of dicts, each dict = one sequence’s features
            # We'll merge them into out_features[i].
            #
@@ -252,17 +243,10 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
                    # For example, input_ids or labels are often arrays.
                    arrays = []
                    for feat in sub_features:
-                        if field_name in feat and isinstance(
+                        if field_name in feat:
                            feat[field_name], (list, torch.Tensor)
                        ):
                            if isinstance(
                                feat[field_name][0], (dict, str)
                            ):  # pylint: disable=too-many-nested-blocks
                                continue
                            arr = np.array(feat[field_name])
                            arrays.append(arr)
-                    if arrays:
+                    out_features[i][field_name] = np.concatenate(arrays)
                        out_features[i][field_name] = np.concatenate(arrays)
        # 3) Now call the parent collator, which will do:
        #    - padding of labels/position_ids
--- a/src/axolotl/integrations/kd/collator_online_teacher.py
+++ b/src/axolotl/integrations/kd/collator_online_teacher.py
@@ -1,561 +0,0 @@
 """
 Packed data loader for online teacher training supporting vllm and sglang.
 """
 import hashlib
 import hmac
 import logging
 from typing import Any, Dict, List, Optional
 import requests
 import torch
 from orjson import orjson
 from axolotl.integrations.kd.collator import KDBatchSamplerDataCollatorForSeq2Seq
 from axolotl.integrations.kd.utils import normalize_logprobs
 from axolotl.utils.data.utils import retry_on_request_exceptions
 LOG = logging.getLogger(__name__)
 def hmac_sha_from_int_list(int_list, key, hash_func=hashlib.sha256):
    """
    Create HMAC-SHA hash from a list of integers
    Args:
        int_list: List of integers
        key: Secret key (string or bytes)
        hash_func: Hash function (default: sha256)
    Returns:
        HMAC digest as hex string
    """
    # Convert key to bytes if it's a string
    if isinstance(key, str):
        key = key.encode("utf-8")
    # Convert list of ints to bytes
    # Method 1: Convert each int to bytes and concatenate
    data = b"".join(i.to_bytes(4, byteorder="big") for i in int_list)
    # Create HMAC
    h = hmac.new(key, data, hash_func)
    return h.hexdigest()
 class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
    """
    Collator for online teacher training.
    """
    DEFAULT_LABEL_PAD_TOKEN_ID: int = -100
    def __init__(
        self,
        *args: Any,
        kd_online_server_base_url: Optional[str] = None,
        kd_online_topk: Optional[int] = None,
        kd_temperature: Optional[float] = 1.0,
        kd_online_server: Optional[str] = "vllm",
        kd_online_timeout: Optional[int] = 120,
        kd_cache_dir: Optional[str] = None,
        kd_normalize_topk: Optional[bool] = True,
        **kwargs: Any,
    ):
        super().__init__(*args, **kwargs)
        if kd_online_server_base_url is None:
            raise ValueError(
                "kd_online_server_base_url must be provided for OnlineTeacherDataloader"
            )
        if kd_online_topk is None or kd_online_topk <= 0:
            raise ValueError(
                "kd_online_topk must be a positive integer for OnlineTeacherDataloader"
            )
        self.kd_online_server_base_url = kd_online_server_base_url.rstrip("/")
        self.kd_online_topk = kd_online_topk
        self.kd_temperature = kd_temperature
        self.kd_online_server = kd_online_server
        self.http_session = requests.Session()
        self.kd_online_timeout = kd_online_timeout
        self.kd_cache_dir = kd_cache_dir
        self.kd_normalize_topk = kd_normalize_topk
    def _normalize_logprobs(self, raw_logprobs: List[float]) -> List[float]:
        """
        Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs.
        """
        if not raw_logprobs or self.kd_online_topk == 0:
            return (
                [-float("inf")] * self.kd_online_topk if self.kd_online_topk > 0 else []
            )
        raw_logprobs_tensor = torch.tensor(raw_logprobs, dtype=torch.float32)
        return normalize_logprobs(raw_logprobs_tensor, self.kd_online_topk).tolist()
    @retry_on_request_exceptions(max_retries=10, delay=5)
    def fetch_online_logprobs_sglang(
        self, batch_input_ids: List[List[int]], labels: List[List[int]]
    ):
        """
        Fetches logprobs from an online teacher served by sglang for a batch of input_ids.
        Assumes API returns token IDs as strings in logprob dictionary keys.
        """
        api_endpoint = f"{self.kd_online_server_base_url}/generate"
        payload = {
            "input_ids": batch_input_ids,
            "return_logprob": True,
            "top_logprobs_num": self.kd_online_topk,
            "logprob_start_len": 0,
            "return_text_in_logprobs": True,
            "echo": True,
            "sampling_params": {
                "max_new_tokens": 0,
                "temperature": self.kd_temperature,
                "skip_special_tokens": False,
            },
        }
        # Initialize with empty lists, so if API call fails, these are returned.
        ret_data_target_token_ids: List[List[List[int]]] = []
        ret_data_target_logprobs: List[List[List[float]]] = []
        ret_data_target_mask: List[List[List[int]]] = []
        try:
            response = self.http_session.post(
                api_endpoint, json=payload, timeout=self.kd_online_timeout
            )
            response.raise_for_status()
            api_data: list[dict] = response.json()
            # Ensure api_data is a list, and its length matches batch_input_ids
            if not isinstance(api_data, list) or len(api_data) != len(batch_input_ids):
                LOG.error(
                    f"API response format error. Expected a list of {len(batch_input_ids)} "
                    f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}."
                )
                # Return empty data; items processed later will get default empty KD fields
                return {
                    "target_token_ids": ret_data_target_token_ids,
                    "target_logprobs": ret_data_target_logprobs,
                    "target_mask": ret_data_target_mask,
                }
            for sequence_data, seq_input_ids, seq_labels in zip(
                api_data, batch_input_ids, labels
            ):
                current_target_logprobs = []
                current_target_token_ids = []
                current_target_mask = []
                meta_info = sequence_data.pop("meta_info", {})
                # Ensure input_top_logprobs is a list
                input_top_logprobs: Optional[list[None | list[tuple]]] = meta_info.pop(
                    "input_top_logprobs", []
                )
                if not isinstance(input_top_logprobs, list):
                    LOG.warning(
                        f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence."
                    )
                    input_top_logprobs = []  # Treat as empty
                # basic check that the logprob data len matches the input len, so no need to handle padding
                assert len(seq_input_ids) == len(input_top_logprobs)
                for i, _, label in zip(
                    range(len(seq_input_ids)), seq_input_ids, seq_labels
                ):
                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
                        # this is always the case for the first token.
                        # there is never logprob data for the first token since that's a true input
                        # so we replace the None value with padding data
                        current_target_logprobs.append(
                            [-float("inf")] * self.kd_online_topk
                        )
                        current_target_token_ids.append([0] * self.kd_online_topk)
                        current_target_mask.append([0] * self.kd_online_topk)
                    elif (
                        i < len(input_top_logprobs)
                        and input_top_logprobs[i] is not None
                    ):
                        pos_top_logprobs_data = input_top_logprobs[i]
                        # Ensure pos_top_logprobs_data is a list of lists as expected
                        if not (
                            isinstance(pos_top_logprobs_data, list)
                            and all(
                                isinstance(item, list) for item in pos_top_logprobs_data
                            )
                            and len(pos_top_logprobs_data) > 0
                            and len(pos_top_logprobs_data[0]) == 3
                        ):  # [logprob, token_id, token_str]
                            LOG.warning(
                                f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position."
                            )
                            current_target_logprobs.append(
                                [-float("inf")] * self.kd_online_topk
                            )
                            current_target_token_ids.append([0] * self.kd_online_topk)
                            current_target_mask.append([0] * self.kd_online_topk)
                            continue
                        # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
                        pos_logprobs_raw, pos_token_ids, _ = [
                            list(row) for row in zip(*pos_top_logprobs_data)
                        ]
                        # Ensure correct length (top_k)
                        if len(pos_logprobs_raw) < self.kd_online_topk:
                            pad_len = self.kd_online_topk - len(pos_logprobs_raw)
                            pos_logprobs_raw.extend([-float("inf")] * pad_len)
                            pos_token_ids.extend([0] * pad_len)  # Pad with 0 token_id
                        # truncate to top_k in case the response was longer
                        current_target_token_ids.append(
                            pos_token_ids[: self.kd_online_topk]
                        )
                        if self.kd_normalize_topk:
                            normalized_logprobs_for_position = self._normalize_logprobs(
                                pos_logprobs_raw[: self.kd_online_topk]
                            )
                            current_target_logprobs.append(
                                normalized_logprobs_for_position
                            )
                        else:
                            current_target_logprobs.append(
                                pos_logprobs_raw[: self.kd_online_topk]
                            )
                        # Mask depends on the corresponding label for the student
                        if label == self.DEFAULT_LABEL_PAD_TOKEN_ID:
                            current_target_mask.append([0] * self.kd_online_topk)
                        else:
                            current_target_mask.append([1] * self.kd_online_topk)
                    else:
                        # Pad if no logprobs for this position (either due to length mismatch or None entry)
                        current_target_logprobs.append(
                            [-float("inf")] * self.kd_online_topk
                        )
                        current_target_token_ids.append([0] * self.kd_online_topk)
                        current_target_mask.append([0] * self.kd_online_topk)
                ret_data_target_token_ids.append(current_target_token_ids)
                ret_data_target_logprobs.append(current_target_logprobs)
                ret_data_target_mask.append(current_target_mask)
        except requests.exceptions.RequestException as e:
            LOG.error(f"Error fetching logprobs from online teacher: {e}")
            raise e
            # ret_logprobs_data will be returned with empty lists, handled by the caller.
        except Exception as e:  # Catch other potential errors during processing
            LOG.error(
                f"Unexpected error processing API response in fetch_online_logprobs: {e}",
                exc_info=True,
            )
            raise e
        return {
            "target_token_ids": ret_data_target_token_ids,
            "target_logprobs": ret_data_target_logprobs,
            "target_mask": ret_data_target_mask,
        }
    @retry_on_request_exceptions(max_retries=10, delay=5)
    def fetch_online_logprobs_vllm(
        self, batch_input_ids: List[List[int]], labels: List[List[int]]
    ):
        """
        Fetches logprobs from an online teacher served by vllm for a batch of input_ids.
        Assumes API returns token IDs as strings in logprob dictionary keys.
        """
        api_endpoint = f"{self.kd_online_server_base_url}/v1/completions"
        payload = {
            "prompt": batch_input_ids,
            "echo": True,
            "logprobs": True,
            "prompt_logprobs": self.kd_online_topk,
            "top_logprobs": self.kd_online_topk,
            "max_new_tokens": 0,
            "skip_special_tokens": False,
            "temperature": self.kd_temperature,
            "sampling_params": {
                "max_tokens": 0,
            },
        }
        # Initialize with empty lists, so if API call fails, these are returned.
        ret_data_target_token_ids: List[List[List[int]]] = []
        ret_data_target_logprobs: List[List[List[float]]] = []
        ret_data_target_mask: List[List[List[int]]] = []
        try:
            headers = {"Accept-Encoding": "deflate, gzip, br, zstd"}
            response = self.http_session.post(
                api_endpoint,
                json=payload,
                headers=headers,
                timeout=self.kd_online_timeout,
            )
            response.raise_for_status()
            api_data: dict = orjson.loads(response.content)
            choices: list[dict] = api_data["choices"]
            # Ensure api_data is a list, and its length matches batch_input_ids
            if not isinstance(choices, list) or len(choices) != len(batch_input_ids):
                LOG.error(
                    f"API response format error. Expected a list of {len(batch_input_ids)} "
                    f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}."
                )
                # Return empty data; items processed later will get default empty KD fields
                return {
                    "target_token_ids": ret_data_target_token_ids,
                    "target_logprobs": ret_data_target_logprobs,
                    "target_mask": ret_data_target_mask,
                }
            for sequence_data, seq_input_ids, seq_labels in zip(
                choices, batch_input_ids, labels
            ):
                # seq_input_ids: List[int]
                # seq_labels: List[int]
                current_target_logprobs = []
                current_target_token_ids = []
                current_target_mask = []
                # Ensure input_top_logprobs is a list
                input_top_logprobs: Optional[list[None | dict[str, dict]]] = (
                    sequence_data.pop("prompt_logprobs", [])
                )
                if not isinstance(input_top_logprobs, list):
                    LOG.warning(
                        f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence."
                    )
                    input_top_logprobs = []  # Treat as empty
                # basic check that the logprob data len matches the input len, so no need to handle padding
                assert len(seq_input_ids) == len(input_top_logprobs)
                seq_len = len(seq_input_ids)
                for i, _, label in zip(range(seq_len), seq_input_ids, seq_labels):
                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
                        # this is always the case for the first token.
                        # there is never logprob data for the first token since that's a true input
                        continue
                    if (
                        i < len(input_top_logprobs)
                        and input_top_logprobs[i] is not None
                    ):
                        pos_top_logprobs_data: dict[str, dict] = input_top_logprobs[i]  # type: ignore[assignment]
                        # Ensure pos_top_logprobs_data is a list of lists as expected
                        if not (
                            isinstance(pos_top_logprobs_data, dict)
                            and all(
                                isinstance(item, dict)
                                for item in pos_top_logprobs_data.values()
                            )
                            and len(pos_top_logprobs_data.keys()) > 0
                        ):  # [logprob, token_id, token_str]
                            LOG.warning(
                                f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position."
                            )
                            current_target_logprobs.append(
                                [-float("inf")] * self.kd_online_topk
                            )
                            current_target_token_ids.append(
                                list(range(self.kd_online_topk))
                            )
                            current_target_mask.append([0] * self.kd_online_topk)
                            continue
                        # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
                        pos_token_ids_str = list(pos_top_logprobs_data.keys())
                        pos_logprobs_dict = pos_top_logprobs_data.values()
                        pos_token_ids = [
                            int(token_id) for token_id in pos_token_ids_str
                        ]
                        pos_logprobs_raw = [
                            float(logprob.get("logprob", -float("inf")))
                            for logprob in pos_logprobs_dict
                        ]
                        # Ensure correct length (top_k)
                        if len(pos_logprobs_raw) < self.kd_online_topk:
                            pad_len = self.kd_online_topk - len(pos_logprobs_raw)
                            LOG.warning(
                                f"Padding position {i} with {pad_len} top-k tokens and logprobs."
                            )
                            pos_logprobs_raw.extend([-float("inf")] * pad_len)
                            pos_token_ids.extend([0] * pad_len)  # Pad with 0 token_id
                        # truncate to top_k in case the response was longer
                        current_target_token_ids.append(
                            pos_token_ids[: self.kd_online_topk]
                        )
                        if self.kd_normalize_topk:
                            normalized_logprobs_for_position = self._normalize_logprobs(
                                pos_logprobs_raw[: self.kd_online_topk]
                            )
                            current_target_logprobs.append(
                                normalized_logprobs_for_position
                            )
                        else:
                            current_target_logprobs.append(
                                pos_logprobs_raw[: self.kd_online_topk]
                            )
                        # Mask depends on the corresponding label for the student
                        if label == self.DEFAULT_LABEL_PAD_TOKEN_ID:
                            current_target_mask.append([0] * self.kd_online_topk)
                        else:
                            current_target_mask.append([1] * self.kd_online_topk)
                    else:
                        # Pad if no logprobs for this position (either due to length mismatch or None entry)
                        current_target_logprobs.append(
                            [-float("inf")] * self.kd_online_topk
                        )
                        current_target_token_ids.append(
                            list(range(self.kd_online_topk))
                        )
                        current_target_mask.append([0] * self.kd_online_topk)
                for i in range(max(0, seq_len - len(current_target_logprobs))):
                    current_target_logprobs.append(
                        [-float("inf")] * self.kd_online_topk
                    )
                    current_target_token_ids.append(list(range(self.kd_online_topk)))
                    current_target_mask.append([0] * self.kd_online_topk)
                ret_data_target_token_ids.append(current_target_token_ids)
                ret_data_target_logprobs.append(current_target_logprobs)
                ret_data_target_mask.append(current_target_mask)
                # TODO save and load targets to disk for caching for next epoch
                # generate a hmac SHA256 hash over the list seq_input_ids and convert it to an int
                # if self.kd_cache_dir:
                #     hash_input_ids = hmac_sha_from_int_list(
                #         seq_input_ids, f"{self.kd_online_server_base_url}:{self.kd_online_topk}"
                #     )
                #     with open(f"{self.kd_cache_dir}/{hash_input_ids}.parquet", "wb") as f:
                #         pd.DataFrame(ret_logprobs_data).to_parquet(f, index=False)
        except requests.exceptions.RequestException as e:
            LOG.error(f"Error fetching logprobs from online teacher: {e}")
            raise e
            # ret_logprobs_data will be returned with empty lists, handled by the caller.
        except Exception as e:  # Catch other potential errors during processing
            LOG.error(
                f"Unexpected error processing API response in fetch_online_logprobs: {e}",
                exc_info=True,
            )
            raise e
        return {
            "target_token_ids": ret_data_target_token_ids,
            "target_logprobs": ret_data_target_logprobs,
            "target_mask": ret_data_target_mask,
        }
    def __call__(
        self, features: List[List[Dict[str, Any]]], return_tensors: Optional[str] = None
    ) -> Dict[str, Any]:
        if not features:
            return super().__call__(features, return_tensors=return_tensors)
        for (
            sub_batch_features
        ) in features:  # sub_batch_features is List[Dict[str, Any]]
            if not sub_batch_features:
                continue
            input_ids_for_api_call: List[List[int]] = []
            labels_for_api_call: List[List[int]] = []
            # Store references to the original item dictionaries to update them in-place
            items_for_api_call: List[Dict[str, Any]] = []
            for item_dict in sub_batch_features:
                if not isinstance(item_dict, dict):
                    LOG.warning(
                        f"Skipping non-dict item in sub_batch_features: {item_dict}"
                    )
                    continue
                current_input_ids = item_dict.get("input_ids")
                current_labels = item_dict.get("labels")
                if current_input_ids is not None and current_labels is not None:
                    # Ensure input_ids and labels are lists of ints for JSON serialization
                    input_ids_list = (
                        current_input_ids.tolist()
                        if hasattr(current_input_ids, "tolist")
                        else list(current_input_ids)
                    )
                    labels_list = (
                        current_labels.tolist()
                        if hasattr(current_labels, "tolist")
                        else list(current_labels)
                    )
                    input_ids_for_api_call.append(input_ids_list)
                    labels_for_api_call.append(labels_list)
                    items_for_api_call.append(item_dict)
                else:
                    # This item will not get teacher logprobs from the API.
                    # Initialize KD fields to empty lists so downstream collators handle them uniformly.
                    item_dict.setdefault("target_token_ids", [])
                    item_dict.setdefault("target_logprobs", [])
                    item_dict.setdefault("target_mask", [])
            # print(items_for_api_call)
            if items_for_api_call:  # Only call API if there's something to process
                if self.kd_online_server == "sglang":
                    api_responses_for_sub_batch = self.fetch_online_logprobs_sglang(
                        input_ids_for_api_call, labels_for_api_call
                    )
                else:
                    api_responses_for_sub_batch = self.fetch_online_logprobs_vllm(
                        input_ids_for_api_call, labels_for_api_call
                    )
                # api_responses_for_sub_batch has keys: "target_token_ids", "target_logprobs", "target_mask"
                # Each value is a list, corresponding to items_for_api_call
                for i, item_to_update in enumerate(items_for_api_call):
                    # TODO make sure to figure out which input in sub_batch_features to update the batch in the original `features` object so the super class can handle it properly.
                    if api_responses_for_sub_batch and i < len(
                        api_responses_for_sub_batch["target_token_ids"]
                    ):  # Check bounds
                        assert len(
                            api_responses_for_sub_batch["target_token_ids"][i]
                        ) == len(item_to_update["input_ids"])
                        assert len(
                            api_responses_for_sub_batch["target_logprobs"][i]
                        ) == len(item_to_update["input_ids"])
                        assert len(
                            api_responses_for_sub_batch["target_mask"][i]
                        ) == len(item_to_update["labels"])
                        item_to_update["target_token_ids"] = (
                            api_responses_for_sub_batch["target_token_ids"][i]
                        )
                        item_to_update["target_logprobs"] = api_responses_for_sub_batch[
                            "target_logprobs"
                        ][i]
                        item_to_update["target_mask"] = api_responses_for_sub_batch[
                            "target_mask"
                        ][i]
                    else:
                        # API call failed for this item, or response was shorter than expected.
                        # Ensure KD fields are initialized as empty lists.
                        LOG.warning(
                            f" (index {i}), or API response was too short. "
                            f"API response keys: {list(api_responses_for_sub_batch.keys()) if api_responses_for_sub_batch else 'None'}"
                        )
                        item_to_update.setdefault("target_token_ids", [])
                        item_to_update.setdefault("target_logprobs", [])
                        item_to_update.setdefault("target_mask", [])
        return super().__call__(features, return_tensors=return_tensors)
--- a/src/axolotl/integrations/kd/kernels/init.py
+++ b/src/axolotl/integrations/kd/kernels/init.py
@@ -1,8 +0,0 @@
 """
 Liger Chunked loss optimizations module
 """
 from .liger import LigerFusedLinearKLTopKLogprobLoss
 from .models import apply_kernel
 __all__ = ["LigerFusedLinearKLTopKLogprobLoss", "apply_kernel"]
--- a/src/axolotl/integrations/kd/kernels/liger.py
+++ b/src/axolotl/integrations/kd/kernels/liger.py
@@ -1,485 +0,0 @@
 """
 Liger Kernels for Chunked Top-K Log-Prob Distillation
 """
 import torch
 import torch.nn.functional as F
 from liger_kernel.chunked_loss.fused_linear_distillation import (
    LigerFusedLinearDistillationBase,
 )
 from axolotl.integrations.kd.utils import normalize_logprobs
 class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
    """
    Chunked kl-div loss for top-k logprobs
    """
    @staticmethod
    def distillation_loss_fn(
        student_logits_temp_scaled: torch.Tensor,  # [chunk_size, vocab_size], already temp-scaled
        target_token_ids_chunk: torch.Tensor,  # [chunk_size, top_k]
        target_logprobs_chunk: torch.Tensor,  # [chunk_size, top_k], already temp-scaled and normalized logprobs
        target_mask_chunk: torch.Tensor,  # [chunk_size, top_k]
        beta: float = 0.0,
        normalize_topk: bool = True,
    ) -> torch.Tensor:
        """
        Compute Top-K KL divergence loss for a chunk.
        Args:
            student_logits_temp_scaled: Student logits, scaled by temperature. Shape: (N, V).
            target_token_ids_chunk: Top-k teacher token IDs. Shape: (N, K).
            target_logprobs_chunk: Top-k teacher log probabilities (temp-scaled, normalized). Shape: (N, K).
            target_mask_chunk: Mask for valid top-k tokens. Shape: (N, K).
            beta: Controls the type of KL divergence.
                  0.0 for Forward KL (P_teacher || P_student).
                  1.0 for Reverse KL (P_student || P_teacher).
                  0.5 for Symmetric KL (average of Forward and Reverse).
            normalize_topk: Whether to normalize the log probabilities
        Returns:
            Sum of KL divergence losses for the chunk.
        """
        topk = target_token_ids_chunk.shape[-1]
        student_logits_temp_scaled = (  # [chunk_size, vocab_size]
            student_logits_temp_scaled.float()
        )
        target_logprobs_chunk = target_logprobs_chunk.float()
        # Gather student logits for the top-k teacher token IDs
        # target_token_ids_chunk: [chunk_size, top_k]
        # student_logits_topk_temp_scaled: [chunk_size, top_k]
        student_logits_topk_temp_scaled = torch.gather(
            student_logits_temp_scaled, dim=-1, index=target_token_ids_chunk
        )
        # Student log-probabilities for the gathered top-k tokens
        student_lse = torch.logsumexp(
            student_logits_temp_scaled, dim=-1, keepdim=True
        )  # [chunk_size, 1]
        student_logprobs_topk_temp_scaled = (
            student_logits_topk_temp_scaled - student_lse
        )
        # we have the top-k student logprobs, normalize them
        if normalize_topk:
            student_logprobs_topk_temp_scaled = normalize_logprobs(
                student_logprobs_topk_temp_scaled, topk
            )
        valid_mask = target_mask_chunk.to(torch.bool)  # [chunk_size, top_k]
        student_logprobs_topk_valid = student_logprobs_topk_temp_scaled[valid_mask]
        teacher_logprobs_valid = target_logprobs_chunk[valid_mask]
        # Teacher probabilities P(y|x_teacher) from logprobs
        # target_logprobs_valid are already normalized (log(softmax(teacher_logits/T)))
        teacher_probs_valid = teacher_logprobs_valid.exp()
        # Student probabilities P_student from log P_student
        student_probs_topk_valid = student_logprobs_topk_valid.exp()
        # kd_loss_per_token = torch.zeros_like(target_logprobs_valid)
        # KL divergence: sum(P_teacher * (log P_teacher - log P_student))
        # = sum(P_teacher * log P_teacher) - sum(P_teacher * log P_student)
        # The distillation loss is often formulated as -sum(P_teacher * log P_student)
        # or as sum(P_teacher * (log_softmax_teacher - log_softmax_student))
        # Here, target_logprobs_valid are log_softmax_teacher.
        # student_logprobs_topk_valid are log_softmax_student (for the selected K indices).
        if beta == 0.0:  # Contribution from Forward KL
            fwd_kl_per_token = teacher_probs_valid * (
                teacher_logprobs_valid - student_logprobs_topk_valid
            )
            kd_loss = fwd_kl_per_token.sum()
        elif beta == 1.0:  # Contribution from Reverse KL
            rev_kl_per_token = student_probs_topk_valid * (
                student_logprobs_topk_valid - teacher_logprobs_valid
            )
            kd_loss = rev_kl_per_token.sum()
        else:
            # JSD - Jensen-Shannon Divergence / Symmetric
            mean_probs = (
                1 - beta
            ) * student_probs_topk_valid + beta * teacher_probs_valid
            log_mean_probs = mean_probs.log()
            student_kl = F.kl_div(
                log_mean_probs,
                student_logprobs_topk_valid,
                reduction="sum",
                log_target=True,
            )
            teacher_kl = F.kl_div(
                log_mean_probs, teacher_logprobs_valid, reduction="sum", log_target=True
            )
            jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
            kd_loss = jsd_loss
        return kd_loss
    @staticmethod
    def _compute_loss_kl_topk(
        student_input_chunk: torch.Tensor,
        student_weight: torch.Tensor,
        # Args for student_bias, target_token_ids_chunk etc. are passed to the lambda wrapped by grad_and_value
        # or through `partial`. Let's make them explicit here for clarity.
        target_token_ids_chunk: torch.Tensor,
        target_logprobs_chunk: torch.Tensor,
        target_mask_chunk: torch.Tensor,
        target_chunk: torch.Tensor,  # For hard loss (true labels)
        student_bias: torch.Tensor = None,  # This will be one of the grad targets
        # Other params passed via `partial` from `forward`
        distillation_loss_fn=None,
        ignore_index: int = -100,
        weight_hard_loss: float = 0.5,
        weight_soft_loss: float = 0.5,
        compute_ce_loss: bool = True,
        temperature: float = 1.0,
        beta: float = 0.0,
        normalize_topk: bool = True,
    ):
        # Compute student logits for the chunk from hidden states and LM head
        # student_input_chunk: [chunk_size, hidden_dim]
        # student_lm_head_weight: [vocab_size, hidden_dim]
        # student_logits_chunk: [chunk_size, vocab_size]
        student_logits_chunk = F.linear(
            student_input_chunk, student_weight, student_bias
        )
        ce_loss = torch.tensor(
            0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype
        )
        if compute_ce_loss and weight_hard_loss > 0.0:
            ce_loss = F.cross_entropy(
                student_logits_chunk.view(-1, student_logits_chunk.shape[-1]),
                target_chunk.view(-1),
                reduction="sum",
                ignore_index=ignore_index,
            )
        soft_loss = torch.tensor(
            0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype
        )
        if weight_soft_loss > 0.0:
            student_logits_chunk_temp_scaled = student_logits_chunk / temperature
            # Assuming student_weight.shape[0] (vocab_size) is adequate for target_token_ids_chunk.max()
            # No explicit padding here; user must ensure vocab alignment or pre-pad student_weight.
            soft_loss = distillation_loss_fn(
                student_logits_chunk_temp_scaled,
                target_token_ids_chunk,
                target_logprobs_chunk,
                target_mask_chunk,
                beta=beta,
                normalize_topk=normalize_topk,
            )
        return soft_loss, ce_loss
    @classmethod
    def forward(
        cls,
        ctx,
        student_input: torch.Tensor,  # [batch_size, seq_len, dim]
        student_lm_head_weight: torch.Tensor,  # [dim, vocab_size]
        target_token_ids: torch.Tensor,  # [batch_size, seq_len, top_k]
        target_logprobs: torch.Tensor,  # [batch_size, seq_len, top_k]
        target_mask: torch.Tensor,  # [batch_size, seq_len, top_k]
        true_labels: torch.Tensor,  # [batch_size, seq_len]
        student_lm_head_bias: torch.Tensor = None,
        weight_hard_loss: float = 0.5,
        weight_soft_loss: float = 0.5,
        ignore_index: int = -100,
        temperature: float = 1.0,
        beta: float = 0.0,
        compiled: bool = False,
        chunk_size: int = 1024,
        compute_ce_loss: bool = True,
        normalize_topk: bool = True,
    ):
        CHUNK_SIZE = chunk_size  # pylint: disable=invalid-name
        grad_weight_acc = torch.zeros_like(student_lm_head_weight)
        grad_inputs_list = []
        grad_bias_acc = (
            torch.zeros_like(student_lm_head_bias)
            if student_lm_head_bias is not None
            else None
        )
        kd_loss_acc = torch.zeros(
            (), device=student_input.device, dtype=student_input.dtype
        )
        ce_loss_acc = torch.zeros(
            (), device=student_input.device, dtype=student_input.dtype
        )
        # This function will be what torch.func.grad_and_value differentiates.
        # It takes student_input_chunk, student_weight (full), student_bias (full) as primals.
        # Other necessary data (target_*, etc.) are passed as non-differentiable arguments.
        def loss_fn_for_grad(
            _student_input_chunk,
            _student_lm_head_weight,  # full weight
            _student_lm_head_bias,  # full bias
            # Fixed arguments for a given chunk, not differentiated:
            _target_token_ids_chunk,
            _target_logprobs_chunk,
            _target_mask_chunk,
            _true_labels_chunk,
        ):
            return cls._compute_loss_kl_topk(
                student_input_chunk=_student_input_chunk,
                student_weight=_student_lm_head_weight,
                target_token_ids_chunk=_target_token_ids_chunk,
                target_logprobs_chunk=_target_logprobs_chunk,
                target_mask_chunk=_target_mask_chunk,
                target_chunk=_true_labels_chunk,
                student_bias=_student_lm_head_bias,
                distillation_loss_fn=cls.distillation_loss_fn,
                ignore_index=ignore_index,
                weight_hard_loss=weight_hard_loss,
                weight_soft_loss=weight_soft_loss,
                compute_ce_loss=compute_ce_loss,
                temperature=temperature,
                beta=beta,
                normalize_topk=normalize_topk,
            )
        def accumulate_chunk_grads(
            student_input_chunk_ac,
            target_token_ids_chunk_ac,
            target_logprobs_chunk_ac,
            target_mask_chunk_ac,
            true_labels_chunk_ac,
        ):
            # student_weight and student_bias are closed over from the outer scope (full tensors)
            if student_lm_head_bias is not None:
                (
                    (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
                    (chunk_kd_loss, chunk_ce_loss),
                ) = torch.func.grad_and_value(
                    loss_fn_for_grad, argnums=(0, 1, 2), has_aux=True
                )(
                    student_input_chunk_ac,
                    student_lm_head_weight,
                    student_lm_head_bias,  # primals
                    target_token_ids_chunk_ac,
                    target_logprobs_chunk_ac,
                    target_mask_chunk_ac,
                    true_labels_chunk_ac,
                )  # non-primals
                grad_bias_acc.add_(chunk_grad_bias)
            else:
                argnums_for_grad = (0, 1)  # Differentiate wrt input_chunk, weight
                (
                    (chunk_grad_input, chunk_grad_weight),  # No grad for bias
                    (chunk_kd_loss, chunk_ce_loss),
                ) = torch.func.grad_and_value(
                    loss_fn_for_grad, argnums=argnums_for_grad, has_aux=True
                )(
                    student_input_chunk_ac,
                    student_lm_head_weight,
                    None,  # Pass None for student_bias primal
                    target_token_ids_chunk_ac,
                    target_logprobs_chunk_ac,
                    target_mask_chunk_ac,
                    true_labels_chunk_ac,
                )
            grad_weight_acc.add_(chunk_grad_weight)
            kd_loss_acc.add_(chunk_kd_loss)
            ce_loss_acc.add_(chunk_ce_loss)
            return chunk_grad_input
        if compiled:
            accumulate_chunk_grads_compiled = torch.compile(
                accumulate_chunk_grads, dynamic=True, backend="inductor"
            )  # dynamic=True often helpful
        else:
            accumulate_chunk_grads_compiled = accumulate_chunk_grads
        # Use the same chunking logic as LigerFusedLinearDistillationBase.forward
        B, N, D = student_input.shape  # pylint: disable=invalid-name
        K = target_token_ids.shape[-1]  # pylint: disable=invalid-name
        student_input_flat = student_input.reshape(-1, student_input.shape[-1])
        target_token_ids_flat = target_token_ids.reshape(-1, target_token_ids.shape[-1])
        target_logprobs_flat = target_logprobs.reshape(-1, target_logprobs.shape[-1])
        target_mask_flat = target_mask.reshape(-1, target_mask.shape[-1])
        # pad and shift for cross entropy loss
        true_labels = torch.nn.functional.pad(true_labels, (0, 1), value=ignore_index)
        true_labels_flat = true_labels[:, 1:].contiguous().view(-1)
        num_chunks = max(1, student_input_flat.shape[0] // CHUNK_SIZE)
        _student_input_chunks = torch.chunk(
            student_input_flat, chunks=num_chunks, dim=0
        )
        _target_token_ids_chunks = torch.chunk(
            target_token_ids_flat, chunks=num_chunks, dim=0
        )
        _target_logprobs_chunks = torch.chunk(
            target_logprobs_flat, chunks=num_chunks, dim=0
        )
        _target_mask_chunks = torch.chunk(target_mask_flat, chunks=num_chunks, dim=0)
        _true_labels_chunks = torch.chunk(true_labels_flat, chunks=num_chunks, dim=0)
        for i in range(num_chunks):
            grad_input_chunk = accumulate_chunk_grads_compiled(
                _student_input_chunks[i],
                _target_token_ids_chunks[i],
                _target_logprobs_chunks[i],
                _target_mask_chunks[i],
                _true_labels_chunks[i],
            )
            grad_inputs_list.append(grad_input_chunk)
        grad_inputs_combined = torch.cat(grad_inputs_list, dim=0)
        ctx.save_for_backward(grad_inputs_combined, grad_weight_acc, grad_bias_acc)
        # For matching None returns in backward for non-tensor/non-grad_requiring inputs
        ctx.hyperparams_count = 9  # Corresponds to number of hyperparams after main tensors in fwd signature
        ctx.bias_was_none = student_lm_head_bias is None
        ctx.orig_dims = (B, N, D, K)
        # since this is packed, there is simply a single batch, so batchmean reduction of kl-div is simply the accumulated sum
        # we still need to scale the kd_loss by the temp^2
        kd_loss_acc = kd_loss_acc * (temperature**2)
        final_loss = weight_soft_loss * kd_loss_acc + weight_hard_loss * ce_loss_acc
        return final_loss
    @staticmethod
    def backward(ctx, grad_output):
        grad_input_flat, grad_weight, grad_bias_maybe = (
            ctx.saved_tensors
        )  # grad_input_flat is (B*N, D)
        # Scale gradients by grad_output if it's not 1.0
        if not torch.equal(
            grad_output,
            torch.tensor(1.0, device=grad_output.device, dtype=grad_output.dtype),
        ):
            grad_input_flat = grad_input_flat * grad_output
            grad_weight = grad_weight * grad_output
            if grad_bias_maybe is not None:
                grad_bias_maybe = grad_bias_maybe * grad_output
        # Reshape grad_input_flat to match original student_input shape (B, N, D)
        # ctx.orig_dims stores (B, N, D, K)
        # We need the first three dimensions for student_input's shape.
        # Ensure that orig_dims are not (0,0,0,K) for empty inputs leading to view errors
        if (
            ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0
            and grad_input_flat.numel() == 0
        ):
            # If original input was empty, gradient should also be empty with correct shape
            grad_input_reshaped = torch.zeros(
                ctx.orig_dims[0],
                ctx.orig_dims[1],
                ctx.orig_dims[2],
                dtype=grad_input_flat.dtype,
                device=grad_input_flat.device,
            )
        elif grad_input_flat.numel() == 0 and not (
            ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0
        ):
            # This case should ideally not happen if forward path is correct (non-empty input -> non-empty flat grad)
            # but as a safeguard:
            grad_input_reshaped = torch.zeros(
                ctx.orig_dims[0],
                ctx.orig_dims[1],
                ctx.orig_dims[2],
                dtype=grad_input_flat.dtype,
                device=grad_input_flat.device,
            )
        else:
            grad_input_reshaped = grad_input_flat.view(
                ctx.orig_dims[0], ctx.orig_dims[1], ctx.orig_dims[2]
            )
        nones_for_hyperparams = [None] * ctx.hyperparams_count
        grad_bias_return = grad_bias_maybe if not ctx.bias_was_none else None
        return (
            grad_input_reshaped,  # Gradient for student_input (reshaped)
            grad_weight,  # Gradient for student_lm_head_weight
            None,  # Gradient for target_token_ids
            None,  # Gradient for target_logprobs
            None,  # Gradient for target_mask
            None,  # Gradient for true_labels
            grad_bias_return,  # Gradient for student_lm_head_bias
            *nones_for_hyperparams,  # Grads for weight_hard_loss, ..., compute_ce_loss
        )
 class LigerFusedLinearKLTopKLogprobLoss(torch.nn.Module):
    """
    wrapper for chunked top-k logprob kl-d
    """
    def __init__(
        self,
        weight_hard_loss: float = 0.5,
        weight_soft_loss: float = 0.5,
        temperature: float = 1.0,  # This is the kd_temperature
        beta: float = 1.0,
        ignore_index: int = -100,
        compiled: bool = True,
        chunk_size: int = 1024,
        compute_ce_loss: bool = True,
        normalize_topk: bool = True,
    ):
        super().__init__()
        if not (0.0 <= weight_hard_loss <= 1.0 and 0.0 <= weight_soft_loss <= 1.0):
            raise ValueError("Loss weights must be between 0.0 and 1.0.")
        if temperature <= 0:
            raise ValueError("Temperature must be positive.")
        self.weight_hard_loss = weight_hard_loss
        self.weight_soft_loss = weight_soft_loss
        self.temperature = temperature
        self.beta = beta
        self.ignore_index = ignore_index
        self.compiled = compiled
        self.chunk_size = chunk_size
        self.compute_ce_loss = compute_ce_loss
        self.normalize_topk = normalize_topk
        if not self.compute_ce_loss and self.weight_hard_loss > 0.0:
            print(
                f"Warning: compute_ce_loss is False, but weight_hard_loss ({self.weight_hard_loss}) > 0. Hard loss will effectively be zero."
            )
            # self.weight_hard_loss = 0.0 # Or let user manage this
        if self.weight_soft_loss == 0.0:
            print(
                "Warning: weight_soft_loss is 0.0. Soft (KD) loss will not be computed."
            )
    def forward(
        self,
        lm_head_weight: torch.Tensor,  # Weights of the linear layer in the LM head
        student_hidden_states: torch.Tensor,  # student_hidden_states before the lm_head
        target_token_ids: torch.Tensor,
        target_logprobs: torch.Tensor,
        target_mask: torch.Tensor,
        true_labels: torch.Tensor,
        student_bias: torch.Tensor = None,
    ) -> torch.Tensor:
        return LigerFusedLinearKLTopKLogprobFunction.apply(
            student_hidden_states,
            lm_head_weight,
            target_token_ids,
            target_logprobs,
            target_mask,
            true_labels,
            student_bias,
            self.weight_hard_loss,
            self.weight_soft_loss,
            self.ignore_index,
            self.temperature,
            self.beta,
            self.compiled,
            self.chunk_size,
            self.compute_ce_loss,
            self.normalize_topk,
        )
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -1,98 +0,0 @@
 """
 model patcher for chunked top-k kl-div
 """
 from types import MethodType
 from typing import Optional, Union, Unpack
 import torch
 from transformers import Cache
 from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import LossKwargs
 class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs):
    """
    placeholder kwargs for hf model classes
    """
 def kldiv_forward_llama_like(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    target_logprobs: Optional[torch.Tensor] = None,
    target_token_ids: Optional[torch.LongTensor] = None,
    target_mask: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Cache] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,  # pylint: disable=unused-argument
    **kwargs: Unpack[KwargsForCausalLM],  # type: ignore[misc]
 ) -> CausalLMOutputWithPast:
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs.last_hidden_state
    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
    # TODO, we can optimize this further by filtering hidden_states on sequence dimension using labels != -100
    # self.loss_function should be LigerFusedLinearKLTopKLogprobLoss
    loss = self.loss_function(
        self.lm_head.weight,
        hidden_states,
        target_token_ids,
        target_logprobs,
        target_mask,
        true_labels=labels,
    )
    num_items_in_batch = kwargs.pop("num_items_in_batch", -1)
    if num_items_in_batch is not None and num_items_in_batch > 0:
        loss = loss / num_items_in_batch
    return CausalLMOutputWithPast(
        loss=loss,
        logits=None,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def apply_kernel(model_type):
    # Dynamically import the module and attention class
    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
    model_cls_prefix = "".join([part.capitalize() for part in model_type.split("_")])
    module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"])
    model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
    model_cls.forward = MethodType(kldiv_forward_llama_like, model_cls)
--- a/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
+++ b/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
@@ -16,7 +16,40 @@
 loss for top_k KL divergence
 """
 import torch
-from torch import nn
+
 def zscore_standardize(
    logits: torch.Tensor,
    mask: torch.Tensor = None,
    base_temperature: float = 1.0,
    eps: float = 1e-9,
 ):
    """
    Z-score standardize along the last dimension of `logits`.
    i.e., for each [B, seq_len] row, across K entries:
        z = (logits - mean) / std,
    then scale by 1 / base_temperature if desired.
    mask can be broadcastable or None. If None, we standardize all elements.
    """
    if mask is None:
        # shape: [B, seq_len, K]
        # Mean and std over dim=-1
        mean = logits.mean(dim=-1, keepdim=True)
        var = logits.var(dim=-1, unbiased=False, keepdim=True)
    else:
        # If you have to exclude some tokens, multiply by mask, etc.
        float_mask = mask.to(logits.dtype)
        count = float_mask.sum(dim=-1, keepdim=True).clamp_min(1.0)
        mean = (logits * float_mask).sum(dim=-1, keepdim=True) / count
        var = (float_mask * (logits - mean) ** 2).sum(dim=-1, keepdim=True) / count
    std = torch.sqrt(var.clamp_min(eps))
    z = (logits - mean) / std
    # Scale by 1 / base_temperature
    z = z / base_temperature
    return z
@torch.jit.script
@@ -27,6 +60,7 @@ def loss(
    target_mask: torch.Tensor,
    num_items_in_batch: int = -1,  # Use -1 to indicate "None"
    kd_temperature: float = 1.0,
    top_k_before_softmax: int = 0,
 ) -> torch.Tensor:
    """
    A KD loss function that is TorchScript-friendly.
@@ -43,6 +77,8 @@ def loss(
        num_items_in_batch (int, optional): The number of items in the batch.
        kd_temperature (float, optional): The temperature for KD.
            Default: 1.0
        top_k_before_softmax (int, optional): Flag of whether to apply softmax before gathering student top-k logits
            Default: 0
    """
    target_logprobs = target_logprobs.float()
@@ -52,24 +88,46 @@ def loss(
    # student_logits shape:   [B, student_seq_len, vocab_size]
    teacher_seq_len = target_token_ids.shape[1]
-    # Slice student logits to match teacher-provided sequence length
+    if top_k_before_softmax:
-    student_logits_for_kd = (
+        # Slice student logits to match teacher-provided sequence length
-        student_logits[:, :teacher_seq_len, :] / kd_temperature
+        student_logits_for_kd = student_logits[
-    )  # [B, teacher_seq_len, vocab_size]
+            :, :teacher_seq_len, :
        ]  # [B, teacher_seq_len, vocab_size]
-    # keep in full precision for numerical stability of loss
+        # Gather student logits for teacher's top-K tokens
-    student_logits_for_kd = student_logits_for_kd.float()
+        student_logits_topk = torch.gather(
            student_logits_for_kd, dim=-1, index=target_token_ids
        )  # [B, teacher_seq_len, K]
-    # Gather student logits for teacher's top-K tokens
+        student_logits_topk = student_logits_topk.float()
    student_logits_topk = torch.gather(
        student_logits_for_kd, dim=-1, index=target_token_ids
    )  # [B, teacher_seq_len, K]
-    # Compute logsumexp across full vocabulary
+        # Apply KD temperature to student’s logits
-    student_lse = torch.logsumexp(student_logits_for_kd, dim=-1, keepdim=True)
+        if kd_temperature != 1.0:
            student_logits_topk = student_logits_topk / kd_temperature
-    #  Convert just the top-k logits to logprobs
+        # Convert student top-k logits to logprobs
-    student_logprobs_topk = student_logits_topk - student_lse
+        student_logprobs_topk = student_logits_topk - torch.logsumexp(
            student_logits_topk, dim=-1, keepdim=True
        )  # [B, teacher_seq_len, K]
    else:
        # Slice student logits to match teacher-provided sequence length
        student_logits_for_kd = (
            student_logits[:, :teacher_seq_len, :] / kd_temperature
        )  # [B, teacher_seq_len, vocab_size]
        # keep in full precision for numerical stability of loss
        student_logits_for_kd = student_logits_for_kd.float()
        # Gather student logits for teacher's top-K tokens
        student_logits_topk = torch.gather(
            student_logits_for_kd, dim=-1, index=target_token_ids
        )  # [B, teacher_seq_len, K]
        # Compute logsumexp across full vocabulary
        student_lse = torch.logsumexp(student_logits_for_kd, dim=-1, keepdim=True)
        #  Convert just the top-k logits to logprobs
        student_logprobs_topk = student_logits_topk - student_lse
    # Convert teacher_mask to boolean for indexing
    # In TorchScript, .bool() is sometimes unsupported, so we do:
@@ -86,6 +144,10 @@ def loss(
    kd_loss_per_token = teacher_probs * (target_logprobs - student_logprobs_topk)
    kd_loss = kd_loss_per_token.sum()
    # Multiply by T^2 (classical KD scaling)
    if kd_temperature != 1.0:
        kd_loss = kd_loss * (kd_temperature**2)
    # Normalize by number of items (if provided) or by valid tokens
    if num_items_in_batch > 0:
        kd_loss = kd_loss / float(num_items_in_batch)
@@ -96,74 +158,80 @@ def loss(
    return kd_loss
-class ChunkedTopKKDLoss(nn.Module):
+def topk_kd_loss_with_zscore(
    student_logits: torch.Tensor,  # [B, seq_len, vocab_size]
    target_token_ids: torch.Tensor,  # [B, seq_len, K]
    target_logprobs: torch.Tensor,  # [B, seq_len, K], sums to 1.0 in prob space
    target_mask: torch.Tensor,  # [B, seq_len, K] or [B, seq_len]
    kd_temperature: float = 1.0,  # classic KD temperature
    zscore_base_temp: float = 1.0,  # from the paper
    num_items_in_batch: int = -1,
 ):
    """
-    A wrapper that chunks (splits) the student and teacher outputs along the time dimension
+    A variant of top_k KL divergence with Z-score scaling
-    to reduce peak memory usage when upcasting from bf16 to fp32, especially for large vocabularies.
+    from "Logit Standardization in Knowledge Distillation".
    Usage is analogous to ForwardKLWithChunkedOutputLoss but adapted to top-K teacher logprobs.
    """
-    def __init__(self, num_output_chunks: int = 8, kd_temperature: float = 1.0):
+    target_logprobs = target_logprobs.float()
        super().__init__()
        self.num_output_chunks = num_output_chunks
        self.kd_temperature = kd_temperature
-    def forward(
+    B, teacher_seq_len, K = target_logprobs.shape  # pylint: disable=invalid-name
-        self,
+    # 1) Gather the student's top-k logits to match teacher
-        student_logits: torch.Tensor,  # [B, seq_len, vocab_size]
+    student_logits_for_kd = student_logits[
-        target_token_ids: torch.Tensor,  # [B, seq_len, K]
+        :, :teacher_seq_len, :
-        target_logprobs: torch.Tensor,  # [B, seq_len, K]
+    ]  # [B, seq_len, vocab]
-        target_mask: torch.Tensor,  # [B, seq_len, K]
+    student_topk_logits = torch.gather(
-        num_items_in_batch: int = -1,  # optional batch size for normalization
+        student_logits_for_kd, dim=-1, index=target_token_ids
-    ) -> torch.Tensor:
+    )  # [B, seq_len, K]
-        # 1. Split along the "token" dimension (dim=1).
+    student_topk_logits = student_topk_logits.float()
        student_logits_chunks = student_logits.chunk(self.num_output_chunks, dim=1)
        token_ids_chunks = target_token_ids.chunk(self.num_output_chunks, dim=1)
        logprobs_chunks = target_logprobs.chunk(self.num_output_chunks, dim=1)
        mask_chunks = target_mask.chunk(self.num_output_chunks, dim=1)
-        # We'll accumulate a global "sum of losses" and "sum of valid tokens"
+    # 2) If you want to keep the "classical" T scaling, apply it first
-        # so that our final average is consistent with the entire sequence/batch.
+    if kd_temperature != 1.0:
-        total_loss = 0.0
+        student_topk_logits = student_topk_logits / kd_temperature
        total_valid_tokens = 0
-        # 2. Loop over each chunk and compute a chunk-specific loss.
+    # 3) Convert teacher logprobs -> treat them as “logits” for z-score
-        for st_chunk, tid_chunk, lp_chunk, msk_chunk in zip(
+    #    (They differ by +some_constant from real logits, but in z-score
-            student_logits_chunks, token_ids_chunks, logprobs_chunks, mask_chunks
+    #     that constant is subtracted out anyway.)
-        ):
+    teacher_logits_for_zscore = target_logprobs  # rename variable for clarity
            # We pass num_items_in_batch=-1 so that the kd_loss
            # will average over *this chunk's* valid tokens only.
            chunk_loss = loss(
                student_logits=st_chunk,
                target_token_ids=tid_chunk,
                target_logprobs=lp_chunk,
                target_mask=msk_chunk,
                num_items_in_batch=-1,  # ensure per-chunk averaging by valid tokens
                kd_temperature=self.kd_temperature,
            )
-            # kd_loss returns an average over the chunk's valid tokens.
+    # 4) Z-score teacher and student
-            # We want a global average in the end, so we need to re‐weight
+    #    If target_mask is 2D, expand to 3D for the K dimension
-            # by the number of valid tokens in this chunk and keep track of the total.
+    if target_mask.dim() == 2 and target_mask.shape[:2] == (B, teacher_seq_len):
-            chunk_valid_mask = msk_chunk.to(torch.bool)
+        target_mask = target_mask.unsqueeze(-1).expand(-1, -1, K)
            chunk_valid_count = chunk_valid_mask.sum()  # scalar tensor
-            # Re-scale "chunk average" back to "chunk sum"
+    teacher_z = zscore_standardize(
-            chunk_loss_sum = chunk_loss * chunk_valid_count
+        teacher_logits_for_zscore, mask=target_mask, base_temperature=zscore_base_temp
    )
    student_z = zscore_standardize(
        student_topk_logits, mask=target_mask, base_temperature=zscore_base_temp
    )
-            total_loss += chunk_loss_sum
+    # 5) Convert to log-probs for KL
-            total_valid_tokens += chunk_valid_count
+    teacher_logprobs_z = teacher_z - torch.logsumexp(teacher_z, dim=-1, keepdim=True)
    student_logprobs_z = student_z - torch.logsumexp(student_z, dim=-1, keepdim=True)
-        # 3. Normalize *once* at the end.
+    # 6) Restrict to valid tokens if needed
-        if num_items_in_batch > 0:
+    valid_mask = target_mask.bool()  # shape [B, seq_len, K]
-            # If the user gave us a manual denominator (e.g. total items in batch),
+    teacher_probs_z = teacher_logprobs_z.exp()
-            # we divide by it. Typically used if each item is of different length.
+    teacher_probs_z = teacher_probs_z[valid_mask]
-            final_loss = total_loss / float(num_items_in_batch)
+    teacher_logprobs_z = teacher_logprobs_z[valid_mask]
-        else:
+    student_logprobs_z = student_logprobs_z[valid_mask]
            # Otherwise, divide by total valid tokens across all chunks.
            # to get the same result as a non-chunked approach.
            final_loss = total_loss / float(total_valid_tokens)
-        return final_loss
+    # 7) forward KL:  sum( p_teacher * [log(p_teacher) - log(p_student)] )
    kd_loss_per_token = teacher_probs_z * (teacher_logprobs_z - student_logprobs_z)
    kd_loss = kd_loss_per_token.sum()
    # 8) If using classical KD scaling by T^2
    if kd_temperature != 1.0:
        kd_loss = kd_loss * (kd_temperature**2)
    # Optionally scale by zscore_base_temp**2 if you want (paper might differ).
    # kd_loss = kd_loss * (zscore_base_temp**2)
    # 9) Normalize
    if num_items_in_batch is not None and num_items_in_batch > 0:
        kd_loss = kd_loss / float(num_items_in_batch)
    else:
        kd_loss = kd_loss / float(kd_loss_per_token.size(0))
    return kd_loss
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -18,7 +18,8 @@ KD trainer
 from axolotl.core.trainers.base import AxolotlTrainer
-from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss
+from .topk_logprob.forward_kl import loss as topk_kd_loss
 from .topk_logprob.forward_kl import topk_kd_loss_with_zscore
 class AxolotlKDTrainer(AxolotlTrainer):
@@ -26,18 +27,6 @@ class AxolotlKDTrainer(AxolotlTrainer):
    Custom trainer subclass for Knowledge Distillation (KD)
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model_accepts_loss_kwargs = True
        self.model._loss_function = LigerFusedLinearKLTopKLogprobLoss(
            self.args.kd_ce_alpha,  # hard label loss
            self.args.kd_alpha,  # kd loss
            self.args.kd_temperature,
            self.args.kd_beta or 0.0,
            compute_ce_loss=bool(self.args.kd_ce_alpha),
            normalize_topk=self.args.kd_normalize_topk,
        )
    def _set_signature_columns_if_needed(self):
        super()._set_signature_columns_if_needed()
        columns_to_add = []
@@ -63,12 +52,12 @@ class AxolotlKDTrainer(AxolotlTrainer):
        Subclass and override for custom behavior.
        """
-        if (
+
-            self.args.sample_packing
+        target_logprobs = inputs.pop("target_logprobs")
-            and hasattr(inputs, "attention_mask")
+        target_token_ids = inputs.pop("target_token_ids")
-            and hasattr(inputs, "position_ids")
+        target_mask = inputs.pop("target_mask")
-        ):
+
-            del inputs["attention_mask"]
+        seq_len = target_token_ids.shape[1]
        if self.model_accepts_loss_kwargs:
            loss_kwargs = {}
@@ -76,4 +65,49 @@ class AxolotlKDTrainer(AxolotlTrainer):
                loss_kwargs["num_items_in_batch"] = num_items_in_batch
            inputs = {**inputs, **loss_kwargs}
        outputs = model(**inputs)
-        return outputs[0]
+
        # FIXME: account for tokenizer.padding_side
        student_logits = outputs["logits"][:, : seq_len - 1, :].contiguous()
        shift_logits = student_logits.contiguous()
        target_logprobs_for_loss = target_logprobs[..., 1:, :].contiguous()
        target_token_ids_for_loss = target_token_ids[..., 1:, :].contiguous()
        target_mask_for_loss = target_mask[..., 1:, :].contiguous()
        if self.args.kd_zscore_base_temp:
            loss_kd = topk_kd_loss_with_zscore(
                shift_logits,
                target_token_ids_for_loss,
                target_logprobs_for_loss,
                target_mask_for_loss,
                kd_temperature=self.args.kd_temperature,
                zscore_base_temp=self.args.kd_zscore_base_temp,
                num_items_in_batch=num_items_in_batch,
            )
        else:
            loss_kd = topk_kd_loss(
                shift_logits,
                target_token_ids_for_loss,
                target_logprobs_for_loss,
                target_mask_for_loss,
                num_items_in_batch=num_items_in_batch,
                kd_temperature=self.args.kd_temperature,
                top_k_before_softmax=1 if self.args.kd_top_k_before_softmax else 0,
            )
        if self.args.kd_ce_alpha > 0:
            kd_alpha = self.args.kd_alpha
            loss = self.args.kd_ce_alpha * outputs["loss"] + kd_alpha * loss_kd
        else:
            loss = loss_kd
        # Save past state if it exists
        # TODO: this needs to be fixed and made cleaner later.
        if self.args.past_index >= 0:
            self._past = outputs[  # pylint: disable=attribute-defined-outside-init
                self.args.past_index
            ]
        if self.args.average_tokens_across_devices and self.model_accepts_loss_kwargs:
            loss *= self.accelerator.num_processes
        return (loss, outputs) if return_outputs else loss
--- a/src/axolotl/integrations/kd/utils.py
+++ b/src/axolotl/integrations/kd/utils.py
@@ -1,100 +0,0 @@
 """Helper KD utils"""
 import math
 from typing import List, Union
 import numpy as np
 import torch
 from torch import FloatTensor, Tensor
 def normalize_logprobs(logprobs: FloatTensor, topk: int) -> FloatTensor:
    """
    Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs.
    """
    # Ensure raw_logprobs matches kd_online_topk length for tensor operations
    # This should ideally be handled by the caller ensuring correct padding/truncation first
    if logprobs.shape[-1] != topk:
        # pad last dimension of logprobs to match topk length with -inf
        padding_len = topk - logprobs.shape[-1]
        padding_tensor = torch.full(
            (
                *logprobs.shape[:-1],
                padding_len,
            ),  # Takes all dimensions of logprobs except the last, then appends padding_needed
            float("-inf"),
            dtype=logprobs.dtype,
            device=logprobs.device,
        )
        logprobs = torch.cat((logprobs, padding_tensor), dim=-1)
    # Convert logprobs at T_online to probabilities
    # use log sum exp trick to avoid underflow
    position_logprobs_lse = torch.logsumexp(logprobs, dim=-1, keepdim=True)
    teacher_probs_t_online = torch.exp(logprobs - position_logprobs_lse)
    # Normalize probabilities (sum to 1)
    # This is important if the top-k from server aren't a full distribution
    teacher_probs_t_online_sum = teacher_probs_t_online.sum(dim=-1, keepdim=True)
    teacher_probs_t_online = teacher_probs_t_online / teacher_probs_t_online_sum
    final_logprobs_tensor = torch.log(teacher_probs_t_online)
    return final_logprobs_tensor
 def strided_chunk_views(
    tensor: Union[np.ndarray, torch.Tensor],
    chunks: int,
    dim: int = 0,
    stride: int = 1,
    chunk_size: int | None = None,
 ) -> List[Union[np.ndarray, torch.Tensor]]:
    """
    Split a tensor into chunks along a dimension with striding, prioritizing views over copies.
    Args:
        tensor: Input tensor (numpy array or torch tensor)
        chunks: Number of chunks to create
        dim: Dimension along which to chunk (default: 0)
        stride: Stride between chunk starting positions (default: 1)
        chunk_size: Size of each chunk. If None, calculated automatically (default: None)
    Returns:
        List of tensor chunks (views when possible, copies when necessary)
    """
    # Get the size of the specified dimension
    dim_size = tensor.shape[dim]
    # Calculate chunk size if not provided
    if chunk_size is None:
        chunk_size = (dim_size + chunks - 1) // chunks  # Ceiling division
    chunks_list = []
    for i in range(chunks):
        start_idx = i * stride
        end_idx = min(start_idx + chunk_size, dim_size)
        # Break if we've gone beyond the tensor
        if start_idx >= dim_size:
            break
        # Create slice objects for all dimensions
        slices = [slice(None)] * tensor.ndim
        slices[dim] = slice(start_idx, end_idx)
        chunk = tensor[tuple(slices)]
        chunks_list.append(chunk)
    return chunks_list
 def chunk_overlap(input_tensor: Tensor, chunks: int, dim: int = 0, overlap: int = 1):
    dim_size = input_tensor.shape[dim]
    stride = math.ceil(dim_size / chunks)
    return strided_chunk_views(
        input_tensor, chunks, dim, stride=stride, chunk_size=stride + overlap
    )
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -19,6 +19,7 @@ from peft import (
 from transformers import PreTrainedModel
 from axolotl.loaders.utils import get_linear_embedding_layers
 from axolotl.telemetry.errors import send_errors
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
@@ -162,6 +163,7 @@ def load_lora(
    return model, lora_config
@send_errors
 def load_adapter(
    model: PreTrainedModel,
    cfg: DictDefault,
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -46,6 +46,7 @@ from axolotl.loaders.utils import (
    load_model_config,
 )
 from axolotl.models.mamba import fix_mamba_attn_for_loss
 from axolotl.telemetry.errors import send_errors
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import (
@@ -145,6 +146,7 @@ class ModelLoader:
        """Property that determines if FSDP with QLoRA is enabled."""
        return self.cfg.fsdp and self.cfg.adapter == "qlora"
    @send_errors
    def load(self) -> tuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]:
        """Load and prepare the model with all configurations and patches.
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -166,17 +166,6 @@ class PatchManager:
    def _apply_self_attention_lora_patch(self):
        """Apply self-attention LoRA patches if configured."""
        if self.cfg.lora_qkv_kernel or self.cfg.lora_o_kernel:
            # Only patch if conditions are met
            can_patch = (
                self.cfg.lora_dropout == 0
                if hasattr(self.cfg, "lora_dropout")
                else True
            )  # default to True if lora_dropout is not set
            if not can_patch:
                LOG.warning("Cannot patch self-attention - requires no dropout")
                return
            from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
            patch_self_attn_lora(self.cfg)
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -8,12 +8,14 @@ from transformers import (
    PreTrainedTokenizerBase,
 )
 from axolotl.telemetry.errors import send_errors
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
@send_errors
 def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
    processor_kwargs: dict[str, Any] = {}  # Do we actually need this?
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -7,14 +7,13 @@ import transformers
 from transformers import (
    AddedToken,
    AutoTokenizer,
    PreTrainedTokenizer,
 )
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.telemetry.errors import send_errors
 from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import (
    barrier,
    is_local_main_process,
@@ -119,21 +118,9 @@ def modify_tokenizer_files(
    return tokenizer_dir
-def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
+@send_errors
 def load_tokenizer(cfg):
    """Load and configure the tokenizer based on the provided config."""
    def _load_mistral_common_tokenizer(cfg: DictDefault):
        """Load mistral-common tokenizer"""
        from axolotl.utils.mistral_tokenizer import HFMistralTokenizer
        # Load the HF-compatible wrapper around MistralTokenizer
        tokenizer = HFMistralTokenizer.from_pretrained(cfg.tokenizer_config)
        return tokenizer
    if cfg.tokenizer_use_mistral_common:
        return _load_mistral_common_tokenizer(cfg)
    model_config = load_model_config(cfg)
    tokenizer_kwargs = {}
    use_fast = True  # this is the default
@@ -222,12 +209,11 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
                )
                and k != "pad_token"
            ):
-                lora_modules_to_save_str = ", ".join(
+                lora_modules_to_save = ", ".join(
                    [f"`{x}`" for x in lora_modules_to_save]
                )
                raise ValueError(
-                    f"Please set lora_modules_to_save to [{lora_modules_to_save_str}] "
+                    f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens."
                    "when using an adapter and changing the special tokens."
                )
            tokenizer.add_special_tokens(
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -145,11 +145,6 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
        return Qwen2Attention
    if model_type == "mllama":
        from transformers.models.mllama.modeling_mllama import MllamaTextSelfAttention
        return MllamaTextSelfAttention
    try:
        # Dynamically import the module and attention class
        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
@@ -274,29 +269,6 @@ def find_mlp_in_layer(
                )
 def get_layers(model: PeftModelForCausalLM) -> list[nn.Module]:
    """
    Get the layers of the model. Handles text-only and multimodal models.
    Args:
        model: A PEFT model.
    Returns:
        A list of layers.
    """
    pretrained_model = model.model
    # check for multimodal models first
    if hasattr(pretrained_model, "language_model"):
        return pretrained_model.language_model.layers
    if hasattr(pretrained_model, "model"):
        return pretrained_model.model.layers
    raise NotImplementedError(
        f"Model type {model.config.model_type} is not supported yet. Please create an Issue."
    )
 def apply_lora_kernel_patches(
    model: PeftModelForCausalLM, cfg: DictDefault
 ) -> PeftModelForCausalLM:
@@ -368,7 +340,17 @@ def apply_lora_kernel_patches(
    if activation not in SUPPORTED_ACTIVATIONS:
        raise NotImplementedError(f"Activation {activation} is not supported")
-    layers = get_layers(model)
+    layers = []
    # check for multimodal models first
    pretrained_model = model.model
    if hasattr(pretrained_model, "language_model"):
        layers = pretrained_model.language_model.layers
    elif hasattr(pretrained_model, "model"):
        layers = pretrained_model.model.layers
    else:
        raise NotImplementedError(
            f"Model type {model.config.model_type} is not supported yet. Please create an Issue."
        )
    # Patch each layer
    for layer in layers:
--- a/src/axolotl/prompt_strategies/init.py
+++ b/src/axolotl/prompt_strategies/init.py
@@ -17,10 +17,7 @@ def load(strategy, tokenizer, cfg, ds_cfg, processor=None):
            return messages_load(tokenizer, cfg, ds_cfg, processor=processor)
        load_fn = "load"
        package = "axolotl.prompt_strategies"
-        if (
+        if strategy.split(".")[-1].startswith("load_"):
            strategy.split(".")[-1].startswith("load_")
            or strategy.split(".")[-1] == "load"
        ):
            load_fn = strategy.split(".")[-1]
            strategy = ".".join(strategy.split(".")[:-1])
        elif len(strategy.split(".")) > 1:
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -2,10 +2,8 @@
 HF Chat Templates prompt strategy
 """
 # pylint: disable=too-many-lines
 from collections import defaultdict
-from typing import TYPE_CHECKING, Any, Dict, List, Set, Union
+from typing import Any, Dict, List, Set, Union
 from pydantic import BaseModel
 from transformers import ProcessorMixin
@@ -17,9 +15,6 @@ from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.datasets import DatasetConfig
 if TYPE_CHECKING:
    from axolotl.utils.mistral_tokenizer import HFMistralTokenizer
 # Configure the logger
 LOG = get_logger(__name__)
 LOG.setLevel("INFO")
@@ -39,7 +34,6 @@ class ChatTemplatePrompter(Prompter):
        message_field_training_detail: str | None = None,
        field_messages: str = "messages",
        field_system: str = "system",
        field_tools: str = "tools",
        roles: dict[str, list[str]] | None = None,
        chat_template_kwargs: dict[str, Any] | None = None,
        drop_system_message: bool = False,
@@ -72,7 +66,6 @@ class ChatTemplatePrompter(Prompter):
        self.message_field_training_detail = message_field_training_detail
        self.field_messages = field_messages
        self.field_system = field_system
        self.field_tools = field_tools
        self.tokenizer = tokenizer
        self.processor: ProcessorMixin | None = processor
        self.chat_template = chat_template
@@ -84,38 +77,17 @@ class ChatTemplatePrompter(Prompter):
    def chat_template_msg_variables(self) -> Set[str]:
        return self._chat_template_msg_variables
-    def build_prompt(
+    def build_prompt(self, conversation, add_generation_prompt=False, images=None):
        self,
        conversation: list[dict],
        add_generation_prompt=False,
        images=None,
        tools=None,
    ):
        """
        Build a prompt from a conversation.
        Args:
            conversation: A list of messages.
            add_generation_prompt: Whether to add a generation prompt.
            images: A list of images. (optional)
            tools: A list of tools. (optional)
        """
        chat_template_kwargs = {
            "chat_template": self.chat_template,
            "add_generation_prompt": add_generation_prompt,
        }
        if tools:
            chat_template_kwargs["tools"] = tools
        if self.processor:
            if not callable(self.processor):
                raise TypeError("Processor must be callable")
            text = self.processor.apply_chat_template(
                conversation,
                chat_template=self.chat_template,
                tokenize=False,
-                **chat_template_kwargs,
+                add_generation_prompt=add_generation_prompt,
                **self.chat_template_kwargs,
            )
            batch = self.processor(
                text=text,
@@ -132,7 +104,9 @@ class ChatTemplatePrompter(Prompter):
        return self.tokenizer.apply_chat_template(
            conversation,
-            **chat_template_kwargs,
+            add_generation_prompt=add_generation_prompt,
            chat_template=self.chat_template,
            **self.chat_template_kwargs,
        )
    def get_offsets_for_train_detail(
@@ -276,15 +250,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos
        # Default to eos_token if eot_tokens not provided
-        self.eot_tokens = []
+        self.eot_tokens = (
-        if eot_tokens is not None:
+            eot_tokens if eot_tokens is not None else [self.tokenizer.eos_token]
-            self.eot_tokens = eot_tokens
+        )
        elif (
            hasattr(self.tokenizer, "eos_token")
            and self.tokenizer.eos_token is not None
        ):
            self.eot_tokens = [self.tokenizer.eos_token]
        self.split_thinking = split_thinking
        self.images = "images"
@@ -408,7 +376,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
            and not self.prompter.message_field_training_detail  # type: ignore
        ):
            turns = self.get_conversation_thread(prompt)
-            images = self._get_images(prompt)
+            images = self.get_images(prompt)
            prompt_ids = self.prompter.build_prompt(  # type: ignore
                turns[:-1],
                add_generation_prompt=True,
@@ -437,8 +405,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
            return tokenized_prompt
        turns = self.get_conversation_thread(prompt)
-        tools = self._get_tools(prompt)
+        input_ids = self.prompter.build_prompt(turns)  # type: ignore
        input_ids = self.prompter.build_prompt(turns, tools=tools)  # type: ignore
        labels = [IGNORE_TOKEN_ID] * len(input_ids)
        last_eos_idx = -1
@@ -477,9 +444,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                continue
-            turn_start_idx, turn_end_idx = self.find_turn(
+            turn_start_idx, turn_end_idx = self.find_turn(turns=turns, turn_idx=index)
                turns=turns, turn_idx=index, tools=tools
            )
            LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}")
@@ -581,9 +546,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                return i
        return -1
-    def find_turn(
+    def find_turn(self, turns: list[dict], turn_idx: int):
        self, turns: list[dict], turn_idx: int, tools: list[dict] | None = None
    ):
        """
        Locate the starting and ending indices of the specified turn in a conversation.
        """
@@ -614,10 +577,10 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        turns_with_content = turns[: turn_idx + 1]
        # Generate the conversation up to the turn, with final turn replaced with dummy content
-        dummy_ids = self.prompter.build_prompt(turns_with_empty, tools=tools)  # type: ignore
+        dummy_ids = self.prompter.build_prompt(turns_with_empty)  # type: ignore
        # Generate the conversation up to the turn, with final turn included
-        full_ids = self.prompter.build_prompt(turns_with_content, tools=tools)  # type: ignore
+        full_ids = self.prompter.build_prompt(turns_with_content)  # type: ignore
        if not full_ids or not dummy_ids:
            LOG.warning(f"Empty template generated for turn {turn_idx}")
@@ -670,10 +633,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
    def get_conversation_thread(self, prompt):
        turns = []
-        messages = self._get_messages(prompt)
+        possible_sys_turn = self.transform_message(
-
+            prompt[self.prompter.field_messages][0]
-        possible_sys_turn = self.transform_message(messages[0])
+        )
        if (
            possible_sys_turn["role"] != "system"
            and self.prompter.field_system in prompt
@@ -681,7 +643,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
            turn = {"role": "system", "content": prompt[self.prompter.field_system]}
            turns.append(turn)
-        for message in messages:
+        for message in prompt[self.prompter.field_messages]:
            transformed_message = self.transform_message(message)
            turn = {
@@ -699,7 +661,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        return turns
-    def transform_message(self, message: dict) -> dict:
+    def transform_message(self, message):
        # Build the initial transformed message from the mappings
        transformed_message = {}
        for key, value in self.prompter.message_property_mappings.items():
@@ -776,135 +738,18 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        return transformed_message
-    def _get_images(self, prompt):
+    def get_images(self, prompt):
        return prompt.get(self.images, None)
    def _get_tools(self, prompt) -> list[dict] | None:
        """Get tools from prompt if available."""
        tools = prompt.get(self.prompter.field_tools, None)
        if tools is None:
            return None
        if isinstance(tools, list):
            return tools
        raise ValueError(
            "Unknown tools format. Please convert it into a list[dict].\n"
            f"Current format: {type(tools)}"
        )
    def _get_messages(self, prompt):
        messages = prompt.get(self.prompter.field_messages, None)
        if messages is None:
            raise ValueError("Messages is null. Please check `field_messages`.")
        if isinstance(messages, list):
            return messages
        raise ValueError(
            "Unknown messages format. Please convert it into a list[dict].\n"
            f"Current format: {type(messages)}"
        )
 class MistralStrategy(ChatTemplateStrategy):
    """
    Mistral strategy for chat template.
    """
    def __init__(
        self,
        prompter: "ChatTemplatePrompter",
        tokenizer: "HFMistralTokenizer",
        train_on_inputs: bool,
        sequence_len: int,
        roles_to_train: list[str] | None = None,
        train_on_eos: str | None = None,
        train_on_eot: str | None = None,
        eot_tokens: list[str] | None = None,
        split_thinking: bool | None = False,
    ):
        # Call the parent's parent __init__ (PromptTokenizingStrategy) to skip ChatTemplateStrategy's validation
        # pylint: disable=non-parent-init-called,super-init-not-called
        PromptTokenizingStrategy.__init__(
            self, prompter, tokenizer, train_on_inputs, sequence_len
        )
        self.prompter: ChatTemplatePrompter = prompter
        self.roles_to_train = []
        if roles_to_train:
            # map roles if exist in prompter.roles else use the role as is
            self.roles_to_train = [
                prompter.roles.get(role, role) for role in roles_to_train
            ]
        self.train_on_eos = train_on_eos
        # Backward compatibility, load from train_on_eos
        self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos
        # Default to eos_token if eot_tokens not provided
        self.eot_tokens = []
        if eot_tokens is not None:
            self.eot_tokens = eot_tokens
        else:
            # set eot_tokens to the eos_token
            self.eot_tokens = [self.tokenizer.eos_token]
        self.split_thinking = split_thinking
        self.images = "images"
        LOG.debug(
            f"The chat template uses the following properites on the message: {self.prompter.chat_template_msg_variables}"
        )
        # Skip the validation that ChatTemplateStrategy calls
        # TODO: address this in the future with mistral-specific checks
        # self._validate_eot_and_eos_tokens()
    @property
    def supports_multiprocessing(self) -> bool:
        """
        Whether this tokenizing strategy supports multiprocessing.
        mistral_common tokenizers cannot be pickled for multiprocessing.
        """
        return False
    def find_first_eot_token(self, input_ids, start_idx):
        """Find the first EOT token in the input_ids starting from start_idx."""
        # mistral-common tokenizer does not support eot_tokens
        return self.find_first_eos_token(input_ids, start_idx)
 class MistralPrompter(ChatTemplatePrompter):
    """
    Mistral prompter for chat template.
    """
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._chat_template_msg_variables = set(["tool_call_id", "name", "tool_calls"])
 class StrategyLoader:
    """
    Load chat template strategy based on configuration.
    """
-    def _get_strategy_cls(self, cfg):
+    def _get_strategy_cls(self):
        if cfg.tokenizer_use_mistral_common:
            return MistralStrategy
        return ChatTemplateStrategy
    def _get_prompter_cls(self, cfg):
        if cfg.tokenizer_use_mistral_common:
            return MistralPrompter
        return ChatTemplatePrompter
    def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
        return {
            "train_on_inputs": cfg.train_on_inputs,
@@ -930,14 +775,9 @@ class StrategyLoader:
        else:
            dataset_config = ds_cfg
-        if cfg.tokenizer_use_mistral_common:
+        chat_template_string = get_chat_template_from_config(
-            # mistral-common does not use this, so we pass an empty string
+            cfg=cfg, ds_cfg=dataset_config, tokenizer=tokenizer
-            chat_template_string = ""
+        )
        else:
            chat_template_string = get_chat_template_from_config(
                cfg=cfg, ds_cfg=dataset_config, tokenizer=tokenizer
            )
        LOG.info(f"Using chat template:\n---\n{chat_template_string!s}\n---")
        prompter_params = {
@@ -963,11 +803,10 @@ class StrategyLoader:
        }
        strategy_params = self._get_strategy_params(cfg, dataset_config)
-        strategy_cls = self._get_strategy_cls(cfg)
+        strategy_cls = self._get_strategy_cls()
        prompter_cls = self._get_prompter_cls(cfg)
        strategy = strategy_cls(
-            prompter_cls(**prompter_params),
+            ChatTemplatePrompter(**prompter_params),
            tokenizer=tokenizer,
            **strategy_params,
        )
--- a/src/axolotl/prompt_strategies/dpo/chat_template.py
+++ b/src/axolotl/prompt_strategies/dpo/chat_template.py
@@ -46,14 +46,6 @@ def default(
        )
        messages = sample[field_messages]
        if isinstance(messages, str):
            messages = [
                {
                    message_property_mappings["role"]: "user",
                    message_property_mappings["content"]: messages,
                }
            ]
        messages = [
            {
                "role": role_map[m[message_property_mappings["role"]]],
@@ -61,35 +53,13 @@ def default(
            }
            for m in messages
        ]
        chosen_raw = sample[field_chosen]
        if isinstance(chosen_raw, str):
            chosen_msg = {
                message_property_mappings["role"]: "assistant",
                message_property_mappings["content"]: chosen_raw,
            }
        elif isinstance(chosen_raw, dict):
            chosen_msg = chosen_raw
        else:
            chosen_msg = chosen_raw[-1]
        chosen = {
-            "role": role_map[chosen_msg[message_property_mappings["role"]]],
+            "role": role_map[sample[field_chosen][message_property_mappings["role"]]],
-            "content": chosen_msg[message_property_mappings["content"]],
+            "content": sample[field_chosen][message_property_mappings["content"]],
        }
        rejected_raw = sample[field_rejected]
        if isinstance(rejected_raw, str):
            rejected_msg = {
                message_property_mappings["role"]: "assistant",
                message_property_mappings["content"]: rejected_raw,
            }
        elif isinstance(rejected_raw, dict):
            rejected_msg = rejected_raw
        else:
            rejected_msg = rejected_raw[-1]
        rejected = {
-            "role": role_map[rejected_msg[message_property_mappings["role"]]],
+            "role": role_map[sample[field_rejected][message_property_mappings["role"]]],
-            "content": rejected_msg[message_property_mappings["content"]],
+            "content": sample[field_rejected][message_property_mappings["content"]],
        }
        dummy_user_message = {"role": "user", "content": "[[dummy_message]]"}
--- a/src/axolotl/prompt_strategies/messages/init.py
+++ b/src/axolotl/prompt_strategies/messages/init.py
@@ -32,3 +32,4 @@ def load(tokenizer, cfg, ds_cfg, processor=None):
    except Exception as exc:  # pylint: disable=broad-exception-caught
        LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
        raise exc
    return None
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -3,7 +3,6 @@
 import abc
 from typing import Callable, Dict, List, Optional, Tuple, Union
 from datasets import Dataset
 from transformers import BatchEncoding, PreTrainedTokenizer
 from axolotl.prompters import Prompter
@@ -29,16 +28,6 @@ class DatasetWrappingStrategy(abc.ABC):
    Abstract class for wrapping datasets for Chat Messages
    """
    @abc.abstractmethod
    def wrap_dataset(
        self,
        dataset,
        process_count: int | None = None,
        keep_in_memory: bool | None = False,
        **kwargs,
    ) -> Dataset:
        pass
 class PromptTokenizingStrategy(abc.ABC):
    """
@@ -70,14 +59,6 @@ class PromptTokenizingStrategy(abc.ABC):
    def supports_batched(self):
        return False
    @property
    def supports_multiprocessing(self):
        """
        Whether this tokenizing strategy supports multiprocessing.
        Should return False if the tokenizer has unpicklable objects.
        """
        return True
    def _tokenize(
        self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
    ) -> BatchEncoding:
--- a/src/axolotl/telemetry/init.py
+++ b/src/axolotl/telemetry/init.py
--- a/src/axolotl/telemetry/callbacks.py
+++ b/src/axolotl/telemetry/callbacks.py
@@ -0,0 +1,164 @@
 """Trainer callbacks for reporting runtime metrics at regular intervals."""
 import logging
 import time
 from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
 )
 from axolotl.telemetry.manager import TelemetryManager
 from axolotl.telemetry.runtime_metrics import RuntimeMetricsTracker
 LOG = logging.getLogger(__name__)
 TIME_SINCE_LAST = 30
 class TelemetryCallback(TrainerCallback):
    """
    Trainer callback for tracking and reporting runtime metrics.
    This callback tracks training progress, runtime, and memory usage,
    sending telemetry at configurable intervals.
    """
    report_interval_steps: int = 100
    def __init__(self):
        """Initialize the metrics callback."""
        self.tracker = RuntimeMetricsTracker()
        self.telemetry_manager = TelemetryManager.get_instance()
        self.current_epoch = -1
        self.start_time = time.time()
        self.last_report_time = None
        self.last_report_step = 0
    # pylint: disable=unused-argument
    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Handle training start."""
        self.telemetry_manager.send_event(event_type="train-start")
    # pylint: disable=unused-argument
    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Handle training end."""
        # Send training completion event
        self.telemetry_manager.send_event(
            event_type="train-end",
            properties=self._extract_last_metrics(state)
            | self.tracker.metrics.to_dict(),
        )
    # pylint: disable=unused-argument
    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Handle epoch start."""
        self.current_epoch += 1
        self.tracker.start_epoch(self.current_epoch)
    # pylint: disable=unused-argument
    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Handle epoch end."""
        self.tracker.end_epoch(self.current_epoch)
    # pylint: disable=unused-argument
    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Handle step end."""
        step = state.global_step
        self.tracker.update_step(step)
        # Check if we should report metrics
        should_report = (
            step % self.report_interval_steps == 0
            or step == 1  # Always report first step
            or step - self.last_report_step >= self.report_interval_steps
        )
        if should_report:
            current_time = time.time()
            if self.last_report_time is not None:
                time_since_last_report = current_time - self.last_report_time
            else:
                time_since_last_report = current_time - self.start_time
            steps_since_last_report = step - self.last_report_step
            # Only report if enough time has passed
            if (
                step == 1
                or time_since_last_report >= TIME_SINCE_LAST
                or steps_since_last_report >= self.report_interval_steps
            ):
                # Calculate steps per second for this interval
                if time_since_last_report > 0 and steps_since_last_report > 0:
                    steps_per_second = steps_since_last_report / time_since_last_report
                else:
                    steps_per_second = 0
                # Update memory metrics
                self.tracker.update_memory_metrics()
                # Prepare metrics to report
                metrics = self._extract_last_metrics(state) | {
                    "step": step,
                    "epoch": self.current_epoch,
                    "progress": state.epoch,  # Fractional epoch progress
                    "steps_per_second": steps_per_second,
                    "elapsed_time": current_time - self.start_time,
                    "time_since_last_report": time_since_last_report,
                }
                # Add memory metrics
                memory_metrics = self.tracker.get_memory_metrics()
                metrics.update({"memory": memory_metrics})
                # Send telemetry
                self.telemetry_manager.send_event(
                    event_type="train-progress", properties=metrics
                )
                # Update last report time and step
                self.last_report_time = current_time
                self.last_report_step = step
    def _extract_last_metrics(self, state: TrainerState) -> dict:
        """Extract last loss and learning_rate from log history."""
        if not state.log_history:
            return {"loss": 0, "learning_rate": 0}
        last_log = state.log_history[-1]
        return {
            "loss": last_log.get("loss", 0),
            "learning_rate": last_log.get("learning_rate", 0),
        }
--- a/src/axolotl/telemetry/errors.py
+++ b/src/axolotl/telemetry/errors.py
@@ -0,0 +1,160 @@
 """Telemetry utilities for exception and traceback information."""
 import logging
 import os
 import re
 import traceback
 from functools import wraps
 from inspect import getmodule
 from typing import Any, Callable
 from axolotl.telemetry.manager import TelemetryManager
 LOG = logging.getLogger(__name__)
 ERROR_HANDLED = False
 def sanitize_stack_trace(stack_trace: str) -> str:
    """
    Remove personal information from stack trace messages while keeping Python package codepaths.
    This function identifies Python packages by looking for common patterns in virtual environment
    and site-packages directories, preserving the package path while removing user-specific paths.
    Args:
        stack_trace: The original stack trace string.
    Returns:
        A sanitized version of the stack trace with Python package paths preserved.
    """
    # Split the stack trace into lines to process each file path separately
    lines = stack_trace.split("\n")
    sanitized_lines = []
    # Regular expression to find file paths in the stack trace
    path_pattern = re.compile(r'(?:File ")(.*?)(?:")')
    # Regular expression to identify paths in site-packages or dist-packages
    # This matches path segments like "site-packages/package_name" or "dist-packages/package_name"
    site_packages_pattern = re.compile(
        r"(?:site-packages|dist-packages)[/\\]([\w\-\.]+)"
    )
    # Additional common virtual environment patterns
    venv_lib_pattern = re.compile(
        r"(?:lib|Lib)[/\\](?:python\d+(?:\.\d+)?[/\\])?(?:site-packages|dist-packages)[/\\]([\w\-\.]+)"
    )
    for line in lines:
        # Check if this line contains a file path
        path_match = path_pattern.search(line)
        if path_match:
            full_path = path_match.group(1)
            sanitized_path = ""
            # Try to match site-packages pattern
            site_packages_match = site_packages_pattern.search(full_path)
            venv_lib_match = venv_lib_pattern.search(full_path)
            if site_packages_match:
                # Find the index where the matched pattern starts
                idx = full_path.find("site-packages")
                if idx == -1:
                    idx = full_path.find("dist-packages")
                # Keep from 'site-packages' onward
                if idx >= 0:
                    sanitized_path = full_path[idx:]
            elif venv_lib_match:
                # For other virtual environment patterns, find the package directory
                match_idx = venv_lib_match.start(1)
                if match_idx > 0:
                    # Keep from the package name onward
                    package_name = venv_lib_match.group(1)
                    idx = full_path.rfind(
                        package_name, 0, match_idx + len(package_name)
                    )
                    if idx >= 0:
                        sanitized_path = full_path[idx:]
            # If we couldn't identify a package pattern but path contains 'axolotl'
            elif "axolotl" in full_path:
                idx = full_path.rfind("axolotl")
                if idx >= 0:
                    sanitized_path = full_path[idx:]
            # Apply the sanitization to the line
            if sanitized_path:
                line = line.replace(full_path, sanitized_path)
            else:
                # If we couldn't identify a package pattern, just keep the filename
                filename = os.path.basename(full_path)
                if filename:
                    line = line.replace(full_path, filename)
                else:
                    line = line.replace(full_path, "")
        sanitized_lines.append(line)
    return "\n".join(sanitized_lines)
 def send_errors(func: Callable) -> Callable:
    """
    Decorator to send exception info in a function. If an exception is raised, we send
    telemetry containing the stack trace and error message.
    If an error occurs in a decorated function that is called by another decorated
    function, we'll only send telemetry corresponding to the lower-level function.
    Args:
        func: Function to decorate.
    Returns:
        Decorated function.
    """
    @wraps(func)
    def wrapper(*args, **kwargs) -> Any:
        telemetry_manager = TelemetryManager.get_instance()
        if not telemetry_manager.enabled:
            return func(*args, **kwargs)
        try:
            return func(*args, **kwargs)
        except Exception as exception:
            # Only track if we're not already handling an error. This prevents us from
            # capturing an error more than once in nested decorated function calls.
            global ERROR_HANDLED  # pylint: disable=global-statement
            if not ERROR_HANDLED:
                ERROR_HANDLED = True
                # Get function module path
                module = getmodule(func)
                module_path = (
                    f"{module.__name__}.{func.__name__}" if module else func.__name__
                )
                # Get stack trace
                stack_trace = "".join(
                    traceback.format_exception(
                        type(exception), exception, exception.__traceback__
                    )
                )
                stack_trace = sanitize_stack_trace(stack_trace)
                # Send error telemetry
                telemetry_manager.send_event(
                    event_type=f"{module_path}-error",
                    properties={
                        "exception": str(exception),
                        "stack_trace": stack_trace,
                    },
                )
            raise
    return wrapper
--- a/src/axolotl/telemetry/manager.py
+++ b/src/axolotl/telemetry/manager.py
@@ -0,0 +1,417 @@
 """Telemetry manager and associated utilities."""
 import atexit
 import importlib
 import logging
 import os
 import platform
 import time
 import uuid
 from pathlib import Path
 from typing import Any
 import posthog
 import psutil
 import torch
 import yaml
 LOG = logging.getLogger(__name__)
 POSTHOG_HOST = "https://app.posthog.com"
 POSTHOG_WRITE_KEY = "phc_1kUR0o04oJKKTTeSsIz2Mfm5mpiVsQEf2WOlzljMD7y"
 OPT_IN_WARNING_SLEEP_SECONDS = 10
 OPT_IN_WARNING = (
    "\nTelemetry is currently disabled by default. If you'd like to help improve "
    "Axolotl, consider enabling it by setting AXOLOTL_DO_NOT_TRACK=0 in your environment.\n\n"
    "Telemetry data helps us understand:\n"
    "- Which features are most used\n"
    "- What hardware configurations to prioritize\n"
    "- Where users encounter errors\n\n"
    "Personally identifiable information (PII) is not collected.\n\n"
    "To remove this warning, explicitly set AXOLOTL_DO_NOT_TRACK=0 (enable telemetry) "
    "or AXOLOTL_DO_NOT_TRACK=1 (explicitly disable telemetry).\n\n"
    "Note: Telemetry will move to an opt-out in a later release.\n\n"
    "For details, see: https://axolotl-ai-cloud.github.io/axolotl/docs/telemetry.html\n\n"
    f"Sleeping for {OPT_IN_WARNING_SLEEP_SECONDS}s..."
 )
 WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml")
 # NOTE: Need to keep these up to date with any config schema changes
 FIELDS_TO_REDACT = {
    "base_model",
    "tokenizer_config",
    "base_model_config",
    "pretraining_dataset",  # NOTE: this field may be a string or a dictionary
    "resume_from_checkpoint",
    "hub_model_id",
 }
 PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"}
 PATH_INDICATORS = {"path", "dir"}
 # pylint: disable=duplicate-code
 RELEVANT_PACKAGES = {
    "torch",
    "transformers",
    "trl",
    "datasets",
    "peft",
    "bitsandbytes",
    "accelerate",
    "optimum",
    "deepspeed",
    "ray",
    "axolotl",
    "triton",
    "mamba-ssm",
    "flash-attn",
    "xformers",
    "autoawq",
    "tokenizers",
    "sentencepiece",
    "torchao",
    "lm_eval",
 }
 def is_main_process() -> bool:
    """
    Check whether we're running in the main process.
    Note:
        We're using this function instead of `torch.utils.distributed.is_main_process`
        causes issues with DeepSpeed world_size since. This function avoids that issue
        by checking env vars that are set by various launchers.
    Returns:
        Whether we're running in the main process.
    """
    # If PyTorch distributed is already initialized, use it
    if torch.distributed.is_initialized():
        return torch.distributed.get_rank() == 0
    # Otherwise check environment variables for global rank
    # NOTE: need to verify this in SLURM / OpenMPI environments
    global_rank = int(
        os.environ.get(
            "RANK",
            os.environ.get(
                "GLOBAL_RANK",
                os.environ.get(
                    "SLURM_PROCID",
                    os.environ.get(
                        "OMPI_COMM_WORLD_RANK",
                        "0",
                    ),
                ),
            ),
        )
    )
    return global_rank == 0
 class TelemetryManager:
    """Manages telemetry collection and transmission"""
    _instance = None
    _initialized = False
    def __new__(cls):
        """
        Telemetry manager constructor. Creates the singleton instance of this class if
        it doesn't already exist.
        """
        if cls._instance is None:
            cls._instance = super(TelemetryManager, cls).__new__(cls)
            cls._instance._initialized = False
        return cls._instance
    def __init__(self):
        """Telemetry manager initializer"""
        if self._initialized:
            return
        self.enabled = self._check_telemetry_enabled()
        if self.enabled:
            self.run_id = str(uuid.uuid4())
            self.whitelist = self._load_whitelist()
            try:
                self.system_info = self._get_system_info()
            except Exception as e:  # pylint: disable=broad-exception-caught
                LOG.warning(f"Error during system info collection: {e}")
                self.system_info = None
            self._init_posthog()
            # Register shutdown method to flush posthog telemetry
            atexit.register(self.shutdown)
        self._initialized = True
    @classmethod
    def get_instance(cls) -> "TelemetryManager":
        if cls._instance is None:
            cls._instance = TelemetryManager()
        return cls._instance
    def _check_telemetry_enabled(self) -> bool:
        """
        Check if telemetry is enabled based on environment variables. We also check
        whether this is the main process (for the distributed setting and to avoid
        sending duplicate PostHog events per GPU).
        Note: This is disabled by default on an opt-in basis. Set
        `AXOLOTL_DO_NOT_TRACK=0` to enable telemetry. We plan to move to an opt-out
        model in a later release. For more details, see
        https://axolotl-ai-cloud.github.io/axolotl/docs/telemetry.html.
        Returns:
            Boolean denoting whether telemetry is enabled or not.
        """
        # Parse relevant env vars
        axolotl_do_not_track = os.getenv("AXOLOTL_DO_NOT_TRACK")
        do_not_track = os.getenv("DO_NOT_TRACK")
        # Default to disabled (opt-in model for initial release)
        if axolotl_do_not_track is None or axolotl_do_not_track.lower() not in (
            "0",
            "1",
            "false",
            "true",
        ):
            # Print opt-in info message for main process only
            if is_main_process():
                LOG.warning(OPT_IN_WARNING)
            time.sleep(OPT_IN_WARNING_SLEEP_SECONDS)
            return False
        # Only rank 0 will send telemetry
        if not is_main_process():
            return False
        if do_not_track is None:
            do_not_track = "0"
        # Respect AXOLOTL_DO_NOT_TRACK, DO_NOT_TRACK if enabled
        enabled = axolotl_do_not_track.lower() not in (
            "1",
            "true",
        ) and do_not_track.lower() not in ("1", "true")
        return enabled
    def _load_whitelist(self) -> dict:
        """Load HuggingFace Hub organization whitelist"""
        with open(WHITELIST_PATH, encoding="utf-8") as f:
            whitelist = yaml.safe_load(f)
            # Send org strings to lowercase since model names are case insensitive
            whitelist["organizations"] = {
                org.lower() for org in whitelist["organizations"]
            }
            return whitelist
    def _is_whitelisted(self, value: str) -> bool:
        """
        Check if model / dataset / etc. org is in whitelist.
        Args:
            value: Value for one of `axolotl.telemetry.manager.FIELDS_WITH_ORGS`
                ("base_model", etc.).
        Returns:
            Boolean indicating whitelist membership.
        """
        # NOTE: This membership-checking logic can be improved.
        # What happens when a local model path matches a whitelisted org?
        parts = value.split("/")
        if len(parts) < 2:
            return False
        org = parts[0]
        whitelisted = org.lower() in self.whitelist["organizations"]
        return whitelisted
    def _init_posthog(self):
        """Initialize PostHog client"""
        posthog.host = POSTHOG_HOST
        posthog.project_api_key = POSTHOG_WRITE_KEY
    def _redact_paths(self, properties: dict[str, Any]) -> dict[str, Any]:
        """
        Redact properties to remove any paths, so as to avoid inadvertently collecting
        private or personally identifiable information (PII). We also remove
        information related to Wandb, MLflow, etc. configuration.
        Args:
            properties: Dictionary of properties to redact.
        Returns:
            Properties dictionary with redaction applied.
        """
        if not properties:
            return {}
        def redact_value(value: Any, key: str = "") -> Any:
            """Recursively sanitize values, redacting those with path-like keys"""
            if isinstance(key, str) and isinstance(value, str):
                # Other redaction special cases
                if (
                    key in FIELDS_TO_REDACT
                    or any(prefix in key for prefix in PREFIXES_TO_REDACT)
                    or any(indicator in key.lower() for indicator in PATH_INDICATORS)
                ):
                    # Fields with whitelisted orgs don't need to be redacted
                    if not self._is_whitelisted(value):
                        return "[REDACTED]"
            # Handle nested values
            if isinstance(value, dict):
                return {k: redact_value(v, k) for k, v in value.items()}
            if isinstance(value, list):
                return [redact_value(item) for item in value]
            return value
        # Create new dict with redacted values
        redacted = {k: redact_value(v, k) for k, v in properties.items()}
        return redacted
    def _get_system_info(self) -> dict[str, Any]:
        """Collect system information for various hardware accelerators"""
        gpu_info = []
        accelerator_type = "none"
        # NVIDIA GPUs
        if torch.cuda.is_available():
            accelerator_type = "cuda"
            for i in range(torch.cuda.device_count()):
                gpu_info.append(
                    {
                        "name": torch.cuda.get_device_name(i),
                        "memory": torch.cuda.get_device_properties(i).total_memory,
                    }
                )
        # AMD GPUs
        elif hasattr(torch, "hip") and torch.hip.is_available():
            accelerator_type = "hip"
            for i in range(torch.hip.device_count()):
                gpu_info.append(
                    {
                        "name": torch.hip.get_device_name(i),
                        "memory": (
                            torch.hip.get_device_properties(i).total_memory
                            if hasattr(torch.hip, "get_device_properties")
                            else None
                        ),
                    }
                )
        # Apple Silicon
        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
            accelerator_type = "mps"
            gpu_info.append(
                {
                    "name": "Apple Silicon",
                    # NOTE: this is memory allocated to this process, not total memory
                    "memory": torch.mps.driver_allocated_memory(),
                }
            )
        # Intel GPUs
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
            accelerator_type = "xpu"
            for i in range(torch.xpu.device_count()):
                memory = None
                if hasattr(torch.xpu, "get_device_properties"):
                    memory = torch.xpu.get_device_properties(i).total_memory
                gpu_info.append(
                    {
                        "name": torch.xpu.get_device_name(i),
                        "memory": memory,
                    }
                )
        # NPUs
        elif hasattr(torch, "npu") and torch.npu.is_available():
            accelerator_type = "npu"
            for i in range(torch.npu.device_count()):
                memory = None
                if hasattr(torch.npu, "get_device_properties"):
                    memory = torch.npu.get_device_properties(i).total_memory
                gpu_info.append(
                    {
                        "name": torch.npu.get_device_name(i),
                        "memory": memory,
                    }
                )
        # Get relevant package versions
        installed_packages = {}
        for package in RELEVANT_PACKAGES:
            try:
                version = importlib.metadata.version(package)
                installed_packages[f"{package}_version"] = version
            except importlib.metadata.PackageNotFoundError:
                pass
        return {
            "os": platform.system(),
            "python_version": platform.python_version(),
            "cpu_count": psutil.cpu_count(),
            "memory_total": psutil.virtual_memory().total,
            "accelerator_type": accelerator_type,
            "accelerator_count": len(gpu_info),
            "accelerator_info": gpu_info,
            **installed_packages,
        }
    def send_event(self, event_type: str, properties: dict[str, Any] | None = None):
        """Send a telemetry event"""
        if not self.enabled:
            return
        if properties is None:
            properties = {}
        # Sanitize properties to remove PII
        properties = self._redact_paths(properties)
        # Wrap PostHog errors in try / except to not raise errors during Axolotl usage
        try:
            # Send event via PostHog
            posthog.capture(
                distinct_id=self.run_id,
                event=event_type,
                properties=properties,
                disable_geoip=True,
            )
        except Exception as e:  # pylint: disable=broad-exception-caught
            LOG.warning(f"Failed to send telemetry event: {e}")
        # Additionally, send system info telemetry when loading config.
        # NOTE: Is this the best place for this?
        if event_type == "config-loaded":
            self.send_system_info()
    def send_system_info(self):
        """Helper method for sending system info"""
        if self.system_info is not None:
            self.send_event(event_type="system-info", properties=self.system_info)
    def shutdown(self):
        """Ensure all queued events are processed before shutdown"""
        if self.enabled:
            posthog.flush()
--- a/src/axolotl/telemetry/runtime_metrics.py
+++ b/src/axolotl/telemetry/runtime_metrics.py
@@ -0,0 +1,209 @@
 """Telemetry utilities for runtime and memory metrics."""
 import logging
 import time
 from dataclasses import dataclass, field
 from typing import Any
 import psutil
 import torch
 from axolotl.telemetry.manager import TelemetryManager
 LOG = logging.getLogger(__name__)
@dataclass
 class RuntimeMetrics:
    """Container for runtime metrics to be tracked throughout training."""
    # Timing metrics
    start_time: float
    epoch_start_times: dict[int, float] = field(init=False)
    epoch_end_times: dict[int, float] = field(init=False)
    # Memory metrics
    peak_cpu_memory: int = 0
    peak_gpu_memory: dict[int, int] = field(init=False)
    # Progress metrics
    total_steps: int = 0
    current_epoch: int = 0
    current_step: int = 0
    def __post_init__(self):
        """Initialize empty metric mappings."""
        self.epoch_start_times = {}
        self.epoch_end_times = {}
        self.peak_gpu_memory = {}
    @property
    def elapsed_time(self) -> float:
        """Calculate total elapsed time in seconds."""
        return time.time() - self.start_time
    def epoch_time(self, epoch: int) -> float | None:
        """Calculate time taken for a specific epoch in seconds."""
        if epoch in self.epoch_start_times and epoch in self.epoch_end_times:
            return self.epoch_end_times[epoch] - self.epoch_start_times[epoch]
        return None
    def average_epoch_time(self) -> float | None:
        """Calculate average time per epoch in seconds."""
        completed_epochs = [
            epoch for epoch in self.epoch_start_times if epoch in self.epoch_end_times
        ]
        if not completed_epochs:
            return None
        total_time = 0.0
        for epoch in completed_epochs:
            epoch_time = self.epoch_time(epoch)
            if epoch_time is not None:  # Check to avoid mypy warning
                total_time += epoch_time
        return total_time / len(completed_epochs)
    def steps_per_second(self) -> float | None:
        """Calculate average steps per second across all training."""
        if self.total_steps == 0 or self.elapsed_time == 0:
            return None
        return self.total_steps / self.elapsed_time
    def to_dict(self) -> dict[str, Any]:
        """Convert metrics to a dictionary for telemetry reporting."""
        metrics = {
            "total_time_seconds": self.elapsed_time,
            "total_steps": self.total_steps,
            "steps_per_second": self.steps_per_second(),
            "epochs_completed": len(
                [
                    epoch
                    for epoch in self.epoch_start_times
                    if epoch in self.epoch_end_times
                ]
            ),
            "peak_cpu_memory_bytes": self.peak_cpu_memory,
        }
        # Add per-epoch timing if available
        epoch_times: dict[str, float] = {}
        for epoch in sorted(self.epoch_end_times.keys()):
            time_taken = self.epoch_time(epoch)
            if time_taken is not None:
                epoch_times[f"epoch_{epoch}_seconds"] = time_taken
        if epoch_times:
            metrics["epoch_times"] = epoch_times  # type: ignore
            metrics["average_epoch_time_seconds"] = self.average_epoch_time()
        # Add GPU memory metrics if available
        if self.peak_gpu_memory:
            gpu_metrics: dict[str, int] = {}
            for gpu_id, memory in self.peak_gpu_memory.items():
                gpu_metrics[f"gpu_{gpu_id}_peak_memory_bytes"] = memory
            metrics["gpu_memory"] = gpu_metrics  # type: ignore
        return metrics
 class RuntimeMetricsTracker:
    """Tracker for runtime metrics during training."""
    update_interval = 100
    def __init__(self):
        """Initialize the runtime metrics tracker."""
        self.metrics = RuntimeMetrics(start_time=time.time())
        self.telemetry_manager = TelemetryManager.get_instance()
    def start_epoch(self, epoch: int):
        """Record the start of a new epoch."""
        self.metrics.current_epoch = epoch
        self.metrics.epoch_start_times[epoch] = time.time()
        self.update_memory_metrics()
    def end_epoch(self, epoch: int):
        """Record the end of an epoch."""
        self.metrics.epoch_end_times[epoch] = time.time()
    def update_step(self, step: int):
        """Update the current step count."""
        self.metrics.current_step = step
        self.metrics.total_steps += 1
        # Periodically update memory metrics
        if step % self.update_interval == 0:
            self.update_memory_metrics()
    def _get_allocated_memory(self) -> dict[int, int]:
        """
        Helper function for getting accelerator-agnostic allocated memory.
        Returns:
            A dictionary mapping device IDs to allocated memory in bytes
        """
        memory_used: dict[int, int] = {}
        # NVIDIA GPUs
        if torch.cuda.is_available():
            for i in range(torch.cuda.device_count()):
                memory_used[i] = torch.cuda.memory_allocated(i)
        # AMD GPUs
        elif hasattr(torch, "hip") and torch.hip.is_available():
            for i in range(torch.hip.device_count()):
                if hasattr(torch.hip, "memory_allocated"):
                    memory_used[i] = torch.hip.memory_allocated(i)
        # Apple Silicon
        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
            # MPS doesn't have per-device memory stats since there's only one device
            if hasattr(torch.mps, "current_allocated_memory"):
                memory_used[0] = torch.mps.current_allocated_memory()
        # Intel GPUs
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
            for i in range(torch.xpu.device_count()):
                if hasattr(torch.xpu, "memory_allocated"):
                    memory_used[i] = torch.xpu.memory_allocated(i)
        # NPUs
        elif hasattr(torch, "npu") and torch.npu.is_available():
            for i in range(torch.npu.device_count()):
                if hasattr(torch.npu, "memory_allocated"):
                    memory_used[i] = torch.npu.memory_allocated(i)
        return memory_used
    def update_memory_metrics(self):
        """Update peak memory usage metrics."""
        # CPU memory
        cpu_memory = psutil.Process().memory_info().rss
        self.metrics.peak_cpu_memory = max(self.metrics.peak_cpu_memory, cpu_memory)
        # GPU memory (if available)
        memory_used = self._get_allocated_memory()
        for i, memory in memory_used.items():
            self.metrics.peak_gpu_memory[i] = max(
                self.metrics.peak_gpu_memory.get(i, 0), memory
            )
    def get_memory_metrics(self) -> dict[str, Any]:
        """Get the current memory metrics as a dictionary."""
        memory_metrics = {
            "cpu_memory_bytes": psutil.Process().memory_info().rss,
            "peak_cpu_memory_bytes": self.metrics.peak_cpu_memory,
        }
        # GPU memory (if available)
        memory_used = self._get_allocated_memory()
        for i, memory in memory_used.items():
            memory_metrics[f"gpu_{i}_memory_bytes"] = memory
            memory_metrics[f"gpu_{i}_peak_memory_bytes"] = (
                self.metrics.peak_gpu_memory.get(i, 0)
            )
        return memory_metrics
--- a/src/axolotl/telemetry/whitelist.yaml
+++ b/src/axolotl/telemetry/whitelist.yaml
@@ -0,0 +1,17 @@
 organizations:
  - "axolotl-ai-co"
  - "meta-llama"
  - "huggingface"
  - "nvidia"
  - "facebook"
  - "google"
  - "microsoft"
  - "deepseek-ai"
  - "HuggingFaceTB"
  - "mistralai"
  - "Qwen"
  - "unsloth"
  - "NousResearch"
  - "allenai"
  - "amd"
  - "tiiuae"
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -1,13 +1,10 @@
 """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
 from __future__ import annotations
 import importlib
 import inspect
 import os
 import signal
 import sys
 import typing
 import weakref
 from contextlib import ExitStack
 from pathlib import Path
@@ -28,12 +25,15 @@ from axolotl.common.datasets import TrainDatasetMeta
 from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
 )
 from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders import (
    ModelLoader,
    load_processor,
    load_tokenizer,
 )
 from axolotl.telemetry.errors import send_errors
 from axolotl.telemetry.manager import TelemetryManager
 from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
@@ -47,19 +47,19 @@ try:
 except ImportError:
    BetterTransformer = None
 if typing.TYPE_CHECKING:
    from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
 LOG = get_logger(__name__)
 TELEMETRY_MANAGER = TelemetryManager.get_instance()
 PLUGIN_MANAGER = PluginManager.get_instance()
 def setup_model_and_tokenizer(
    cfg: DictDefault,
 ) -> tuple[
    PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None
 ]:
-    """Load the tokenizer, processor (for multimodal models), and model based on
+    """
-    configuration.
+    Load the tokenizer, processor (for multimodal models), and model based on configuration.
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
@@ -69,7 +69,10 @@ def setup_model_and_tokenizer(
            `None`), and processor (if multimodal, else `None`).
    """
    # Load tokenizer
-    LOG.debug(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
+    LOG.debug(
        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
        main_process_only=True,
    )
    tokenizer = load_tokenizer(cfg)
    # Load processor for multimodal models if needed
@@ -88,6 +91,14 @@ def setup_model_and_tokenizer(
    if model.generation_config is not None:
        model.generation_config.do_sample = True
    TELEMETRY_MANAGER.send_event(
        event_type="model-load", properties=model.config.to_dict()
    )
    if peft_config:
        TELEMETRY_MANAGER.send_event(
            event_type="peft-config-load", properties=peft_config.to_dict()
        )
    # Apply freezing if specified
    if cfg.unfrozen_parameters:
        freeze_layers_except(model, cfg.unfrozen_parameters)
@@ -477,7 +488,7 @@ def handle_untrained_tokens_fix(
 def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> tuple[
-    "HFRLTrainerBuilder" | "HFCausalTrainerBuilder",
+    HFRLTrainerBuilder | HFCausalTrainerBuilder,
    PeftModel | PreTrainedModel,
    PreTrainedTokenizer,
    PeftConfig | None,
@@ -522,6 +533,7 @@ def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) ->
        model_ref=model_ref,
        peft_config=peft_config,
    )
    PLUGIN_MANAGER.post_trainer_create(cfg, trainer)
    return (
        trainer,
@@ -532,6 +544,7 @@ def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) ->
    )
@send_errors
 def train(
    cfg: DictDefault, dataset_meta: TrainDatasetMeta
 ) -> tuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]:
@@ -556,8 +569,11 @@ def train(
        processor,
    ) = setup_model_and_trainer(cfg, dataset_meta)
-    plugin_manager = PluginManager.get_instance()
+    # Determine if we need to resume from a checkpoint
-    plugin_manager.post_trainer_create(cfg, trainer)
+    resume_from_checkpoint = determine_resume_checkpoint(cfg)
    # Configuration for saving
    safe_serialization = cfg.save_safetensors is True
    # Handle untrained tokens if configured
    safe_serialization = cfg.save_safetensors is True
@@ -572,7 +588,6 @@ def train(
    setup_model_card(cfg)
    # Execute the training
    resume_from_checkpoint = determine_resume_checkpoint(cfg)
    execute_training(cfg, trainer, resume_from_checkpoint)
    # Save the trained model and cleanup
@@ -580,7 +595,6 @@ def train(
    create_model_card(cfg, trainer)
    if not cfg.use_ray:
        cleanup_distributed()
-
+    PLUGIN_MANAGER.post_train(cfg, model)
    plugin_manager.post_train(cfg, model)
    return model, tokenizer, trainer
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -52,10 +52,3 @@ def patch_optimized_env():
    if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
    set_pytorch_cuda_alloc_conf()
 def get_not_null(value, default=None):
    """
    return the value if it's not None, otherwise return the default value
    """
    return value if value is not None else default
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -53,6 +53,25 @@ IGNORE_INDEX = -100
 LOG = get_logger(__name__)
 class EvalFirstStepCallback(
    TrainerCallback
 ):  # pylint: disable=too-few-public-methods disable=unused-argument
    """
    Callback to trigger evals on the first step
    """
    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if args.eval_strategy == IntervalStrategy.STEPS and state.global_step == 1:
            control.should_evaluate = True
        return control
 class SaveBetterTransformerModelCallback(
    TrainerCallback
 ):  # pylint: disable=too-few-public-methods
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
--- a/src/axolotl/utils/collators/batching.py
+++ b/src/axolotl/utils/collators/batching.py
@@ -1,7 +1,7 @@
 """Data collators for axolotl to pad labels and position_ids for packed sequences"""
 from dataclasses import dataclass
-from typing import Any, List
+from typing import Any
 import numpy as np
 from transformers import PreTrainedTokenizerBase
@@ -81,11 +81,9 @@ class DataCollatorForSeq2Seq:
                padding_side = self.tokenizer.padding_side
                for feature in features:
-                    remainder_len = max_feature_length - len(feature[feature_name])
+                    remainder = [pad_token_id] * (
-                    if feature_name == "position_ids":
+                        max_feature_length - len(feature[feature_name])
-                        remainder = list(range(remainder_len))
+                    )
                    else:
                        remainder = [pad_token_id] * remainder_len
                    if isinstance(feature[feature_name], list):
                        feature[feature_name] = (
                            feature[feature_name] + remainder
@@ -163,7 +161,7 @@ class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    def __call__(self, features, return_tensors=None):
        if not isinstance(features[0], list):
-            features: List[List[dict]] = [features]
+            features = [features]
        out_features = [{} for _ in features]
        for i, features_ in enumerate(features):
            for feature in features_[0].keys():
--- a/src/axolotl/utils/data/init.py
+++ b/src/axolotl/utils/data/init.py
@@ -1,21 +1,16 @@
-"""Init for `axolotl.utils.data` module."""
+"""
 Data processing modules
 """
-from axolotl.utils.data.pretraining import (
+from axolotl.utils.data.pretraining import (  # noqa: F401
    encode_pretraining,
    wrap_pretraining_dataset,
 )
-from axolotl.utils.data.rl import prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_preference_datasets  # noqa: F401
-from axolotl.utils.data.sft import (
+from axolotl.utils.data.sft import (  # noqa: F401
    get_dataset_wrapper,
-    prepare_datasets,
+    load_prepare_datasets,
    load_tokenized_prepared_datasets,
    prepare_dataset,
 )
-from axolotl.utils.data.utils import md5
+from axolotl.utils.data.utils import md5  # noqa: F401
 __all__ = [
    "encode_pretraining",
    "wrap_pretraining_dataset",
    "prepare_preference_datasets",
    "get_dataset_wrapper",
    "prepare_datasets",
    "md5",
 ]
--- a/src/axolotl/utils/data/lock.py
+++ b/src/axolotl/utils/data/lock.py
@@ -1,66 +0,0 @@
 """Logic for loading / preparing a dataset once over all processes."""
 import time
 from pathlib import Path
 from typing import Any, Callable
 from filelock import FileLock
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.utils.dict import DictDefault
 LOCK_FILE_NAME = "datasets_prep.lock"
 READY_FILE_NAME = "datasets_ready.flag"
 PROCESS_COUNTER_FILE_NAME = "process_counter.txt"
 class FileLockLoader:
    """
    Simple class for abstracting single process data loading / processing. The first
    process that creates a lock file does the work; the remaining procesees simply load
    the preprocessed dataset once the first process is done.
    """
    def __init__(self, cfg: DictDefault):
        self.cfg = cfg
        self.dataset_prepared_path = (
            cfg.dataset_prepared_path or DEFAULT_DATASET_PREPARED_PATH
        )
        self.lock_file_path = Path(self.dataset_prepared_path) / LOCK_FILE_NAME
        self.ready_flag_path = Path(self.dataset_prepared_path) / READY_FILE_NAME
        self.counter_path = Path(self.dataset_prepared_path) / PROCESS_COUNTER_FILE_NAME
    def load(self, load_fn: Callable[[], Any]) -> Any:
        with FileLock(str(self.lock_file_path)):
            self._increment_counter()
            if not self.ready_flag_path.exists():
                result = load_fn()
                self.ready_flag_path.touch()
                return result
            while not self.ready_flag_path.exists():
                time.sleep(1)
            return load_fn()
    def _increment_counter(self):
        """Safely increment the process counter."""
        if self.counter_path.exists():
            count = int(self.counter_path.read_text().strip())
        else:
            count = 0
        self.counter_path.write_text(str(count + 1))
    def cleanup(self):
        """Clean up ready flag when last process is done."""
        with FileLock(str(self.lock_file_path)):
            count = int(self.counter_path.read_text().strip())
            count -= 1
            if count == 0:
                # Last process cleans everything up
                self.ready_flag_path.unlink(missing_ok=True)
                self.counter_path.unlink(missing_ok=True)
            else:
                # Still have active processes
                self.counter_path.write_text(str(count))
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/pretraining.py
@@ -250,7 +250,7 @@ def encode_packed_pretraining(
    # pylint: disable=duplicate-code
    # tokenize all the examples
    # rows get split with stride (overlap)
-    train_dataset = ds_wrapper(dataset=Dataset.from_dict(examples))[0]
+    train_dataset = ds_wrapper(Dataset.from_dict(examples))[0]
    train_dataset = process_pretraining_datasets_for_packing(
        train_dataset,
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -1,117 +1,75 @@
-"""Data handling specific to RL trainers."""
+"""data handling specific to DPO"""
 import inspect
 from functools import partial
-from typing import Any, Callable, Literal
+from pathlib import Path
 from typing import Any, List, Union
-from datasets import Dataset, DatasetDict
+import yaml
-from transformers import PreTrainedTokenizer
+from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.loaders import load_tokenizer
 from axolotl.prompt_strategies.dpo import load as load_dpo
 from axolotl.prompt_strategies.kto import load as load_kto
 from axolotl.prompt_strategies.orpo import load as load_orpo
-from axolotl.utils.data.lock import FileLockLoader
+from axolotl.utils.data.shared import datasets_w_name_generator, load_dataset_w_config
-from axolotl.utils.data.shared import (
+from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
    create_train_validation_split,
    datasets_with_name_generator,
    generate_dataset_hash_from_config,
    load_dataset_with_config,
    load_preprocessed_dataset,
    merge_datasets,
    save_preprocessed_dataset,
    try_load_from_hub,
 )
 from axolotl.utils.data.utils import (
    deduplicate_and_log_datasets,
    retry_on_request_exceptions,
 )
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process, zero_first
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RLType
 LOG = get_logger(__name__)
-@retry_on_request_exceptions(max_retries=3, delay=5)
+def _get_path(ds_hash, cfg):
-def prepare_preference_datasets(
+    prepared_ds_path = (
-    cfg: DictDefault, tokenizer: PreTrainedTokenizer
+        Path(cfg.dataset_prepared_path) / ds_hash
-) -> tuple[Dataset, Dataset | None]:
+        if cfg.dataset_prepared_path
-    """Load and prepare preference datasets for RL training.
+        else Path(DEFAULT_DATASET_PREPARED_PATH) / ds_hash
    )
-    Loads training and evaluation datasets, handling preprocessing, caching, and
+    return prepared_ds_path
    deduplication as configured. Uses FileLock for distributed coordination.
    Args:
        cfg: Configuration object containing dataset and training settings.
        tokenizer: Tokenizer to use for processing text.
    Returns:
        Tuple of (train_dataset, eval_dataset). eval_dataset may be None
            if no evaluation dataset is configured.
    """
    def _load_datasets():
        # Load training dataset
        train_dataset = _load_or_create_dataset_split(cfg, tokenizer, split="train")
        # Load or create evaluation dataset
        eval_dataset: Dataset | None = None
        if cfg.test_datasets:
            eval_dataset = _load_or_create_dataset_split(cfg, tokenizer, split="test")
        elif cfg.val_set_size:
            # Create validation split from training data
            train_dataset, eval_dataset = create_train_validation_split(
                train_dataset, cfg, cfg.val_set_size
            )
        return train_dataset, eval_dataset
    # Prepare datasets (with file locking logic for multiple ranks)
    loader = FileLockLoader(cfg)
    try:
        train_dataset, eval_dataset = loader.load(_load_datasets)
    finally:
        loader.cleanup()
    # Apply deduplication if configured
    if cfg.dataset_exact_deduplication:
        train_dataset, eval_dataset = deduplicate_and_log_datasets(
            dataset=train_dataset, other_dataset=eval_dataset
        )
    return train_dataset, eval_dataset
-def _map_dataset(
+def _load_preprocessed_ds(cfg, sub_cfg):
-    cfg: DictDefault,
+    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
-    dataset: Dataset | DatasetDict,
+    prepared_ds_path = _get_path(ds_hash, cfg)
-    ds_transform_fn: Callable[..., Any],
+    dataset = None
    tokenizer: Any | None = None,
    **map_kwargs: Any,
 ) -> Dataset:
    """Apply transformation function to dataset.
-    Args:
+    # pylint: disable=duplicate-code
-        cfg: Configuration object.
+    if (
-        dataset: Dataset to transform.
+        cfg.dataset_prepared_path
-        ds_transform_fn: Transformation function to apply.
+        and any(prepared_ds_path.glob("*"))
-        tokenizer: Optional tokenizer for transformation.
+        and not cfg.is_preprocess
-        **map_kwargs: Additional arguments for dataset mapping.
+    ):
        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
        dataset = load_from_disk(str(prepared_ds_path))
-    Returns:
+    return dataset
-        Transformed dataset.
+
-    """
+
 def _save_preprocessed_ds(cfg, sub_cfg, dataset):
    ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
    prepared_ds_path = _get_path(ds_hash, cfg)
    if cfg.is_preprocess and is_main_process():
        LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
        dataset.save_to_disk(str(prepared_ds_path))
 def map_dataset(cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs):
    sig = inspect.signature(ds_transform_fn)
    if "tokenizer" in sig.parameters:
        if not tokenizer:
            tokenizer = load_tokenizer(cfg)
        ds_transform_fn = partial(ds_transform_fn, tokenizer=tokenizer)
-    if isinstance(dataset, DatasetDict):
+    if isinstance(data_set, DatasetDict):
-        dataset = dataset["train"]
+        data_set = data_set["train"]
-    dataset = dataset.map(
+    data_set = data_set.map(
        ds_transform_fn,
        num_proc=cfg.dataset_processes,
        load_from_cache_file=not cfg.is_preprocess,
@@ -119,27 +77,13 @@ def _map_dataset(
        **map_kwargs,
    )
-    return dataset
+    return data_set
-def _drop_long_sequences(
+def drop_long_rl_seq(
-    sample: dict[str, Any], rl: RLType, tokenizer: Any, sequence_len: int
+    sample, rl, tokenizer, sequence_len  # pylint: disable=invalid-name
-) -> bool:
+):
-    """Filter out samples that exceed maximum sequence length.
+    if rl in (RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO):
    Args:
        sample: Dataset sample to check.
        rl: Reinforcement learning type.
        tokenizer: Tokenizer for length calculation.
        sequence_len: Maximum allowed sequence length.
    Returns:
        True if sample should be kept, False if it should be dropped.
    Raises:
        ValueError: If required keys are missing or RL type is unknown.
    """
    if rl in {RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO}:
        if not (
            sample.get("prompt") and sample.get("chosen") and sample.get("rejected")
        ):
@@ -179,115 +123,132 @@ def _drop_long_sequences(
    raise ValueError("Unknown RL type")
-def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset:
+def load_prepare_preference_datasets(cfg):
-    """Load and process dataset split for RL training.
+    def load_split(dataset_cfgs, _cfg):
        split_datasets: List[Any] = []
        use_auth_token = _cfg.hf_use_auth_token
        for config_dataset in datasets_w_name_generator(dataset_cfgs):
            ds: Union[Dataset, DatasetDict] = load_dataset_w_config(
                config_dataset, use_auth_token, streaming=False
            )
            split_datasets.append(ds)
-    Args:
+        tokenizer = load_tokenizer(cfg)
        cfg: Configuration object containing dataset settings.
        split: Dataset split to load ("train" or "test").
-    Returns:
+        for i, data_set in enumerate(split_datasets):
-        Combined and processed dataset for the specified split.
+            _type = dataset_cfgs[i]["type"]
-    """
+            if _type:
-    datasets_configs = cfg.datasets if split == "train" else cfg.test_datasets
+                if isinstance(_type, DictDefault):
-    split_datasets: list[Dataset | DatasetDict] = []
+                    _type = "user_defined.default"
                if _cfg.rl is RLType.ORPO:
                    ds_transform_fn = load_orpo(_type, _cfg, dataset_idx=i)
                elif _cfg.rl is RLType.KTO:
                    ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
                else:
                    ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
-    for dataset_config in datasets_with_name_generator(datasets_configs):
+                map_kwargs = {}
-        dataset: Dataset | DatasetDict = load_dataset_with_config(
+                if isinstance(ds_transform_fn, tuple):
-            dataset_config, cfg.hf_use_auth_token, streaming=False
+                    ds_transform_fn, map_kwargs = ds_transform_fn
-        )
+                split_datasets[i] = map_dataset(
-        split_datasets.append(dataset)
+                    cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs
-
+                )
-    tokenizer = load_tokenizer(cfg)
+            elif _cfg.rl is RLType.KTO:
-
+                ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
-    for i, dataset in enumerate(split_datasets):
+                map_kwargs = {}
-        _type = datasets_configs[i]["type"]
+                if isinstance(ds_transform_fn, tuple):
-        if _type:
+                    ds_transform_fn, map_kwargs = ds_transform_fn
-            if isinstance(_type, DictDefault):
+                split_datasets[i] = map_dataset(
-                _type = "user_defined.default"
+                    cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs
-            if cfg.rl is RLType.ORPO:
+                )
                ds_transform_fn = load_orpo(_type, cfg, dataset_idx=i)
            elif cfg.rl is RLType.KTO:
                ds_transform_fn = load_kto(_type, cfg, dataset_idx=i)
            else:
-                ds_transform_fn = load_dpo(_type, cfg, dataset_idx=i)
+                # If no `type` is provided, assume the dataset is already in the expected format with
                # "prompt", "chosen" and "rejected" already preprocessed
                split_datasets[i] = data_set
-            map_kwargs: dict[str, Any] = {}
+            if not cfg.skip_prepare_dataset:
-            if isinstance(ds_transform_fn, tuple):
+                drop_long = partial(
-                ds_transform_fn, map_kwargs = ds_transform_fn
+                    drop_long_rl_seq,
-            split_datasets[i] = _map_dataset(
+                    rl=_cfg.rl,
-                cfg, dataset, ds_transform_fn, tokenizer, **map_kwargs
+                    tokenizer=tokenizer,
-            )
+                    sequence_len=cfg.sequence_len,
                )
                prior_len = len(split_datasets[i])
                split_datasets[i] = split_datasets[i].filter(
                    drop_long,
                    num_proc=cfg.dataset_processes,
                    load_from_cache_file=not cfg.is_preprocess,
                    desc="Dropping Long Sequences",
                )
                dropped = prior_len - len(split_datasets[i])
                if dropped:
                    LOG.warning(
                        f"Dropped {dropped} long samples from dataset index {i}"
                    )
        combined_datasets = concatenate_datasets(split_datasets)
        combined_datasets = combined_datasets.shuffle(seed=cfg.seed or 42)
        return combined_datasets
    with zero_first(is_main_process()):
        train_is_preprocessed = False
        eval_is_preprocessed = False
        if train_dataset := _load_preprocessed_ds(cfg, cfg.datasets):
            train_is_preprocessed = True
        else:
-            # If no `type` is provided, assume the dataset is already in the expected format with
+            train_dataset = load_split(cfg.datasets, cfg)
            # "prompt", "chosen", and "rejected" already preprocessed
            split_datasets[i] = dataset
-        if not cfg.skip_prepare_dataset:
+        eval_dataset = None
-            drop_long = partial(
+        if cfg.test_datasets:
-                _drop_long_sequences,
+            if eval_dataset := _load_preprocessed_ds(cfg, cfg.test_datasets):
-                rl=cfg.rl,
+                eval_is_preprocessed = True
-                tokenizer=tokenizer,
+            else:
-                sequence_len=cfg.sequence_len,
+                eval_dataset = load_split(cfg.test_datasets, cfg)
-            )
+        if not eval_dataset:
            if cfg.val_set_size:
                seed = cfg.seed if cfg.seed is not None else 42
-            prior_len = len(split_datasets[i])
+                # ensure we end up with the same fingerprint by doing rank0 first and being able to cache
-            split_datasets[i] = split_datasets[i].filter(
+                to_hash_train = (
-                drop_long,
+                    train_dataset._fingerprint  # pylint: disable=protected-access
-                num_proc=cfg.dataset_processes,
+                    + "|"
-                load_from_cache_file=not cfg.is_preprocess,
+                    + str(cfg.val_set_size)
-                desc="Dropping Long Sequences",
+                    + "|"
-            )
+                    + "train"
-            dropped = prior_len - len(split_datasets[i])
+                    + "|"
-            if dropped:
+                    + str(cfg.seed or 42)
-                LOG.warning(f"Dropped {dropped} long samples from dataset index {i}")
+                )
                to_hash_test = (
                    train_dataset._fingerprint  # pylint: disable=protected-access
                    + "|"
                    + str(cfg.val_set_size)
                    + "|"
                    + "test"
                    + "|"
                    + str(cfg.seed or 42)
                )
                train_fingerprint = md5(to_hash_train)
                test_fingerprint = md5(to_hash_test)
                ds_w_test_split = train_dataset.train_test_split(
                    test_size=cfg.val_set_size,
                    seed=seed,
                    shuffle=False,
                    train_new_fingerprint=train_fingerprint,
                    test_new_fingerprint=test_fingerprint,
                )
                eval_dataset = ds_w_test_split["test"]
                train_dataset = ds_w_test_split["train"]
-    # Merge datasets
+        if not train_is_preprocessed:
-    dataset = merge_datasets(split_datasets, cfg)
+            _save_preprocessed_ds(cfg, cfg.datasets, train_dataset)
        if eval_dataset and not eval_is_preprocessed:
            _save_preprocessed_ds(cfg, cfg.test_datasets, eval_dataset)
-    if not cfg.skip_prepare_dataset:
+    if cfg.dataset_exact_deduplication:
-        # Save preprocessed dataset
+        train_dataset, eval_dataset, _ = deduplicate_and_log_datasets(
-        dataset_hash = generate_dataset_hash_from_config(
+            train_dataset=train_dataset, eval_dataset=eval_dataset
            cfg, datasets_configs, tokenizer.name_or_path
        )
        save_preprocessed_dataset(cfg, dataset, dataset_hash, split)
-    return dataset
+    return train_dataset, eval_dataset
 # pylint: disable=duplicate-code
 def _load_or_create_dataset_split(
    cfg: DictDefault, tokenizer: PreTrainedTokenizer, split: Literal["train", "test"]
 ) -> Dataset:
    """Load preprocessed dataset or create new one for given split.
    Args:
        cfg: Configuration object.
        tokenizer: Tokenizer to use for processing text.
        split: Dataset split to load.
    Returns:
        Tuple of (dataset, is_preprocessed).
    """
    # Select correct dataset configuration based on split
    datasets_config = cfg.datasets if split == "train" else cfg.test_datasets
    # Generate dataset hash for caching
    dataset_hash = generate_dataset_hash_from_config(
        cfg, datasets_config, tokenizer.name_or_path
    )
    # Try loading from hub if push_dataset_to_hub is configured
    dataset = None
    if cfg.push_dataset_to_hub:
        dataset = try_load_from_hub(cfg, dataset_hash, split)
    # Attempt to load preprocessed dataset
    if dataset is None:
        dataset = load_preprocessed_dataset(cfg, dataset_hash)
    # Otherwise, load it
    if dataset is None:
        dataset = _load_split(cfg, split=split)
    return dataset
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -1,21 +1,11 @@
-"""Dataset loading shared utils."""
+"""
 dataset loading shared utils
 """
 from __future__ import annotations
 import functools
 import os
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Generator
+from typing import Optional, Union
-from datasets import (
+from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
    Dataset,
    DatasetDict,
    IterableDataset,
    IterableDatasetDict,
    concatenate_datasets,
    load_dataset,
    load_from_disk,
 )
 from huggingface_hub import hf_hub_download, snapshot_download
 from huggingface_hub.errors import (
    HFValidationError,
@@ -23,141 +13,78 @@ from huggingface_hub.errors import (
    RevisionNotFoundError,
 )
 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
 from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 if TYPE_CHECKING:
    from adlfs import AzureBlobFileSystem
    from gcsfs import GCSFileSystem
    from ocifs import OCIFileSystem
    from s3fs import S3FileSystem
 LOG = get_logger(__name__)
 EXTENSIONS_TO_DATASET_TYPES = {
    ".parquet": "parquet",
    ".arrow": "arrow",
    ".csv": "csv",
    ".txt": "text",
 }
-def get_dataset_type(dataset_config: DictDefault) -> str:
+def get_ds_type(config_dataset: DictDefault):
-    """Get the dataset type from the path if it's not specified."""
+    """
-    if dataset_config.ds_type:
+    Get the dataset type from the path if it's not specified
-        return dataset_config.ds_type
+    """
-
+    ds_type = "json"
-    for extension, dataset_type in EXTENSIONS_TO_DATASET_TYPES.items():
+    if config_dataset.ds_type:
-        if extension in dataset_config.path:
+        ds_type = config_dataset.ds_type
-            return dataset_type
+    elif ".parquet" in config_dataset.path:
-
+        ds_type = "parquet"
-    return "json"
+    elif ".arrow" in config_dataset.path:
        ds_type = "arrow"
    elif ".csv" in config_dataset.path:
        ds_type = "csv"
    elif ".txt" in config_dataset.path:
        ds_type = "text"
    return ds_type
-def datasets_with_name_generator(
+def datasets_w_name_generator(dataset_configs: list[DictDefault]):
-    dataset_configs: list[DictDefault],
+    """
-) -> Generator[DictDefault, None, None]:
+    Yields dataset configs handling multiple names or preprocess_shards
    """Yields expanded dataset configurations based on multiple names or preprocessing
    shards.
    When a dataset config has a list of names, it yields separate configs for each
    name. When a dataset config specifies preprocessing shards, it yields configs for
    each shard.
    Args:
-        dataset_configs: List of dataset configuration objects.
+        dataset_configs: list of dataset configs (equivalent to cfg.datasets)
    Yields:
        Individual dataset configurations, expanded as needed for names or shards.
    """
-    for config in dataset_configs:
+    for dataset in dataset_configs:
-        if config.name and isinstance(config.name, list):
+        if dataset.name and isinstance(dataset.name, list):
-            for name in config.name:
+            # load_dataset doesn't properly handle multiple named configurations
-                yield DictDefault({**config, "name": name})
+            # at the same time for a given dataset
-        elif config.preprocess_shards and not config.shards:
+            for name in dataset.name:
-            for shard_idx in range(config.preprocess_shards):
+                yield DictDefault({**dataset, "name": name})
        elif dataset.preprocess_shards and not dataset.shards:
            for shard in range(dataset.preprocess_shards):
                yield DictDefault(
                    {
-                        **config,
+                        **dataset,
-                        "shards": config.preprocess_shards,
+                        "shards": dataset.preprocess_shards,
-                        "shards_idx": shard_idx,
+                        "shards_idx": shard,
                    }
                )
        else:
-            yield config
+            yield dataset
-def load_dataset_with_config(
+def load_dataset_w_config(
-    dataset_config: DictDefault, use_auth_token: bool, streaming=False
+    config_dataset: DictDefault, use_auth_token: bool, streaming=False
-) -> Dataset | IterableDataset:
+) -> Union[Dataset, DatasetDict]:
-    """Load a dataset from a config. Handles datasets that are stored locally, in the
+    """
-    HuggingFace Hub, in a remote filesystem (S3, GCS, Azure, OCI), a URL, or
+    Load a dataset from a config
    `data_files`.
    Args:
-        dataset_config: Single dataset config.
+        config_dataset: single dataset config
-        use_auth_token: Whether to use HF auth token.
+        use_auth_token: whether to use HF auth token
-        streaming: Whether to stream the dataset.
+        streaming: whether to stream the dataset
    Returns:
        Loaded dataset.
    """
-    # Set up common kwargs for dataset loading
+    # pylint: disable=invalid-name
-    load_dataset_kwargs = {
+    ds: Optional[Union[Dataset, DatasetDict]] = None  # pylint: disable=invalid-name
-        "split": dataset_config.split if dataset_config.split else None,
+    ds_from_hub = False
        "name": dataset_config.name,
        "streaming": streaming,
        "trust_remote_code": dataset_config.trust_remote_code,
    }
    # First check if it's a local path
    if Path(dataset_config.path).exists():
        return _load_from_local_path(dataset_config, load_dataset_kwargs)
    # Check if it's a HuggingFace dataset
    is_hub_dataset = _check_if_hub_dataset(dataset_config, use_auth_token)
    # Check if it's a cloud storage path and get appropriate filesystem
    remote_fs, storage_options = _get_remote_filesystem(dataset_config.path)
    is_cloud_dataset = False
    if remote_fs:
        try:
            is_cloud_dataset = remote_fs.exists(dataset_config.path)
        except (FileNotFoundError, ConnectionError):
            pass
    # Load from appropriate source
    if is_hub_dataset:
        return _load_from_hub(dataset_config, use_auth_token, load_dataset_kwargs)
    if is_cloud_dataset:
        return _load_from_cloud(
            dataset_config, remote_fs, storage_options, load_dataset_kwargs
        )
    if dataset_config.path.startswith("https://"):
        return _load_from_url(dataset_config, load_dataset_kwargs)
    if dataset_config.data_files:
        return _load_from_data_files(dataset_config, load_dataset_kwargs)
    raise ValueError(
        f"The dataset could not be loaded. This could be due to a misconfigured dataset path "
        f"({dataset_config.path}). Try double-check your path / name / data_files. "
        f"This is not caused by the dataset type."
    )
 def _check_if_hub_dataset(dataset_config: DictDefault, use_auth_token: bool) -> bool:
    """Check if a dataset exists on the HuggingFace Hub."""
    try:
        # this is just a basic check to see if the path is a
        # valid HF dataset that's loadable
        snapshot_download(
-            repo_id=dataset_config.path,
+            repo_id=config_dataset.path,
            repo_type="dataset",
            token=use_auth_token,
-            revision=dataset_config.revision,
+            revision=config_dataset.revision,
            ignore_patterns=["*"],
        )
-        return True
+        ds_from_hub = True
    except (
        RepositoryNotFoundError,
        RevisionNotFoundError,
@@ -166,373 +93,198 @@ def _check_if_hub_dataset(dataset_config: DictDefault, use_auth_token: bool) ->
        HFValidationError,
        ValueError,
    ):
-        return False
+        pass
-
+    ds_from_cloud = False
-def _get_remote_filesystem(
+    storage_options: dict = {}
-    path: str,
+    remote_file_system = None
-) -> tuple[
+    if config_dataset.path.startswith("s3://"):
    S3FileSystem | GCSFileSystem | AzureBlobFileSystem | OCIFileSystem | None, dict
 ]:
    """Get the appropriate filesystem for a remote path."""
    if path.startswith("s3://"):
        try:
-            import s3fs
+            import s3fs  # type: ignore
            storage_options = {"anon": False}
            return s3fs.S3FileSystem(**storage_options), storage_options
        except ImportError as exc:
            raise ImportError("s3:// paths require s3fs to be installed") from exc
-    elif path.startswith(("gs://", "gcs://")):
+        # Reads env, credentials from ~/.aws/credentials, or IAM metadata provider
        # https://s3fs.readthedocs.io/en/latest/index.html?highlight=storage_options#credentials
        storage_options = {"anon": False}
        remote_file_system = s3fs.S3FileSystem(**storage_options)
    elif config_dataset.path.startswith("gs://") or config_dataset.path.startswith(
        "gcs://"
    ):
        try:
-            import gcsfs
+            import gcsfs  # type: ignore
            storage_options = {"token": None}  # type: ignore
            return gcsfs.GCSFileSystem(**storage_options), storage_options
        except ImportError as exc:
            raise ImportError(
                "gs:// or gcs:// paths require gcsfs to be installed"
            ) from exc
-    elif path.startswith(("adl://", "abfs://", "az://")):
+        # gcsfs will use default credentials from the environment else anon
        # https://gcsfs.readthedocs.io/en/latest/#credentials
        storage_options = {"token": None}
        remote_file_system = gcsfs.GCSFileSystem(**storage_options)
    elif (
        config_dataset.path.startswith("adl://")
        or config_dataset.path.startswith("abfs://")
        or config_dataset.path.startswith("az://")
    ):
        try:
            import adlfs
            storage_options = {"anon": False}
            return adlfs.AzureBlobFileSystem(**storage_options), storage_options
        except ImportError as exc:
            raise ImportError(
                "adl:// or abfs:// paths require adlfs to be installed"
            ) from exc
-    elif path.startswith("oci://"):
+        # # Ensure you have the following environment variables set:
        # # Gen 1
        # storage_options = {
        #     "tenant_id": AZURE_STORAGE_TENANT_ID,
        #     "client_id": AZURE_STORAGE_CLIENT_ID,
        #     "client_secret": AZURE_STORAGE_CLIENT_SECRET,
        # }
        # # Gen 2
        # storage_options = {
        #     "account_name": AZURE_STORAGE_ACCOUNT_NAME,
        #     "account_key": AZURE_STORAGE_ACCOUNT_KEY,
        # }
        # Reads env
        # https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials
        storage_options = {"anon": False}
        remote_file_system = adlfs.AzureBlobFileSystem(**storage_options)
    elif config_dataset.path.startswith("oci://"):
        try:
            import ocifs
            storage_options = {}
            return ocifs.OCIFileSystem(**storage_options), storage_options
        except ImportError as exc:
            raise ImportError("oci:// paths require ocifs to be installed") from exc
-    return None, {}
+        # https://ocifs.readthedocs.io/en/latest/getting-connected.html#Using-Environment-Variables
        remote_file_system = ocifs.OCIFileSystem(**storage_options)
 def _load_from_local_path(
    dataset_config: DictDefault, load_dataset_kwargs: dict
 ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
    """Load a dataset from a local path."""
    local_path = Path(dataset_config.path)
    if local_path.is_dir():
        if dataset_config.data_files:
            dataset_type = get_dataset_type(dataset_config)
            return load_dataset(
                dataset_type,
                data_files=dataset_config.data_files,
                **load_dataset_kwargs,
            )
        try:
            return load_from_disk(dataset_config.path)
        except FileNotFoundError:
            load_dataset_kwargs["streaming"] = False
            return load_dataset(dataset_config.path, **load_dataset_kwargs)
    elif local_path.is_file():
        dataset_type = get_dataset_type(dataset_config)
        load_dataset_kwargs["streaming"] = False
        return load_dataset(
            dataset_type,
            data_files=dataset_config.path,
            **load_dataset_kwargs,
        )
    else:
        raise ValueError(
            "Unhandled dataset load: local path exists, but is neither a directory or a file"
        )
 def _load_from_hub(
    dataset_config: DictDefault, use_auth_token: bool, load_dataset_kwargs: dict
 ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
    """Load a dataset from the HuggingFace Hub."""
    return load_dataset(
        dataset_config.path,
        data_files=dataset_config.data_files,
        token=use_auth_token,
        revision=dataset_config.revision,
        **load_dataset_kwargs,
    )
 def _load_from_cloud(
    dataset_config: DictDefault,
    remote_fs: S3FileSystem | GCSFileSystem | AzureBlobFileSystem | OCIFileSystem,
    storage_options: dict,
    load_dataset_kwargs: dict,
 ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
    """Load a dataset from cloud storage."""
    if remote_fs.isdir(dataset_config.path):
        return load_from_disk(
            dataset_config.path,
            storage_options=storage_options,
        )
    if remote_fs.isfile(dataset_config.path):
        dataset_type = get_dataset_type(dataset_config)
        return load_dataset(
            dataset_type,
            data_files=dataset_config.path,
            storage_options=storage_options,
            **load_dataset_kwargs,
        )
    raise ValueError(
        f"Cloud path {dataset_config.path} is neither a directory nor a file"
    )
 def _load_from_url(
    dataset_config: DictDefault, load_dataset_kwargs: dict
 ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
    """Load a dataset from a URL."""
    dataset_type = get_dataset_type(dataset_config)
    return load_dataset(
        dataset_type,
        data_files=dataset_config.path,
        **load_dataset_kwargs,
    )
 def _load_from_data_files(
    dataset_config: DictDefault, load_dataset_kwargs: dict
 ) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
    """Load a dataset from data files."""
    file_path = None
    if isinstance(dataset_config.data_files, str):
        file_path = hf_hub_download(
            repo_id=dataset_config.path,
            repo_type="dataset",
            filename=dataset_config.data_files,
            revision=dataset_config.revision,
        )
    elif isinstance(dataset_config.data_files, list):
        file_path = [
            hf_hub_download(
                repo_id=dataset_config.path,
                repo_type="dataset",
                filename=file,
                revision=dataset_config.revision,
            )
            for file in dataset_config.data_files
        ]
    else:
        raise ValueError("data_files must be either a string or list of strings")
    return load_dataset("json", data_files=file_path, **load_dataset_kwargs)
 def generate_split_fingerprints(
    dataset: Dataset, val_set_size: int | float, seed: int
 ) -> tuple[str, str]:
    """Generate consistent fingerprints for train/test splits."""
    fingerprint = dataset._fingerprint  # pylint: disable=protected-access
    train_hash_input = f"{fingerprint}|{val_set_size}|train|{seed}"
    test_hash_input = f"{fingerprint}|{val_set_size}|test|{seed}"
    train_fingerprint = md5(train_hash_input)
    test_fingerprint = md5(test_hash_input)
    return train_fingerprint, test_fingerprint
 def get_prepared_dataset_path(cfg: DictDefault, dataset_hash: str) -> Path:
    """Get standardized path for prepared datasets.
    Args:
        cfg: Configuration object.
        dataset_hash: Hash identifying the specific dataset configuration.
    Returns:
        Path where the prepared dataset should be stored.
    """
    base_path = cfg.dataset_prepared_path or DEFAULT_DATASET_PREPARED_PATH
    return Path(base_path) / dataset_hash
 def create_train_validation_split(
    dataset: Dataset, cfg: DictDefault, val_set_size: int | float
 ) -> tuple[Dataset, Dataset]:
    """Create train/validation split with consistent fingerprinting.
    Args:
        dataset: Dataset to split.
        cfg: Configuration object containing seed and other settings.
        val_set_size: Size of validation set (absolute number or fraction).
    Returns:
        Tuple of (train_dataset, eval_dataset).
    """
    train_fingerprint, test_fingerprint = generate_split_fingerprints(
        dataset, val_set_size, cfg.seed
    )
    # Apply deduplication before splitting if configured
    if cfg.dataset_exact_deduplication:
        dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
    split_dataset = dataset.train_test_split(
        test_size=val_set_size,
        shuffle=False,
        seed=cfg.seed,
        train_new_fingerprint=train_fingerprint,
        test_new_fingerprint=test_fingerprint,
    )
    return split_dataset["train"], split_dataset["test"]
 def _generate_from_iterable_dataset(
    dataset: IterableDataset, worker_id: list[int], num_workers: list[int]
 ) -> Generator[Any, None, None]:
    """Generator function to correctly split the dataset for each worker"""
    for i, item in enumerate(dataset):
        if i % num_workers[0] == worker_id[0]:
            yield item
 def save_preprocessed_dataset(
    cfg: DictDefault,
    dataset: Dataset,
    dataset_hash: str,
    split: str,
 ) -> None:
    """Save preprocessed dataset to disk and optionally push to the HF Hub."""
    prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)
    if isinstance(dataset, IterableDataset):
        num_workers = cfg.dataset_processes
        ds_from_iter = Dataset.from_generator(
            functools.partial(_generate_from_iterable_dataset, dataset),
            features=dataset.features,
            num_proc=num_workers,
            split=split,
            gen_kwargs={
                "worker_id": list(range(num_workers)),
                "num_workers": [num_workers] * num_workers,
            },
        )
        ds_from_iter.save_to_disk(str(prepared_ds_path))
    else:
        os.makedirs(prepared_ds_path, exist_ok=True)
        dataset.save_to_disk(str(prepared_ds_path))
    if cfg.push_dataset_to_hub:
        LOG.info(
            "Pushing merged prepared dataset to Huggingface hub at "
            f"{cfg.push_dataset_to_hub} (version {dataset_hash})...",
            main_process_only=False,
        )
        dataset.push_to_hub(
            cfg.push_dataset_to_hub,
            dataset_hash,
            private=True,
        )
 def load_preprocessed_dataset(cfg: DictDefault, dataset_hash: str) -> Dataset | None:
    """Load preprocessed dataset from disk if available.
    Args:
        cfg: Configuration object.
        dataset_hash: Hash identifying the dataset configuration.
    Returns:
        Loaded dataset if found and conditions are met, None otherwise.
    """
    prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)
    if (
        cfg.dataset_prepared_path
        and any(prepared_ds_path.glob("*"))
        and not cfg.skip_prepare_dataset
        and not cfg.is_preprocess
    ):
        LOG.info(
            f"Loading prepared dataset from disk at {prepared_ds_path}...",
            main_process_only=False,
        )
        return load_from_disk(str(prepared_ds_path))
    LOG.info(
        f"Unable to find prepared dataset in {prepared_ds_path}",
        main_process_only=False,
    )
    return None
 def try_load_from_hub(
    cfg: DictDefault, dataset_hash: str, split: str
 ) -> Dataset | None:
    """Try to load the prepared dataset from HuggingFace Hub."""
    try:
-        LOG.info(
+        if remote_file_system and remote_file_system.exists(config_dataset.path):
-            "Attempting to load prepared dataset from HuggingFace Hub at "
+            ds_from_cloud = True
-            f"{cfg.push_dataset_to_hub} (version {dataset_hash})..."
+    except (FileNotFoundError, ConnectionError):
-        )
+        pass
        dataset = load_dataset(
            cfg.push_dataset_to_hub,
            dataset_hash,
            token=cfg.hf_use_auth_token,
        )
        return dataset[split]
    except Exception:  # pylint: disable=broad-except # nosec
        LOG.info("Unable to find prepared dataset in HuggingFace Hub")
        return None
-
+    # gather extra args from the config
-def generate_dataset_hash_from_config(
+    load_ds_kwargs = {}
-    cfg: DictDefault, cfg_datasets: list, tokenizer_name: str
+    if config_dataset.split:
-) -> str:
+        load_ds_kwargs["split"] = config_dataset.split
    """Generate a hash to uniquely identify a dataset configuration for SFT.
    Args:
        cfg: Main configuration object.
        cfg_datasets: List of dataset configurations.
        tokenizer_name: Name of the tokenizer being used.
    Returns:
        MD5 hash string representing the configuration.
    """
    config_str = (
        f"{cfg.sequence_len}@{cfg.sample_packing}@{cfg.eval_sample_packing}@"
        f"{cfg.group_by_length}@{cfg.kd_temperature or 1.0}|"
        f"{'|'.join(sorted([f'{d.path}:{d.type}:{d.shards}:{d.conversation}:{d.split}:{d.temperature or 1.0}' for d in cfg_datasets]))}"
        f"|{tokenizer_name}"
    )
    return str(md5(config_str))
 def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
    """Merge multiple datasets into one with optional shuffling.
    Args:
        datasets: List of datasets to merge.
        cfg: Configuration object containing shuffle settings.
    Returns:
        Merged dataset.
    """
    if len(datasets) == 1:
        return datasets[0]
    LOG.info("Merging datasets...")
    merged_dataset = concatenate_datasets(datasets)
    if cfg.shuffle_merged_datasets:
        LOG.debug("Shuffling merged datasets...")
        merged_dataset = merged_dataset.shuffle(seed=cfg.seed)
    else:
-        LOG.debug("Not shuffling merged datasets.")
+        load_ds_kwargs["split"] = None
-    return merged_dataset
+    # prefer local dataset, even if hub exists
    local_path = Path(config_dataset.path)
    if local_path.exists():
        if local_path.is_dir():
            if config_dataset.data_files:
                ds_type = get_ds_type(config_dataset)
                ds = load_dataset(  # pylint: disable=invalid-name
                    ds_type,
                    name=config_dataset.name,
                    data_files=config_dataset.data_files,
                    streaming=streaming,
                    **load_ds_kwargs,
                )
            else:
                try:
                    ds = load_from_disk(
                        config_dataset.path
                    )  # pylint: disable=invalid-name
                except FileNotFoundError:
                    ds = load_dataset(
                        config_dataset.path,
                        name=config_dataset.name,
                        streaming=False,
                        **load_ds_kwargs,
                    )
        elif local_path.is_file():
            ds_type = get_ds_type(config_dataset)
            ds = load_dataset(  # pylint: disable=invalid-name
                ds_type,
                name=config_dataset.name,
                data_files=config_dataset.path,
                streaming=False,
                **load_ds_kwargs,
            )
        else:
            raise ValueError(
                "unhandled dataset load: local path exists, but is neither a directory or a file"
            )
    elif ds_from_hub:
        ds = load_dataset(
            config_dataset.path,
            name=config_dataset.name,
            streaming=streaming,
            data_files=config_dataset.data_files,
            token=use_auth_token,
            revision=config_dataset.revision,
            trust_remote_code=config_dataset.trust_remote_code,
            **load_ds_kwargs,
        )
    elif ds_from_cloud and remote_file_system:
        if remote_file_system.isdir(config_dataset.path):
            ds = load_from_disk(
                config_dataset.path,
                storage_options=storage_options,
            )
        elif remote_file_system.isfile(config_dataset.path):
            ds_type = get_ds_type(config_dataset)
            ds = load_dataset(
                ds_type,
                name=config_dataset.name,
                data_files=config_dataset.path,
                streaming=streaming,
                storage_options=storage_options,
                trust_remote_code=config_dataset.trust_remote_code,
                **load_ds_kwargs,
            )
    elif config_dataset.path.startswith("https://"):
        ds_type = get_ds_type(config_dataset)
        ds = load_dataset(
            ds_type,
            name=config_dataset.name,
            data_files=config_dataset.path,
            streaming=streaming,
            storage_options=storage_options,
            trust_remote_code=config_dataset.trust_remote_code,
            **load_ds_kwargs,
        )
    elif config_dataset.data_files:
        fp: str | list[str] | None = None
        if isinstance(config_dataset.data_files, str):
            fp = hf_hub_download(
                repo_id=config_dataset.path,
                repo_type="dataset",
                filename=config_dataset.data_files,
                revision=config_dataset.revision,
            )
        elif isinstance(config_dataset.data_files, list):
            fp = []
            for file in config_dataset.data_files:
                fp.append(
                    hf_hub_download(
                        repo_id=config_dataset.path,
                        repo_type="dataset",
                        filename=file,
                        revision=config_dataset.revision,
                    )
                )
        else:
            raise ValueError("data_files must be either a string or list of strings")
        ds = load_dataset(
            "json",
            name=config_dataset.name,
            data_files=fp,
            streaming=streaming,
            **load_ds_kwargs,
        )
    if not ds:
        raise ValueError(
            "The dataset could not be loaded. This could be due to a misconfigured dataset path "
            f"({config_dataset.path}). Try double-check your path / name / data_files. "
            "This is not caused by the dataset type."
        )
    return ds
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -1,11 +1,9 @@
-"""Data handling helpers"""
+"""data handling helpers"""
 import contextlib
 import functools
 import hashlib
 import time
 from enum import Enum
 from typing import Callable
 import huggingface_hub
 import numpy as np
@@ -21,7 +19,9 @@ LOG = get_logger(__name__)
 class RetryStrategy(Enum):
-    """Enum for retry strategies."""
+    """
    Enum for retry strategies.
    """
    CONSTANT = 1
    LINEAR = 2
@@ -30,18 +30,7 @@ class RetryStrategy(Enum):
 def retry_on_request_exceptions(
    max_retries=3, delay=1, retry_strategy: RetryStrategy = RetryStrategy.LINEAR
-) -> Callable:
+):
    """Decorator that retries function calls on specific request exceptions.
    Args:
        max_retries: Maximum number of retry attempts.
        delay: Base delay between retries in seconds.
        retry_strategy: Strategy for calculating retry delays.
    Returns:
        Decorated function with retry logic.
    """
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):  # pylint: disable=inconsistent-return-statements
@@ -51,7 +40,6 @@ def retry_on_request_exceptions(
                except (
                    requests.exceptions.ReadTimeout,
                    requests.exceptions.ConnectionError,
                    requests.exceptions.HTTPError,
                    huggingface_hub.errors.HfHubHTTPError,
                ) as exc:
                    if attempt < max_retries - 1:
@@ -71,7 +59,6 @@ def retry_on_request_exceptions(
 def md5(to_hash: str, encoding: str = "utf-8") -> str:
    """Generate MD5 hash of a string."""
    try:
        return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
    except TypeError:
@@ -79,89 +66,102 @@ def md5(to_hash: str, encoding: str = "utf-8") -> str:
 def sha256(to_hash: str, encoding: str = "utf-8") -> str:
    """Generate SHA256 hash of a string."""
    return hashlib.sha256(to_hash.encode(encoding)).hexdigest()
-def _deduplicate_dataset(
+def deduplicate_dataset(
-    dataset: Dataset,
+    dataset: Dataset, seen_hashes: dict[str, list[int]], other_dataset: Dataset = None
-    seen_hashes: set[str] | None = None,
+) -> Dataset:
 ) -> tuple[Dataset, set[str]]:
    """Remove duplicate rows from a dataset using SHA256 hashes.
    Args:
        dataset: Dataset to deduplicate.
        seen_hashes: Set of previously seen row hashes (for cross-deduplication).
    Returns:
        Tuple of deduplicated dataset and the set of seen hashes.
    """
    if seen_hashes is None:
        seen_hashes = set()
    unique_indices = []
    for idx, row in enumerate(dataset):
        row_hash = sha256(str(row))  # Using SHA256 for collision resistance
        if row_hash not in seen_hashes:
            seen_hashes.add(row_hash)
            unique_indices.append(idx)
-    return dataset.select(unique_indices), seen_hashes
+    for idx, row in enumerate(dataset):
        row_hash = sha256(str(row))  # Using SHA256 for collision resistance.
        if row_hash not in seen_hashes:
            seen_hashes[row_hash] = [idx]
            unique_indices.append(idx)
        else:
            # Check for collision by looking up the original dataset indices
            original_indices = seen_hashes[row_hash]
            is_duplicate = False
            for original_idx in original_indices:
                if (
                    not idx == original_idx
                    and original_idx < len(dataset)
                    and str(dataset[original_idx]) == str(row)
                ):
                    is_duplicate = True
                    break
                # Check in the other dataset if provided
                if other_dataset is not None:
                    if original_idx < len(other_dataset) and str(
                        other_dataset[original_idx]
                    ) == str(row):
                        is_duplicate = True
                        break
            if not is_duplicate:
                seen_hashes[row_hash].append(idx)
                unique_indices.append(idx)
                continue
    return dataset.select(unique_indices)
 def deduplicate_and_log_datasets(
-    dataset: Dataset,
+    *,
-    other_dataset: Dataset | None = None,
+    train_dataset: Dataset = None,
-    dataset_name: str | None = "train",
+    eval_dataset: Dataset = None,
-    other_name: str | None = "eval",
+    dataset: Dataset = None,
-) -> tuple[Dataset, Dataset | None]:
+) -> tuple[Dataset, Dataset, Dataset]:
-    """Deduplicate datasets, with optional cross-dataset deduplication.
+    """
-
+    Deduplicates train, eval, and an optional dataset if provided, logging original and new sizes.
    Args:
        dataset: Primary dataset to deduplicate.
        other_dataset: Optional second dataset to deduplicate against the first.
        dataset_name: Name for the primary dataset (for logging).
        other_name: Name for the second dataset (for logging).
    Returns:
-        Tuple of (deduplicated_dataset, deduplicated_other_dataset).
+        tuple: Deduplicated train, eval, and additional datasets.
    """
-    # Deduplicate primary dataset
+    seen_hashes: dict[str, list[int]] = {}
    LOG.info(
        f"Starting deduplication for {dataset_name} dataset. Original size: {len(dataset)}"
    )
    dataset, seen_rows = _deduplicate_dataset(dataset)
    LOG.info(
        f"Deduplication complete for {dataset_name} dataset. New size: {len(dataset)}"
    )
-    # Deduplicate second dataset if provided
+    # Handle cases where datasets are None
-    if other_dataset is not None:
+    if train_dataset is not None:
        LOG.info(
-            f"Starting deduplication for {other_name} dataset. Original size: {len(other_dataset)}"
+            f"Starting deduplication for train dataset. Original size: {len(train_dataset)}"
        )
        train_dataset = deduplicate_dataset(
            dataset=train_dataset, seen_hashes=seen_hashes
        )
        other_dataset, _ = _deduplicate_dataset(other_dataset, seen_rows)
        LOG.info(
-            f"Deduplication complete for {other_name} dataset. New size: {len(other_dataset)}"
+            f"Deduplication complete for train dataset. New size: {len(train_dataset)}"
        )
    else:
        LOG.info("Train dataset is None. Skipping deduplication.")
    if eval_dataset is not None:
        LOG.info(
            f"Starting deduplication for eval dataset. Original size: {len(eval_dataset)}"
        )
        eval_dataset = deduplicate_dataset(
            dataset=eval_dataset, seen_hashes=seen_hashes, other_dataset=train_dataset
        )
        LOG.info(
            f"Deduplication complete for eval dataset. New size: {len(eval_dataset)}"
        )
    else:
        LOG.info("Eval dataset is None. Skipping deduplication.")
    if dataset is not None and (eval_dataset is None and train_dataset is None):
        LOG.info(
            f"Starting deduplication for combined dataset. Original size: {len(dataset)}"
        )
        dataset = deduplicate_dataset(dataset=dataset, seen_hashes=seen_hashes)
        LOG.info(
            f"Deduplication complete for combined dataset. New size: {len(dataset)}"
        )
-    return dataset, other_dataset
+    return train_dataset, eval_dataset, dataset
-def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault) -> Dataset:
+def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
    """Remove sequences longer than configured maximum from dataset.
    Args:
        dataset: Dataset to filter.
        cfg: Dictionary mapping `axolotl` config keys to values.
    Returns:
        Filtered dataset with long sequences removed.
    """
    if "input_ids" not in dataset.column_names:
        LOG.warning(
-            "Dataset does not contain 'input_ids' column. Skip drop long seq. This is "
+            "Dataset does not contain 'input_ids' column. Skip drop long seq. This is expected for RewardModeling."
            "expected for reward modeling."
        )
        return dataset
@@ -171,14 +171,20 @@ def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault) -> Dataset:
        min_sequence_len=cfg.min_sample_len,
    )
-    with contextlib.suppress(AttributeError):
+    try:
        ds_lengths = get_dataset_lengths(dataset, from_arrow=True)
        min_input_len = np.min(ds_lengths)
        LOG.info(f"min_input_len: {min_input_len}")
        max_input_len = np.max(ds_lengths)
        LOG.info(f"max_input_len: {max_input_len}")
    except AttributeError:
        pass
-    prior_len = len(dataset) if hasattr(dataset, "__len__") else None
+    try:
        prior_len = len(dataset)
    except TypeError:
        # handle iterable datasets case
        prior_len = None
    filter_map_kwargs = {}
    if not isinstance(dataset, IterableDataset):
--- a/src/axolotl/utils/data/wrappers.py
+++ b/src/axolotl/utils/data/wrappers.py
@@ -1,425 +0,0 @@
 """Data handling specific to SFT."""
 import logging
 from typing import Any, NoReturn, cast
 from datasets import (
    Dataset,
    IterableDataset,
    Sequence,
    Value,
 )
 from transformers import PreTrainedTokenizer
 from transformers.processing_utils import ProcessorMixin
 from axolotl.datasets import TokenizedPromptDataset, wrap_dataset_for_tokenized_prompt
 from axolotl.prompt_strategies import load
 from axolotl.prompt_strategies.bradley_terry import load as bradley_terry_load
 from axolotl.prompt_tokenizers import (
    AlpacaMultipleChoicePromptTokenizingStrategy,
    AlpacaPromptTokenizingStrategy,
    AlpacaReflectionPTStrategy,
    DatasetWrappingStrategy,
    GPTeacherPromptTokenizingStrategy,
    JeopardyPromptTokenizingStrategy,
    OpenAssistantPromptTokenizingStrategy,
    PromptTokenizingStrategy,
    SummarizeTLDRPromptTokenizingStrategy,
 )
 from axolotl.prompters import (
    AlpacaPrompter,
    GPTeacherPrompter,
    JeopardyPrompter,
    MultipleChoiceConcisePrompter,
    MultipleChoiceExplainPrompter,
    Prompter,
    ReflectAlpacaPrompter,
    SummarizeTLDRPrompter,
    UnsupportedPrompter,
 )
 from axolotl.utils.dict import DictDefault
 LOG = logging.getLogger(__name__)
 def handle_unknown_dataset_strategy(dataset_config: DictDefault) -> NoReturn:
    """Raise error for unknown dataset strategy."""
    ds_type = dataset_config.type
    suffix = ""
    if ":load_" in ds_type:
        suffix = f"Did you mean {ds_type.replace(':load_', '.load_')}?"
    error_message = f"unhandled prompt tokenization strategy: {ds_type}. {suffix}"
    LOG.error(error_message)
    raise ValueError(error_message)
 # pylint: disable=too-many-return-statements
 def get_dataset_wrapper(
    dataset_config: DictDefault,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset_base_type: str | None,
    dataset: Dataset | IterableDataset,
    dataset_prompt_style: str | None = None,
    processor: ProcessorMixin | None = None,  # pylint: disable=unused-argument
 ) -> tuple[Dataset | IterableDataset, Prompter | None]:
    """Create an appropriate dataset wrapper and prompter based on dataset
    configuration.
    Args:
        dataset_config: Configuration for the dataset.
        tokenizer: Tokenizer to use for processing text.
        cfg: Global configuration object.
        dataset_base_type: The base type of the dataset.
        dataset: The actual dataset object.
        dataset_prompt_style: Optional prompt style specification.
        processor: Optional processor for multimodal datasets.
    Returns:
        tuple of (dataset_wrapper, dataset_prompter).
    """
    # Common parameters for dataset wrapping
    dataset_kwargs: dict[str, Any] = {
        "process_count": cfg.dataset_processes,
        "keep_in_memory": cfg.dataset_keep_in_memory is True,
    }
    LOG.info(
        f"Loading dataset: {dataset_config['path']} with base_type: "
        f"{dataset_base_type} and prompt_style: {dataset_prompt_style}"
    )
    # Dataset is already tokenized
    if _is_dataset_already_tokenized(dataset):
        return dataset, UnsupportedPrompter()
    # Custom dataset type definition
    if isinstance(dataset_config.type, DictDefault):
        return _handle_custom_dataset_type(
            dataset_config, tokenizer, cfg, dataset, dataset_kwargs
        )
    # Skip preparation if configured
    if cfg.skip_prepare_dataset:
        return dataset, None
    # Bradley-Terry dataset
    if dataset_config.type.startswith("bradley_terry"):
        return _handle_bradley_terry_dataset(
            dataset_config, tokenizer, cfg, dataset, dataset_kwargs
        )
    # Stepwise supervised dataset
    if dataset_config.type.startswith("stepwise_supervised"):
        return _handle_stepwise_supervised_dataset(
            dataset_config, tokenizer, cfg, dataset, dataset_kwargs
        )
    # Try to load prompt tokenizer / dataset wrapper strategy from registry
    dataset_strategy = load(
        dataset_config.type, tokenizer, cfg, dataset_config, processor=processor
    )
    if dataset_strategy:
        return _handle_loaded_strategy(dataset_strategy, dataset, dataset_kwargs)
    # Known dataset types with specific handling
    if dataset_base_type in DATASET_HANDLERS:
        handler = DATASET_HANDLERS[dataset_base_type]
        return handler(dataset_prompt_style, tokenizer, cfg, dataset, dataset_kwargs)
    # Unhandled dataset type
    handle_unknown_dataset_strategy(dataset_config)
 def _is_dataset_already_tokenized(dataset: Dataset | IterableDataset) -> bool:
    """Check if the dataset is already tokenized."""
    return (
        isinstance(dataset, Dataset)
        and "input_ids" in dataset.features
        and "attention_mask" in dataset.features
        and "labels" in dataset.features
    )
 def _handle_custom_dataset_type(
    dataset_config: DictDefault,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a custom dataset type defined in the configuration."""
    dataset_strategy = cast(
        PromptTokenizingStrategy,
        load("user_defined", tokenizer, cfg, dataset_config.type.to_dict()),
    )
    dataset_prompter = UnsupportedPrompter()
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_bradley_terry_dataset(
    dataset_config: DictDefault,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter | None]:
    """Handle a Bradley-Terry dataset."""
    bt_type = dataset_config.type.split(".", 1)[1]
    dataset_strategy = bradley_terry_load(bt_type, tokenizer, cfg, dataset_config)
    if not dataset_strategy:
        handle_unknown_dataset_strategy(dataset_config)
    dataset_prompter = UnsupportedPrompter()
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_stepwise_supervised_dataset(
    dataset_config: DictDefault,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a stepwise supervised dataset."""
    dataset_prompter = UnsupportedPrompter()
    dataset_strategy = load(dataset_config.type, tokenizer, cfg, dataset_config)
    # We need to explicitly cast boolean labels to int
    # for compatibility with how trl's PRMTrainer works
    if isinstance(dataset, Dataset):
        dataset = dataset.cast_column("labels", Sequence(Value("int64")))
    dataset_wrapper = TokenizedPromptDataset(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_loaded_strategy(
    dataset_strategy: PromptTokenizingStrategy | DatasetWrappingStrategy,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter | None]:
    """Handle a dataset with a strategy loaded from the registry."""
    if isinstance(dataset_strategy, DatasetWrappingStrategy):
        return dataset_strategy.wrap_dataset(dataset, **dataset_kwargs), None
    dataset_prompter = UnsupportedPrompter()
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_alpaca_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle an Alpaca dataset."""
    dataset_prompter = AlpacaPrompter(dataset_prompt_style)
    dataset_strategy = AlpacaPromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_explainchoice_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle an ExplainChoice dataset."""
    dataset_prompter = MultipleChoiceExplainPrompter(dataset_prompt_style)
    dataset_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_concisechoice_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a ConciseChoice dataset."""
    dataset_prompter = MultipleChoiceConcisePrompter(dataset_prompt_style)
    dataset_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_summarizetldr_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a SummarizeTLDR dataset."""
    dataset_prompter = SummarizeTLDRPrompter(dataset_prompt_style)
    dataset_strategy = SummarizeTLDRPromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_jeopardy_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a Jeopardy dataset."""
    dataset_prompter = JeopardyPrompter(dataset_prompt_style)
    dataset_strategy = JeopardyPromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_oasst_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle an OpenAssistant dataset."""
    dataset_prompter = AlpacaPrompter(dataset_prompt_style)
    dataset_strategy = OpenAssistantPromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_gpteacher_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a GPTeacher dataset."""
    dataset_prompter = GPTeacherPrompter(dataset_prompt_style)
    dataset_strategy = GPTeacherPromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 def _handle_reflection_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
 ) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a Reflection dataset."""
    dataset_prompter = ReflectAlpacaPrompter(dataset_prompt_style)
    dataset_strategy = AlpacaReflectionPTStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter
 DATASET_HANDLERS = {
    "alpaca": _handle_alpaca_dataset,
    "explainchoice": _handle_explainchoice_dataset,
    "concisechoice": _handle_concisechoice_dataset,
    "summarizetldr": _handle_summarizetldr_dataset,
    "jeopardy": _handle_jeopardy_dataset,
    "oasst": _handle_oasst_dataset,
    "gpteacher": _handle_gpteacher_dataset,
    "reflection": _handle_reflection_dataset,
 }
--- a/src/axolotl/utils/mistral_tokenizer.py
+++ b/src/axolotl/utils/mistral_tokenizer.py
@@ -1,567 +0,0 @@
 """Wrapper for MistralTokenizer from mistral-common"""
 import math
 import os
 from shutil import copyfile
 from typing import TYPE_CHECKING, Optional
 import numpy as np
 from huggingface_hub import hf_hub_download
 from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
 from mistral_common.tokens.tokenizers.tekken import Tekkenizer
 from torch import Tensor
 from transformers.utils import PaddingStrategy
 from axolotl.utils.collators.core import IGNORE_INDEX
 if TYPE_CHECKING:
    from mistral_common.protocol.instruct.request import ChatCompletionRequest
 def _get_file_path(path_or_repo_id: str, filename: str) -> str:
    """Get the file path from local or HF Hub"""
    if os.path.exists(path_or_repo_id):
        maybe_file_path = os.path.join(path_or_repo_id, filename)
        if os.path.exists(maybe_file_path):
            return maybe_file_path
        raise FileNotFoundError(f"File not found at {path_or_repo_id}")
    return hf_hub_download(repo_id=path_or_repo_id, filename=filename)
 class HFMistralTokenizer:
    """
    Wraps mistral_common.tokens.tokenizers.mistral.MistralTokenizer
    and exposes HuggingFace API for special tokens.
    """
    def __init__(
        self, mistral: MistralTokenizer, name_or_path: str, tokenizer_path: str
    ):
        """
        Args:
            mistral: The mistral-common tokenizer to wrap.
            name_or_path: The name or path to the tokenizer files or the repo id.
        """
        self._mistral = mistral
        self._padding_side = "right"
        self._name_or_path = name_or_path
        self._tokenizer_path = tokenizer_path
        # Manual set to training mode
        from mistral_common.protocol.instruct.validator import (
            MistralRequestValidator,
            ValidationMode,
        )
        # Check if MistralRequestValidator has a _mode attribute.
        # This is a private API and may change in the future.
        # pylint: disable=protected-access
        if not (
            hasattr(self._mistral, "_chat_completion_request_validator")
            and isinstance(
                self._mistral._chat_completion_request_validator,
                MistralRequestValidator,
            )
            and hasattr(self._mistral._chat_completion_request_validator, "_mode")
        ):
            raise RuntimeError(
                "Unable to switch mistral tokenizer to finetuning mode – "
                "private API `_chat_completion_request_validator._mode` missing."
            )
        self._mistral._chat_completion_request_validator._mode = (
            ValidationMode.finetuning
        )
    def _load_system_prompt(self, path_or_repo_id: str) -> str:
        """Load system prompt from local or HF Hub.
        Note: Unused for now as we don't want to explicitly set the system prompt if a user does
        not provide one.
        Args:
            path_or_repo_id: The path to the tokenizer files or the repo id.
        Returns:
            The system prompt.
        """
        file_path = _get_file_path(path_or_repo_id, "SYSTEM_PROMPT.txt")
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"System prompt file not found at {file_path}")
        with open(file_path, "r", encoding="utf-8") as file:
            return file.read()
    @property
    def bos_token_id(self) -> int:
        return self._mistral.instruct_tokenizer.tokenizer.bos_id
    @property
    def eos_token_id(self) -> int:
        return self._mistral.instruct_tokenizer.tokenizer.eos_id
    @property
    def pad_token_id(self) -> int:
        return self._mistral.instruct_tokenizer.tokenizer.pad_id
    @property
    def unk_token_id(self) -> int:
        return self._mistral.instruct_tokenizer.tokenizer.unk_id
    @property
    def bos_token(self) -> str:
        return self._mistral.instruct_tokenizer.tokenizer.id_to_piece(self.bos_token_id)
    @property
    def eos_token(self) -> str:
        return self._mistral.instruct_tokenizer.tokenizer.id_to_piece(self.eos_token_id)
    @property
    def pad_token(self) -> str:
        return self._mistral.instruct_tokenizer.tokenizer.id_to_piece(self.pad_token_id)
    @property
    def unk_token(self) -> str:
        return self._mistral.instruct_tokenizer.tokenizer.id_to_piece(self.unk_token_id)
    @property
    def padding_side(self) -> str:
        return self._padding_side
    @property
    def name_or_path(self) -> str:
        return self._name_or_path
    @property
    def chat_template(self) -> str | None:
        """Chat template is not supported. Dummy method to satisfy HuggingFace API."""
        return None
    def __len__(self) -> int:
        return self._mistral.instruct_tokenizer.tokenizer.n_words
    @classmethod
    def from_pretrained(
        cls,
        name_or_path: str,
        *,
        revision: Optional[str] = None,
        **kwargs,  # pylint: disable=unused-argument
    ) -> "HFMistralTokenizer":
        """
        Load a mistral tekken tokenizer from a local file or HF Hub and wrap it.
        Args:
            path_or_repo_id: The path to the tokenizer files or the repo id.
            revision: The revision of the tokenizer to download.
            kwargs: Additional keyword arguments.
        Returns:
            A HFMistralTokenizer instance.
        """
        if revision:
            raise NotImplementedError(
                "Revision not supported yet for mistral-common tokenizer"
            )
        # only support Tekken tokenizer for now
        # downloads from HF Hub if not local
        tokenizer_path = _get_file_path(name_or_path, "tekken.json")
        base = MistralTokenizer.from_file(tokenizer_path)
        return cls(
            base,
            name_or_path=name_or_path,
            tokenizer_path=tokenizer_path,
        )
    def save_pretrained(self, save_directory: str) -> None:
        """
        Save the Tekken/SentencePiece model file so that from_pretrained can pick it up again.
        Only Tekken models are supported.
        Args:
            save_directory: The directory to save the tokenizer files.
        """
        inner = self._mistral.instruct_tokenizer.tokenizer
        if isinstance(inner, Tekkenizer):
            # Create the directory and save the model
            try:
                os.makedirs(save_directory, exist_ok=True)
                # Verify directory was created
                if not os.path.exists(save_directory):
                    raise RuntimeError(f"Failed to create directory: {save_directory}")
                # Verify source file exists
                if not os.path.exists(self._tokenizer_path):
                    raise FileNotFoundError(
                        f"Source tokenizer file not found: {self._tokenizer_path}"
                    )
                destination_path = os.path.join(save_directory, "tekken.json")
                copyfile(self._tokenizer_path, destination_path)
            except Exception as e:
                raise RuntimeError(
                    f"Failed to save tokenizer to {save_directory}: {e}. "
                    f"Source path: {self._tokenizer_path}, "
                    f"Directory exists: {os.path.exists(save_directory)}"
                ) from e
        else:
            raise RuntimeError(f"Unknown tokenizer type: {type(inner)}")
    def encode(self, text: str, add_special_tokens: bool = True) -> list[int]:
        """
        Encode a text string into a list of token IDs.
        Args:
            text: The text string to encode.
            add_special_tokens: Whether to add special tokens to the encoded tokens.
        Returns:
            A list of token IDs.
        """
        return self._mistral.instruct_tokenizer.tokenizer.encode(
            text,
            bos=add_special_tokens,
            eos=add_special_tokens,
        )
    def decode(
        self, token_ids: int | list[int], skip_special_tokens: bool = False
    ) -> str:
        """
        Decode a list of token IDs into a text string.
        Args:
            token_ids: The int or list of token IDs to decode.
            skip_special_tokens: Whether to skip special tokens in the decoded text.
        Returns:
            The decoded text string.
        """
        if isinstance(token_ids, int):
            token_ids = [token_ids]
        if skip_special_tokens:
            return self._mistral.instruct_tokenizer.tokenizer.decode(token_ids)
        # to_string returns a string with special tokens
        return self._mistral.instruct_tokenizer.tokenizer.to_string(token_ids)
    def _create_mistral_chat_completion_request(
        self, conversation: list[dict], tools: list[dict] | None = None
    ) -> "ChatCompletionRequest":
        from mistral_common.protocol.instruct.messages import (
            AssistantMessage,
            SystemMessage,
            ToolMessage,
            UserMessage,
        )
        from mistral_common.protocol.instruct.request import ChatCompletionRequest
        from mistral_common.protocol.instruct.tool_calls import Function, Tool
        messages: list[UserMessage | AssistantMessage | ToolMessage | SystemMessage] = (
            []
        )
        for turn in conversation:
            role = turn.get("role")
            if role == "user":
                messages.append(UserMessage(content=turn["content"]))
            elif role == "assistant":
                messages.append(
                    AssistantMessage(
                        content=turn.get("content"),
                        tool_calls=turn.get("tool_calls"),
                    )
                )
            elif role == "tool":
                messages.append(
                    ToolMessage(
                        content=turn.get("content"),
                        tool_call_id=turn.get("tool_call_id"),
                        name=turn.get("name"),
                    )
                )
            elif role == "system":
                messages.append(SystemMessage(content=turn["content"]))
            else:
                raise ValueError(
                    f"Unknown role for use with mistral-common tokenizer: {turn['role']}"
                )
        tool_calls: list[Tool] = []
        if tools:
            # convert to Tool
            for tool in tools:
                if tool["type"] != "function":
                    continue
                function = tool["function"]
                tool_calls.append(
                    Tool(
                        function=Function(
                            name=function["name"],
                            description=function["description"],
                            # set parameters to empty dict if not provided
                            parameters=function.get("parameters", {}),
                        )
                    )
                )
        chat_completion: ChatCompletionRequest = ChatCompletionRequest(
            messages=messages,
            tools=tool_calls,
        )
        return chat_completion
    def apply_chat_template(
        self,
        messages: list[dict],
        tokenize: bool = True,
        tools: list[dict] | None = None,
        chat_template: str | None = None,  # pylint: disable=unused-argument
        add_generation_prompt: bool = False,  # pylint: disable=unused-argument
    ) -> list[int] | str:
        if chat_template:
            raise NotImplementedError("chat_template not supported yet")
        if add_generation_prompt:
            raise NotImplementedError("add_generation_prompt not supported yet")
        chat_completion: ChatCompletionRequest = (
            self._create_mistral_chat_completion_request(messages, tools)
        )
        tokens: list[int] = self._mistral.encode_chat_completion(chat_completion).tokens
        if tokenize:
            return tokens
        return self.decode(tokens)
    def pad(
        self,
        features: list[dict[str, list[int] | np.ndarray]],
        *,
        padding: bool | str | PaddingStrategy = True,
        max_length: int | None = None,
        pad_to_multiple_of: int | None = None,
        return_tensors: str | None = None,  # "np", "pt", or "tf"
    ) -> dict[str, np.ndarray | Tensor]:
        """
        HF-style pad method that properly handles all sequence-related features:
        - pad 'input_ids' & 'labels' to the longest (or to max_length)
        """
        import torch
        from torch.nn import functional as F
        # Check for unsupported fields
        if any("token_type_ids" in f for f in features):
            raise ValueError("token_type_ids is not supported by this tokenizer")
        # Determine desired sequence length
        lengths = [len(f["input_ids"]) for f in features]
        if padding in (True, "longest", PaddingStrategy.LONGEST):
            target_length = max(lengths)
        elif padding in ("max_length", PaddingStrategy.MAX_LENGTH):
            if max_length is None:
                raise ValueError("max_length must be set for 'max_length' padding")
            target_length = max_length
        elif padding in (False, "do_not_pad", PaddingStrategy.DO_NOT_PAD):
            target_length = None
        else:
            raise ValueError(f"Unknown padding strategy: {padding}")
        # Apply pad_to_multiple_of
        if target_length is not None and pad_to_multiple_of is not None:
            target_length = (
                math.ceil(target_length / pad_to_multiple_of) * pad_to_multiple_of
            )
        # If no padding requested, just stack tensors
        do_pad = target_length is not None
        # Pad sequences using torch.nn.utils.rnn.pad_sequence
        input_ids = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(x["input_ids"], dtype=torch.long) for x in features],
            batch_first=True,
            padding_value=self.pad_token_id if self.pad_token_id is not None else 0,
        )
        labels = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(x["labels"], dtype=torch.long) for x in features],
            batch_first=True,
            padding_value=IGNORE_INDEX,
        )
        attention_mask = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(x["attention_mask"], dtype=torch.long) for x in features],
            batch_first=True,
            padding_value=0,
        )
        # Handle position_ids - pad with sequential values for right padding, 0s for left padding
        if "position_ids" in features[0]:
            if self.padding_side == "left":
                # Likely not needed, but keeping for now
                # For left padding, we'll pad with 0s using pad_sequence, then handle manually
                position_ids = torch.nn.utils.rnn.pad_sequence(
                    [
                        torch.tensor(x["position_ids"], dtype=torch.long)
                        for x in features
                    ],
                    batch_first=True,
                    padding_value=0,
                )
            else:
                # For right padding, continue the sequence
                max_pos_len = max(len(f["position_ids"]) for f in features)
                position_ids_list = []
                for f in features:
                    pos_seq = torch.tensor(f["position_ids"], dtype=torch.long)
                    if len(pos_seq) < max_pos_len:
                        # Continue the sequence
                        last_pos = pos_seq[-1].item() if len(pos_seq) > 0 else -1
                        pad_len = max_pos_len - len(pos_seq)
                        pad_positions = torch.arange(
                            last_pos + 1, last_pos + 1 + pad_len, dtype=torch.long
                        )
                        pos_seq = torch.cat([pos_seq, pad_positions])
                    position_ids_list.append(pos_seq)
                position_ids = torch.stack(position_ids_list)
        else:
            # Create position_ids if not present
            seq_len = input_ids.size(1)
            position_ids = (
                torch.arange(seq_len, dtype=torch.long)
                .unsqueeze(0)
                .expand(input_ids.size(0), -1)
            )
        # Ensure all tensors have the same sequence length
        max_seq_len = max(
            input_ids.size(1),
            labels.size(1),
            attention_mask.size(1),
            position_ids.size(1),
        )
        # TODO: check if trimming is needed? and correct.
        if do_pad and target_length is not None:
            max_seq_len = target_length
        # Pad all tensors to the same length
        if input_ids.size(1) < max_seq_len:
            pad_len = max_seq_len - input_ids.size(1)
            if self.padding_side == "right":
                input_ids = F.pad(
                    input_ids,
                    (0, pad_len),
                    value=self.pad_token_id if self.pad_token_id is not None else 0,
                )
            else:
                input_ids = F.pad(
                    input_ids,
                    (pad_len, 0),
                    value=self.pad_token_id if self.pad_token_id is not None else 0,
                )
        elif input_ids.size(1) > max_seq_len:
            input_ids = input_ids[:, :max_seq_len]
        if labels.size(1) < max_seq_len:
            pad_len = max_seq_len - labels.size(1)
            if self.padding_side == "right":
                labels = F.pad(labels, (0, pad_len), value=IGNORE_INDEX)
            else:
                labels = F.pad(labels, (pad_len, 0), value=IGNORE_INDEX)
        elif labels.size(1) > max_seq_len:
            labels = labels[:, :max_seq_len]
        if attention_mask.size(1) < max_seq_len:
            pad_len = max_seq_len - attention_mask.size(1)
            if self.padding_side == "right":
                attention_mask = F.pad(attention_mask, (0, pad_len), value=0)
            else:
                attention_mask = F.pad(attention_mask, (pad_len, 0), value=0)
        elif attention_mask.size(1) > max_seq_len:
            attention_mask = attention_mask[:, :max_seq_len]
        if position_ids.size(1) < max_seq_len:
            pad_len = max_seq_len - position_ids.size(1)
            if self.padding_side == "right":
                batch_size = position_ids.size(0)
                new_position_ids = []
                for i in range(batch_size):
                    seq = position_ids[i]
                    if len(seq) > 0:
                        # get last position and pad with sequential values
                        last_pos = seq[-1].item()
                        pad_positions = torch.arange(
                            last_pos + 1, last_pos + 1 + pad_len, dtype=torch.long
                        )
                        new_seq = torch.cat([seq, pad_positions])
                    else:
                        new_seq = torch.arange(pad_len, dtype=torch.long)
                    new_position_ids.append(new_seq)
                position_ids = torch.stack(new_position_ids)
            else:
                position_ids = F.pad(position_ids, (pad_len, 0), value=0)
        elif position_ids.size(1) > max_seq_len:
            position_ids = position_ids[:, :max_seq_len]
        final_batch = {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
            "position_ids": position_ids,
        }
        # Handle non-sequence fields (raise error)
        sequence_fields = {"input_ids", "labels", "attention_mask", "position_ids"}
        for f in features:
            for key in f.keys():
                if key not in sequence_fields:
                    raise NotImplementedError(
                        f"Non-sequence field {key} not handled yet"
                    )
        # Convert to requested tensor type
        if return_tensors is None or return_tensors == "np":
            result = {}
            for k, v in final_batch.items():
                if isinstance(v, torch.Tensor):
                    result[k] = v.numpy().astype(np.long)
                else:
                    result[k] = v
            return result
        if return_tensors == "pt":
            return final_batch
        raise ValueError(f"Unsupported return_tensors='{return_tensors}'")
    def convert_ids_to_tokens(self, ids: list[int]) -> list[str]:
        """
        Convert a list of token IDs to a list of tokens.
        Args:
            ids: The list of token IDs to convert.
        Returns:
            The list of tokens.
        """
        return [
            self._mistral.instruct_tokenizer.tokenizer.id_to_piece(id) for id in ids
        ]
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -3,7 +3,6 @@ Multipack Batch Sampler - An efficient batch sampler for packing variable-length
 into fixed-capacity batches to optimize memory usage and training throughput.
 """
 import gc
 import math
 from concurrent.futures import ProcessPoolExecutor
 from multiprocessing import cpu_count, get_context
@@ -146,7 +145,7 @@ def pack_parallel(
    """
    num_items = len(sequence_lengths)
    if num_processes is None:
-        num_processes = max(1, min(num_items // group_size, cpu_count(), 16))
+        num_processes = max(1, min(num_items // group_size, cpu_count()))
    # Create tasks for parallel processing
    tasks = []
@@ -259,8 +258,8 @@ class MultipackBatchSampler(BatchSampler):
        batch_max_len: int,  # Maximum sequence length (bin capacity)
        lengths: np.ndarray,  # Sequence lengths
        packing_efficiency_estimate: float = 1.0,  # Initial efficiency estimate
-        drop_last: bool = True,  # Whether to drop final batches (might be incomplete)
+        drop_last: bool = False,  # Whether to drop final batches (might be incomplete)
-        num_count_samples: int = 8,  # Number of times to estimate batch count
+        num_count_samples: int = 16,  # Number of times to estimate batch count
        sequential: bool = False,  # Whether to use sequential packing
        group_size: int = 100_000,  # Size of groups for parallel packing
        bin_size: int = 200,  # The max number of samples that can be packed in a single bin
@@ -350,7 +349,6 @@ class MultipackBatchSampler(BatchSampler):
            # Calculate efficiency statistics
            total_used = lengths.sum()
            total_slots = len(all_bins) * self.batch_max_len
            del all_bins
        # Group bins into batches (each batch contains batch_size bins)
        batches = [
@@ -370,7 +368,6 @@ class MultipackBatchSampler(BatchSampler):
            self.total_token_slots += total_slots
        self._batches = batches
        gc.collect()
        return batches
    def __iter__(self) -> Iterator[list[list[int]]]:
@@ -446,18 +443,10 @@ class MultipackBatchSampler(BatchSampler):
        if self._len_across_ranks is None:
            # Sample multiple times to get stable estimate
-            _sampled_lens = []
+            len_batches = min(  # pylint: disable=consider-using-generator
-            for _ in range(self.num_count_samples):
+                [len(self._batches) for _ in range(self.num_count_samples)]
-                self._batches = None  # Reset cached batches
+            )
                _sampled_lens.append(len(self.generate_batches(set_stats=False)))
            len_batches = min(_sampled_lens)
            # Gather minimum across all ranks
-            if self._len_across_ranks is None:
+            self._len_across_ranks = self.gather_len_batches(len_batches)
                self._len_across_ranks = self.gather_len_batches(len_batches)
            else:
                self._len_across_ranks = min(
                    self._len_across_ranks, self.gather_len_batches(len_batches)
                )
        return self._len_across_ranks
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -102,8 +102,6 @@ class AxolotlInputConfig(
    dpo_use_weighting: bool | None = None
    dpo_use_logits_to_keep: bool | None = None
    dpo_label_smoothing: float | None = None
    dpo_norm_loss: bool | None = None
    dpo_padding_free: bool | None = None
    datasets: (
        Annotated[
@@ -338,14 +336,6 @@ class AxolotlInputConfig(
    plugins: list[str] | None = Field(default=None)
    @field_validator("seed", mode="after")
    @classmethod
    def set_default_seed(cls, seed):
        if seed is None:
            LOG.info("`seed` not set in config; setting to 42")
            seed = 42
        return seed
    @field_validator("datasets", mode="before")
    @classmethod
    def deprecate_sharegpt_datasets(cls, datasets):
@@ -1209,7 +1199,7 @@ class AxolotlInputConfig(
                    "flash_attention: true must be set with sequence_parallel_degree > 1"
                )
-            if self.sample_packing and getattr(self, "micro_batch_size", 1) > 1:
+            if self.sample_packing and self.micro_batch_size > 1:
                raise ValueError(
                    "micro_batch_size must be set to 1 when sample_packing is enabled "
                    "due to a `ring-flash-attn` requirement"
@@ -1267,71 +1257,9 @@ class AxolotlInputConfig(
            )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_tokenizer_use_mistral_common(cls, data):
        if data.get("tokenizer_use_mistral_common") is None:
            if any(
                "magistral" in name.lower()
                for name in [
                    data.get("base_model", ""),
                    data.get("base_model_config", ""),
                    data.get("tokenizer_config", ""),
                ]
            ):
                LOG.warning(
                    "tokenizer_use_mistral_common auto inferred to True for Magistral models. Please set it to True explicitly if you want to use mistral-common tokenizer."
                )
                data["tokenizer_use_mistral_common"] = True
        return data
    @field_validator("tokenizer_use_mistral_common", mode="after")
    @classmethod
    def check_mistral_common_import(cls, tokenizer_use_mistral_common):
        if tokenizer_use_mistral_common:
            try:
                import mistral_common  # noqa: F401 # pylint:disable=unused-import
            except ImportError as exception:
                raise ImportError(
                    "mistral-common is required for mistral models. Please install it with `pip install axolotl` or `pip install -e .`."
                ) from exception
        return tokenizer_use_mistral_common
    @model_validator(mode="before")
    @classmethod
    def check_mistral_common_incompatible_options(cls, data):
        if not data.get("tokenizer_use_mistral_common"):
            return data
        # NOTE: mistral-common tokenizer is not compatible with editing tokenizer at the moment
        if data.get("added_tokens_overrides"):
            raise ValueError(
                "added_tokens_overrides is not supported with mistral-common tokenizer"
            )
        if data.get("special_tokens"):
            raise ValueError(
                "special_tokens override is not supported with mistral-common tokenizer"
            )
        if data.get("tokens"):
            raise ValueError(
                "tokens override is not supported with mistral-common tokenizer"
            )
        if data.get("chat_template"):
            raise ValueError(
                "Setting chat_template is not supported with mistral-common tokenizer"
            )
        return data
 class AxolotlConfigWCapabilities(AxolotlInputConfig):
-    """wrapper to valdiate gpu capabilities with the configured options"""
+    """Wrapper to validate GPU capabilities with the config options"""
    capabilities: GPUCapabilities
    env_capabilities: EnvCapabilities
--- a/src/axolotl/utils/schemas/datasets.py
+++ b/src/axolotl/utils/schemas/datasets.py
@@ -43,7 +43,6 @@ class SFTDataset(BaseModel):
    field_human: str | None = None
    field_model: str | None = None
    field_messages: str | None = None
    field_tools: str | None = None
    # deprecated, use message_property_mappings
    message_field_role: str | None = None
    # deprecated, use message_property_mappings
--- a/src/axolotl/utils/schemas/model.py
+++ b/src/axolotl/utils/schemas/model.py
@@ -18,7 +18,6 @@ class ModelInputConfig(BaseModel):
    tokenizer_config: str | None = None
    tokenizer_use_fast: bool | None = None
    tokenizer_legacy: bool | None = None
    tokenizer_use_mistral_common: bool | None = None
    tokenizer_type: str | None = Field(
        default=None, json_schema_extra={"description": "transformers tokenizer class"}
    )
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -16,6 +16,7 @@ from datasets import IterableDataset, disable_caching, enable_caching
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from transformers.utils import is_torch_bf16_gpu_available
 from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.monkeypatch.trainer_eval_guard import patch_evaluation_loop_for_fsdp2
 from axolotl.utils.distributed import reduce_and_broadcast
 from axolotl.utils.environment import check_cuda_p2p_ib_support
@@ -466,7 +467,6 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                bin_size=cfg.sample_packing_bin_size,
                sequential=cfg.sample_packing_sequentially,
                drop_last=True,
                num_processes=cfg.dataset_processes,
            )
            data_loader = DataLoader(
@@ -482,9 +482,6 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                    data_loader_len * cfg.num_epochs * cfg.sequence_parallel_degree
                )
            )
            if cfg.dataloader_drop_last:
                # drop the last batch for each epoch
                total_num_steps -= int(math.ceil(cfg.num_epochs))
            def calc_sample_packing_eff_est(estimates: List[float]):
                LOG.info(f"sample_packing_eff_est across ranks: {repr(estimates)}")
@@ -632,8 +629,6 @@ def setup_trainer(
        A trainer instance (either `HFRLTrainer` or `HFCausalTrainer`) configured based
            on the provided parameters.
    """
    from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
    if (
        cfg.torch_compile
        and cfg.fsdp_config
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,6 +1,4 @@
-"""
+"""Shared pytest fixtures"""
 shared pytest fixtures
 """
 import functools
 import importlib
@@ -559,3 +557,9 @@ def test_load_fixtures(
    download_llama2_model_fixture,
 ):
    pass
@pytest.fixture(autouse=True)
 def disable_telemetry(monkeypatch):
    monkeypatch.setenv("AXOLOTL_DO_NOT_TRACK", "1")
    yield
--- a/tests/core/test_builders.py
+++ b/tests/core/test_builders.py
@@ -12,7 +12,7 @@ from axolotl.common.datasets import load_datasets
 from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.loaders import ModelLoader, load_tokenizer
 from axolotl.utils.config import normalize_config
-from axolotl.utils.data import prepare_preference_datasets
+from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.schemas.enums import RLType
@@ -451,19 +451,15 @@ def rand_reward_func(prompts, completions) -> list[float]:
            # Only use mock for the commented out configs
            if dataset_name is not None:
                with patch(
-                    "axolotl.utils.data.rl.load_dataset_with_config"
+                    "axolotl.utils.data.rl.load_dataset_w_config"
                ) as mock_load_dataset:
                    mock_load_dataset.return_value = request.getfixturevalue(
                        dataset_name
                    )
-                    train_dataset, eval_dataset = prepare_preference_datasets(
+                    train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
                        cfg, tokenizer
                    )
            else:
                # Load actual datasets for orpo_cfg and kto_cfg
-                train_dataset, eval_dataset = prepare_preference_datasets(
+                train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
                    cfg, tokenizer
                )
            builder.train_dataset = train_dataset
            builder.eval_dataset = eval_dataset
--- a/tests/e2e/integrations/test_cut_cross_entropy.py
+++ b/tests/e2e/integrations/test_cut_cross_entropy.py
@@ -4,6 +4,7 @@ Simple end-to-end test for Cut Cross Entropy integration
 import pytest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils import get_pytorch_version
@@ -58,7 +59,8 @@ class TestCutCrossEntropyIntegration:
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
-        dataset_meta = load_datasets(cfg=cfg)
+        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        major, minor, _ = get_pytorch_version()
        if (major, minor) < (2, 4):
@@ -103,7 +105,8 @@ class TestCutCrossEntropyIntegration:
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
-        dataset_meta = load_datasets(cfg=cfg)
+        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        major, minor, _ = get_pytorch_version()
        if (major, minor) < (2, 4):
@@ -131,7 +134,8 @@ class TestCutCrossEntropyIntegration:
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
-        dataset_meta = load_datasets(cfg=cfg)
+        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        major, minor, _ = get_pytorch_version()
        if (major, minor) < (2, 4):
--- a/tests/e2e/integrations/test_hooks.py
+++ b/tests/e2e/integrations/test_hooks.py
@@ -5,6 +5,7 @@ e2e tests to make sure all the hooks are fired on the plugin
 import os
 from pathlib import Path
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.integrations.base import BasePlugin
 from axolotl.train import train
@@ -159,7 +160,8 @@ class TestPluginHooks:
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
-        dataset_meta = load_datasets(cfg=cfg)
+        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -5,9 +5,11 @@ e2e tests for kd trainer support in Axolotl
 from pathlib import Path
 import pytest
 import yaml
 from accelerate.test_utils import execute_subprocess_async, get_torch_dist_unique_port
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 from tests.e2e.utils import check_tensorboard, require_torch_2_5_1
@@ -16,8 +18,8 @@ from tests.e2e.utils import check_tensorboard, require_torch_2_5_1
@pytest.fixture(name="kd_min_cfg")
 def min_cfg(temp_dir):
    return {
-        "base_model": "Qwen/Qwen3-0.6B",
+        "base_model": "osllmai-community/Llama-3.2-1B",
-        "tokenizer_config": "winglian/qwen3-14b-math",
+        "tokenizer_config": "axolotl-ai-co/Llama-3.3-70B-Instruct-tokenizer",
        "plugins": [
            "axolotl.integrations.kd.KDPlugin",
            "axolotl.integrations.liger.LigerPlugin",
@@ -30,22 +32,20 @@ def min_cfg(temp_dir):
        "kd_ce_alpha": 0.1,
        "kd_alpha": 0.9,
        "kd_temperature": 1.0,
        "kd_beta": 0.0,
        "kd_normalize_topk": True,
        "dataloader_prefetch_factor": 8,
        "dataloader_num_workers": 4,
        "dataloader_pin_memory": True,
        "datasets": [
            {
-                "path": "winglian/OpenThoughts-114k-math-correct-qwen3-14b-math-prepared-topk128-normalized",
+                "path": "axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample",
-                "type": "chat_template",
+                "type": "axolotl.integrations.kd.chat_template",
                "field_messages": "messages_combined",
                "split": "train",
-                "split_thinking": True,
+                "logprobs_field": "llm_text_generation_vllm_logprobs",
-                "eot_tokens": ["<|im_end|>"],
+                "temperature": 1.0,
-                "data_files": ["train/batch-000000.parquet"],
+                "preprocess_shards": 2,
            },
        ],
        "skip_prepare_dataset": True,
        "val_set_size": 0.0,
        "sequence_len": 2048,
        "sample_packing": True,
@@ -81,29 +81,18 @@ class TestKnowledgeDistillation:
    def test_llama_kd(self, temp_dir, kd_min_cfg):
        cfg = DictDefault(kd_min_cfg)
        # pylint: disable=duplicate-code
-        # write cfg to yaml file
+        cfg = validate_config(cfg)
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        prepare_plugins(cfg)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+        normalize_config(cfg)
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+        cli_args = TrainerCliArgs()
-
+        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "1",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )
        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "model.safetensors").exists()
        check_tensorboard(
            temp_dir + "/runs", "train/loss", 1.4, "Train Loss (%s) is too high"
        )
    @pytest.mark.skip(reason="Chunked KD loss doesn't support PEFT/LoRA")
    @pytest.mark.parametrize(
        "load_in_8bit",
        [True, False],
@@ -123,22 +112,13 @@ class TestKnowledgeDistillation:
            | kd_min_cfg
        )
        # pylint: disable=duplicate-code
-        # write cfg to yaml file
+        cfg = validate_config(cfg)
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
+        prepare_plugins(cfg)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
+        normalize_config(cfg)
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
+        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-        execute_subprocess_async(
+        train(cfg=cfg, dataset_meta=dataset_meta)
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "1",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )
        assert (Path(temp_dir) / "adapter_model.safetensors").exists()
        check_tensorboard(
            temp_dir + "/runs", "train/loss", 1.2, "Train Loss (%s) is too high"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dan Saunders	345a159796	coderabbit comments	2025-06-07 04:50:29 +00:00
Dan Saunders	657bffd85f	update posthog dep	2025-06-05 23:46:20 +00:00
Dan Saunders	f0dde8e2d5	lint	2025-06-05 23:41:46 +00:00
Dan Saunders	25fa4df70f	fix	2025-06-05 23:33:46 +00:00
Dan Saunders	e735f4270b	slight changes	2025-06-05 23:33:46 +00:00
Dan Saunders	035e7a2f4c	simplifying	2025-06-05 23:33:46 +00:00
Dan Saunders	2d36c11264	minor fixes	2025-06-05 23:33:46 +00:00
Dan Saunders	b8ec5bdccf	doc update	2025-06-05 23:33:44 +00:00
Dan Saunders	249405b46e	docs fix	2025-06-05 23:31:44 +00:00
Dan Saunders	d3be84fec2	enable / disable logic update	2025-06-05 23:31:44 +00:00
Dan Saunders	1c74ab175f	opt-in version of telemetry	2025-06-05 23:31:44 +00:00
Dan Saunders	b2f1fc109a	distributed fix	2025-06-05 23:31:44 +00:00
Dan Saunders	5a2a80cc48	fix issue with tests in ci	2025-06-05 23:31:44 +00:00
Dan Saunders	4033fe74f8	fixes	2025-06-05 23:31:44 +00:00
Dan Saunders	e9df4444be	remove duplicate info	2025-06-05 23:31:44 +00:00
Dan Saunders	ffd2985750	adding runtime metrics / system info additional accelerator support, etc.	2025-06-05 23:31:44 +00:00
Dan Saunders	17310f9acc	adding runtime metrics / system info additional accelerator support, etc.	2025-06-05 23:31:44 +00:00
Dan Saunders	71ae6f9f87	improved redaction, send system info during model config load telemetry, etc.	2025-06-05 23:31:08 +00:00
Dan Saunders	9dd1092f8f	doc update	2025-06-05 23:27:29 +00:00
Dan Saunders	2c2f2647a9	fix	2025-06-05 23:27:29 +00:00
Dan Saunders	98313a6b3f	adding back in base_model redaction w/ whitelist	2025-06-05 23:27:29 +00:00
Dan Saunders	8b75205d3b	sleep on all ranks in distributed setting	2025-06-05 23:27:29 +00:00
Dan Saunders	ef4990f304	simplifying path redaction	2025-06-05 23:27:29 +00:00
Dan Saunders	db3297b090	small update / fix	2025-06-05 23:27:27 +00:00
Dan Saunders	86ed554bda	tests for runtime metrics telemetry and assoc. callback	2025-06-05 23:26:07 +00:00
Dan Saunders	f254d7d5a2	adding runtime metrics (cpu + gpu memory, steps/s, etc.)	2025-06-05 23:26:05 +00:00
Dan Saunders	d8b0522ea0	updated sanitization logic, tests	2025-06-05 23:20:51 +00:00
Dan Saunders	1edd6b9524	update error file path sanitization function; adding more error tracking	2025-06-05 23:20:49 +00:00
Dan Saunders	66c6fb56cb	progress on telemetry: config load, process, model load, train start / end, error tracking	2025-06-05 22:59:50 +00:00
Dan Saunders	90b39ce112	updates	2025-06-05 22:49:15 +00:00
Dan Saunders	5afab46cc6	updates	2025-06-05 22:49:15 +00:00
Dan Saunders	bd152c6115	adding todo	2025-06-05 22:49:15 +00:00
Dan Saunders	76336743ff	initial telemetry manager impl	2025-06-05 22:49:14 +00:00
`@@ -4,4 +4,4 @@ import pkgutil`

	`__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package`	`__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package`

	`__version__ = "0.10.0"`	`__version__ = "0.10.0.dev0"`