feat: add baseten to lmeval

2025-08-29 18:02:26 +07:00
102 changed files with 1097 additions and 4779 deletions
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -44,7 +44,7 @@ jobs:
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
-            axolotl_extras: fbgemm-gpu
+            axolotl_extras:
            num_gpus: 2
            nightly_build: "true"
    runs-on: [self-hosted, modal]
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -303,8 +303,7 @@ jobs:
            python_version: "3.11"
            pytorch: 2.8.0
            num_gpus: 1
-            gpu_type: "B200"
+            axolotl_extras:
            axolotl_extras: fbgemm-gpu
    steps:
      - name: Checkout
        uses: actions/checkout@v4
@@ -325,7 +324,6 @@ jobs:
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,10 +11,10 @@ repos:
    -   id: no-commit-to-branch
        args: ['--branch', 'main']
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.12
+    rev: v0.12.9
    hooks:
    -   id: ruff
-        args: [--fix, --select, I]
+        args: [--fix]
    -   id: ruff-format
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.17.1
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -1,6 +1,6 @@
 cff-version: 1.2.0
 type: software
-title: "Axolotl: Open Source LLM Post-Training"
+title: "Axolotl: Post-Training for AI Models"
 message: "If you use this software, please cite it as below."
 authors:
  - name: "Axolotl maintainers and contributors"
--- a/README.md
+++ b/README.md
@@ -5,9 +5,6 @@
        <img alt="Axolotl" src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
    </picture>
 </p>
  <p align="center">
      <strong>A Free and Open Source LLM Fine-tuning Framework</strong><br>
  </p>
 <p align="center">
    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
@@ -20,7 +17,6 @@
    <br/>
    <a href="https://discord.com/invite/HhrNrHJPRb"><img src="https://img.shields.io/badge/discord-7289da.svg?style=flat-square&logo=discord" alt="discord" style="height: 20px;"></a>
    <a href="https://twitter.com/axolotl_ai"><img src="https://img.shields.io/twitter/follow/axolotl_ai?style=social" alt="twitter" style="height: 20px;"></a>
    <a href="https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="google-colab" style="height: 20px;"></a>
    <br/>
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
@@ -53,21 +49,20 @@
 ## ✨ Overview
-Axolotl is a free and open-source tool designed to streamline post-training and fine-tuning for the latest large language models (LLMs).
+Axolotl is a tool designed to streamline post-training for various AI models.
 Features:
- **Multiple Model Support**: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub.
+- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
- **Multimodal Training**: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, and audio models like Voxtral with image, video, and audio support.
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).
+- **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
 - **Easy Configuration**: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference.
 - **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
 - **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
 - **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
-## 🚀 Quick Start - LLM Fine-tuning in Minutes
+## 🚀 Quick Start
 **Requirements**:
@@ -75,10 +70,6 @@ Features:
 - Python 3.11
 - PyTorch ≥2.6.0
 ### Google Colab
 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)
 ### Installation
 #### Using pip
@@ -164,7 +155,7 @@ If you use Axolotl in your research or projects, please cite it as follows:
 ```bibtex
@software{axolotl,
-  title = {Axolotl: Open Source LLM Post-Training},
+  title = {Axolotl: Post-Training for AI Models},
  author = {{Axolotl maintainers and contributors}},
  url = {https://github.com/axolotl-ai-cloud/axolotl},
  license = {Apache-2.0},
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -153,7 +153,7 @@ quartodoc:
        - utils.distributed
        - utils.dict
        - utils.optimizers.adopt
-        - utils.data.streaming
+        - utils.data.pretraining
        - utils.data.sft
        - utils.quantization
    - title: Schemas
@@ -272,7 +272,6 @@ website:
          contents:
            - docs/batch_vs_grad.qmd
            - docs/dataset_preprocessing.qmd
            - docs/streaming.qmd
            - docs/multipack.qmd
            - docs/mixed_precision.qmd
            - docs/optimizers.qmd
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -57,8 +57,7 @@ VOLUME_CONFIG = {
 }
 N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_TYPE = os.environ.get("GPU_TYPE", "L40S")
+GPU_CONFIG = f"L40S:{N_GPUS}"
 GPU_CONFIG = f"{GPU_TYPE}:{N_GPUS}"
 def run_cmd(cmd: str, run_folder: str):
--- a/codecov.yml
+++ b/codecov.yml
@@ -12,7 +12,7 @@ coverage:
      default:
        # basic
        target: auto
-        threshold: 1%
+        threshold: 0%
        base: auto
        # advanced
        branches: null
@@ -27,7 +27,7 @@ coverage:
      default:
        # basic
        target: auto
-        threshold: 1%
+        threshold: 0%
        base: auto
        # advanced
        branches: null
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -134,7 +134,7 @@ For providers supporting Docker:
 ### Google Colab {#sec-colab}
-[![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)
+Use our [example notebook](../examples/colab-notebooks/colab-axolotl-example.ipynb).
 ## Platform-Specific Instructions {#sec-platform-specific}
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -63,6 +63,15 @@ Start from Stage 1 -> Stage 2 -> Stage 3.
 :::
 ::: {.callout-tip}
 Using ZeRO Stage 3 with Single-GPU training
 ZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:
 `WORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500`
 :::
 ## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
 ::: {.callout-note}
--- a/docs/quantize.qmd
+++ b/docs/quantize.qmd
@@ -51,11 +51,3 @@ axolotl quantize qat.yml
 ```
 This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
 ::: {.callout-note}
 If you have configured pushing to hub with `hub_model_id`, your model hub name will have the quantization schema appended to it,
 e.g. `axolotl-ai-cloud/qat-nvfp4-llama3B` will become `axolotl-ai-cloud/qat-nvfp4-llama3B-nvfp4w`
 :::
--- a/docs/reward_modelling.qmd
+++ b/docs/reward_modelling.qmd
@@ -11,7 +11,6 @@ We support the reward modelling techniques supported by `trl`.
 ### (Outcome) Reward Models
 Outcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step).
 For improved training stability, you can use the `center_rewards_coefficient` parameter to encourage mean-zero reward outputs ([see TRL docs](https://huggingface.co/docs/trl/v0.10.1/en/reward_trainer#centering-rewards)).
 ```yaml
 base_model: google/gemma-2-2b
--- a/docs/streaming.qmd
+++ b/docs/streaming.qmd
@@ -1,120 +0,0 @@
 ---
 title: Streaming Datasets
 description: How to use streaming mode for large-scale datasets and memory-efficient training
 order: 10
 ---
 Streaming enables memory-efficient training with large datasets by loading data
 incrementally rather than loading the entire dataset into memory at once.
 Use streaming when:
 - Your dataset is too large to fit in memory (e.g. when you're doing pretraining with massive text corpora)
 - You want to start training immediately without preprocessing the entire dataset
 Streaming works with both remote and locally stored datasets!
 ::: {.callout-note}
 Streaming currently only supports a single dataset. Multi-dataset support will be added soon.
 :::
 ## Configuration
 ### Basic Streaming
 Enable streaming mode by setting the `streaming` flag:
 ```yaml
 streaming: true
 ```
 ### Pretraining with Streaming
 For pretraining tasks, streaming is automatically enabled when using `pretraining_dataset`:
 ```yaml
 pretraining_dataset:
  - path: HuggingFaceFW/fineweb-edu
    type: pretrain
    text_column: text
    split: train
 # Optionally, enable sample packing
 streaming_multipack_buffer_size: 10000
 sample_packing: true
 ```
 ### SFT with Streaming
 For supervised fine-tuning with streaming:
 ```yaml
 streaming: true
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
    split: train
 # Optionally, enable sample packing
 streaming_multipack_buffer_size: 10000
 sample_packing: true
 ```
 ## Configuration Options
 ### `streaming_multipack_buffer_size`
 Controls the buffer size for multipack streaming (default: 10,000). This determines how
 many samples are buffered before packing. Larger buffers can improve packing efficiency
 but use more memory.
 ### `shuffle_merged_datasets`
 When enabled, shuffles the streaming dataset using the buffer. This requires additional
 memory for the shuffle buffer.
 ## Sample Packing with Streaming
 Sample packing is supported for streaming datasets. When enabled, multiple samples are
 packed into a single sequence to maximize GPU utilization:
 ```yaml
 sample_packing: true
 streaming_multipack_buffer_size: 10000
 # For SFT: attention is automatically isolated between packed samples
 # For pretraining: control with pretrain_multipack_attn
 pretrain_multipack_attn: true  # prevent cross-attention between packed samples
 ```
 For more information, see our [documentation](multipack.qmd) on multipacking.
 ## Important Considerations
 ### Memory Usage
 While streaming reduces memory usage compared to loading entire datasets, you still need
 to consider:
 - You can control the memory usage by adjusting `streaming_multipack_buffer_size`
 - Sample packing requires buffering multiple samples
 - Shuffling requires additional memory for the shuffle buffer
 ### Performance
 - Streaming may have slightly higher latency compared to preprocessed datasets, as samples are processed on-the-fly
 - Network speed and disk read speed are important when streaming from remote sources or a local dataset, respectively
 - Consider using `axolotl preprocess` for smaller or more frequently used datasets
 ### Evaluation Datasets
 Evaluation datasets are not streamed to ensure consistent evaluation metrics. They're
 loaded normally even when training uses streaming.
 ## Examples
 See the `examples/streaming/` directory for complete configuration examples:
 - `pretrain.yaml`: Pretraining with streaming dataset
 - `sft.yaml`: Supervised fine-tuning with streaming
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -176,8 +176,8 @@
    }
   ],
   "source": [
    "from axolotl.cli.config import load_cfg\n",
    "from axolotl.utils.dict import DictDefault\n",
    "from axolotl.cli.config import load_cfg\n",
    "\n",
    "# Axolotl provides full control and transparency over model and training configuration\n",
    "config = DictDefault(\n",
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -20,13 +20,7 @@ pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
-2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
+2. Run the finetuning example:
 ```bash
 python scripts/cutcrossentropy_install.py | sh
 ```
 3. Run the finetuning example:
 ```bash
 axolotl train examples/devstral/devstral-small-qlora.yml
--- a/examples/gemma3/270m-qlora.yml
+++ b/examples/gemma3/270m-qlora.yml
@@ -1,68 +0,0 @@
 base_model: google/gemma-3-270m-it
 # optionally might have model_type or tokenizer_type
 model_type: AutoModelForCausalLM
 tokenizer_type: AutoTokenizer
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 # gemma3 doesn't seem to play nice with ddp
 ddp_find_unused_parameters: true
 load_in_8bit: false
 load_in_4bit: true
 # huggingface repo
 chat_template: gemma3
 eot_tokens:
  - <end_of_turn>
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 val_set_size: 0.0
 output_dir: ./outputs/out
 adapter: qlora
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 sequence_len: 2048
 sample_packing: true
 eval_sample_packing: false
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch:
 saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -106,16 +106,6 @@ See [Nanobit/text-tools-2k-test](https://huggingface.co/datasets/Nanobit/text-to
 Refer to [our docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use) for more info.
 ### Thinking and chat_template masking conflict
 OpenAI’s Harmony template hides `thinking` in all non-final turns, which conflicts with Axolotl’s `chat_template` masking.
 If your dataset has `thinking` content mid-turn, there are two paths we recommend:
 - Train only on the last turn. This can be accomplished via chat_template's [train on last doc](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#training-on-last-message).
 - Adjust your dataset to only have `thinking` content in the last turn.
 ### TIPS
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
--- a/examples/hunyuan/README.md
+++ b/examples/hunyuan/README.md
@@ -1,85 +0,0 @@
 # Finetune HunYuan with Axolotl
 Tencent released a family of opensource models called HunYuan with varying parameter scales of 0.5B, 1.8B, 4B, and 7B scale for both Pre-trained and Instruct variants. The models can be found at [HuggingFace](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
 ## Getting started
 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as HunYuan is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
    Here is an example of how to install from main for pip:
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
 ```
 2. Run the finetuning example:
 ```bash
 axolotl train examples/hunyuan/hunyuan-v1-dense-qlora.yaml
 ```
 This config uses about 4.7 GB VRAM.
 Let us know how it goes. Happy finetuning! 🚀
 ### Dataset
 HunYuan Instruct models can choose to enter a slow think or fast think pattern. For best performance on fine-tuning their Instruct models, your dataset should be adjusted to match their pattern.
 ```python
 # fast think pattern
 messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "/no_think What color is the sun?" },
    {"role": "assistant", "content": "<think>\n\n</think>\n<answer>\nThe sun is yellow.\n</answer>"}
 ]
 # slow think pattern
 messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "/no_think What color is the sun?" },
    {"role": "assistant", "content": "<think>\nThe user is asking about the color of the sun. I need to ...\n</think>\n<answer>\nThe sun is yellow.\n</answer>"}
 ]
 ```
 ### TIPS
 - For inference, the official Tencent team recommends
 ```json
 {
  "do_sample": true,
  "top_k": 20,
  "top_p": 0.8,
  "repetition_penalty": 1.05,
  "temperature": 0.7
 }
 ```
 - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
 - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
 ## Optimization Guides
 - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
 - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
 - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
 ## Related Resources
 - [Tencent HunYuan Blog](https://hunyuan.tencent.com/)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl Website](https://axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/hunyuan/hunyuan-v1-dense-qlora.yaml
+++ b/examples/hunyuan/hunyuan-v1-dense-qlora.yaml
@@ -1,64 +0,0 @@
 base_model: tencent/Hunyuan-0.5B-Instruct
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 load_in_8bit: false
 load_in_4bit: true
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/3b-qat-fsdp2-nvfp4.yaml
+++ b/examples/llama-3/3b-qat-fsdp2-nvfp4.yaml
@@ -1,64 +0,0 @@
 base_model: meta-llama/Llama-3.2-3B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 plugins:
  - axolotl.integrations.liger.LigerPlugin
 liger_rope: true
 liger_rms_norm: true
 liger_glu_activation: true
 liger_layer_norm: true
 liger_fused_linear_cross_entropy: true
 datasets:
  - path: yahma/alpaca-cleaned
    type: alpaca
    split: train[:95%]
 output_dir: ./outputs/qat_out/
 dataset_prepared_path: ./outputs/dataset_prepared
 sequence_len: 8192
 flash_attention: true
 qat:
  activation_dtype: nvfp4
  weight_dtype: nvfp4
  group_size: 16 # only group_size of 16 is supported with nvfp4
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_checkpointing: true
 gradient_accumulation_steps: 1
 micro_batch_size: 64
 num_epochs: 1
 optimizer: adamw_torch_fused
 cosine_constant_lr_ratio: 0
 cosine_min_lr_ratio: 1.0
 learning_rate: 2e-5
 save_only_model: true
 bf16: true
 resume_from_checkpoint:
 logging_steps: 1
 evals_per_epoch: 1
 saves_per_epoch: 1
 warmup_ratio: 0.1
 weight_decay: 0.0
 special_tokens:
  pad_token: <|finetune_right_pad_id|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -15,18 +15,20 @@ liger_glu_activation: true
 liger_layer_norm: true
 liger_fused_linear_cross_entropy: true
 datasets:
  - path: yahma/alpaca-cleaned
    type: alpaca
    split: train[:95%]
 output_dir: ./outputs/qat_out/
 dataset_prepared_path: ./outputs/qat_out/dataset_prepared
-sample_packing: false
+sample_packing: true
-sequence_len: 8192
+
-flash_attention: true
+sequence_len: 512
 flex_attention: true
 flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
 qat:
  activation_dtype: int8
@@ -65,7 +67,7 @@ fsdp:
 fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
-  fsdp_cpu_ram_efficient_loading: false
+  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
@@ -74,6 +76,6 @@ fsdp_config:
  fsdp_activation_checkpointing: true
 special_tokens:
-  pad_token: <|finetune_right_pad_id|>
+  pad_token: <|end_of_text|>
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/diffusion/pretrain-1b.yaml
+++ b/examples/llama-3/diffusion/pretrain-1b.yaml
@@ -1,56 +0,0 @@
 base_model: meta-llama/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 pretraining_dataset:
  - path: wikitext
    name: wikitext-103-raw-v1
    type: completion
    field: text
 plugins:
  - axolotl.integrations.diffusion.DiffusionPlugin
 diffusion:
  noise_schedule: cosine
  min_mask_ratio: 0.15
  max_mask_ratio: 0.85
  num_diffusion_steps: 128
  eps: 5e-4
  importance_weighting: true
  mask_token_id: 128002
  generate_samples: true
  generation_interval: 250
 output_dir: ./outputs/model-out
 sequence_len: 512
 sample_packing: true
 gradient_accumulation_steps: 8
 micro_batch_size: 4
 max_steps: 10000
 warmup_ratio: 0.1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 3e-4
 sdp_attention: true
 bf16: auto
 tf32: true
 logging_steps: 1
 save_strategy: steps
 save_steps: 1000
 special_tokens:
  pad_token: "<|end_of_text|>"
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/diffusion/sft-1b.yaml
+++ b/examples/llama-3/diffusion/sft-1b.yaml
@@ -1,59 +0,0 @@
 base_model: meta-llama/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 val_set_size: 0.05
 plugins:
  - axolotl.integrations.diffusion.DiffusionPlugin
 diffusion:
  noise_schedule: cosine
  min_mask_ratio: 0.1
  max_mask_ratio: 0.9
  num_diffusion_steps: 128
  eps: 1e-3
  importance_weighting: true
  mask_token_id: 128002
  generate_samples: true
  generation_interval: 250
 output_dir: ./outputs/model-out
 sequence_len: 512
 sample_packing: true
 eval_sample_packing: true
 gradient_accumulation_steps: 4
 micro_batch_size: 4
 num_epochs: 1
 warmup_steps: 0.1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 1e-5
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 sdp_attention: true
 logging_steps: 1
 save_strategy: best
 eval_strategy: epoch
 special_tokens:
  pad_token: "<|end_of_text|>"
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -18,13 +18,7 @@ pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
-2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
+2. Run the finetuning example:
 ```bash
 python scripts/cutcrossentropy_install.py | sh
 ```
 3. Run the finetuning example:
 ```bash
 axolotl train examples/magistral/magistral-small-qlora.yaml
--- a/examples/qwen3/reward-model.yaml
+++ b/examples/qwen3/reward-model.yaml
@@ -1,44 +0,0 @@
 base_model: Skywork/Skywork-Reward-V2-Qwen3-8B
 model_type: AutoModelForSequenceClassification
 num_labels: 1
 reward_model: true
 center_rewards_coefficient: 0.01  # Incentivize mean-zero rewards for improved stability
 chat_template: qwen3
 datasets:
  - path: argilla/distilabel-intel-orca-dpo-pairs
    type: bradley_terry.chat_template
 val_set_size: 0.0
 output_dir: ./outputs/out
 sequence_len: 8192
 sample_packing: false
 eval_sample_packing: false
 pad_to_sequence_len: true
 deepspeed: deepspeed_configs/zero1.json
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 eval_batch_size: 1
 num_epochs: 3
 optimizer: adamw_bnb_8bit
 lr_scheduler: linear
 learning_rate: 0.00002
 bf16: true
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 warmup_ratio: 0.1
 logging_steps: 1
 weight_decay: 0.01
--- a/examples/seed-oss/README.md
+++ b/examples/seed-oss/README.md
@@ -1,54 +0,0 @@
 # Finetune ByteDance's Seed-OSS with Axolotl
 [Seed-OSS](https://huggingface.co/collections/ByteDance-Seed/seed-oss-68a609f4201e788db05b5dcd) are a series of 36B parameter open source models trained by ByteDance's Seed Team.
 This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
 ## Getting started
 1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Seed-OSS is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
    Here is an example of how to install from main for pip:
 ```bash
 # Ensure you have Pytorch installed (Pytorch 2.6.0 min)
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
 pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
 # Install Cut Cross Entropy
 python scripts/cutcrossentropy_install.py | sh
 ```
 2. Run the finetuning example:
 ```bash
 axolotl train examples/seed-oss/seed-oss-36b-qlora.yaml
 ```
 This config uses about 27.7 GiB VRAM.
 Let us know how it goes. Happy finetuning! 🚀
 ### TIPS
 - For inference, the official Seed Team recommends `top_p=0.95` and `temperature=1.1`.
 - You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
 - The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
 ## Optimization Guides
 - [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
 - [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
 - [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
 ## Related Resources
 - [ByteDance Seed Website](https://seed.bytedance.com/)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl Website](https://axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/seed-oss/seed-oss-36b-qlora.yaml
+++ b/examples/seed-oss/seed-oss-36b-qlora.yaml
@@ -1,56 +0,0 @@
 base_model: ByteDance-Seed/Seed-OSS-36B-Instruct
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 load_in_8bit: false
 load_in_4bit: true
 datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 sample_packing: true
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 bf16: auto
 tf32: false
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/streaming/README.md
+++ b/examples/streaming/README.md
@@ -1,50 +0,0 @@
 # Streaming Dataset Examples
 This directory contains example configurations for using Axolotl's streaming dataset
 functionality, which enables memory-efficient training with large datasets.
 ## Examples
 Run the following examples with e.g. `axolotl train examples/streaming/sft.yaml`; no
 `axolotl preprocess` required!
 ### Pretraining (`pretrain.yaml`)
 Demonstrates streaming configuration for pretraining tasks using the fineweb-edu dataset
 with SmolLM2-135M.
 - Uses `pretraining_dataset` configuration for automatic streaming
 - Multipack attention control to prevent cross-attention between packed sequences
 - Buffer size configuration for memory management
 ### SFT (`sft.yaml`)
 Shows how to use streaming for supervised fine-tuning with the Alpaca dataset.
 - Explicit `streaming: true` flag for SFT datasets
 - Memory-efficient training on instruction datasets
 - Evaluation datasets are currently not streamed
 ## Key Configuration Options
 ### `streaming`
 - Enables streaming mode for standard datasets
 - Automatically enabled for `pretraining_dataset`
 ### `streaming_multipack_buffer_size`
 - Controls buffer size for sample packing (default: 10,000)
 - Larger values improve packing efficiency but use more memory
 - Adjust based on available memory
 ### `shuffle_merged_datasets`
 - Enables shuffling of streaming datasets
 - Requires additional memory for shuffle buffer
 ### `sample_packing`
 - Packs multiple samples into single sequences
 - Minimize per-step padding tokens
 ## Performance Tips
 - Download small / frequently-used datasets locally for better performance
 - Larger buffer sizes improve packing efficiency
--- a/examples/streaming/pretrain.yaml
+++ b/examples/streaming/pretrain.yaml
@@ -1,57 +0,0 @@
 base_model: HuggingFaceTB/SmolLM2-135M
 # Streaming pretraining configuration
 pretraining_dataset:
  - path: HuggingFaceFW/fineweb-edu
    name: sample-10BT
    type: pretrain
    text_column: text
    split: train
 # Streaming-specific settings
 streaming_multipack_buffer_size: 10000
 shuffle_merged_datasets: true
 # Training configuration
 max_steps: 1000
 output_dir: ./outputs/smollm2-135m-pretrain-streaming
 # Sequence and packing settings
 sequence_len: 1024
 sample_packing: true
 pretrain_multipack_attn: true  # Prevent cross-attention between packed sequences
 flash_attention: true
 # Batch size settings
 gradient_accumulation_steps: 8
 micro_batch_size: 1
 # Optimizer and scheduler
 optimizer: adamw_torch
 lr_scheduler: cosine
 learning_rate: 5e-4
 warmup_ratio: 0.1
 weight_decay: 0.01
 # Precision and performance
 bf16: auto
 tf32: true
 # Logging and checkpointing
 logging_steps: 10
 save_strategy: steps
 save_steps: 250
 save_total_limit: 3
 # Weights & Biases (optional)
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 # Special tokens
 special_tokens:
  pad_token: "<|endoftext|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/streaming/sft.yaml
+++ b/examples/streaming/sft.yaml
@@ -1,55 +0,0 @@
 base_model: HuggingFaceTB/SmolLM2-135M
 # Dataset configuration
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
    split: train
 # Streaming-specific settings
 streaming: true
 streaming_multipack_buffer_size: 10000
 shuffle_merged_datasets: true
 # Training configuration
 max_steps: 1000
 output_dir: ./outputs/smollm2-135m-sft-streaming
 # Sequence and packing settings
 sequence_len: 1024
 sample_packing: true
 flash_attention: true
 # Batch size settings
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 # Optimizer and scheduler
 optimizer: adamw_torch
 lr_scheduler: cosine
 learning_rate: 2e-4
 warmup_ratio: 0.1
 weight_decay: 0.0
 # Precision and performance
 bf16: auto
 tf32: true
 # Logging and checkpointing
 logging_steps: 10
 save_strategy: steps
 save_steps: 100
 save_total_limit: 3
 # Weights & Biases (optional)
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 # Special tokens
 special_tokens:
  pad_token: "<|endoftext|>"
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/voxtral/README.md
+++ b/examples/voxtral/README.md
@@ -22,9 +22,6 @@ pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 # audio
 pip3 install librosa==0.11.0
 pip3 install 'mistral_common[audio]==1.8.3'
 # Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
 python scripts/cutcrossentropy_install.py | sh
 ```
 3. Run the finetuning example:
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ packaging==23.2
 huggingface_hub>=0.33.0
 peft>=0.17.0
-transformers==4.56.1
+transformers==4.55.3
 tokenizers>=0.21.1
 accelerate==1.10.0
 datasets==4.0.0
@@ -64,7 +64,7 @@ langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2
-torchao==0.13.0
+torchao==0.12.0
 schedulefree==1.4.1
 axolotl-contribs-lgpl==0.0.6
--- a/setup.py
+++ b/setup.py
@@ -127,7 +127,7 @@ extras_require = {
        "yunchang==0.6.0",
    ],
    "deepspeed": [
-        "deepspeed==0.17.5",
+        "deepspeed==0.17.2",
        "deepspeed-kernels",
    ],
    "mamba-ssm": [
@@ -162,7 +162,6 @@ extras_require = {
    "llmcompressor": [
        "llmcompressor==0.5.1",
    ],
    "fbgemm-gpu": ["fbgemm-gpu-genai>=1.2.0"],
 }
 install_requires, dependency_links, extras_require_build = parse_requirements(
    extras_require
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -14,13 +14,9 @@ class PreprocessCliArgs:
    prompter: Optional[str] = field(default=None)
    download: Optional[bool] = field(default=True)
    iterable: Optional[bool] = field(
-        default=False,
+        default=None,
        metadata={
-            "help": (
+            "help": "Use IterableDataset for streaming processing of large datasets"
                "Deprecated in v0.13.0, will be removed in v0.14.0. For streaming "
                "datasets, use 'axolotl train' and set 'streaming: true' in your YAML "
                "config, or pass --streaming instead in the CLI."
            )
        },
    )
@@ -115,7 +111,6 @@ class QuantizeCliArgs:
    quantize_embedding: Optional[bool] = field(default=None)
    group_size: Optional[int] = field(default=None)
    output_dir: Optional[str] = field(default=None)
    hub_model_id: Optional[str] = field(default=None)
@dataclass
--- a/src/axolotl/cli/cloud/init.py
+++ b/src/axolotl/cli/cloud/init.py
@@ -67,8 +67,16 @@ def do_cli_lm_eval(
    cloud_config: Path | str,
    config: Path | str,
 ) -> None:
-    cloud_cfg = load_cloud_cfg(cloud_config)
+    cloud_cfg: DictDefault = load_cloud_cfg(cloud_config)
-    cloud = ModalCloud(cloud_cfg)
+    provider = cloud_cfg.provider or "modal"
    cloud: Cloud | None
    if provider == "modal":
        cloud = ModalCloud(cloud_cfg)
    elif provider == "baseten":
        cloud = BasetenCloud(cloud_cfg.to_dict())
    else:
        raise ValueError(f"Unsupported cloud provider: {provider}")
    with open(config, "r", encoding="utf-8") as file:
        config_yaml = file.read()
    cloud.lm_eval(config_yaml)
--- a/src/axolotl/cli/cloud/baseten/init.py
+++ b/src/axolotl/cli/cloud/baseten/init.py
@@ -46,3 +46,23 @@ class BasetenCloud(Cloud):
            subprocess.run(  # nosec B603 B607
                ["truss", "train", "push", "train_sft.py"], cwd=tmp_dir, check=False
            )
    def lm_eval(
        self,
        config_yaml: str,
    ):
        with tempfile.TemporaryDirectory() as tmp_dir:
            config = self.config.copy()
            with open(tmp_dir + "/cloud.yaml", "w", encoding="utf-8") as cloud_fout:
                yaml.dump(config, cloud_fout)
            with open(tmp_dir + "/eval.yaml", "w", encoding="utf-8") as config_fout:
                config_fout.write(config_yaml)
            shutil.copyfile(
                dirname(__file__) + "/template/eval.sh", tmp_dir + "/eval.sh"
            )
            shutil.copyfile(
                dirname(__file__) + "/template/eval_sft.py", tmp_dir + "/eval_sft.py"
            )
            subprocess.run(  # nosec B603 B607
                ["truss", "train", "push", "eval_sft.py"], cwd=tmp_dir, check=False
            )
--- a/src/axolotl/cli/cloud/baseten/template/eval.sh
+++ b/src/axolotl/cli/cloud/baseten/template/eval.sh
@@ -0,0 +1,8 @@
 #!/bin/bash
 set -eux
 export NCCL_SOCKET_IFNAME="^docker0,lo"
 export NCCL_IB_DISABLE=0
 export NCCL_TIMEOUT=1800000
 axolotl lm-eval eval.yaml
--- a/src/axolotl/cli/cloud/baseten/template/eval_sft.py
+++ b/src/axolotl/cli/cloud/baseten/template/eval_sft.py
@@ -0,0 +1,81 @@
 """
 Baseten Training Script for Axolotl
 """
 # pylint: skip-file
 import yaml
 from truss.base import truss_config
 # Import necessary classes from the Baseten Training SDK
 from truss_train import definitions
 cloud_config = yaml.safe_load(open("cloud.yaml", "r"))
 gpu = cloud_config.get("gpu", "h100")
 gpu_count = (
    1  # int(cloud_config.get("gpu_count", 1)) # only single GPU supported at the moment
 )
 node_count = (
    1  # int(cloud_config.get("node_count", 1)) # only single node support for lmeval
 )
 project_name = cloud_config.get("project_name", "axolotl-project") or "axolotl-project"
 secrets = cloud_config.get("secrets", [])
 # launcher = cloud_config.get("launcher", "accelerate")
 # launcher_args = cloud_config.get("launcher_args", [])
 script_name = "eval.sh"
 # launcher_args_str = ""
 # if launcher_args:
 #     launcher_args_str = "-- " + " ".join(launcher_args)
 # 1. Define a base image for your training job
 # must use torch 2.7.0 for vllm
 BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu126-2.7.1"
 # 2. Define the Runtime Environment for the Training Job
 # This includes start commands and environment variables.a
 # Secrets from the baseten workspace like API keys are referenced using
 # `SecretReference`.
 env_vars = {
    # "AXOLOTL_LAUNCHER": launcher,
    # "AXOLOTL_LAUNCHER_ARGS": launcher_args_str,
 }
 for secret_name in secrets:
    env_vars[secret_name] = definitions.SecretReference(name=secret_name)
 training_runtime = definitions.Runtime(
    start_commands=[  # Example: list of commands to run your training script
        f"/bin/sh -c 'chmod +x ./{script_name} && ./{script_name}'"
    ],
    environment_variables=env_vars,
    cache_config=definitions.CacheConfig(
        enabled=True,
    ),
    checkpointing_config=definitions.CheckpointingConfig(
        enabled=True,
    ),
 )
 # 3. Define the Compute Resources for the Training Job
 training_compute = definitions.Compute(
    node_count=node_count,
    accelerator=truss_config.AcceleratorSpec(
        accelerator=truss_config.Accelerator.H100,
        count=gpu_count,
    ),
 )
 # 4. Define the Training Job
 # This brings together the image, compute, and runtime configurations.
 my_training_job = definitions.TrainingJob(
    image=definitions.Image(base_image=BASE_IMAGE),
    compute=training_compute,
    runtime=training_runtime,
 )
 # This config will be pushed using the Truss CLI.
 # The association of the job to the project happens at the time of push.
 first_project_with_job = definitions.TrainingProject(
    name=project_name, job=my_training_job
 )
--- a/src/axolotl/cli/cloud/baseten/template/train_sft.py
+++ b/src/axolotl/cli/cloud/baseten/template/train_sft.py
@@ -44,6 +44,12 @@ training_runtime = definitions.Runtime(
        f"/bin/sh -c 'chmod +x ./{script_name} && ./{script_name}'"
    ],
    environment_variables=env_vars,
    cache_config=definitions.CacheConfig(
        enabled=True,
    ),
    checkpointing_config=definitions.CheckpointingConfig(
        enabled=True,
    ),
 )
 # 3. Define the Compute Resources for the Training Job
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -14,14 +14,10 @@ from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
 from axolotl.cli.args import InferenceCliArgs
 from axolotl.cli.config import load_cfg
 from axolotl.cli.utils import load_model_and_tokenizer
-from axolotl.cli.utils.diffusion import (
+from axolotl.utils.chat_templates import (
-    diffusion_inference,
+    get_chat_template,
-    launch_diffusion_gradio_ui,
+    get_chat_template_from_config,
    render_html,
    run_diffusion,
 )
 from axolotl.integrations.base import PluginManager
 from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
@@ -36,7 +32,6 @@ def get_multi_line_input() -> str:
        Possibly multi-line, possibly empty stdin input as a string.
    """
    print("Give me an instruction (Ctrl + D to submit): ")
    print("=" * 80)
    instruction = ""
    for line in sys.stdin:
@@ -51,9 +46,9 @@ def do_inference(
    cli_args: InferenceCliArgs,
 ):
    """
-    Runs inference on the command line in a loop. User input is accepted, a chat
+    Runs inference on the command line in a loop. User input is accepted, a chat template
-    template is (optionally) applied, and the model specified in the `axolotl` config is
+    is (optionally) applied, and the model specified in the `axolotl` config is used to
-    used to generate completions according to a default generation config.
+    generate completions according to a default generation config.
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
@@ -69,31 +64,17 @@ def do_inference(
            importlib.import_module("axolotl.prompters"), prompter
        )
    elif cfg.chat_template:
-        chat_template_str = get_chat_template_from_config(
+        chat_template_str = get_chat_template(cfg.chat_template, tokenizer=tokenizer)
-            cfg, ds_cfg=None, tokenizer=tokenizer
+    elif cfg.datasets[0].type == "chat_template":
        )
    elif cfg.datasets and cfg.datasets[0].type == "chat_template":
        chat_template_str = get_chat_template_from_config(
            cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
        )
    model = model.to(cfg.device, dtype=cfg.torch_dtype)
    # Detect diffusion mode
    plugin_manager = PluginManager.get_instance()
    is_diffusion = any(
        plugin.__class__.__name__ == "DiffusionPlugin"
        for plugin in plugin_manager.plugins.values()
    )
    if is_diffusion:
        print("=" * 80)
        print("Commands:")
        print(":complete N -> completion mode with N tokens (default 64)")
        print(":mask R     -> random masking with ratio R (0.0–1.0)")
    while True:
        print("=" * 80)
        # support for multiline inputs
        instruction = get_multi_line_input()
        if not instruction:
            return
@@ -123,19 +104,9 @@ def do_inference(
        else:
            batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
-        print("=" * 80)
+        print("=" * 40)
        model.eval()
        with torch.no_grad():
            if is_diffusion:
                diffusion_inference(
                    model=model,
                    tokenizer=tokenizer,
                    cfg=cfg,
                    prompt=prompt,
                    chat_template_str=chat_template_str,
                )
                continue
            generation_config = GenerationConfig(
                repetition_penalty=1.1,
                max_new_tokens=1024,
@@ -158,7 +129,7 @@ def do_inference(
                generation_config=generation_config,
                streamer=streamer,
            )
-        print("=" * 80)
+        print("=" * 40)
        print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
@@ -188,33 +159,10 @@ def do_inference_gradio(
            importlib.import_module("axolotl.prompters"), prompter
        )
    elif cfg.chat_template:
-        chat_template_str = get_chat_template_from_config(
+        chat_template_str = get_chat_template(cfg.chat_template, tokenizer=tokenizer)
            cfg, ds_cfg=None, tokenizer=tokenizer
        )
    elif cfg.datasets and cfg.datasets[0].type == "chat_template":
        chat_template_str = get_chat_template_from_config(
            cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
        )
    model = model.to(cfg.device, dtype=cfg.torch_dtype)
    # Detect diffusion mode
    plugin_manager = PluginManager.get_instance()
    is_diffusion = any(
        plugin.__class__.__name__ == "DiffusionPlugin"
        for plugin in plugin_manager.plugins.values()
    )
    if is_diffusion:
        launch_diffusion_gradio_ui(
            model=model,
            tokenizer=tokenizer,
            cfg=cfg,
            prompter_module=prompter_module,
            chat_template_str=chat_template_str,
        )
        return
    def generate(instruction):
        if not instruction:
            return
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -35,20 +35,10 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
    check_accelerate_default_config()
    check_user_token()
    if cli_args.iterable:
        LOG.error(
            "The --iterable CLI argument for 'axolotl preprocess' is no longer "
            "supported. For training, set 'streaming: true' in your YAML config or "
            "pass '--streaming' in your 'axolotl train' command for on-the-fly "
            "preprocessing."
        )
        return
    for key in ["skip_prepare_dataset", "pretraining_dataset"]:
        if cfg.get(key):
            LOG.error(
-                f"You have set `{key}:`. `preprocess` is not needed. Run the 'axolotl "
+                f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
                "train' CLI directly instead."
            )
            return
--- a/src/axolotl/cli/quantize.py
+++ b/src/axolotl/cli/quantize.py
@@ -5,17 +5,12 @@ CLI to post-training quantize a model using torchao
 from pathlib import Path
 from typing import Union
-from transformers import AutoConfig, AutoModelForCausalLM, TorchAoConfig
+from transformers import AutoModelForCausalLM
 from axolotl.cli.config import load_cfg
 from axolotl.loaders import load_tokenizer
 from axolotl.utils.logging import get_logger
-from axolotl.utils.quantization import (
+from axolotl.utils.quantization import TorchIntDType, quantize_model_for_ptq
    TorchAOQuantDType,
    get_quantization_config,
    quantization_config_to_str,
    quantize_model,
 )
 LOG = get_logger(__name__)
@@ -48,13 +43,13 @@ def do_quantize(
            "No quantization configuration found. Please specify either qat or quantization in your config file."
        )
-    model_path = cli_args.get("base_model") or cfg.output_dir
+    model_path = cli_args.get("model_path") or cfg.output_dir
    if weight_dtype := cli_args.get("weight_dtype"):
-        weight_dtype = TorchAOQuantDType.from_string(weight_dtype)
+        weight_dtype = TorchIntDType[weight_dtype]
    else:
        weight_dtype = quantize_cfg.weight_dtype
    if activation_dtype := cli_args.get("activation_dtype"):
-        activation_dtype = TorchAOQuantDType.from_string(activation_dtype)
+        activation_dtype = TorchIntDType[activation_dtype]
    else:
        activation_dtype = quantize_cfg.activation_dtype
    group_size = cli_args.get("group_size") or quantize_cfg.group_size
@@ -62,15 +57,10 @@ def do_quantize(
        cli_args.get("quantize_embedding") or quantize_cfg.quantize_embedding
    )
    output_dir = cli_args.get("output_dir") or cfg.output_dir
    hub_model_id = cli_args.get("hub_model_id") or cfg.hub_model_id
-    LOG.info(f"Loading model from {model_path}.")
+    LOG.info(f"Loading model from {model_path}...")
    tokenizer = load_tokenizer(cfg)
-    config = AutoConfig.from_pretrained(model_path)
+    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
    torch_dtype = config.torch_dtype if hasattr(config, "torch_dtype") else None
    model = AutoModelForCausalLM.from_pretrained(
        model_path, device_map="auto", torch_dtype=torch_dtype
    )
    LOG.info(
        f"Quantizing model with configuration: \n"
@@ -80,21 +70,11 @@ def do_quantize(
        f"\tquantize_embedding: {quantize_embedding}"
    )
-    quantize_model(
+    quantize_model_for_ptq(
        model, weight_dtype, group_size, activation_dtype, quantize_embedding
    )
-    quantization_config = get_quantization_config(
+    LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}...")
        weight_dtype, activation_dtype, group_size
    )
    ao_config = TorchAoConfig(
        quant_type=quantization_config,
        include_input_output_embeddings=quantize_embedding,
    )
    model.config.quantization_config = ao_config
    LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}.")
    model.save_pretrained(
        str(Path(output_dir) / "quantized"),
        safe_serialization=False,
@@ -106,14 +86,4 @@ def do_quantize(
        progressbar=True,
        save_jinja_files=cfg.tokenizer_save_jinja_files,
    )
-
+    LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}...")
    if hub_model_id:
        hub_model_id = (
            hub_model_id.rstrip("-")
            + f"-{quantization_config_to_str[type(quantization_config)]}"
        )
        model.push_to_hub(hub_model_id, safe_serialization=False)
        tokenizer.push_to_hub(hub_model_id)
        LOG.info(f"Quantized model pushed to: {hub_model_id}.")
    LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}.")
--- a/src/axolotl/cli/utils/diffusion.py
+++ b/src/axolotl/cli/utils/diffusion.py
@@ -1,375 +0,0 @@
 """Helpers for diffusion-mode inference in CLI and Gradio."""
 from __future__ import annotations
 import gradio as gr
 import torch
 from colorama import Fore, Style
 from axolotl.integrations.diffusion import generate, resolve_mask_token_id
 from axolotl.utils.dict import DictDefault
 def diffusion_inference(
    model,
    tokenizer,
    cfg,
    prompt: str,
    chat_template_str: str | None = None,
 ):
    """Diffusion inference helper method."""
    mode = "random"
    completion_tokens = 0
    target_mask_ratio = None
    mode, completion_tokens, target_mask_ratio, cleaned = _parse_commands(prompt)
    if cleaned:
        prompt = cleaned
    info = run_diffusion(
        model=model,
        tokenizer=tokenizer,
        cfg=cfg,
        prompt=prompt,
        chat_template_str=chat_template_str,
        mode=mode,
        target_mask_ratio=target_mask_ratio,
        completion_tokens=completion_tokens,
    )
    masked_text = info["masked_text"]
    mask_ratio = info["mask_ratio"]
    generated_ids = info["generated_ids"]
    masked_positions = info["masked_positions"]
    orig_ids = info["orig_ids"]
    # Display with masked preview and colored diff
    if masked_text is not None and mask_ratio is not None:
        print(f"Masked ({mask_ratio:.1%}):\n{masked_text}\n")
    if generated_ids is not None:
        # Compute per-token style
        styles: list[str] = []
        for i, tid in enumerate(generated_ids):
            if i in masked_positions:
                if i < len(orig_ids) and tid == orig_ids[i]:
                    styles.append("green")  # correct fill
                elif i < len(orig_ids):
                    styles.append("red")  # incorrect fill
                else:
                    styles.append("normal")  # appended
            else:
                same = i < len(orig_ids) and tid == orig_ids[i]
                styles.append("dim" if same else "normal")
        # Group contiguous spans by style
        styled_spans: list[tuple[str, int, int]] = []
        if generated_ids:
            current_style = styles[0]
            start = 0
            for i in range(1, len(generated_ids)):
                s = styles[i]
                if s != current_style:
                    styled_spans.append((current_style, start, i))
                    current_style, start = s, i
            styled_spans.append((current_style, start, len(generated_ids)))
        out_parts = []
        for style_name, a, b in styled_spans:
            chunk_text = tokenizer.decode(generated_ids[a:b], skip_special_tokens=False)
            if style_name == "green":
                out_parts.append(Fore.GREEN + chunk_text + Style.RESET_ALL)
            elif style_name == "red":
                out_parts.append(Fore.RED + chunk_text + Style.RESET_ALL)
            else:
                if style_name == "dim":
                    out_parts.append(Style.DIM + chunk_text + Style.RESET_ALL)
                else:
                    out_parts.append(chunk_text)
        print("Generated:\n" + "".join(out_parts))
    else:
        print("Generated:\n(no output)")
 def _parse_commands(text: str):
    """
    Parse leading diffusion commands.
    Supported at start of input (can be chained):
      :complete N  -> completion mode with N tokens (default 64)
      :mask R      -> random masking with ratio R in [0, 1]
    """
    tokens = text.strip().split()
    i = 0
    mode = "random"
    completion_tokens = 0
    target_mask_ratio = None
    consumed = 0
    while i < len(tokens) and tokens[i].startswith(":"):
        cmd = tokens[i]
        i += 1
        consumed = i
        if cmd == ":complete":
            mode = "completion"
            if i < len(tokens):
                try:
                    completion_tokens = int(tokens[i])
                    i += 1
                    consumed = i
                except Exception:
                    completion_tokens = 64
            else:
                completion_tokens = 64
        elif cmd == ":mask":
            mode = "random"
            if i < len(tokens):
                try:
                    target_mask_ratio = float(tokens[i])
                    i += 1
                    consumed = i
                except Exception:
                    target_mask_ratio = None
        else:
            i -= 1
            consumed = i
            break
    cleaned = " ".join(tokens[consumed:])
    return mode, completion_tokens, target_mask_ratio, cleaned
 def run_diffusion(
    *,
    model,
    tokenizer,
    cfg: DictDefault,
    prompt: str,
    chat_template_str: str | None,
    mode: str = "random",
    target_mask_ratio: float | None = None,
    completion_tokens: int = 0,
 ):
    """Run a single diffusion generation and return a structured result dict."""
    if chat_template_str:
        batch = tokenizer.apply_chat_template(
            [{"role": "user", "content": prompt}],
            return_tensors="pt",
            add_special_tokens=True,
            add_generation_prompt=True,
            chat_template=chat_template_str,
            tokenize=True,
            return_dict=True,
        )
    else:
        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
    mask_token_id = resolve_mask_token_id(tokenizer, cfg, allow_add=False)
    seq = batch["input_ids"].to(cfg.device)
    gen_mode = "completion" if mode == "completion" else "random"
    comp_tokens = int(completion_tokens) if gen_mode == "completion" else 0
    result = generate(
        model,
        tokenizer,
        original_sequence=seq[:1],
        num_diffusion_steps=cfg.diffusion.num_diffusion_steps,
        temperature=cfg.diffusion.generation_temperature,
        mask_token_id=int(mask_token_id),
        mode=gen_mode,  # type: ignore[arg-type]
        completion_tokens=comp_tokens,
        target_mask_ratio=target_mask_ratio,
    )
    masked_text = result.get("masked") if isinstance(result, dict) else None
    mask_ratio = result.get("mask_ratio") if isinstance(result, dict) else None
    generated_ids = result.get("generated_ids") if isinstance(result, dict) else None
    masked_positions = (
        set(result.get("masked_positions") or []) if isinstance(result, dict) else set()
    )
    orig_ids = seq[0].detach().cpu().tolist()
    return {
        "masked_text": masked_text,
        "mask_ratio": mask_ratio,
        "generated_ids": generated_ids,
        "masked_positions": masked_positions,
        "orig_ids": orig_ids,
    }
 def render_html(
    *,
    generated_ids: list[int] | None,
    orig_ids: list[int],
    masked_positions: set[int],
    tokenizer,
 ) -> str:
    """Render HTML visualizing diffusion outputs."""
    if not generated_ids:
        return "<pre>Generated:\n(no output)</pre>"
    def _style_for(i: int, tid: int) -> str:
        if i in masked_positions:
            if i < len(orig_ids) and tid == orig_ids[i]:
                return "green"
            if i < len(orig_ids):
                return "red"
            return "normal"
        same = i < len(orig_ids) and tid == orig_ids[i]
        return "dim" if same else "normal"
    # Group contiguous spans by style to reduce HTML size
    spans: list[tuple[str, int, int]] = []
    if generated_ids:
        cur = _style_for(0, generated_ids[0])
        start = 0
        for i in range(1, len(generated_ids)):
            s = _style_for(i, generated_ids[i])
            if s != cur:
                spans.append((cur, start, i))
                cur, start = s, i
        spans.append((cur, start, len(generated_ids)))
    html_parts = []
    for style_name, a, b in spans:
        txt = tokenizer.decode(generated_ids[a:b], skip_special_tokens=False)
        if style_name == "green":
            html_parts.append(f'<span style="color:#2e7d32">{txt}</span>')
        elif style_name == "red":
            html_parts.append(f'<span style="color:#c62828">{txt}</span>')
        elif style_name == "dim":
            html_parts.append(f'<span style="opacity:0.6">{txt}</span>')
        else:
            html_parts.append(txt)
    legend = (
        '<div style="font-size:0.9em;margin-bottom:4px">'
        '<span style="color:#2e7d32">correct</span>, '
        '<span style="color:#c62828">incorrect</span>, '
        '<span style="opacity:0.6">unchanged</span>'
        "</div>"
    )
    return (
        legend
        + '<pre style="white-space:pre-wrap">Generated:\n'
        + "".join(html_parts)
        + "</pre>"
    )
 def launch_diffusion_gradio_ui(
    *,
    model,
    tokenizer,
    cfg: DictDefault,
    prompter_module=None,
    chat_template_str: str | None = None,
 ):
    """Build and launch a simple Gradio UI for diffusion inference."""
    with gr.Blocks(
        title=cfg.get("gradio_title", "Axolotl Diffusion Interface")
    ) as demo:
        gr.Markdown(
            """
            ## Axolotl Diffusion Inference
            - Mode "Random" masks tokens at a target ratio and fills them.
            - Mode "Completion" appends N masked tokens at the end and fills them.
            """
        )
        with gr.Row():
            mode = gr.Radio(
                choices=["random", "completion"],
                value="random",
                label="Mode",
            )
            mask_ratio = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                value=0.4,
                label="Mask ratio (random mode)",
                interactive=True,
            )
            completion_tokens = gr.Number(
                value=64,
                precision=0,
                label="Completion tokens (completion mode)",
                interactive=True,
                visible=False,
            )
        instruction = gr.Textbox(label="Instruction", lines=6)
        run_btn = gr.Button("Generate")
        masked_preview = gr.Textbox(label="Masked preview", lines=6)
        html_out = gr.HTML(label="Generated")
        def _toggle_controls(selected_mode: str):
            return (
                gr.update(visible=(selected_mode == "random")),
                gr.update(visible=(selected_mode == "completion")),
            )
        mode.change(
            _toggle_controls,
            inputs=[mode],
            outputs=[mask_ratio, completion_tokens],
        )
        def _gen(instruction_text: str, selected_mode: str, mratio: float, ctoks: int):
            if not instruction_text:
                return "", "<pre>Generated:\n(no output)</pre>"
            if prompter_module:
                prompt: str = next(
                    prompter_module().build_prompt(
                        instruction=instruction_text.strip("\n")
                    )
                )
            else:
                prompt = instruction_text.strip()
            info = run_diffusion(
                model=model,
                tokenizer=tokenizer,
                cfg=cfg,
                prompt=prompt,
                chat_template_str=chat_template_str,
                mode=selected_mode,
                target_mask_ratio=mratio if selected_mode == "random" else None,
                completion_tokens=int(ctoks) if selected_mode == "completion" else 0,
            )
            masked_text = info.get("masked_text")
            mask_ratio_val = info.get("mask_ratio")
            generated_ids = info.get("generated_ids")
            masked_positions = info.get("masked_positions") or set()
            orig_ids = info.get("orig_ids") or []
            preview = (
                f"Masked ({mask_ratio_val:.1%}):\n{masked_text}"
                if masked_text is not None and mask_ratio_val is not None
                else ""
            )
            html = render_html(
                generated_ids=generated_ids,
                orig_ids=orig_ids,
                masked_positions=masked_positions,
                tokenizer=tokenizer,
            )
            return preview, html
        run_btn.click(
            _gen,
            inputs=[instruction, mode, mask_ratio, completion_tokens],
            outputs=[masked_preview, html_out],
        )
        demo.queue().launch(
            show_api=False,
            share=cfg.get("gradio_share", True),
            server_name=cfg.get("gradio_server_name", "127.0.0.1"),
            server_port=cfg.get("gradio_server_port", None),
        )
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -55,11 +55,13 @@ def load_datasets(
    """
    tokenizer = load_tokenizer(cfg)
    processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
    preprocess_iterable = getattr(cli_args, "iterable", False)
    train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
        cfg,
        tokenizer,
        processor=processor,
        preprocess_iterable=preprocess_iterable,
    )
    if (
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -36,6 +36,7 @@ from axolotl.utils.callbacks import (
    SaveModelOnFirstStepCallback,
 )
 from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
 from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback
 from axolotl.utils.distributed import build_parallelism_config
 from axolotl.utils.schemas.enums import CustomSupportedOptimizers
@@ -144,6 +145,12 @@ class TrainerBuilderBase(abc.ABC):
                    profiler_steps_start=self.cfg.profiler_steps_start,
                )
            )
        if self.cfg.include_tkps:
            callbacks.append(
                TokensPerSecondCallback(
                    self.cfg.tensor_parallel_size, self.cfg.context_parallel_size
                )
            )
        return callbacks
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -10,7 +10,6 @@ import transformers
 from transformers import (
    DataCollatorWithFlattening,
    EarlyStoppingCallback,
    Trainer,
 )
 from trl.trainer.utils import RewardDataCollatorWithPadding
@@ -36,7 +35,6 @@ from axolotl.utils.callbacks import (
 )
 from axolotl.utils.callbacks.lisa import lisa_callback_factory
 from axolotl.utils.callbacks.qat import QATCallback
 from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback
 from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.collators import (
    BatchSamplerDataCollatorForSeq2Seq,
@@ -76,12 +74,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.qat:
            callbacks.append(QATCallback(self.cfg.qat))
        if self.cfg.include_tkps:
            callbacks.append(
                TokensPerSecondCallback(
                    self.cfg.tensor_parallel_size, self.cfg.context_parallel_size
                )
            )
        return callbacks
    def get_post_trainer_create_callbacks(self, trainer):
@@ -348,10 +340,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.reward_model:
            training_args_cls = AxolotlRewardConfig
            if self.cfg.center_rewards_coefficient is not None:
                training_arguments_kwargs["center_rewards_coefficient"] = (
                    self.cfg.center_rewards_coefficient
                )
        elif self.cfg.process_reward_model:
            training_args_cls = AxolotlPRMConfig
        else:
@@ -395,11 +383,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                **data_collator_kwargs,
            )
        sig = inspect.signature(trainer_cls)
-        if "processing_class" in sig.parameters or issubclass(trainer_cls, Trainer):
+        if "processing_class" in sig.parameters:
            trainer_kwargs["processing_class"] = self.tokenizer
        elif "tokenizer" in sig.parameters:
            trainer_kwargs["tokenizer"] = self.tokenizer
        if (
            trainer_cls not in [AxolotlRewardTrainer, AxolotlPRMTrainer]
            and self.cfg.datasets is not None
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -49,13 +49,6 @@ from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 LOG = get_logger(__name__)
 REDUCTION_FNS = {
    "mean": torch.mean,
    "min": torch.min,
    "max": torch.max,
    "sum": torch.sum,
 }
 class AxolotlTrainer(
    PackingMixin,
@@ -96,9 +89,7 @@ class AxolotlTrainer(
        super().__init__(*_args, **kwargs)
        self.train_data_collator = self.data_collator
-        self._stored_metrics = defaultdict(
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
            lambda: defaultdict(lambda: {"values": [], "reduction": "mean"})
        )
        if self.args.orpo_alpha:
            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
@@ -351,10 +342,10 @@ class AxolotlTrainer(
            inputs_key = "labels" if "labels" in inputs else "input_ids"
            if hasattr(self.state, "num_tokens"):
                self.state.num_tokens = (
-                    self.state.num_tokens + (inputs[inputs_key] != -100).sum().cpu()
+                    self.state.num_tokens + (inputs[inputs_key] != -100).sum()
                )
            else:
-                self.state.num_tokens = (inputs[inputs_key] != -100).sum().cpu()
+                self.state.num_tokens = (inputs[inputs_key] != -100).sum()
        if self.args.orpo_alpha:
            return self.orpo_compute_loss(
@@ -371,11 +362,6 @@ class AxolotlTrainer(
            num_items_in_batch=num_items_in_batch,
        )
    @override
    def evaluate(self, *args, **kwargs):
        LOG.info("Running evaluation step...")
        return super().evaluate(*args, **kwargs)
    @staticmethod
    def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
        concatenated_batch = {}
@@ -599,17 +585,9 @@ class AxolotlTrainer(
        """
        # logs either has 'loss' or 'eval_loss'
        train_eval = "train" if "loss" in logs else "eval"
-
+        # Add averaged stored metrics to logs
-        for key, metric_data in self._stored_metrics[train_eval].items():
+        for key, metrics in self._stored_metrics[train_eval].items():
-            values = torch.tensor(metric_data["values"])  # type: ignore[arg-type]
+            logs[key] = torch.tensor(metrics).mean().item()
            reduction_type = metric_data["reduction"]
            fn = REDUCTION_FNS.get(reduction_type)
            if fn is None:
                raise NotImplementedError(
                    "Metric reduction must be one of [mean, min, max, sum]"
                )
            logs[key] = round(fn(values).item(), 4)
        if is_main_process():
            # Add memory usage
@@ -633,27 +611,10 @@ class AxolotlTrainer(
        return super().log(logs, start_time)
    def store_metrics(
-        self,
+        self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train"
        metrics: dict[str, float] | dict[str, tuple[int | float, str]],
        train_eval: Literal["train", "eval"] = "train",
        reduction: Literal["mean", "min", "max", "sum"] = "mean",
    ) -> None:
        """
        Store metrics with specified reduction type.
        Args:
            metrics: Dictionary of metric names to values, or metric names to (value,
                reduction_type) tuples.
            train_eval: Whether this is for training or evaluation.
        """
        for key, value in metrics.items():
-            if isinstance(value, tuple):
+            self._stored_metrics[train_eval][key].append(value)
                value, _reduction = value  # type: ignore[assignment]
            else:
                value, _reduction = value, reduction
            self._stored_metrics[train_eval][key]["values"].append(value)
            self._stored_metrics[train_eval][key]["reduction"] = _reduction
    def _save_checkpoint(self, model, trial, **kwargs):
        # make sure the checkpoint dir exists, since trainer is flakey
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -1,17 +1,18 @@
-"""
+"""Module containing Dataset functionality"""
 Module containing dataset functionality.
 We want this to be a wrapper for an existing dataset that we have loaded. Lets use the
 concept of middlewares to wrap each dataset. We'll use the collators later on to pad the
 datasets.
 """
 import torch
 from datasets import Dataset, IterableDataset
 from axolotl.utils.logging import get_logger
 from .prompt_tokenizers import PromptTokenizingStrategy
 # We want this to be a wrapper for an existing dataset that we have loaded
 # lets use the concept of middlewares to wrap each dataset, for example
 # ConstantLengthDataset(ShuffledDataset([TokenizedPromptDataset(alpaca_dataset)]))
 # let's check to ensure we don't truncate an item in the middle, we'll use
 # the collators later on to pad the datasets
 LOG = get_logger(__name__)
@@ -85,3 +86,133 @@ def wrap_dataset_for_tokenized_prompt(
            **map_kwargs,
        )
    return TokenizedPromptDataset(prompt_tokenizer, dataset, **kwargs)
 # TODO this isn't the best since it can't interleave datasets
 class ConstantLengthDataset(IterableDataset):
    """Iterable dataset that returns constant length chunks of tokens from stream of
    text files.
    Args:
        tokenizer: The processor used for processing the data.
        dataset: Dataset with text files.
        seq_length: Length of token sequences to return.
    """
    def __init__(
        self,
        tokenizer,
        datasets,
        seq_length=2048,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.datasets: list[IterableDataset] = datasets
        self.seq_length = seq_length
        vocab_size = len(tokenizer.get_vocab())
        if vocab_size <= torch.iinfo(torch.int16).max:
            self.tokens_dtype = torch.int16
        elif vocab_size <= torch.iinfo(torch.int32).max:
            self.tokens_dtype = torch.int32
        else:
            self.tokens_dtype = torch.int64
    def __iter__(self):
        buffer = {
            "input_ids": [],
            "attention_mask": [],
            "labels": [],
            "position_ids": [],
        }
        buffer_len = 0
        for dataset in self.datasets:
            idx = 0
            iterator = iter(dataset)
            more_examples = True
            while more_examples:
                try:
                    example = next(iterator)
                    idx += 1
                except StopIteration:
                    more_examples = False
                    example = None
                add_concat_token = False
                if example:
                    example_len = len(example["input_ids"])
                    add_concat_token = example["input_ids"][-1] != self.concat_token_id
                else:
                    example_len = 0
                if not example_len or (
                    buffer_len + int(add_concat_token) + example_len > self.seq_length
                ):
                    if buffer["input_ids"]:
                        input_ids = torch.cat(buffer["input_ids"], dim=-1)[
                            : self.seq_length
                        ]
                        attention_mask = torch.cat(buffer["attention_mask"], dim=-1)[
                            : self.seq_length
                        ]
                        position_ids = torch.cat(buffer["position_ids"], dim=-1)[
                            : self.seq_length
                        ]
                        labels = torch.cat(buffer["labels"], dim=-1)[: self.seq_length]
                        if labels.size() == input_ids.size() and (
                            attention_mask.size() == input_ids.size()
                        ):
                            yield {
                                "input_ids": input_ids,
                                "labels": labels,
                                "attention_mask": attention_mask,
                                "position_ids": position_ids,
                            }
                        else:
                            LOG.warning(
                                "Dropping batch due to tensor size mismatch "
                                f"input_ids: {input_ids.size()}, "
                                f"labels: {labels.size()}, "
                                f"attention_mask: {attention_mask.size()}"
                            )
                    buffer = {
                        "input_ids": [],
                        "attention_mask": [],
                        "labels": [],
                        "position_ids": [],
                    }
                    buffer_len = 0
                    idx = 1
                if example:
                    # FIXME
                    # just going to drop data points that are too long
                    if len(example["input_ids"]) <= self.seq_length:
                        input_ids = example["input_ids"]
                        attention_mask = example["attention_mask"]
                        labels = example["labels"]
                        if add_concat_token:
                            input_ids.append(self.concat_token_id)
                            attention_mask.append(1)
                            labels.append(self.concat_token_id)
                        input_ids_with_concat = torch.tensor(
                            input_ids, dtype=self.tokens_dtype
                        )
                        attention_mask_with_concat = torch.tensor(
                            [idx * m for m in attention_mask], dtype=torch.int16
                        )
                        labels_with_concat = torch.tensor(
                            labels, dtype=self.tokens_dtype
                        )
                        position_ids = torch.arange(
                            len(input_ids), dtype=self.tokens_dtype
                        )
                        buffer["input_ids"].append(input_ids_with_concat)
                        buffer["attention_mask"].append(attention_mask_with_concat)
                        buffer["labels"].append(labels_with_concat)
                        buffer["position_ids"].append(position_ids)
                        buffer_len += len(input_ids)
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -142,7 +142,7 @@ class BasePlugin:
            model: The loaded model.
        """
-    def get_trainer_cls(self, cfg: DictDefault) -> type[Trainer] | None:
+    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
        """Returns a custom class for the trainer.
        Args:
--- a/src/axolotl/integrations/config.py
+++ b/src/axolotl/integrations/config.py
@@ -20,8 +20,8 @@ from typing import Any, Dict, List, Type
 from axolotl.utils.schemas.config import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
    AxolotlInputConfig as AxolotlInputConfigBase,
 )
 from axolotl.utils.schemas.config import AxolotlInputConfig as AxolotlInputConfigBase
 def merge_input_args():
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -34,7 +34,6 @@ plugins:
 - arcee
 - cohere
 - cohere2
 - deepseek_v3
 - gemma
 - gemma2
 - gemma3
@@ -43,7 +42,6 @@ plugins:
 - gemma3n_text
 - glm
 - glm4
 - glm4_moe
 - gpt_oss
 - granite
 - granitemoe
@@ -66,7 +64,6 @@ plugins:
 - qwen3
 - qwen3_moe
 - smollm3
 - seed_oss
 - voxtral
 ## Citation
--- a/src/axolotl/integrations/diffusion/README.md
+++ b/src/axolotl/integrations/diffusion/README.md
@@ -1,154 +0,0 @@
 # Diffusion LM Training Plugin for Axolotl
 This plugin enables diffusion language model training using an approach inspired by
 LLaDA (Large Language Diffusion Models) within Axolotl.
 ## Overview
 LLaDA is a diffusion-based approach to language model training that uses:
 - **Random token masking** during training instead of next-token prediction
 - **Bidirectional attention** to allow the model to attend to the full context
 - **Importance weighting** based on masking probabilities for stable training
 This approach can lead to more robust language models with better understanding of
 bidirectional context.
 ## Installation
 The plugin is included with Axolotl. See our
 [installation docs](https://docs.axolotl.ai/docs/installation.html).
 ## Quickstart
 Train with an example config (Llama‑3.2 1B):
   - Pretrain: `axolotl train examples/llama-3/diffusion-3.2-1b-pretrain.yaml`
   - SFT: `axolotl train examples/llama-3/diffusion-3.2-1b-sft.yaml`
 ### Basic Configuration
 You can also modify your existing configs to enable / customize diffusion training.
 Add the following to your Axolotl config:
 ```yaml
 # Enable diffusion LM training plugin
 plugins:
  - axolotl.integrations.diffusion.DiffusionPlugin
 ```
 And, configure the nested `diffusion` block (defaults shown):
 ```yaml
 diffusion:
  noise_schedule: linear  # or "cosine"
  min_mask_ratio: 0.1
  max_mask_ratio: 0.9
  num_diffusion_steps: 128
  eps: 1e-3
  importance_weighting: true
  # Mask token (training auto-adds if missing, avoid pad/eos)
  mask_token_str: "<|diffusion_mask|>"
  # Or use an existing special token id (e.g., 128002 for Llama-3.x)
  # mask_token_id: 128002
  # Sample generation during training (optional)
  generate_samples: true
  generation_interval: 100
  num_generation_samples: 3
  generation_steps: 128
  generation_temperature: 0.0
  generation_max_length: 100
 ```
 ## Supported Models
 Any models that support 4D attention masks should work out of the box. If not, please
 create an [issue](https://github.com/axolotl-ai-cloud/axolotl/issues) or open a
 [PR](https://github.com/axolotl-ai-cloud/axolotl/compare)!
 ## How It Works
 ### Random Masking
 During training, tokens are randomly masked:
 - Sample timestep `t` uniformly from [0, 1]
 - Calculate masking probability: `p = (1 - eps) * t + eps`
 - Randomly mask tokens with probability `p`
 ### Diffusion Loss
 Loss is computed only on masked tokens with (optional) importance weighting:
 ```python
 loss = sum(cross_entropy(pred, target) / p_mask) / total_tokens
 ```
 ## Sample Generation
 When `diffusion.generate_samples: true`, the plugin generates samples during training:
 ```
 Sample 1:
   Original (45 tokens): The quick brown fox jumps over the lazy dog...
   Masked (18/45 tokens, 40.0%): The [MASK] [MASK] fox [MASK] over [MASK] lazy [MASK]...
   Generated: The quick brown fox jumps over the lazy dog...
 ```
 Samples are logged to console and wandb (if enabled).
 ## Inference
 Diffusion inference is integrated into the standard Axolotl CLI. Use the same config
 you trained with and run:
 ```
 axolotl inference path/to/your-config.yaml
 ```
 Optionally, pass `--gradio` to use a simple web interface.
 Interactive controls (prefix the prompt with commands):
 - `:complete N` → completion mode with N new masked tokens appended (default 64)
 - `:mask R` → random masking mode with target mask ratio R in [0.0, 1.0]
 Example session:
 ```
 ================================================================================
 Commands:
 :complete N -> completion mode with N tokens (default 64)
 :mask R     -> random masking with ratio R (0.0–1.0)
 ================================================================================
 Give me an instruction (Ctrl + D to submit):
 :mask 0.4 The quick brown fox jumps over the lazy dog
 Masked (40.0%):
 The [MASK] brown [MASK] jumps over the [MASK] dog
 Generated:
 The quick brown fox jumps over the loud dog
 ```
 ## Metrics and Monitoring
 The plugin adds (or modifies) several metrics to track diffusion training:
 - `train/loss`: Weighted diffusion loss
 - `train/accuracy`: Accuracy on masked tokens
 - `train/mask_ratio`: Average fraction of tokens masked
 - `train/num_masked_tokens`: Number of tokens masked
 - `train/avg_p_mask`: Average masking probability
 - `train/ce_loss`: Unweighted cross-entropy loss
 - `train/importance_weight_avg`: Average importance weight
 ## Limitations
 - No flash attention support
 - No RL training support
 ## References
 - [LLaDA Paper](https://arxiv.org/abs/2404.10406)
 - [Axolotl Documentation](https://docs.axolotl.ai/)
 - [API reference for plugin](https://docs.axolotl.ai/docs/api/integrations.diffusion.args.html#axolotl.integrations.diffusion.args)
--- a/src/axolotl/integrations/diffusion/init.py
+++ b/src/axolotl/integrations/diffusion/init.py
@@ -1,19 +0,0 @@
 """Diffusion LM training plugin init."""
 from .args import DiffusionArgs, DiffusionConfig
 from .callbacks import DiffusionGenerationCallback
 from .generation import generate
 from .plugin import DiffusionPlugin
 from .trainer import DiffusionTrainer
 from .utils import create_bidirectional_attention_mask, resolve_mask_token_id
 __all__ = [
    "DiffusionArgs",
    "DiffusionPlugin",
    "DiffusionTrainer",
    "generate",
    "resolve_mask_token_id",
    "create_bidirectional_attention_mask",
    "DiffusionGenerationCallback",
    "DiffusionConfig",
 ]
--- a/src/axolotl/integrations/diffusion/args.py
+++ b/src/axolotl/integrations/diffusion/args.py
@@ -1,95 +0,0 @@
 """Config args for diffusion LM training (nested under `diffusion:`)."""
 from __future__ import annotations
 from typing import Literal
 from pydantic import BaseModel, Field, model_validator
 class DiffusionConfig(BaseModel):
    """Nested diffusion configuration available under the `diffusion` key."""
    # Noise schedule config
    noise_schedule: Literal["linear", "cosine"] = Field(
        default="linear", description="Type of noise schedule for diffusion training"
    )
    min_mask_ratio: float = Field(
        default=0.1,
        ge=0.0,
        le=1.0,
        description="Minimum masking ratio for diffusion noise schedule",
    )
    max_mask_ratio: float = Field(
        default=0.9,
        ge=0.0,
        le=1.0,
        description="Maximum masking ratio for diffusion noise schedule",
    )
    num_diffusion_steps: int = Field(
        default=128, ge=1, description="Number of diffusion timesteps"
    )
    eps: float = Field(
        default=1e-3,
        ge=0.0,
        le=1.0,
        description="Epsilon value for minimum masking probability in forward process",
    )
    # Training config
    importance_weighting: bool = Field(
        default=True,
        description="Apply importance weighting to loss based on masking probability",
    )
    mask_token_id: int | None = Field(
        default=None,
        description=(
            "Token ID to use for masking. Unset by default; can use one of the "
            "tokenizer's special tokens here."
        ),
    )
    mask_token_str: str | None = Field(
        default=None,
        description=(
            "Token string to use as a mask. If `mask_token_id` is invalid or unset, "
            "this token will be ensured to exist as an additional special token and "
            "used. If absent, a default '<|diffusion_mask|>' will be added."
        ),
    )
    # Sample generation config
    generate_samples: bool = Field(
        default=True, description="Enable sample generation during training"
    )
    generation_interval: int = Field(
        default=100, ge=1, description="Generate samples every N steps"
    )
    num_generation_samples: int = Field(
        default=3, ge=1, description="Number of samples to generate each time"
    )
    generation_steps: int = Field(
        default=128, ge=1, description="Number of diffusion steps for generation"
    )
    generation_temperature: float = Field(
        default=0.0,
        ge=0.0,
        description="Temperature for generation sampling (0.0 = deterministic)",
    )
    generation_max_length: int = Field(
        default=100, ge=1, description="Maximum sequence length for generation"
    )
    @model_validator(mode="after")
    def _validate_mask_ratios(self) -> "DiffusionConfig":
        if self.min_mask_ratio > self.max_mask_ratio:
            raise ValueError("min_mask_ratio must be ≤ max_mask_ratio")
        return self
 class DiffusionArgs(BaseModel):
    """Plugin entry that exposes the nested `diffusion` block to the core config."""
    diffusion: DiffusionConfig = Field(
        default_factory=DiffusionConfig,
        description="Diffusion training configuration. Only nested block is supported.",
    )
--- a/src/axolotl/integrations/diffusion/callbacks.py
+++ b/src/axolotl/integrations/diffusion/callbacks.py
@@ -1,174 +0,0 @@
 """Callbacks for diffusion training."""
 import logging
 import sys
 import wandb
 from colorama import Fore, Style
 from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
 from transformers.training_args import TrainingArguments
 from .generation import generate_samples
 # Simpler logger for more readable sample generation
 logger = logging.getLogger(__name__)
 if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(logging.Formatter("%(message)s"))
    logger.addHandler(handler)
    logger.propagate = False
 logger.setLevel(logging.INFO)
 class DiffusionGenerationCallback(TrainerCallback):
    """Callback for generating samples during diffusion training."""
    def __init__(self, trainer):
        self.trainer = trainer
    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Generate samples at specified intervals."""
        if (
            state.global_step > 0
            and state.global_step % self.trainer.cfg.diffusion.generation_interval == 0
        ):
            if not self.trainer.state.is_world_process_zero:
                return
            # Use eval dataloader if available, otherwise use train dataloader
            dataloader = None
            try:
                if getattr(self.trainer, "eval_dataset", None) is not None:
                    dataloader = self.trainer.get_eval_dataloader()
            except Exception:
                dataloader = None
            if dataloader is None:
                dataloader = self.trainer.get_train_dataloader()
            # Generate samples
            diffusion_cfg = self.trainer.cfg.diffusion
            samples = generate_samples(
                model=self.trainer.model,
                tokenizer=self.trainer.processing_class,
                dataloader=dataloader,
                num_generation_samples=diffusion_cfg.num_generation_samples,
                max_length=diffusion_cfg.generation_max_length,
                num_diffusion_steps=diffusion_cfg.generation_steps,
                temperature=diffusion_cfg.generation_temperature,
                mask_token_id=diffusion_cfg.mask_token_id,
            )
            # Log samples
            self._log_samples(samples, state.global_step)
    def _log_samples(self, samples: list, step: int):
        """Log generated samples."""
        if not samples:
            return
        logger.info("=" * 60)
        logger.info("GENERATED SAMPLES")
        logger.info("=" * 60)
        for i, sample_data in enumerate(samples, 1):
            original = sample_data["original"]
            masked = sample_data["masked"]
            generated = sample_data["generated"]
            mask_ratio = sample_data["mask_ratio"]
            masked_tokens = sample_data["masked_tokens"]
            total_tokens = sample_data["total_tokens"]
            logger.info(f"\nSample {i}:")
            logger.info(f"\tOriginal ({total_tokens} tokens): {original}")
            logger.info(
                f"\tMasked ({masked_tokens}/{total_tokens} tokens, "
                f"{mask_ratio:.1%}): {masked}"
            )
            try:
                gen_ids = sample_data.get("generated_ids")
                orig_ids = sample_data.get("orig_ids")
                masked_positions = set(sample_data.get("masked_positions") or [])
                if isinstance(gen_ids, list) and isinstance(orig_ids, list):
                    styles: list[str] = []
                    for i, tid in enumerate(gen_ids):
                        if i in masked_positions:
                            if i < len(orig_ids) and tid == orig_ids[i]:
                                styles.append("green")
                            elif i < len(orig_ids):
                                styles.append("red")
                            else:
                                styles.append("normal")
                        else:
                            same = i < len(orig_ids) and tid == orig_ids[i]
                            styles.append("dim" if same else "normal")
                    spans: list[tuple[str, int, int]] = []
                    if gen_ids:
                        cur = styles[0]
                        start = 0
                        for i in range(1, len(gen_ids)):
                            s = styles[i]
                            if s != cur:
                                spans.append((cur, start, i))
                                cur, start = s, i
                        spans.append((cur, start, len(gen_ids)))
                    parts = []
                    for style_name, a, b in spans:
                        chunk_text = self.trainer.processing_class.decode(
                            gen_ids[a:b], skip_special_tokens=False
                        )
                        if style_name == "green":
                            parts.append(Fore.GREEN + chunk_text + Style.RESET_ALL)
                        elif style_name == "red":
                            parts.append(Fore.RED + chunk_text + Style.RESET_ALL)
                        else:
                            if style_name == "dim":
                                parts.append(Style.DIM + chunk_text + Style.RESET_ALL)
                            else:
                                parts.append(chunk_text)
                    logger.info("\tGenerated:\n%s", "".join(parts))
                else:
                    logger.info(f"\tGenerated: {generated}")
            except Exception:
                logger.info(f"\tGenerated: {generated}")
        logger.info("=" * 60)
        if self.trainer.cfg.use_wandb:
            if wandb.run is not None:
                wandb.log(
                    {
                        "generated_samples": wandb.Table(
                            columns=[
                                "step",
                                "original",
                                "masked",
                                "generated",
                                "mask_ratio",
                                "masked_tokens",
                                "total_tokens",
                            ],
                            data=[
                                [
                                    step,
                                    sample["original"],
                                    sample["masked"],
                                    sample["generated"],
                                    f"{sample['mask_ratio']:.1%}",
                                    sample["masked_tokens"],
                                    sample["total_tokens"],
                                ]
                                for sample in samples
                            ],
                        )
                    },
                    step=step,
                )
--- a/src/axolotl/integrations/diffusion/generation.py
+++ b/src/axolotl/integrations/diffusion/generation.py
@@ -1,409 +0,0 @@
 """Sample generation utilities for diffusion training."""
 import re
 from typing import Any, List, Literal, Optional
 import torch
 from axolotl.utils.logging import get_logger
 from .utils import create_bidirectional_attention_mask
 LOG = get_logger(__name__)
 def generate_samples(
    model: torch.nn.Module,
    tokenizer: Any,
    dataloader: Optional[Any] = None,
    num_generation_samples: int = 3,
    max_length: int = 100,
    num_diffusion_steps: int = 128,
    temperature: float = 0.0,
    mask_token_id: int = 32000,
    mode: Literal["random", "completion"] = "random",
    completion_tokens: int = 0,
    target_mask_ratio: Optional[float] = None,
 ) -> List[dict]:
    """
    Generate text samples using the diffusion model by randomly masking sequences from
    the given dataset and running the reverse diffusion process.
    Args:
        model: The wrapped or unwrapped model
        tokenizer: Tokenizer for encoding/decoding
        dataloader: Validation dataloader (for sampling sequences)
        num_generation_samples: Number of samples to generate
        max_length: Maximum length of sequences to use
        num_diffusion_steps: Number of diffusion steps for generation
        temperature: Temperature for sampling (0.0 = deterministic)
        mask_token_id: Token ID used for masking
    Returns:
        List of dictionaries with original text, masked text, and generated text
    """
    if dataloader is None:
        LOG.warning("No validation dataloader provided, cannot generate samples")
        return []
    unwrapped_model = model.module if hasattr(model, "module") else model
    training = unwrapped_model.training
    unwrapped_model.eval()
    # Resolve device robustly (some modules don't expose `.device`)
    device = getattr(unwrapped_model, "device", None)
    if device is None:
        try:
            device = next(unwrapped_model.parameters()).device
        except StopIteration:
            device = torch.device("cpu")
    generations = []
    # Sample sequences from validation dataset
    sampled_sequences = _sample_sequences_from_dataloader(
        dataloader, num_generation_samples, max_length, device
    )
    LOG.info(f"Sampled {len(sampled_sequences)} sequences from validation dataset")
    # Generate samples using reverse diffusion process
    with torch.no_grad():
        for sample in sampled_sequences:
            if isinstance(sample, dict):
                original_sequence = sample.get("input_ids")
                labels_seq = sample.get("labels")
                attn_seq = sample.get("attention_mask")
            else:
                original_sequence = sample
                labels_seq = None
                attn_seq = None
            generation_result = generate(
                unwrapped_model,
                tokenizer,
                original_sequence,
                num_diffusion_steps,
                temperature,
                mask_token_id,
                mode=mode,
                completion_tokens=completion_tokens,
                target_mask_ratio=target_mask_ratio,
                labels=labels_seq,
                attention_mask=attn_seq,
            )
            generations.append(generation_result)
    # Restore prior training state
    if training:
        unwrapped_model.train()
    else:
        unwrapped_model.eval()
    return generations
 def _sample_sequences_from_dataloader(
    dataloader: Any, num_samples: int, max_length: int, device: torch.device
 ) -> List[Any]:
    """Sample sequences from validation dataloader."""
    sampled_sequences: list[dict[str, torch.Tensor] | torch.Tensor] = []
    sample_count = 0
    # Skip a random number of batches (we could be more clever about this)
    skip_batches = torch.randint(0, 10, (1,)).item()
    batch_count = 0
    for batch in dataloader:
        # Skip some batches for variety
        if batch_count < skip_batches:
            batch_count += 1
            continue
        if sample_count >= num_samples:
            break
        batch_count += 1
        input_ids = batch["input_ids"]
        attention_mask = batch.get("attention_mask")
        labels = batch.get("labels")
        # Randomly sample from sequences in this batch
        batch_indices = torch.randperm(input_ids.size(0)).tolist()
        for i in batch_indices:
            if sample_count >= num_samples:
                break
            # Get actual sequence length (non-padded)
            if attention_mask is not None:
                seq_len = attention_mask[i].sum().item()
            else:
                seq_len = input_ids.size(1)
            if seq_len < 10:
                continue
            # Determine truncation length
            max_total = min(seq_len, max_length)
            if labels is not None:
                labels_i = labels[i][:seq_len]
                answer_mask = labels_i != -100
                if not answer_mask.any():
                    # No answer tokens; skip for SFT masking
                    continue
                first_ans_idx = int(
                    torch.nonzero(answer_mask, as_tuple=False)[0].item()
                )
                prompt_len = first_ans_idx
                if prompt_len >= max_total:
                    # Prompt alone reaches cap; cannot include any answer
                    continue
                remaining_answer = int(answer_mask[prompt_len:].sum().item())
                allowed_answer = max_total - prompt_len
                take_answer = min(remaining_answer, allowed_answer)
                if take_answer <= 0:
                    continue
                actual_length = prompt_len + take_answer
            else:
                actual_length = max_total
            # Extract the (possibly truncated) sequence
            sequence = input_ids[i][:actual_length].unsqueeze(0).to(device)
            attn_seq = (
                attention_mask[i][:actual_length].unsqueeze(0).to(device)
                if attention_mask is not None
                else None
            )
            if labels is not None:
                labels_seq = labels[i][:actual_length].unsqueeze(0).to(device)
                sampled_sequences.append(
                    {
                        "input_ids": sequence,
                        "labels": labels_seq,
                        "attention_mask": attn_seq,
                    }
                )
            else:
                if attn_seq is not None:
                    sampled_sequences.append(
                        {"input_ids": sequence, "attention_mask": attn_seq}
                    )
                else:
                    sampled_sequences.append(sequence)
            sample_count += 1
    return sampled_sequences
 def generate(
    model: torch.nn.Module,
    tokenizer: Any,
    original_sequence: torch.Tensor,
    num_diffusion_steps: int,
    temperature: float,
    mask_token_id: int,
    *,
    mode: Literal["random", "completion"] = "random",
    completion_tokens: int = 0,
    target_mask_ratio: Optional[float] = None,
    labels: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
 ) -> dict:
    """Generate a single sample using reverse diffusion."""
    # Get original text for comparison
    original_text = tokenizer.decode(
        original_sequence[0].cpu(), skip_special_tokens=True
    )
    # Build masked sequence
    if (
        labels is not None
        and labels.numel() > 0
        and (labels == -100).any()
        and (labels != -100).any()
    ):
        # SFT case: completely mask all answer tokens (labels != -100)
        total_tokens = original_sequence.size(1)
        masked_indices = (labels != -100).to(dtype=torch.bool)
        masked_sequence = original_sequence.clone()
        masked_sequence[masked_indices] = mask_token_id
        masked_tokens = int(masked_indices.sum().item())
        mask_ratio = masked_tokens / max(int(total_tokens), 1)
    elif mode == "completion" and completion_tokens > 0:
        # Append mask tokens to the right for completion
        total_tokens = original_sequence.size(1) + int(completion_tokens)
        masked_indices = torch.zeros(
            1, total_tokens, dtype=torch.bool, device=original_sequence.device
        )
        masked_indices[0, -int(completion_tokens) :] = True
        append = torch.full(
            (1, int(completion_tokens)), mask_token_id, device=original_sequence.device
        )
        masked_sequence = torch.cat([original_sequence, append], dim=1)
        masked_tokens = int(completion_tokens)
        mask_ratio = masked_tokens / total_tokens
    else:
        # Apply random masking with optional fixed ratio
        total_tokens = original_sequence.size(1)
        if target_mask_ratio is None:
            min_ratio, max_ratio = 0.1, 0.7
            target_mask_ratio = (
                torch.rand(1).item() * (max_ratio - min_ratio) + min_ratio
            )
        target_masked_tokens = max(1, int(total_tokens * float(target_mask_ratio)))
        # Create random mask indices
        mask_positions = torch.randperm(total_tokens)[:target_masked_tokens]
        masked_indices = torch.zeros(
            1, total_tokens, dtype=torch.bool, device=original_sequence.device
        )
        masked_indices[0, mask_positions] = True
        # Create masked sequence
        masked_sequence = original_sequence.clone()
        masked_sequence[masked_indices] = mask_token_id
        # Calculate actual mask ratio
        masked_tokens = masked_indices.sum().item()
        mask_ratio = masked_tokens / total_tokens
    # Get masked text for comparison
    masked_text = tokenizer.decode(masked_sequence[0].cpu(), skip_special_tokens=False)
    masked_text = _clean_masked_text(masked_text, tokenizer, mask_token_id)
    # Run reverse diffusion process
    sequence = masked_sequence.clone()
    attention_mask = create_bidirectional_attention_mask(
        sequence, attention_mask, sample_packing=attention_mask is not None
    )
    for step in range(num_diffusion_steps):
        sequence = _diffusion_step(
            model,
            sequence,
            step,
            num_diffusion_steps,
            temperature,
            mask_token_id,
            attention_mask,
        )
    generated_text = tokenizer.decode(sequence[0].cpu(), skip_special_tokens=True)
    # Collect diagnostic info
    final_ids = sequence[0].detach().cpu().tolist()
    orig_ids_for_render = original_sequence[0].detach().cpu().tolist()
    if masked_indices is not None:
        masked_positions = (
            torch.where(masked_indices[0])[0].detach().cpu().tolist()
            if masked_indices.ndim == 2
            else []
        )
    else:
        masked_positions = []
    result = {
        "original": original_text,
        "masked": masked_text,
        "generated": generated_text,
        "mask_ratio": mask_ratio,
        "masked_tokens": masked_tokens,
        "total_tokens": total_tokens,
        "generated_ids": final_ids,
        "masked_positions": masked_positions,
        "orig_ids": orig_ids_for_render,
        "formatted": (
            f"Original: '{original_text}' → Masked: '{masked_text}' "
            f"({mask_ratio:.1%}) → Generated: '{generated_text}'"
        ),
    }
    return result
 def _clean_masked_text(masked_text: str, tokenizer: Any, mask_token_id: int) -> str:
    """Clean up masked text for display."""
    mask_token_repr = tokenizer.decode([mask_token_id], skip_special_tokens=False)
    cleaned = masked_text.replace(mask_token_repr, "[MASK]")
    # Remove literal special token strings
    if hasattr(tokenizer, "special_tokens_map"):
        for token_value in tokenizer.special_tokens_map.values():
            if token_value and isinstance(token_value, str):
                cleaned = cleaned.replace(token_value, "")
    # Normalize whitespace but preserve newlines
    cleaned = cleaned.replace("\r\n", "\n").replace("\r", "\n")
    cleaned = re.sub(r"[ \t]+", " ", cleaned)
    cleaned = "\n".join(line.rstrip() for line in cleaned.split("\n")).strip()
    return cleaned
 def _diffusion_step(
    model: torch.nn.Module,
    sequence: torch.Tensor,
    step: int,
    num_diffusion_steps: int,
    temperature: float,
    mask_token_id: int,
    attention_mask: torch.Tensor | None = None,
 ) -> torch.Tensor:
    """Perform a single diffusion step with remasking."""
    # Only process if there are masked tokens remaining
    current_mask = sequence == mask_token_id
    if not current_mask.any():
        return sequence
    # Create or use provided attention mask
    if attention_mask is None:
        batch_size, seq_len = sequence.shape
        attention_mask = torch.ones(
            batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=sequence.device
        )
    # Forward pass
    outputs = model(input_ids=sequence, attention_mask=attention_mask)
    logits = outputs.logits
    # Only sample at currently masked positions
    if current_mask.any():
        masked_logits = logits[current_mask]
        # Apply temperature scaling
        if temperature > 0:
            scaled_logits = masked_logits / temperature
        else:
            scaled_logits = masked_logits
        # Suppress mask token in outputs
        scaled_logits[:, mask_token_id] = -float("inf")
        if temperature > 0:
            # Add Gumbel noise for sampling
            gumbel_noise = -torch.log(
                -torch.log(torch.rand_like(scaled_logits, dtype=torch.float32))
            )
            gumbel_logits = scaled_logits + gumbel_noise
            predicted_tokens = torch.argmax(gumbel_logits, dim=-1)
        else:
            predicted_tokens = torch.argmax(scaled_logits, dim=-1)
        # Calculate probabilities for confidence scoring
        probs = torch.softmax(scaled_logits, dim=-1)
        predicted_token_probs = probs[range(len(predicted_tokens)), predicted_tokens]
        # Determine how many tokens to unmask this step
        remaining_masked = current_mask.sum().item()
        if step == num_diffusion_steps - 1:
            num_to_unmask = remaining_masked
        else:
            unmask_ratio = 1.0 / (num_diffusion_steps - step)
            num_to_unmask = max(1, int(remaining_masked * unmask_ratio))
        # Select highest confidence predictions to unmask
        if num_to_unmask >= remaining_masked:
            sequence[current_mask] = predicted_tokens
        else:
            _, top_indices = predicted_token_probs.topk(num_to_unmask)
            mask_positions = torch.where(current_mask)[1]
            positions_to_unmask = mask_positions[top_indices]
            sequence[0, positions_to_unmask] = predicted_tokens[top_indices]
    return sequence
--- a/src/axolotl/integrations/diffusion/plugin.py
+++ b/src/axolotl/integrations/diffusion/plugin.py
@@ -1,41 +0,0 @@
 """Diffusion LM training plugin for Axolotl."""
 from peft import PeftModel
 from transformers import PreTrainedModel
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 from .trainer import DiffusionTrainer
 LOG = get_logger(__name__)
 class DiffusionPlugin(BasePlugin):
    """
    Plugin for diffusion language model training.
    This plugin enables diffusion-based training using the LLaDA approach, which uses
    random masking and bidirectional attention to train language models.
    """
    def __init__(self):
        super().__init__()
        self.cfg = None
    def get_input_args(self) -> str:
        """Returns the pydantic model for LLaDA plugin arguments."""
        return "axolotl.integrations.diffusion.DiffusionArgs"
    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Perform actions after model is loaded."""
        self.cfg = cfg
    def get_trainer_cls(self, cfg: DictDefault) -> type[DiffusionTrainer] | None:
        """Return custom trainer class for diffusion training."""
        return DiffusionTrainer
    def post_trainer_create(self, cfg: DictDefault, trainer: DiffusionTrainer):
        """Configure trainer after creation."""
        trainer.set_config(cfg)
--- a/src/axolotl/integrations/diffusion/trainer.py
+++ b/src/axolotl/integrations/diffusion/trainer.py
@@ -1,301 +0,0 @@
 """Custom trainer for diffusion LM training."""
 from typing import Any, Literal
 import torch
 import torch.nn.functional as F
 from torch import nn
 from axolotl.core.trainers.base import AxolotlTrainer
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 from .callbacks import DiffusionGenerationCallback
 from .utils import create_bidirectional_attention_mask
 LOG = get_logger(__name__)
 class DiffusionTrainer(AxolotlTrainer):
    """Custom trainer for diffusion LM training that overrides loss computation."""
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cfg = None
        self._special_token_ids = None
    def set_config(self, config: DictDefault):
        """Set config for diffusion training."""
        self.cfg = config
        self._cache_special_token_ids()
        self._resolve_mask_token_id()
        token_id = int(getattr(self.cfg.diffusion, "mask_token_id", 0))
        LOG.info(f"Diffusion: using mask_token_id={token_id}")
        if getattr(config.diffusion, "generate_samples", True):
            generation_callback = DiffusionGenerationCallback(self)
            self.add_callback(generation_callback)
    def _resolve_mask_token_id(self) -> None:
        """Ensure mask_token_id is valid for the current tokenizer."""
        from .utils import resolve_mask_token_id
        tokenizer = getattr(self, "processing_class", None)
        if tokenizer is None:
            return
        mid = resolve_mask_token_id(
            tokenizer,
            self.cfg,
            allow_add=True,
            model=getattr(self, "model", None),
        )
        try:
            self.cfg.diffusion.mask_token_id = int(mid)
        except Exception:
            pass
    def compute_loss(
        self,
        model: nn.Module,
        inputs: dict[str, torch.Tensor],
        return_outputs: bool = False,
        num_items_in_batch: torch.Tensor | None = None,
    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
        """Override compute_loss to use diffusion loss."""
        input_ids = inputs.get("input_ids")
        attention_mask = inputs.get("attention_mask")
        labels = inputs.get("labels")
        if input_ids is None:
            raise ValueError("input_ids is required for diffusion training")
        loss, outputs = self._compute_diffusion_loss(
            model, input_ids, attention_mask, labels
        )
        if return_outputs:
            return loss, outputs
        return loss
    def _cache_special_token_ids(self):
        """Cache special token IDs to avoid repeated tokenizer access."""
        if self.processing_class is None:
            self._special_token_ids = set()
            return
        tokenizer = self.processing_class
        special_tokens = set()
        if hasattr(tokenizer, "bos_token_id") and tokenizer.bos_token_id is not None:
            special_tokens.add(tokenizer.bos_token_id)
        if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id is not None:
            special_tokens.add(tokenizer.eos_token_id)
        if hasattr(tokenizer, "pad_token_id") and tokenizer.pad_token_id is not None:
            special_tokens.add(tokenizer.pad_token_id)
        self._special_token_ids = special_tokens
    def _forward_process(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor | None = None,
        labels: torch.Tensor | None = None,
        eps: float = 1e-3,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Forward noising process. A timestep is sampled along the process, and tokens are
        masked with probability determined by the configured noise schedule.
        Args:
            input_ids: Input token ids [batch_size, seq_len].
            attention_mask: Attention mask [batch_size, seq_len].
            labels: Labels for SFT training [batch_size, seq_len].
            eps: Small epsilon value for minimum masking probability.
        Returns:
            noisy_batch: Input with some tokens masked.
            masked_indices: Boolean mask indicating which tokens were masked.
            p_mask: Masking probabilities for each token [batch_size, seq_len].
        """
        batch_size, seq_len = input_ids.shape
        device = input_ids.device
        # Sample random timesteps for each sample in batch
        t = torch.rand(batch_size, device=device)
        p_mask = (1 - eps) * t + eps  # [batch_size]
        p_mask = p_mask[:, None].repeat(1, seq_len)  # [batch_size, seq_len]
        # Don't mask padding tokens if attention_mask is provided
        if attention_mask is not None:
            valid_mask = attention_mask.bool()
            p_mask = p_mask * valid_mask.float()
        # Create mask to exclude special tokens
        special_token_mask = torch.zeros_like(input_ids, dtype=torch.bool)
        if self._special_token_ids:
            for token_id in self._special_token_ids:
                special_token_mask |= input_ids == token_id
        # Create random mask based on p_mask
        masked_indices = torch.rand((batch_size, seq_len), device=device) < p_mask
        masked_indices = masked_indices & ~special_token_mask
        if attention_mask is not None:
            masked_indices = masked_indices & attention_mask.bool()
        # For SFT data, only mask answer tokens
        if labels is not None:
            answer_mask = labels != -100
            masked_indices = masked_indices & answer_mask
        # Create masked input
        mask_token_id = int(self.cfg.diffusion.mask_token_id)
        mask_value = torch.full_like(input_ids, mask_token_id)
        noisy_batch = torch.where(masked_indices, mask_value, input_ids)
        return noisy_batch, masked_indices, p_mask
    def _compute_diffusion_loss(
        self,
        model: nn.Module,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor | None = None,
        labels: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor | Any]:
        """
        Compute diffusion loss.
        Args:
            model: The model to compute loss for.
            input_ids: Ground truth token ids [batch_size, seq_len].
            attention_mask: Attention mask [batch_size, seq_len].
            labels: Labels for SFT training [batch_size, seq_len].
        Returns:
            loss: Cross-entropy loss.
            metrics: Dictionary of metrics.
        """
        # Short-circuit empty sequences
        if input_ids is None or input_ids.numel() == 0 or input_ids.shape[1] == 0:
            zero = torch.tensor(
                0.0,
                device=(input_ids.device if input_ids is not None else None),
                requires_grad=True,
            )
            return zero, {}
        # If an attention_mask is provided and all positions are padding for every
        # sample in this batch, skip the step.
        if attention_mask is not None:
            if attention_mask.dim() == 2 and (attention_mask.sum(dim=1) == 0).all():
                zero = torch.tensor(0.0, device=input_ids.device, requires_grad=True)
                return zero, {}
        # Apply forward process
        noisy_batch, masked_indices, p_mask = self._forward_process(
            input_ids, attention_mask, labels, self.cfg.diffusion.eps
        )
        # Create bidirectional attention mask
        bidirectional_mask = create_bidirectional_attention_mask(
            input_ids, attention_mask, sample_packing=self.cfg.sample_packing
        )
        # Forward pass
        outputs = model(
            input_ids=noisy_batch.long(),
            attention_mask=bidirectional_mask,
        )
        logits = outputs.logits
        if masked_indices.sum() > 0:
            valid_indices = torch.where(masked_indices)
            batch_indices, seq_indices = valid_indices
            masked_logits = logits[batch_indices, seq_indices]
            masked_targets = input_ids[batch_indices, seq_indices]
            masked_p_mask = p_mask[batch_indices, seq_indices]
            # Compute cross-entropy loss without reduction
            token_loss = F.cross_entropy(
                masked_logits.float(), masked_targets, reduction="none"
            )
            if self.cfg.diffusion.importance_weighting:
                masked_p_mask = masked_p_mask.float()
                weighted_loss = token_loss / masked_p_mask
            else:
                weighted_loss = token_loss
            if labels is not None:
                # For SFT data: normalize by answer token count per sample
                answer_mask = labels != -100
                answer_lengths = answer_mask.sum(dim=1).float()  # [batch_size]
                # Get batch indices for masked tokens
                masked_batch_indices = batch_indices
                # Sum losses per sample and divide by answer length
                batch_size = input_ids.shape[0]
                loss_per_sample = torch.zeros(batch_size, device=input_ids.device)
                for i in range(batch_size):
                    sample_mask = masked_batch_indices == i
                    if sample_mask.sum() > 0:
                        sample_loss = weighted_loss[sample_mask].sum()
                        denom = answer_lengths[i].clamp(min=1.0)
                        loss_per_sample[i] = sample_loss / denom
                loss = loss_per_sample.mean()
            else:
                # Non-SFT: when importance weighting is enabled, use unbiased estimator
                # (sum(loss/p) / total_tokens). Otherwise, average over masked tokens
                # for stable scaling across varying mask ratios.
                if self.cfg.diffusion.importance_weighting:
                    loss = weighted_loss.sum() / (
                        input_ids.shape[0] * input_ids.shape[1]
                    )
                else:
                    loss = weighted_loss.mean()
            ce_loss = token_loss.mean()
            # Compute accuracy on masked tokens
            with torch.no_grad():
                pred_tokens = masked_logits.argmax(dim=-1)
                accuracy = (pred_tokens == masked_targets).float().mean()
        else:
            loss = torch.tensor(0.0, device=input_ids.device, requires_grad=True)
            accuracy = torch.tensor(0.0, device=input_ids.device)
            ce_loss = torch.tensor(0.0, device=input_ids.device)
            masked_p_mask = torch.tensor(1.0, device=input_ids.device)
        avg_p_mask = (
            p_mask[masked_indices].mean().item() if masked_indices.any() else 0.0
        )
        metrics = {
            "loss": loss.item(),
            "accuracy": accuracy.item(),
            "mask_ratio": masked_indices.float().mean().item(),
            "num_masked_tokens": (masked_indices.sum().item(), "sum"),
            "avg_p_mask": avg_p_mask,
            "ce_loss": ce_loss.item(),
        }
        # If doing SFT training, log answer-specific metrics
        if self.cfg.datasets is not None:
            with torch.no_grad():
                answer_mask = labels != -100
                answer_lengths = answer_mask.sum(dim=1).float()  # type: ignore
                total_answer_tokens = answer_mask.sum().item()  # type: ignore
                total_tokens = labels.numel()  # type: ignore
                metrics["answer_ratio"] = total_answer_tokens / max(total_tokens, 1)
                metrics["avg_answer_length"] = answer_lengths.mean().item()
        if self.cfg.diffusion.importance_weighting:
            metrics["importance_weight_avg"] = (1.0 / masked_p_mask).mean().item()
        train_eval: Literal["train", "eval"] = "train" if model.training else "eval"
        self.store_metrics(metrics, train_eval=train_eval)
        return loss, outputs
--- a/src/axolotl/integrations/diffusion/utils.py
+++ b/src/axolotl/integrations/diffusion/utils.py
@@ -1,159 +0,0 @@
 """Shared utilities for diffusion integration."""
 from __future__ import annotations
 from typing import Any, Optional
 import torch
 from axolotl.utils.dict import DictDefault
 def resolve_mask_token_id(
    tokenizer: Any,
    cfg: DictDefault,
    *,
    allow_add: bool,
    model: Any | None = None,
    default_token: str = "<|diffusion_mask|>",
 ) -> int:
    """Resolve mask token id. Training may add a new special token; inference won't."""
    # Determine vocab size if available
    vocab_size = None
    if tokenizer is not None:
        if hasattr(tokenizer, "vocab_size") and tokenizer.vocab_size is not None:
            try:
                vocab_size = int(tokenizer.vocab_size)  # type: ignore[arg-type]
            except Exception:
                vocab_size = None
        elif hasattr(tokenizer, "__len__"):
            try:
                vocab_size = int(len(tokenizer))
            except Exception:
                vocab_size = None
    # Use explicit id from config if provided
    diffusion_cfg = getattr(cfg, "diffusion", None)
    # Fallback to top-level attr names only if nested missing (shouldn't happen)
    cfg_id = (
        getattr(diffusion_cfg, "mask_token_id", None)
        if diffusion_cfg is not None
        else getattr(cfg, "diffusion_mask_token_id", None)
    )
    if isinstance(cfg_id, int) and cfg_id >= 0:
        if vocab_size is None or cfg_id < vocab_size:
            return int(cfg_id)
    def _existing_special_token_id(token_str: str | None) -> int | None:
        """Attempt to resolve an existing special token string to a real ID."""
        if not token_str or not hasattr(tokenizer, "convert_tokens_to_ids"):
            return None
        try:
            token_id = tokenizer.convert_tokens_to_ids(token_str)
        except Exception:
            return None
        if not isinstance(token_id, int) or token_id < 0:
            return None
        # Ensure it's registered as special and not UNK, and within vocab
        unk_id = getattr(tokenizer, "unk_token_id", None)
        specials = set(getattr(tokenizer, "all_special_tokens", []) or [])
        addl = set(getattr(tokenizer, "additional_special_tokens", []) or [])
        is_special = token_str in specials or token_str in addl
        in_vocab = vocab_size is None or token_id < vocab_size
        if (
            (unk_id is not None and token_id == unk_id)
            or not is_special
            or not in_vocab
        ):
            return None
        return token_id
    # Try mask token string if provided
    token_str = (
        getattr(diffusion_cfg, "mask_token_str", None)
        if diffusion_cfg is not None
        else getattr(cfg, "diffusion_mask_token_str", None)
    )
    for candidate in (token_str, default_token):
        token_id = _existing_special_token_id(candidate)
        if isinstance(token_id, int):
            try:
                if diffusion_cfg is None:
                    cfg.diffusion_mask_token_id = int(token_id)  # legacy fallback
                else:
                    diffusion_cfg.mask_token_id = int(token_id)
            except Exception:
                pass
            return int(token_id)
    # Optionally add and return a dedicated special token during training
    if allow_add and hasattr(tokenizer, "add_special_tokens"):
        token_to_add = token_str or default_token
        try:
            tokenizer.add_special_tokens({"additional_special_tokens": [token_to_add]})
            # Resize embeddings if possible
            if (
                model is not None
                and hasattr(tokenizer, "__len__")
                and hasattr(model, "resize_token_embeddings")
            ):
                try:
                    model.resize_token_embeddings(len(tokenizer))
                except Exception:
                    pass
            new_id = tokenizer.convert_tokens_to_ids(token_to_add)
            if isinstance(new_id, int) and new_id >= 0:
                try:
                    if diffusion_cfg is None:
                        cfg.diffusion_mask_token_id = int(new_id)  # legacy fallback
                    else:
                        diffusion_cfg.mask_token_id = int(new_id)
                except Exception:
                    pass
                return int(new_id)
        except Exception:
            pass
    # Fallback to unk or 0 (do not update cfg)
    fallback = getattr(tokenizer, "unk_token_id", 0) or 0
    return int(fallback)
 def create_bidirectional_attention_mask(
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    sample_packing: bool = False,
 ) -> torch.Tensor:
    """
    Create bidirectional attention mask to override default causal masking.
    Handles sample-packed sequences where different samples are identified
    by different attention mask values.
    Args:
        input_ids: Input token ids [batch_size, seq_len]
        attention_mask: Attention mask [batch_size, seq_len]
        sample_packing: Whether sample packing is enabled
    Returns:
        bidirectional_mask: 4D attention mask [batch_size, 1, seq_len, seq_len]
    """
    batch_size, seq_len = input_ids.shape
    device = input_ids.device
    if attention_mask is None or not sample_packing:
        return torch.ones(
            batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=device
        )
    # Handle sample packing: tokens can only attend within their sample
    mask_i = attention_mask.unsqueeze(2)  # [batch_size, seq_len, 1]
    mask_j = attention_mask.unsqueeze(1)  # [batch_size, 1, seq_len]
    # Tokens can attend to each other if they have the same non-zero sample ID
    bidirectional_mask = (mask_i == mask_j) & (mask_i > 0)
    # Add head dimension: [batch_size, 1, seq_len, seq_len]
    return bidirectional_mask.unsqueeze(1)
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -14,7 +14,6 @@ from peft import (
    PeftConfig,
    PeftMixedModel,
    PeftModel,
    TaskType,
    get_peft_model,
 )
 from transformers import PreTrainedModel
@@ -99,17 +98,6 @@ def load_lora(
        lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
    if cfg.peft_layer_replication:
        lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
    if cfg.peft_trainable_token_indices:
        lora_config_kwargs["trainable_token_indices"] = cfg.peft_trainable_token_indices
    # Determine the correct PEFT task type
    model_cls = type(model).__name__
    if "SequenceClassification" in model_cls:
        task_type = TaskType.SEQ_CLS
    elif "TokenClassification" in model_cls:
        task_type = TaskType.TOKEN_CLS
    else:
        task_type = TaskType.CAUSAL_LM
    lora_config = LoraConfig(
        r=cfg.lora_r,
@@ -122,7 +110,7 @@ def load_lora(
        fan_in_fan_out=cfg.lora_fan_in_fan_out,
        modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
        bias="none",
-        task_type=task_type,
+        task_type="CAUSAL_LM",
        **lora_config_kwargs,
    )
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -673,33 +673,6 @@ class ModelLoader:
        return hf_ds_cfg
    def _load_model_from_config(self, model_loader_class=None) -> PreTrainedModel:
        """
        Load model with random initialization using from_config.
        Uses the selected loader when provided; otherwise falls back to the auto loader.
        """
        loader = model_loader_class or self.auto_model_loader
        if loader in [AutoModelForCausalLM, AutoModelForVision2Seq]:
            model = loader.from_config(
                config=self.model_config,
                trust_remote_code=self.cfg.trust_remote_code or False,
            )
        else:
            model = loader(config=self.model_config)
        return model
    def _load_model_from_pretrained(self, model_loader_class=None) -> PreTrainedModel:
        """Load model from pretrained weights."""
        loader = model_loader_class or self.auto_model_loader
        kwargs = {
            "config": self.model_config,
            "trust_remote_code": self.cfg.trust_remote_code or False,
            **self.model_kwargs,
        }
        return loader.from_pretrained(self.base_model, **kwargs)
    def _build_model(self) -> bool:
        """Load model, with load strategy depending on config."""
        skip_move_to_device = False
@@ -714,8 +687,7 @@ class ModelLoader:
        if self.is_fsdp_enabled:
            if self.cfg.fsdp_config.cpu_ram_efficient_loading:
                skip_move_to_device = True
-                # Don't delete device_map for QLoRA + FSDP - it was set correctly in
+                # Don't delete device_map for QLoRA + FSDP - it was set correctly in _set_device_map
                # _set_device_map
                if (
                    "device_map" in self.model_kwargs
                    and not self.is_qlora_and_fsdp_enabled
@@ -744,11 +716,6 @@ class ModelLoader:
                or self.cfg.qlora_sharded_model_loading
            )
        ):
            if self.cfg.reinit_weights:
                LOG.warning(
                    "reinit_weights is not supported with sharded quantized loading. "
                    "Loading from pretrained weights instead."
                )
            quant_storage = self.cfg.torch_dtype
            quantization_config = getattr(
                self.model_config, "quantization_config", None
@@ -764,12 +731,33 @@ class ModelLoader:
                quantization_config=quantization_config,
            )
            skip_move_to_device = True
-        elif self.model_type == "MambaLMHeadModel":
+        elif (
-            if self.cfg.reinit_weights:
+            self.model_config.model_type in ["llama", "llama4"]
-                LOG.warning(
+            and not self.cfg.trust_remote_code
-                    "reinit_weights is not supported with MambaLMHeadModel. "
+            and not self.cfg.gptq
-                    "Loading from pretrained weights instead."
+        ):
            # Please don't remove underscore binding without reading the fn docstring.
            _ = self._configure_zero3_memory_efficient_loading()
            # Load model with random initialization if specified
            if self.cfg.random_init_weights:
                # AutoModel classes support the from_config method
                if self.auto_model_loader in [
                    AutoModelForCausalLM,
                    AutoModelForVision2Seq,
                ]:
                    self.model = self.auto_model_loader.from_config(
                        config=self.model_config,
                    )
                else:
                    self.model = self.auto_model_loader(config=self.model_config)
            else:
                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    **self.model_kwargs,
                )
        elif self.model_type == "MambaLMHeadModel":
            # FIXME this is janky at best and hacked together to make it work
            MambaLMHeadModel = fix_mamba_attn_for_loss()
@@ -782,27 +770,41 @@ class ModelLoader:
                self.base_model,
                **self.model_kwargs,
            )
        elif (
            self.model_type
            and self.model_type != "AutoModelForCausalLM"
            and not self.cfg.trust_remote_code
        ):
            if self.cfg.gptq:
                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
                    **self.model_kwargs,
                )
            else:
                self.model = getattr(transformers, self.model_type).from_pretrained(
                    self.base_model,
                    config=self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
                    **self.model_kwargs,
                )
        elif self.cfg.gptq:
            self.model = self.auto_model_loader.from_pretrained(
                self.base_model,
                config=self.model_config,
                trust_remote_code=self.cfg.trust_remote_code or False,
                **self.model_kwargs,
            )
        else:
-            # Please don't remove underscore binding without reading the fn docstring
+            # Please don't remove underscore binding without reading the fn docstring.
            _ = self._configure_zero3_memory_efficient_loading()
-
+            self.model = self.auto_model_loader.from_pretrained(
-            if (
+                self.base_model,
-                self.model_type
+                config=self.model_config,
-                and self.model_type != "AutoModelForCausalLM"
+                trust_remote_code=self.cfg.trust_remote_code or False,
-                and not self.cfg.trust_remote_code
+                **self.model_kwargs,
-                and not self.cfg.gptq
+            )
            ):
                # Use model type from transformers
                model_loader_class = getattr(transformers, self.model_type)
            else:
                # Use auto model loader (handles gptq and default cases)
                model_loader_class = self.auto_model_loader
            if self.cfg.reinit_weights:
                self.model = self._load_model_from_config(model_loader_class)
            else:
                self.model = self._load_model_from_pretrained(model_loader_class)
        if is_deepspeed_zero3_enabled():
            skip_move_to_device = True
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -4,7 +4,6 @@ Applies pre- and post-model load patches for various fixes and optimizations.
 """
 import importlib.util
 import os
 from functools import cached_property
 import addict
@@ -67,7 +66,6 @@ class PatchManager:
        self._apply_mistral_cross_entropy_patch()
        self._apply_self_attention_lora_patch()
        self._apply_fsdp2_bnb_patches()
        self._apply_patch_deepspeed_zero3()
    def apply_post_plugin_pre_model_load_patches(self):
        """Apply post plugin-pre_model_load load patches based on config."""
@@ -80,7 +78,13 @@ class PatchManager:
            patch_maybe_log_save_evaluate,
        )
-        patch_evaluation_loop()
+        patch_fsdp2 = (
            self.cfg.torch_compile
            and self.cfg.fsdp_config
            and self.cfg.fsdp_version == 2
        )
        patch_evaluation_loop(patch_fsdp2)
        patch_maybe_log_save_evaluate()
    def apply_post_model_load_patches(self, model: PreTrainedModel):
@@ -143,12 +147,14 @@ class PatchManager:
    def _apply_flex_attention_patches(self):
        """Apply patches for flexible attention."""
        if self.cfg.flex_attention:
-            from axolotl.monkeypatch.attention.flex_attn import (
+            # from axolotl.monkeypatch.attention.flex_attn import (
-                patch_flex_wrapper,
+            #     patch_flex_make_mask,
-            )
+            #     patch_flex_wrapper,
-
+            # )
-            flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
+            #
-            patch_flex_wrapper(**flex_attn_compile_kwargs)
+            # flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
            # patch_flex_wrapper(**flex_attn_compile_kwargs)
            # patch_flex_make_mask()
            if self.cfg.sample_packing:
                from axolotl.core.attention.flex_block_mask import (
                    patch_create_causal_mask,
@@ -465,17 +471,3 @@ class PatchManager:
            from axolotl.monkeypatch.lora_kernels import apply_lora_kernel_patches
            apply_lora_kernel_patches(model=model, cfg=self.cfg)
    def _apply_patch_deepspeed_zero3(self):
        try:
            from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
            from axolotl.monkeypatch.deepspeed_utils import apply_deepspeed_patches
            if self.cfg.activation_offloading is True and (
                is_deepspeed_zero3_enabled()
                or os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3"
            ):
                apply_deepspeed_patches()
        except ImportError as e:
            LOG.warning(f"DeepSpeed patches not applied: {e}")
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -296,7 +296,7 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
            )
        tokenizer.chat_template = chat_template_string
-    elif getattr(tokenizer, "chat_template", None) is None:
+    else:
        LOG.info(
            "No Chat template selected. Consider adding a chat template for easier inference."
        )
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -160,11 +160,9 @@ def get_state_dict(self, model, unwrap=True):
                state_dict[param_name] = param.cpu()
            torch.distributed.barrier()
    elif self.distributed_type == DistributedType.FSDP:
-        from torch.distributed.fsdp import (
+        from torch.distributed.fsdp import FullStateDictConfig
-            FullStateDictConfig,
+        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-            FullyShardedDataParallel as FSDP,
+        from torch.distributed.fsdp import StateDictType
            StateDictType,
        )
        full_state_dict_config = FullStateDictConfig(
            offload_to_cpu=True, rank0_only=True
@@ -180,38 +178,6 @@ def get_state_dict(self, model, unwrap=True):
    return state_dict
 def cast_lora_module(module):
    base_layer_dtype = module.base_layer.weight.dtype
    # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
    # wrap this. Therefore we must ensure the bias has the same dtype as the weight
    if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None:
        if module.base_layer.weight.dtype != module.base_layer.bias.dtype:
            log_bias_dtype_mismatch = True
            module.base_layer.bias.data = module.base_layer.bias.data.to(
                module.base_layer.weight.dtype
            )
    for active_adapter in module.active_adapters:
        if module.lora_A:
            module.lora_A[active_adapter] = module.lora_A[active_adapter].to(base_layer_dtype)
            if hasattr(module.lora_A[active_adapter], 'bias') and module.lora_A[active_adapter].bias is not None:
                module.lora_A[active_adapter].bias.data = module.lora_A[active_adapter].bias.data.to(base_layer_dtype)
        if module.lora_B:
           module.lora_B[active_adapter] = module.lora_B[active_adapter].to(base_layer_dtype)
           if hasattr(module.lora_B[active_adapter], 'bias') and module.lora_B[active_adapter].bias is not None:
               module.lora_B[active_adapter].bias.data = module.lora_B[active_adapter].bias.data.to(base_layer_dtype)
        if module.lora_embedding_A:
            module.lora_embedding_A[active_adapter] = module.lora_embedding_A[active_adapter].to(base_layer_dtype)
            if hasattr(module.lora_embedding_A[active_adapter], 'bias') and module.lora_embedding_A[active_adapter].bias is not None:
                module.lora_embedding_A[active_adapter].bias.data = module.lora_embedding_A[active_adapter].bias.data.to(base_layer_dtype)
        if module.lora_embedding_B:
            module.lora_embedding_B[active_adapter] = module.lora_embedding_B[active_adapter].to(base_layer_dtype)
            if hasattr(module.lora_embedding_B[active_adapter], 'bias') and module.lora_embedding_B[active_adapter].bias is not None:
                module.lora_embedding_B[active_adapter].bias.data = module.lora_embedding_B[active_adapter].bias.data.to(base_layer_dtype)
        if module.lora_magnitude_vector:
            module.lora_magnitude_vector[active_adapter] = module.lora_magnitude_vector[active_adapter].to(base_layer_dtype)
            if hasattr(module.lora_magnitude_vector[active_adapter], 'bias') and module.lora_magnitude_vector[active_adapter].bias is not None:
                module.lora_magnitude_vector[active_adapter].bias.data = module.lora_magnitude_vector[active_adapter].bias.data.to(base_layer_dtype)
 def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
    """Helper function to process LoRA modules for FSDP2."""
@@ -227,37 +193,18 @@ def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
            module.base_layer.bias.data = module.base_layer.bias.data.to(
                module.base_layer.weight.dtype
            )
-    fully_shard(module, **fsdp2_kwargs)
+
-    module.set_reshard_after_forward(False)
+    for active_adapter in module.active_adapters:
-    module.set_reshard_after_backward(False)
+        if module.lora_A:
-    # for active_adapter in module.active_adapters:
+            fully_shard(module.lora_A[active_adapter], **fsdp2_kwargs)
-    #     for adapter_name in [
+        if module.lora_B:
-    #         "lora_A",
+            fully_shard(module.lora_B[active_adapter], **fsdp2_kwargs)
-    #         "lora_B",
+        if module.lora_embedding_A:
-    #         "lora_embedding_A",
+            fully_shard(module.lora_embedding_A[active_adapter], **fsdp2_kwargs)
-    #         "lora_embedding_B",
+        if module.lora_embedding_B:
-    #         "lora_magnitude_vector",
+            fully_shard(module.lora_embedding_B[active_adapter], **fsdp2_kwargs)
-    #     ]:
+        if module.lora_magnitude_vector:
-    #         adapter_module = getattr(module, adapter_name, None)
+            fully_shard(module.lora_magnitude_vector[active_adapter], **fsdp2_kwargs)
    #         # print(adapter_module, adapter_name)
    #         # torch.distributed.breakpoint()
    #         if not adapter_module:
    #             continue
    #         fsdp_adapter_module = fully_shard(adapter_module[active_adapter], **fsdp2_kwargs)
    #         # fsdp_adapter_module.unshard()
    #         fsdp_adapter_module.set_reshard_after_backward(False)
    #         fsdp_adapter_module.set_reshard_after_forward(False)
            # torch.distributed.breakpoint()
        # if module.lora_A:
        #     fully_shard(module.lora_A[active_adapter], **fsdp2_kwargs)
        # if module.lora_B:
        #     fully_shard(module.lora_B[active_adapter], **fsdp2_kwargs)
        # if module.lora_embedding_A:
        #     fully_shard(module.lora_embedding_A[active_adapter], **fsdp2_kwargs)
        # if module.lora_embedding_B:
        #     fully_shard(module.lora_embedding_B[active_adapter], **fsdp2_kwargs)
        # if module.lora_magnitude_vector:
            # fully_shard(module.lora_magnitude_vector[active_adapter], **fsdp2_kwargs)
    return log_bias_dtype_mismatch
@@ -371,26 +318,16 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
            model.tie_weights()
    is_peft_model = isinstance(model, PeftModel)
-    # TODO - this doesn't actually do anything
+
    for name, module in model.named_children():
        if name == "experts":
            # torch.distributed.breakpoint()
            for expert in module.children():
                # torch.distributed.breakpoint()
                print(f"expert: {expert}")
                for lora_module in expert.children():
                    print(f"lora {lora_module}")
                    # torch.distributed.breakpoint()
                    cast_lora_module(lora_module)
                    _process_lora_module_for_fsdp(lora_module, fsdp2_kwargs)
    auto_wrap_policy = fsdp2_prepare_auto_wrap_policy(fsdp2_plugin, model)
    log_bias_dtype_mismatch = False
    if auto_wrap_policy is not None:
        for module in get_module_children_bottom_up(model)[:-1]:
-            if is_peft_model and isinstance(module, LoraLayer) and not isinstance(module, FSDPModule):
+            if is_peft_model and isinstance(module, LoraLayer):
-                # torch.distributed.breakpoint()
+                module_log_bias_mismatch = _process_lora_module_for_fsdp(
-                cast_lora_module(module)
+                    module, fsdp2_kwargs
-                # torch.distributed.breakpoint()
+                )
                log_bias_dtype_mismatch |= module_log_bias_mismatch
            if auto_wrap_policy(module) and not isinstance(module, FSDPModule):
                fully_shard(module, **fsdp2_kwargs)
@@ -407,9 +344,6 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
            accelerator, model, original_sd, offload_to_cpu=offload_to_cpu
        )
    # for module in model.named_modules():
    #     if "Lora" in 
    if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
        # We re-register the buffers, as they may not be in the state_dict
        for fqn, buffer_tensor in original_non_persistent_buffers.items():
--- a/src/axolotl/monkeypatch/attention/flex_attn.py
+++ b/src/axolotl/monkeypatch/attention/flex_attn.py
@@ -1,11 +1,10 @@
 """Flex attention monkey patch"""
 import sys
 from typing import Optional, Tuple, Union
 import torch
 import transformers
 from packaging import version
 from transformers.utils.import_utils import _torch_version, is_torch_less_or_equal
 from axolotl.utils.logging import get_logger
@@ -47,33 +46,19 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
            """
            self.training = None
            if not self._is_flex_compiled or training != self.training:
                self.training = training
                if is_torch_less_or_equal("2.5.1"):
                    self._compiled_flex_attention = torch.compile(
                        flex_attention, dynamic=False
                    )
                # In PyTorch 2.6.0, there's a known issue with flex attention compilation which may
                # cause errors. The suggested fix is to compile with "max-autotune-no-cudagraphs"
                # see https://github.com/pytorch/pytorch/issues/146260 for training
-                elif version.parse(_torch_version).base_version == "2.6.0" and training:
+                self.training = training
-                    self._compiled_flex_attention = torch.compile(
+                LOG.info(
-                        flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs"
+                    "Compiling flex attention with kwargs: %s. This may take a while...",
-                    )
+                    flex_attn_compile_kwargs,
-                # Fallback, usually the most recent torch 2.7.x+ versions
+                )
-                else:
+                self._compiled_flex_attention = torch.compile(
-                    LOG.info(
+                    flex_attention,
-                        "Compiling flex attention with kwargs: %s. This may take a while...",
+                    **flex_attn_compile_kwargs,
-                        flex_attn_compile_kwargs,
+                )
-                        main_process_only=True,
+                LOG.info("Flex attention compiled successfully.")
                    )
                    self._compiled_flex_attention = torch.compile(
                        flex_attention,
                        **flex_attn_compile_kwargs,
                    )
                    LOG.info(
                        "Flex attention compiled successfully.", main_process_only=True
                    )
                self._is_flex_compiled = True
        def __call__(self):
@@ -83,3 +68,139 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
    sys.modules[
        "transformers.integrations.flex_attention"
    ].WrappedFlexAttention = WrappedFlexAttention
 def patch_flex_make_mask():
    is_torch_2_6 = torch.__version__.startswith("2.6")
    if not is_torch_2_6:
        return
    from torch.nn.attention.flex_attention import (
        _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size,
    )
    from torch.nn.attention.flex_attention import (
        BlockMask,
    )
    from torch.nn.attention.flex_attention import (
        create_block_mask as create_block_causal_mask_flex,
    )
    Offset = Union[torch.Tensor, int]
    def patched_make_flex_block_causal_mask(
        attention_mask_2d: torch.Tensor,
        attention_chunk_size: Optional[int] = None,
        query_length=None,
        key_length=None,
        offsets: Optional[Tuple[Offset, Offset]] = None,
    ) -> "BlockMask":
        """
        Create a block causal document mask for a batch of sequences, both packed and unpacked.
        Create Block causal logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
        The resultant BlockMask is a compressed representation of the full block causal
        mask. BlockMask is essential for performant computation of flex attention.
        See: https://pytorch.org/blog/flexattention/
        Args:
            attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
            of shape (batch_size, total_seq_len). e.g.
            For unpacked sequence:
            [[1, 1, 1, 1, 0, 0, 0],
             [1, 1, 1, 1, 1, 0, 0]]
            For packed sequence:
            [[1, 1, 1, 2, 2, 2, 0],
             [1, 1, 2, 2, 2, 3, 3]]
        Returns:
            BlockMask
        """
        batch_size, total_seq_len = attention_mask_2d.shape
        if not key_length:
            key_length = total_seq_len
        if not query_length:
            query_length = total_seq_len
        attention_mask_2d = torch.nn.functional.pad(
            attention_mask_2d,
            value=0,
            pad=(0, abs(total_seq_len - max(key_length, flex_default_block_size))),
        )
        device = attention_mask_2d.device
        document_ids = attention_mask_2d.clone()
        if attention_chunk_size is not None:
            # we create an arange, then we just // by chunk size to get [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
            chunk_idxs = (document_ids.clone().fill_(1).cumsum(-1) - 1) // (
                attention_chunk_size
            )
        # Instead of passing a tensor mask, flex attention requires a mask_mod function
        # that determines which elements of QK^T should be included in the attention
        # computation prior to the softmax. For sample packing, we need both the
        # logic for both causal mask and document mask. See PyTorch's official
        # blog post for more details: https://pytorch.org/blog/flexattention/#mask-mods
        def causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
            """
            Defines the logic of a block causal mask by combining both a standard causal mask
            and a block diagonal document mask.
            See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
            for an illustration.
            """
            causal_mask = q_idx >= kv_idx  # not valid when decoding
            document_mask = (
                document_ids[batch_idx, q_idx] == document_ids[batch_idx, kv_idx]
            )
            padding_mask = attention_mask_2d[batch_idx, q_idx] > 0
            final_mask = causal_mask & padding_mask & document_mask
            return final_mask
        def chunk_causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
            """
            Combines the chunk mask with the causal mask for chunked attention.
            """
            chunk_mask = chunk_idxs[batch_idx, q_idx] == chunk_idxs[batch_idx, kv_idx]
            causal_doc_mask = causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx)
            return chunk_mask & causal_doc_mask
        mask_mod_maybe_combined = (
            causal_mask_mod if attention_chunk_size is None else chunk_causal_mask_mod
        )
        if offsets is not None:
            q_offset = offsets[0]
            kv_offset = offsets[1]
            def mask_mod(batch_idx, head_idx, q_idx, kv_idx):
                offset_q = q_idx + q_offset
                offset_kv = kv_idx + kv_offset
                return mask_mod_maybe_combined(batch_idx, head_idx, offset_q, offset_kv)
        else:
            mask_mod = mask_mod_maybe_combined
        return create_block_causal_mask_flex(
            mask_mod=mask_mod,
            B=batch_size,
            H=None,  # attention head
            Q_LEN=query_length,
            KV_LEN=key_length,
            device=device,
            _compile=True,
        )
    for n in tuple(sys.modules):
        if ".modeling_" in n:
            if hasattr(sys.modules[n], "make_flex_block_causal_mask"):
                sys.modules[
                    n
                ].make_flex_block_causal_mask = patched_make_flex_block_causal_mask
                sys.modules[
                    n
                ].make_flex_block_causal_mask = patched_make_flex_block_causal_mask
    transformers.integrations.flex_attention.make_flex_block_causal_mask = (
        patched_make_flex_block_causal_mask
    )
--- a/src/axolotl/monkeypatch/deepspeed_utils.py
+++ b/src/axolotl/monkeypatch/deepspeed_utils.py
@@ -1,67 +0,0 @@
 import importlib
 import importlib.util
 from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
 def patch_checkpoint_wrapper_setattr():
    """
    Patch CheckpointWrapper to properly forward DeepSpeed attributes to wrapped modules.
    This fixes the issue where CheckpointWrapper doesn't forward ds_* attributes
    (like ds_grads_remaining) to the actual wrapped module, causing DeepSpeed
    ZeRO-3 to fail when gradient checkpointing is enabled.
    This issue occurs specifically with:
    - QLoRA + DeepSpeed ZeRO-3
    - gradient_checkpointing: true
    - activation_offloading: true
    References:
    - https://github.com/deepspeedai/DeepSpeed/issues/7203
    - https://github.com/deepspeedai/DeepSpeed/blob/38d1a9eb64c9e01e32eccc50b25ba18925287441/deepspeed/runtime/zero/parameter_offload.py#L424-L458
    - https://github.com/axolotl-ai-cloud/axolotl/pull/3102
    """
    try:
        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
            CheckpointWrapper,
        )
        # Check if already patched
        if hasattr(CheckpointWrapper, "_axolotl_setattr_patched"):
            LOG.debug("CheckpointWrapper already patched")
            return
        original_setattr = CheckpointWrapper.__setattr__
        def new_setattr(self, name: str, value) -> None:
            if name.startswith("ds_") and hasattr(self, "_checkpoint_wrapped_module"):
                setattr(self._checkpoint_wrapped_module, name, value)
                LOG.debug(
                    f"Forwarded {name} to wrapped module {type(self._checkpoint_wrapped_module).__name__}"
                )
            else:
                original_setattr(self, name, value)
        CheckpointWrapper.__setattr__ = new_setattr
        CheckpointWrapper._axolotl_setattr_patched = True
        LOG.info("CheckpointWrapper patched to forward DeepSpeed attributes")
    except ImportError as e:
        LOG.debug(f"CheckpointWrapper not available: {e}")
    except Exception as e:
        LOG.warning(f"Failed to patch CheckpointWrapper: {e}")
 def apply_deepspeed_patches():
    """
    Apply DeepSpeed-related patches
    """
    if importlib.util.find_spec("deepspeed") is not None:
        patch_checkpoint_wrapper_setattr()
    else:
        LOG.debug("DeepSpeed not available, skipping patches")
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -36,13 +36,8 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "glm",
    "glm4",
    "smollm3",
    "granite",
    "granitemoe",
    "hunyuan_v1_dense",
    "hunyuan_v1_moe",
    "gpt_oss",
    "arcee",
    "seed_oss",
 ]
--- a/src/axolotl/monkeypatch/tiled_mlp/base.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/base.py
@@ -8,94 +8,6 @@ from typing import List
 import torch
 class DeepSpeedTiledMLPMoE(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        fn,
        self,
        x,
        shards,
        compute_params,
    ) -> torch.Tensor:
        ctx.fn = fn
        ctx.self = self
        ctx.shards = shards
        ctx.compute_params = [p for p in compute_params if p.requires_grad]
        ctx.save_for_backward(x)
        x_shards = list(torch.chunk(x, chunks=shards, dim=1))
        with torch.no_grad():
            output_shards = [fn(self, x_shard) for x_shard in x_shards]
        ctx.is_tuple_output = isinstance(output_shards[0], tuple)
        if isinstance(output_shards[0], tuple):
            tuple_dim_idx = [1, 0]
            output_unsharded = tuple(
                torch.cat(
                    [output_shard[i] for output_shard in output_shards],
                    dim=tuple_dim_idx[i],
                )
                for i in range(len(output_shards[0]))
            )
        else:
            output_unsharded = torch.cat(output_shards, dim=1)
        return output_unsharded
    @staticmethod
    def backward(ctx, *grads) -> torch.Tensor:
        fn = ctx.fn
        (x,) = ctx.saved_tensors
        self = ctx.self
        shards = ctx.shards
        compute_params = ctx.compute_params
        is_tuple_output = ctx.is_tuple_output
        x_requires_grad = x.requires_grad
        x = x.detach()
        # detach() unsets `x.requires_grad`, so restore it
        x.requires_grad_(x_requires_grad)
        incoming_grad = grads[0]
        x_grad = torch.zeros_like(x)
        x_shards = list(torch.chunk(x, chunks=shards, dim=1))
        shard_step = x_shards[0].numel()
        for i, x_shard in enumerate(x_shards):
            # Tell deepspeed not to add a new grad to its ipg bucket until the last shard is run
            if compute_params is not None:
                if i + 1 < shards:
                    for param in compute_params:
                        param.ds_grad_is_ready = False
                else:
                    # last shard, can add the grad
                    for param in compute_params:
                        param.ds_grad_is_ready = True
            x_shard.requires_grad_(x_requires_grad)
            shard_offset = i * shard_step
            x_shard.grad = (
                x_grad.view(-1)
                .narrow(0, shard_offset, x_shard.numel())
                .view_as(x_shard)
            )
            incoming_grad_shard = (
                incoming_grad.view(-1)
                .narrow(0, shard_offset, x_shard.numel())
                .view_as(x_shard)
            )
            with torch.enable_grad():
                output = fn(self, x_shard)
            if is_tuple_output:
                torch.autograd.backward(output[0], incoming_grad_shard)
            else:
                torch.autograd.backward(output, incoming_grad_shard)
        return (None, None, x_grad, None, None)
 class TiledMLP(torch.autograd.Function):
    """
    TiledMLP implementation using gradient hooks
@@ -119,18 +31,7 @@ class TiledMLP(torch.autograd.Function):
        x_shards = list(torch.chunk(x, chunks=shards, dim=1))
        with torch.no_grad():
            output_shards = [fn(self, x_shard) for x_shard in x_shards]
-        ctx.is_tuple_output = isinstance(output_shards[0], tuple)
+        output_unsharded = torch.cat(output_shards, dim=1)
        if isinstance(output_shards[0], tuple):
            tuple_dim_idx = [1, 0]
            output_unsharded = tuple(
                torch.cat(
                    [output_shard[i] for output_shard in output_shards],
                    dim=tuple_dim_idx[i],
                )
                for i in range(len(output_shards[0]))
            )
        else:
            output_unsharded = torch.cat(output_shards, dim=1)
        return output_unsharded
@@ -141,7 +42,6 @@ class TiledMLP(torch.autograd.Function):
        self = ctx.self
        shards = ctx.shards
        compute_params = ctx.compute_params
        is_tuple_output = ctx.is_tuple_output
        x_requires_grad = x.requires_grad
        x = x.detach()
@@ -176,10 +76,7 @@ class TiledMLP(torch.autograd.Function):
            with torch.enable_grad():
                output = fn(self, x_shard)
-            if is_tuple_output:
+            torch.autograd.backward(output, incoming_grad_shard)
                torch.autograd.backward(output[0], incoming_grad_shard)
            else:
                torch.autograd.backward(output, incoming_grad_shard)
        # Clean up hooks
        grad_accumulator.cleanup()
--- a/src/axolotl/monkeypatch/tiled_mlp/patch.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/patch.py
@@ -17,7 +17,7 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
        TiledMLP as DeepSpeedTiledMLP,
    )
-    from axolotl.monkeypatch.tiled_mlp.base import DeepSpeedTiledMLPMoE, TiledMLP
+    from axolotl.monkeypatch.tiled_mlp.base import TiledMLP
    try:
        # Dynamically import the module and MLP class
@@ -64,10 +64,7 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
                        for p in self._compute_params
                    )
                ) or os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
-                    if model_type == "gpt_oss":
+                    self._tiled_mlp_dist_impl = DeepSpeedTiledMLP
                        self._tiled_mlp_dist_impl = DeepSpeedTiledMLPMoE
                    else:
                        self._tiled_mlp_dist_impl = DeepSpeedTiledMLP
                else:
                    self._tiled_mlp_dist_impl = TiledMLP
--- a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
@@ -28,6 +28,15 @@ PATCHED_EVAL_CODE = {
    "array": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(all_losses).item()',
 }
 ORIGINAL_FSDP2_CODE = """
    model.eval()
 """
 PATCHED_FSDP2_CODE = """
    if hasattr(model, "eval") and callable(model.eval):
        self.model.eval()
 """
 ORIGINAL_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).mean().item()"
 PATCHED_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).nanmean().item()"
@@ -37,7 +46,13 @@ def check_evaluation_loop_is_patchable() -> bool:
    return all(value in evaluation_loop_source for value in ORIGINAL_EVAL_CODE.values())
-def patch_evaluation_loop():
+def check_evaluation_loop_is_fsdp2_patchable() -> bool:
    evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop)
    evaluation_loop_source, _ = detab_code(evaluation_loop_source)
    return ORIGINAL_FSDP2_CODE in evaluation_loop_source
 def patch_evaluation_loop(patch_fsdp2: bool):
    """Patch the evaluation_loop method."""
    # Check if already patched
    if hasattr(Trainer, "_original_evaluation_loop"):
@@ -60,6 +75,13 @@ def patch_evaluation_loop():
        ORIGINAL_EVAL_CODE["array"], PATCHED_EVAL_CODE["array"]
    )
    # Apply FSDP2 eval guard patch if needed
    if patch_fsdp2 and ORIGINAL_FSDP2_CODE in evaluation_loop_source:
        evaluation_loop_source = evaluation_loop_source.replace(
            ORIGINAL_FSDP2_CODE, PATCHED_FSDP2_CODE
        )
        LOG.info("Applied FSDP2 eval guard patch to evaluation_loop")
    # Rename the function to avoid conflicts
    evaluation_loop_source = evaluation_loop_source.replace(
        "def evaluation_loop(",
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -75,7 +75,7 @@ class PromptTokenizingStrategy(abc.ABC):
    ) -> BatchEncoding:
        empty = BatchEncoding(data={"input_ids": [], "attention_mask": []})
        if not prompt:
-            LOG.warning_once("Empty text requested for tokenization.")
+            LOG.warning("Empty text requested for tokenization.")
            return empty
        result = self.tokenizer(
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -30,7 +30,11 @@ from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
 )
 from axolotl.integrations.base import PluginManager
-from axolotl.loaders import ModelLoader, load_processor, load_tokenizer
+from axolotl.loaders import (
    ModelLoader,
    load_processor,
    load_tokenizer,
 )
 from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
@@ -230,15 +234,16 @@ def save_trained_model(
    # handle QAT
    if cfg.qat:
-        from axolotl.utils.quantization import convert_qat_model
+        from axolotl.utils.quantization import convert_qat_model_for_ptq
-        convert_qat_model(
+        LOG.info("Processing QAT model for saving...")
        convert_qat_model_for_ptq(
            model,
            quantize_embedding=cfg.qat.quantize_embedding,
        )
        LOG.info(
-            "QAT usage note: please ensure you quantize your model fine-tuned using QAT by running `axolotl quantize`"
+            "QAT modules have been converted for PTQ. Please ensure you quantize "
-            " with the same config which you used for training."
+            "your model weights with `axolotl quantize`."
        )
    # Handle ReLoRA early return case
    if cfg.relora:
@@ -332,7 +337,9 @@ def save_trained_model(
    if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
        # TODO: add integration support so this can be implemented completely within the plugin
-        from axolotl.integrations.llm_compressor.utils import save_compressed_model
+        from axolotl.integrations.llm_compressor.utils import (
            save_compressed_model,
        )
        save_compressed_model(
            model=model,
--- a/src/axolotl/utils/callbacks/tokens_per_second.py
+++ b/src/axolotl/utils/callbacks/tokens_per_second.py
@@ -43,12 +43,11 @@ class TokensPerSecondCallback(TrainerCallback):
        control: TrainerControl,
        **kwargs,
    ):  # pylint: disable=unused-argument
-        if hasattr(state, "num_tokens"):
+        step_time = time.perf_counter() - self.start_time
-            step_time = time.perf_counter() - self.start_time
+        num_tokens_per_device = state.num_tokens.clone()
-            num_tokens_per_device = state.num_tokens.clone()
+        # non data parallel groups have duplicated tokens, so we avoid double-counting
-            # non data parallel groups have duplicated tokens, so we avoid double-counting
+        num_tokens_per_device = num_tokens_per_device / self.non_data_parallel_size
-            num_tokens_per_device = num_tokens_per_device / self.non_data_parallel_size
+        state.last_tokens_per_second = num_tokens_per_device / step_time
            state.last_tokens_per_second = num_tokens_per_device / step_time
    def on_log(
        self,
@@ -59,6 +58,5 @@ class TokensPerSecondCallback(TrainerCallback):
        **kwargs,
    ):  # pylint: disable=unused-argument
        # after logging, clear the running metrics
-        if hasattr(state, "last_tokens_per_second"):
+        state.last_tokens_per_second.zero_()
-            state.last_tokens_per_second.zero_()
+        state.num_tokens = 0
            state.num_tokens = torch.zeros(1)
--- a/src/axolotl/utils/collators/init.py
+++ b/src/axolotl/utils/collators/init.py
@@ -1,17 +1,11 @@
-"""Shared axolotl collators for multipacking, mamba, multimodal."""
+"""
 shared axolotl collators for multipack, mamba, multimodal
 """
-from .batching import (
+from .batching import (  # noqa: F401
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
    PretrainingBatchSamplerDataCollatorForSeq2Seq,
    V2BatchSamplerDataCollatorForSeq2Seq,
 )
-from .mamba import MambaDataCollator
+from .mamba import MambaDataCollator  # noqa: F401
 __all__ = [
    "DataCollatorForSeq2Seq",
    "BatchSamplerDataCollatorForSeq2Seq",
    "V2BatchSamplerDataCollatorForSeq2Seq",
    "PretrainingBatchSamplerDataCollatorForSeq2Seq",
    "MambaDataCollator",
 ]
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -17,8 +17,8 @@ from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.config import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
    AxolotlInputConfig as AxolotlInputConfigBase,
 )
 from axolotl.utils.schemas.config import AxolotlInputConfig as AxolotlInputConfigBase
 from axolotl.utils.schemas.datasets import DPODataset, KTODataset, SFTDataset
 LOG = get_logger(__name__)
@@ -273,9 +273,7 @@ def validate_config(
    # Convert datasets to proper format if needed
    if cfg.get("datasets"):
        for idx, ds_cfg in enumerate(cfg["datasets"]):
-            if cfg.get("rl") in ["dpo", "ipo", "simpo"] and not isinstance(
+            if cfg.get("rl") in ["dpo", "simpo"] and not isinstance(ds_cfg, DPODataset):
                ds_cfg, DPODataset
            ):
                cfg["datasets"][idx] = DPODataset(**ds_cfg)
            elif cfg.get("rl") == "kto" and not isinstance(ds_cfg, KTODataset):
                cfg["datasets"][idx] = KTODataset(**dict(ds_cfg))
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -48,10 +48,10 @@ def apply_sequence_parallelism(
            - The original sequence length before padding.
            - The number of padding tokens added.
    """
-    batch_size, original_seq_len = batch["input_ids"].shape
+    original_seq_len = batch["input_ids"].size(1)
    # Update ring attention params if needed
-    if batch.get("position_ids") is not None and batch_size == 1:
+    if batch.get("position_ids") is not None:
        update_ring_attn_params(position_ids=batch["position_ids"])
    else:
        # If position_ids aren't already in the batch, create them
--- a/src/axolotl/utils/data/init.py
+++ b/src/axolotl/utils/data/init.py
@@ -1,19 +1,19 @@
 """Init for `axolotl.utils.data` module."""
 from axolotl.utils.data.pretraining import (
    encode_pretraining,
    wrap_pretraining_dataset,
 )
 from axolotl.utils.data.rl import prepare_preference_datasets
 from axolotl.utils.data.sft import (
    get_dataset_wrapper,
    prepare_datasets,
 )
 from axolotl.utils.data.streaming import (
    encode_streaming,
    wrap_streaming_dataset,
 )
 from axolotl.utils.data.utils import md5
 __all__ = [
-    "encode_streaming",
+    "encode_pretraining",
-    "wrap_streaming_dataset",
+    "wrap_pretraining_dataset",
    "prepare_preference_datasets",
    "get_dataset_wrapper",
    "prepare_datasets",
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/pretraining.py
@@ -1,4 +1,4 @@
-"""Data handling specific to streaming datasets."""
+"""data handling specific to pretraining"""
 import functools
 from collections import defaultdict
@@ -17,10 +17,10 @@ from axolotl.utils.trainer import process_pretraining_datasets_for_packing
 LOG = get_logger(__name__)
-def encode_streaming(
+def encode_pretraining(
    examples: Dict[str, List],
    tokenizer: PreTrainedTokenizerBase,
    max_tokens: int,
    examples: Dict[str, List],
    text_column: str = "text",
    concatenate: bool = True,
 ) -> Dict[str, List]:
@@ -176,57 +176,45 @@ def encode_streaming(
    return ret
-def wrap_streaming_dataset(
+def wrap_pretraining_dataset(
    dataset,
    tokenizer,
    cfg,
    ds_wrapper_fn,
    max_tokens=2048,
    batch_size=1,
    seed=42,
    buffer_size=10_000,
 ):
    if cfg.sample_packing:
        # For SFT (non-pretraining) datasets, always use multipack_attn=True to ensure
        # attention isolation between packed sequences
        multipack_attn = (
            True if not cfg.pretraining_dataset else cfg.pretrain_multipack_attn
        )
        collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq(
            tokenizer,
            return_tensors="pt",
            padding=True,
-            pad_to_multiple_of=cfg.sequence_len,
+            pad_to_multiple_of=max_tokens,
-            multipack_attn=multipack_attn,
+            multipack_attn=cfg.pretrain_multipack_attn,
        )
        encode = functools.partial(
-            encode_packed_streaming,
+            encode_packed_pretraining,
            collate_fn,
            ds_wrapper_fn,
-            max_seq_length=cfg.sequence_len,
+            max_seq_length=max_tokens,
-            batch_size=cfg.micro_batch_size,
+            batch_size=batch_size,
-            multipack_attn=multipack_attn,
+            multipack_attn=cfg.pretrain_multipack_attn,
        )
-
+        # set this to 1 so downstream data_loader doesn't try to increase the batch again
        # Set this to 1 so downstream data_loader doesn't try to increase the batch size
        # again
        cfg.micro_batch_size = 1
    else:
        # NOTE: This is not reachable for SFT datasets since we use the pre-existing
        # loading function for non-packed streaming datasets. Refer to
        # _prepare_streaming_datasets in sft.py for that code path.
        text_column = (
            getattr(cfg.pretraining_dataset[0], "text_column", "text") or "text"
        )
        encode = functools.partial(
-            encode_streaming,
+            encode_pretraining,
-            tokenizer=tokenizer,
+            tokenizer,
-            max_tokens=cfg.sequence_len,
+            max_tokens,
-            text_column=text_column,
+            text_column=cfg.pretraining_dataset[0].text_column or "text",
            concatenate=cfg.pretraining_sample_concatenation is True,
        )
    if cfg.shuffle_merged_datasets:
-        dataset = dataset.shuffle(
+        dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
            seed=cfg.seed, buffer_size=cfg.streaming_multipack_buffer_size
        )
    else:
        LOG.debug("NOT shuffling merged pretraining datasets")
@@ -244,13 +232,14 @@ def wrap_streaming_dataset(
    dataset = dataset.map(
        encode,
        batched=True,
-        batch_size=cfg.streaming_multipack_buffer_size,
+        batch_size=buffer_size,
        # input_columns="text",
        remove_columns=remove_columns,
    )
    return dataset
-def encode_packed_streaming(
+def encode_packed_pretraining(
    collate_fn,
    ds_wrapper: Callable,
    examples: Dict[str, List],
@@ -285,6 +274,8 @@ def encode_packed_streaming(
    for batch in sampler:
        for data in batch:
            features = train_dataset[data]
            if "num_truncated_tokens" in features:
                del features["num_truncated_tokens"]
            if "num_truncated_tokens" in features:
                del features["num_truncated_tokens"]
            if "overflow_to_sample_mapping" in features:
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -9,13 +9,13 @@ from datasets import (
    Dataset,
    DatasetDict,
    IterableDataset,
    IterableDatasetDict,
    load_dataset,
 )
 from transformers import PreTrainedTokenizer, ProcessorMixin
 from axolotl.prompters import Prompter
 from axolotl.utils.data.lock import FileLockLoader
 from axolotl.utils.data.pretraining import wrap_pretraining_dataset
 from axolotl.utils.data.shared import (
    create_train_validation_split,
    datasets_with_name_generator,
@@ -26,7 +26,6 @@ from axolotl.utils.data.shared import (
    save_preprocessed_dataset,
    try_load_from_hub,
 )
 from axolotl.utils.data.streaming import wrap_streaming_dataset
 from axolotl.utils.data.utils import (
    deduplicate_and_log_datasets,
    handle_long_seq_in_dataset,
@@ -49,6 +48,7 @@ def prepare_datasets(
    cfg: DictDefault,
    tokenizer: PreTrainedTokenizer,
    processor: ProcessorMixin | None = None,
    preprocess_iterable: bool = False,
 ) -> tuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]:
    """Prepare training and evaluation datasets based on configuration.
@@ -56,19 +56,23 @@ def prepare_datasets(
        cfg: Dictionary mapping `axolotl` config keys to values.
        tokenizer: Tokenizer to use for processing text.
        processor: Optional processor for multimodal datasets.
        preprocess_iterable: Whether to use iterable preprocessing.
    Returns:
        Tuple of (train_dataset, eval_dataset, total_steps, prompters).
    """
-    if cfg.streaming or cfg.pretraining_dataset:
+    if cfg.pretraining_dataset:
-        return _prepare_streaming_dataset(cfg, tokenizer, processor)
+        return _prepare_pretraining_dataset(
-    return _prepare_standard_dataset(cfg, tokenizer, processor)
+            cfg, tokenizer, processor, preprocess_iterable
        )
    return _prepare_standard_dataset(cfg, tokenizer, processor, preprocess_iterable)
 def _prepare_standard_dataset(
    cfg: DictDefault,
    tokenizer: PreTrainedTokenizer,
    processor: ProcessorMixin | None,
    preprocess_iterable: bool,
 ) -> tuple[Dataset, Dataset | None, int, list[Prompter | None]]:
    """Prepare standard (non-pretraining) datasets."""
@@ -79,6 +83,7 @@ def _prepare_standard_dataset(
            cfg,
            split="train",
            processor=processor,
            preprocess_iterable=preprocess_iterable,
        )
        # Overwrite eval_dataset if test data exists
@@ -88,6 +93,7 @@ def _prepare_standard_dataset(
                cfg,
                split="test",
                processor=processor,
                preprocess_iterable=preprocess_iterable,
            )
        return train_dataset, eval_dataset, prompters
@@ -122,40 +128,22 @@ def _prepare_standard_dataset(
    return train_dataset, eval_dataset, total_num_steps, prompters
-def _prepare_streaming_dataset(
+def _prepare_pretraining_dataset(
    cfg: DictDefault,
    tokenizer: PreTrainedTokenizer,
    processor: ProcessorMixin | None,
    preprocess_iterable: bool,
 ) -> tuple[IterableDataset, Dataset | None, int, list[Prompter | None]]:
    """
-    Prepare dataset for streaming mode.
+    Prepare dataset for pretraining mode.
-    Note: Streaming datasets are loaded incrementally from the source.
+    Note: Pre-training datasets are streamed from the HuggingFace Hub.
    """
-    if cfg.pretraining_dataset:
+    # Extract pretraining dataset configuration
-        dataset_config = _extract_pretraining_config(cfg)
+    pretraining_config = _extract_pretraining_config(cfg)
        train_dataset = _load_streaming_dataset(dataset_config, cfg, tokenizer)
    elif cfg.sample_packing:
        # TODO(djsaunde): Implement for multiple datasets
        dataset_config = DictDefault(cfg.datasets[0])
-        # Ensure we have a split set - default to 'train' if not specified
+    # Load streaming dataset for training
-        if not hasattr(dataset_config, "split") or not dataset_config.split:
+    train_dataset = _load_pretraining_dataset(pretraining_config, cfg, tokenizer)
            dataset_config.split = "train"
        train_dataset = _load_streaming_dataset(dataset_config, cfg, tokenizer)
    else:
        # Use legacy loading function for non-packed streaming datasets
        train_dataset, eval_dataset, prompters = _load_and_prepare_datasets(
            tokenizer,
            cfg,
            split="train",
            processor=processor,
            streaming=True,
        )
        # Return early for non-packed streaming datasets
        total_num_steps = cfg.max_steps if cfg.max_steps else -1
        return train_dataset, eval_dataset, total_num_steps, prompters
    # Load evaluation dataset if specified
    eval_dataset = None
@@ -165,12 +153,14 @@ def _prepare_streaming_dataset(
            cfg,
            split="test",
            processor=processor,
-            streaming=False,
+            preprocess_iterable=preprocess_iterable,
        )
-    # For streaming, we return max_steps directly from config or -1 if not set
+    if cfg.dataset_exact_deduplication:
-    total_num_steps = cfg.max_steps if cfg.max_steps else -1
+        LOG.info("Deduplication not available for pretrained datasets")
-    return train_dataset, eval_dataset, total_num_steps, []
+
    # For pretraining, we return max_steps directly from config
    return train_dataset, eval_dataset, cfg.max_steps, []
 def _extract_pretraining_config(cfg: DictDefault) -> DictDefault:
@@ -202,7 +192,7 @@ def _extract_pretraining_config(cfg: DictDefault) -> DictDefault:
    )
-def _load_streaming_dataset(
+def _load_pretraining_dataset(
    pretraining_config: DictDefault, cfg: DictDefault, tokenizer: PreTrainedTokenizer
 ) -> IterableDataset:
    """Load and prepare a streaming dataset for pretraining."""
@@ -237,11 +227,15 @@ def _load_streaming_dataset(
        iter_dataset = iter_dataset.skip(pretraining_config["skip"])
    # Wrap the dataset for pretraining
-    train_dataset = wrap_streaming_dataset(
+    train_dataset = wrap_pretraining_dataset(
        iter_dataset,
        tokenizer,
        cfg,
        dataset_wrapper_partial,
        max_tokens=cfg.sequence_len,
        batch_size=cfg.micro_batch_size,
        seed=cfg.seed,
        buffer_size=cfg.pretrain_multipack_buffer_size or 10_000,
    )
    # Format for PyTorch
@@ -262,7 +256,7 @@ def _load_tokenized_prepared_datasets(
    cfg: DictDefault,
    split: Literal["train", "test"] = "train",
    processor: ProcessorMixin | None = None,
-    streaming: bool = False,
+    preprocess_iterable: bool = False,
 ) -> tuple[Dataset | DatasetDict, list[Prompter | None]]:
    """Load or create tokenized and prepared datasets for training or testing.
@@ -271,7 +265,7 @@ def _load_tokenized_prepared_datasets(
        cfg: Configuration object.
        split: Dataset split to load ('train' or 'test').
        processor: Optional processor for multimodal datasets.
-        streaming: Whether to use iterable preprocessing.
+        preprocess_iterable: Whether to use iterable preprocessing.
    Returns:
        Tuple of (dataset, prompters list).
@@ -302,7 +296,7 @@ def _load_tokenized_prepared_datasets(
            tokenizer,
            split,
            processor,
-            streaming,
+            preprocess_iterable,
        )
    return dataset, prompters
@@ -314,7 +308,7 @@ def _load_raw_datasets(
    tokenizer: PreTrainedTokenizer,
    split: str,
    processor: ProcessorMixin | None = None,
-    streaming: bool = False,
+    preprocess_iterable: bool = False,
 ) -> tuple[Dataset, list[Prompter | None]]:
    """Load, process, merge, and save raw datasets."""
    LOG.info("Loading raw datasets...", main_process_only=False)
@@ -335,7 +329,7 @@ def _load_raw_datasets(
            split=split,
            seed=cfg.seed,
            processor=processor,
-            streaming=streaming,
+            preprocess_iterable=preprocess_iterable,
        )
        datasets.append(dataset_wrapper)
        prompters.append(dataset_prompter)
@@ -343,7 +337,7 @@ def _load_raw_datasets(
    # Merge datasets
    dataset = merge_datasets(datasets, cfg)
-    if not cfg.skip_prepare_dataset and not streaming:
+    if not cfg.skip_prepare_dataset:
        if split == "test" and cfg.eval_sequence_len:
            dataset = handle_long_seq_in_dataset(dataset, cfg.eval_sequence_len, cfg)
        else:
@@ -367,19 +361,19 @@ def _load_and_process_single_dataset(
    split: str,
    seed: int,
    processor: ProcessorMixin | None = None,
-    streaming: bool = False,
+    preprocess_iterable: bool = False,
 ) -> tuple[Dataset | IterableDataset, Prompter | None]:
    """Load and process a single dataset based on the passed config."""
    # Load the dataset
    dataset = load_dataset_with_config(
-        dataset_config, cfg.hf_use_auth_token, streaming=streaming
+        dataset_config, cfg.hf_use_auth_token, streaming=preprocess_iterable
    )
    # Parse dataset type
    d_base_type, d_prompt_style = _parse_dataset_type(dataset_config.type)
    # Select the appropriate split
-    if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
+    if isinstance(dataset, DatasetDict):
        if dataset_config.split and dataset_config.split in dataset:
            dataset = dataset[dataset_config.split]
        elif split in dataset:
@@ -485,7 +479,7 @@ def _load_and_prepare_datasets(
    cfg: DictDefault,
    split: Literal["train", "test"] = "train",
    processor: ProcessorMixin | None = None,
-    streaming: bool = False,
+    preprocess_iterable: bool = False,
 ) -> tuple[Dataset | None, Dataset | None, list[Prompter | None]]:
    """Load and prepare datasets with optional validation split and sharding.
@@ -494,7 +488,7 @@ def _load_and_prepare_datasets(
        cfg: Configuration object.
        split: Dataset split to load ('train' or 'test').
        processor: Optional processor for multimodal datasets.
-        streaming: Whether to use iterable preprocessing.
+        preprocess_iterable: Whether to use iterable preprocessing.
    Returns:
        Tuple of (train_dataset, eval_dataset, prompters).
@@ -505,7 +499,7 @@ def _load_and_prepare_datasets(
        cfg,
        split=split,
        processor=processor,
-        streaming=streaming,
+        preprocess_iterable=preprocess_iterable,
    )
    # Apply dataset sharding if configured using shared function
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -236,9 +236,11 @@ def _load_from_local_path(
        try:
            return load_from_disk(dataset_config.path)
        except FileNotFoundError:
            load_dataset_kwargs["streaming"] = False
            return load_dataset(dataset_config.path, **load_dataset_kwargs)
    elif local_path.is_file():
        dataset_type = get_dataset_type(dataset_config)
        load_dataset_kwargs["streaming"] = False
        return load_dataset(
            dataset_type,
            data_files=dataset_config.path,
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -190,21 +190,12 @@ def handle_long_seq_in_dataset(
    Returns:
        Filtered dataset with long sequences removed.
    """
-    if (
+    if "input_ids" not in dataset.column_names:
        hasattr(dataset, "column_names")
        and dataset.column_names
        and "input_ids" not in dataset.column_names
    ):
        LOG.warning(
            "Dataset does not contain 'input_ids' column. Skip drop long seq. This is "
            "expected for reward modeling."
        )
        return dataset
    elif not hasattr(dataset, "column_names") or dataset.column_names is None:
        LOG.info(
            "Dataset is streaming (IterableDataset), skipping long sequence handling"
        )
        return dataset
    drop_long = functools.partial(
        drop_long_seq,
--- a/src/axolotl/utils/environment.py
+++ b/src/axolotl/utils/environment.py
@@ -6,6 +6,8 @@ from importlib.metadata import version
 from accelerate.utils.environment import (
    check_cuda_p2p_ib_support as accelerate_check_cuda_p2p_ib_support,
 )
 from accelerate.utils.environment import (
    get_gpu_info,
 )
 from packaging.version import Version, parse
--- a/src/axolotl/utils/quantization.py
+++ b/src/axolotl/utils/quantization.py
@@ -3,47 +3,30 @@ Utilities for quantization including QAT and PTQ using torchao.
 """
 import torch
-from packaging import version
+from torch import nn
 from torchao.core.config import AOBaseConfig
 from torchao.quantization import quantize_
 from torchao.quantization.qat import (
-    QATConfig,
+    FakeQuantizeConfig,
    FromIntXQuantizationAwareTrainingConfig,
    IntXQuantizationAwareTrainingConfig,
 )
 from torchao.quantization.quant_api import (
-    Float8DynamicActivationFloat8WeightConfig,
+    Int4DynamicActivationInt4WeightConfig,
-    Float8DynamicActivationInt4WeightConfig,
+    Int4WeightOnlyConfig,
    Int8DynamicActivationInt4WeightConfig,
    Int8DynamicActivationInt8WeightConfig,
    Int8WeightOnlyConfig,
    UIntXWeightOnlyConfig,
    _is_linear,
 )
-from axolotl.utils.schemas.enums import TorchAOQuantDType
+from axolotl.utils.schemas.enums import TorchIntDType
 quantization_config_to_str = {
    Int8DynamicActivationInt4WeightConfig: "int8int4",
    Float8DynamicActivationFloat8WeightConfig: "fp8fp8",
    Float8DynamicActivationInt4WeightConfig: "fp8int4",
 }
 if version.parse(torch.__version__) >= version.parse("2.8.0"):
    try:
        from torchao.prototype.mx_formats import NVFP4InferenceConfig
        quantization_config_to_str[NVFP4InferenceConfig] = "nvfp4"
    except:
        pass
    # int4 weight config imports will fail on machines with fbgemm-gpu installed
    # without a CUDA runtime available so we do this safely
    try:
        from torchao.quantization.quant_api import Int4WeightOnlyConfig
        quantization_config_to_str[Int4WeightOnlyConfig] = "int4"
    except:
        pass
-def get_quantization_config(
+def get_ptq_config(
-    weight_dtype: TorchAOQuantDType,
+    weight_dtype: TorchIntDType,
-    activation_dtype: TorchAOQuantDType | None = None,
+    activation_dtype: TorchIntDType | None = None,
    group_size: int | None = None,
 ) -> AOBaseConfig:
    """
@@ -62,101 +45,44 @@ def get_quantization_config(
            or if the group size is not specified for int8 or int4 weight only quantization.
    """
    if activation_dtype is None:
-        if weight_dtype == TorchAOQuantDType.int8:
+        if not weight_dtype.value.is_signed:  # type: ignore[attr-defined,union-attr]
-            raise ValueError("Int8WeightOnlyConfig is not supported by torchao QAT.")
+            return UIntXWeightOnlyConfig(
-        if weight_dtype == TorchAOQuantDType.int4:
+                dtype=weight_dtype.value,
-            from torchao.quantization.quant_api import Int4WeightOnlyConfig
+                group_size=group_size,
-
+                set_inductor_config=False,
-            if group_size is not None:
+            )
-                return Int4WeightOnlyConfig(group_size=group_size, version=2)
+        if weight_dtype == TorchIntDType.int8:
-            else:
+            if group_size is None:
-                return Int4WeightOnlyConfig(version=2)
+                raise ValueError(
-    if (
+                    "group_size must be specified for int8 weight only quantization"
-        activation_dtype == TorchAOQuantDType.int4
+                )
-        and weight_dtype == TorchAOQuantDType.int4
+            return Int8WeightOnlyConfig(
-    ):
+                group_size=group_size,
-        raise ValueError(
+            )
-            "Int4DynamicActivationInt4WeightConfig is not supported by torchao QAT."
+        if weight_dtype == TorchIntDType.int4:
-        )
+            if group_size is None:
-    if (
+                raise ValueError(
-        activation_dtype == TorchAOQuantDType.int8
+                    "group_size must be specified for int4 weight only quantization"
-        and weight_dtype == TorchAOQuantDType.int8
+                )
-    ):
+            return Int4WeightOnlyConfig(
-        raise ValueError(
+                group_size=group_size,
-            "Int8DynamicActivationInt8WeightConfig is not supported by torchao QAT."
+            )
-        )
+    if activation_dtype == TorchIntDType.int4 and weight_dtype == TorchIntDType.int4:
-    if (
+        return Int4DynamicActivationInt4WeightConfig()
-        activation_dtype == TorchAOQuantDType.int8
+    if activation_dtype == TorchIntDType.int8 and weight_dtype == TorchIntDType.int8:
-        and weight_dtype == TorchAOQuantDType.int4
+        return Int8DynamicActivationInt8WeightConfig()
-    ):
+    if activation_dtype == TorchIntDType.int8 and weight_dtype == TorchIntDType.int4:
-        if group_size is not None:
+        return Int8DynamicActivationInt4WeightConfig()
            return Int8DynamicActivationInt4WeightConfig(group_size=group_size)
        else:
            return Int8DynamicActivationInt4WeightConfig()
    if (
        activation_dtype == TorchAOQuantDType.float8_e4m3fn
        and weight_dtype == TorchAOQuantDType.float8_e4m3fn
    ):
        return Float8DynamicActivationFloat8WeightConfig()
    if (
        activation_dtype == TorchAOQuantDType.float8_e4m3fn
        and weight_dtype == TorchAOQuantDType.int4
    ):
        return Float8DynamicActivationInt4WeightConfig()
    if weight_dtype == TorchAOQuantDType.nvfp4:
        from torchao.prototype.mx_formats import NVFP4InferenceConfig
        if group_size is not None and group_size != 16:
            raise ValueError("NVFP4 quantization must use a group_size of 16")
        return NVFP4InferenceConfig()
    raise ValueError(
        f"Invalid activation/weight dtype combination: {activation_dtype}/{weight_dtype}"
    )
 def quantize_model(
    model,
    weight_dtype: TorchAOQuantDType,
    group_size: int | None = None,
    activation_dtype: TorchAOQuantDType | None = None,
    quantize_embedding: bool | None = None,
 ):
    """
    This function is used to quantize a model.
    Args:
        model: The model to quantize.
        weight_dtype: The dtype to use for weight quantization.
        group_size: The group size to use for weight quantization.
        activation_dtype: The dtype to use for activation quantization.
        quantize_embedding: Whether to quantize the model's embedding weights.
    """
    linear_ptq_config = get_quantization_config(
        weight_dtype=weight_dtype,
        activation_dtype=activation_dtype,
        group_size=group_size,
    )
    quantize_(model, linear_ptq_config)
    if quantize_embedding:
        # activation fake quantization is not supported for embedding layers
        embedding_quantize_config = get_quantization_config(
            weight_dtype=weight_dtype,
            activation_dtype=None,
            group_size=group_size,
        )
        quantize_(
            model,
            embedding_quantize_config,
            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
        )
 def prepare_model_for_qat(
    model,
-    weight_dtype: TorchAOQuantDType,
+    weight_dtype: TorchIntDType,
-    group_size: int | None = None,
+    group_size: int,
-    activation_dtype: TorchAOQuantDType | None = None,
+    activation_dtype: TorchIntDType | None = None,
    quantize_embedding: bool = False,
 ):
    """
@@ -174,40 +100,86 @@ def prepare_model_for_qat(
    Raises:
        ValueError: If the activation/weight dtype combination is invalid.
    """
-    base_config = get_quantization_config(
+    if activation_dtype:
        activation_config = FakeQuantizeConfig(
            dtype=activation_dtype.value, granularity="per_token", is_symmetric=False
        )
    weight_config = FakeQuantizeConfig(dtype=weight_dtype.value, group_size=group_size)
    linear_quantize_config = IntXQuantizationAwareTrainingConfig(
        activation_config=None if activation_dtype is None else activation_config,
        weight_config=weight_config,
    )
    quantize_(model, linear_quantize_config)
    if quantize_embedding:
        # activation fake quantization is not supported for embedding layers
        embedding_quantize_config = IntXQuantizationAwareTrainingConfig(
            activation_config=None,
            weight_config=weight_config,
        )
        quantize_(
            model,
            embedding_quantize_config,
            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
        )
 def quantize_model_for_ptq(
    model,
    weight_dtype: TorchIntDType,
    group_size: int | None = None,
    activation_dtype: TorchIntDType | None = None,
    quantize_embedding: bool | None = None,
 ):
    """
    This function is used to quantize a model for post-training quantization.
    It swaps the model's linear layers with fake quantized linear layers.
    If `quantize_embedding` is True, it will also swap the model's embedding weights with fake quantized embedding weights.
    Args:
        model: The model to quantize.
        weight_dtype: The dtype to use for weight quantization.
        group_size: The group size to use for weight quantization.
        activation_dtype: The dtype to use for activation quantization.
        quantize_embedding: Whether to quantize the model's embedding weights.
    """
    linear_ptq_config = get_ptq_config(
        weight_dtype=weight_dtype,
        activation_dtype=activation_dtype,
        group_size=group_size,
    )
-    qat_config = QATConfig(base_config)
+    quantize_(model, linear_ptq_config)
    quantize_(model, qat_config)
    if quantize_embedding:
-        # activation fake quantization is not supported for embedding layers
+        embedding_quantize_config = get_ptq_config(
        embedding_base_config = get_quantization_config(
            weight_dtype=weight_dtype,
            activation_dtype=None,
            group_size=group_size,
        )
        embedding_qat_config = QATConfig(embedding_base_config)
        quantize_(
            model,
-            embedding_qat_config,
+            embedding_quantize_config,
            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
        )
-def convert_qat_model(
+def convert_qat_model_for_ptq(
    model,
-    quantize_embedding: bool = False,
+    *,
    quantize_embedding: bool | None = None,
 ):
    """
-    This function converts a QAT model which has fake quantized layers back to the original model.
+    This function is used to convert a swap fake-quantized modules in a model
    which has been trained with QAT back to the original modules, ready for PTQ.
    Args:
        model: The model to convert.
        quantize_embedding: Whether to quantize the model's embedding weights.
    """
    config = QATConfig(step="convert")
    quantize_(model, config)
    if quantize_embedding:
-        quantize_(
+
-            model,
+        def filter_fn(m, _):
-            config,
+            return isinstance(m, nn.Embedding) or _is_linear(m)
-            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
+
-        )
+    else:
        filter_fn = _is_linear
    quantize_(model, FromIntXQuantizationAwareTrainingConfig(), filter_fn=filter_fn)
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -106,12 +106,6 @@ class AxolotlInputConfig(
            "description": "Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs"
        },
    )
    reinit_weights: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Reinitialize model weights randomly instead of loading pretrained weights"
        },
    )
    trainer_cls: str | None = Field(
        default=None,
@@ -144,12 +138,6 @@ class AxolotlInputConfig(
            "description": "Process reward modelling: `True` or `False`"
        },
    )
    center_rewards_coefficient: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`."
        },
    )
    num_labels: int | None = None
    # Whether to use weighting in DPO trainer.
    # If `None`, default is `False` in the trainer.
@@ -487,6 +475,12 @@ class AxolotlInputConfig(
        },
    )
    multipack_real_batches: bool | None = None
    pretraining_sample_concatenation: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "whether to concatenate samples during pretraining",
        },
    )
    batch_flattening: Literal["auto"] | bool | None = Field(
        default=None,
@@ -501,34 +495,13 @@ class AxolotlInputConfig(
    pose_max_context_len: int | None = None
    pose_num_chunks: int | None = None
-    # Deprecated: Use streaming_multipack_buffer_size instead
+    pretrain_multipack_buffer_size: int | None = 10_000
    pretrain_multipack_buffer_size: int | None = Field(
        default=None,
        deprecated="Deprecated in v0.13.0, will be removed in v0.14.0. Use streaming_multipack_buffer_size instead",
    )
    pretrain_multipack_attn: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "whether to prevent cross attention for packed sequences during pretraining",
        },
    )
    pretraining_sample_concatenation: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "whether to concatenate samples during pretraining",
        },
    )
    streaming: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Use streaming mode for loading datasets"},
    )
    streaming_multipack_buffer_size: int | None = Field(
        default=10_000,
        json_schema_extra={
            "description": "Buffer size for multipack streaming datasets"
        },
    )
    xformers_attention: bool | None = Field(
        default=None,
@@ -861,9 +834,9 @@ class AxolotlInputConfig(
        },
    )
    include_tkps: bool | None = Field(
-        default=True,
+        default=None,
        json_schema_extra={
-            "description": "bool of whether to report tokens per second per-gpu during training by measuring throughput of non-padding tokens."
+            "description": "bool of whether to report tokens per second during training by measuring throughput of non-padding tokens."
        },
    )
    neftune_noise_alpha: float | None = Field(
@@ -959,15 +932,7 @@ class AxolotlInputConfig(
        },
    )
-    fix_untrained_tokens: int | list[int] | None = Field(
+    fix_untrained_tokens: int | list[int] | None = None
        default=None,
        json_schema_extra={
            "description": (
                "Token index or indices to adjust embedding weights to the mean of the other tokens. "
                "This is useful when the model has untrained embeddings."
            )
        },
    )
    # INTERNALS - document for now, generally not set externally
    is_preprocess: bool | None = None
@@ -1026,26 +991,6 @@ class AxolotlInputConfig(
            return [ds_config.model_dump(exclude_none=True) for ds_config in ds_configs]
        return None
    @model_validator(mode="before")
    @classmethod
    def warn_peft_trainable_token_to_fix_untrained(cls, data):
        if (
            peft_trainable_token_indices := data.get("peft_trainable_token_indices")
        ) and (fix_untrained_tokens := data.get("fix_untrained_tokens")):
            if isinstance(fix_untrained_tokens, int):
                fix_untrained_tokens = (fix_untrained_tokens,)
            if isinstance(peft_trainable_token_indices, int):
                peft_trainable_token_indices = (peft_trainable_token_indices,)
            for untrained_token_id in fix_untrained_tokens:
                if untrained_token_id not in peft_trainable_token_indices:
                    LOG.warning_once(
                        f"Token {untrained_token_id} is fixed via `fix_untrained_tokens`, yet not in `peft_trainable_token_indices: ` list. "
                        "Please add it, otherwise the token won't be trained on."
                    )
        return data
 class AxolotlConfigWCapabilities(AxolotlInputConfig):
    """wrapper to valdiate GPU capabilities with the configured options"""
@@ -1319,14 +1264,3 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
            data["dataset_processes"] = get_default_process_count()
        return data
    @model_validator(mode="before")
    @classmethod
    def check_deduplication_with_streaming(cls, data):
        if data.get("dataset_exact_deduplication") and (
            data.get("streaming") or data.get("pretraining_dataset")
        ):
            raise NotImplementedError(
                "dataset_exact_deduplication is not available for streaming datasets. "
            )
        return data
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -5,21 +5,18 @@ from enum import Enum
 import torch
-class TorchAOQuantDType(Enum):
+class TorchIntDType(Enum):
-    int4 = torch.int4
+    """Torch integer data types - `getattr` guards against torch < 2.6 which does not support int4"""
    int8 = torch.int8
    float8_e4m3fn = torch.float8_e4m3fn
    nvfp4 = "nvfp4"
-    def from_string(str):
+    uint1 = getattr(torch, "uint1", None)
-        if str == "int4":
+    uint2 = getattr(torch, "uint2", None)
-            return TorchAOQuantDType.int4
+    uint3 = getattr(torch, "uint3", None)
-        if str == "int8":
+    uint4 = getattr(torch, "uint4", None)
-            return TorchAOQuantDType.int8
+    uint5 = getattr(torch, "uint5", None)
-        if str in ["float8_e4m3fn", "fp8", "float8"]:
+    uint6 = getattr(torch, "uint6", None)
-            return TorchAOQuantDType.float8_e4m3fn
+    uint7 = getattr(torch, "uint7", None)
-        if str == "nvfp4":
+    int4 = getattr(torch, "int4", None)
-            return TorchAOQuantDType.nvfp4
+    int8 = getattr(torch, "int8", None)
 class RLType(str, Enum):
--- a/src/axolotl/utils/schemas/peft.py
+++ b/src/axolotl/utils/schemas/peft.py
@@ -90,16 +90,6 @@ class LoraConfig(BaseModel):
            "description": "How to initialize LoRA weights. Default to True which is MS original implementation."
        },
    )
    peft_trainable_token_indices: list[int] | dict[str, list[int]] | None = Field(
        default=None,
        json_schema_extra={
            "description": (
                "A list of token indices to fine-tune on the `embed_tokens` layer.\n"
                "Otherwise, a dict mapping an embedding layer name to its trainable token indices.\n"
                "See https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-tokens-alongside-lora"
            )
        },
    )
    qlora_sharded_model_loading: bool | None = Field(
        default=False,
--- a/src/axolotl/utils/schemas/quantization.py
+++ b/src/axolotl/utils/schemas/quantization.py
@@ -6,23 +6,7 @@ from typing import Any
 from pydantic import BaseModel, Field, field_validator
-from axolotl.utils.schemas.enums import TorchAOQuantDType
+from axolotl.utils.schemas.enums import TorchIntDType
 def validate_ao_dtype(v: Any) -> TorchAOQuantDType | None:
    if v is None:
        return None
    if v == "int4":
        return TorchAOQuantDType.int4
    if v == "int8":
        return TorchAOQuantDType.int8
    if v in ["float8_e4m3fn", "fp8", "float8"]:
        return TorchAOQuantDType.float8_e4m3fn
    if v == "nvfp4":
        return TorchAOQuantDType.nvfp4
    raise ValueError(
        f"Invalid dtype: '{v}'. Must be one of: {[e.name for e in TorchAOQuantDType] + ['fp8', 'float8']}"
    )
 class QATConfig(BaseModel):
@@ -30,13 +14,13 @@ class QATConfig(BaseModel):
    QAT Config Schema
    """
-    activation_dtype: TorchAOQuantDType | None = Field(
+    activation_dtype: TorchIntDType | None = Field(
        default=None,
-        description="Fake quantization layout to use for activation quantization.",
+        description='Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"',
    )
-    weight_dtype: TorchAOQuantDType = Field(
+    weight_dtype: TorchIntDType = Field(
-        default=TorchAOQuantDType.int8,
+        default=TorchIntDType.int8,
-        description="Fake quantization layout to use for weight quantization.",
+        description='Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"',
    )
    quantize_embedding: bool | None = Field(
        default=False, description="Quantize embedding"
@@ -51,8 +35,12 @@ class QATConfig(BaseModel):
    @field_validator("activation_dtype", "weight_dtype", mode="before")
    @classmethod
-    def validate_dtype(cls, v: Any) -> TorchAOQuantDType | None:
+    def validate_dtype(cls, v: Any) -> TorchIntDType | None:
-        return validate_ao_dtype(v)
+        if v == "int4":
            return TorchIntDType.int4
        if v == "int8":
            return TorchIntDType.int8
        raise ValueError(f"Invalid dtype: '{v}'. Must be one of: ['int4', 'int8']")
 class PTQConfig(BaseModel):
@@ -60,13 +48,13 @@ class PTQConfig(BaseModel):
    PTQ Config Schema
    """
-    weight_dtype: TorchAOQuantDType = Field(
+    weight_dtype: TorchIntDType = Field(
-        default=TorchAOQuantDType.int8,
+        default=TorchIntDType.int8,
-        description="Fake quantization layout to use for weight quantization.",
+        description="Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8",
    )
-    activation_dtype: TorchAOQuantDType | None = Field(
+    activation_dtype: TorchIntDType | None = Field(
        default=None,
-        description="Fake quantization layout to use for activation quantization.",
+        description='Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"',
    )
    quantize_embedding: bool | None = Field(
        default=None, description="Whether to quantize the embedding layer."
@@ -78,5 +66,9 @@ class PTQConfig(BaseModel):
    @field_validator("activation_dtype", "weight_dtype", mode="before")
    @classmethod
-    def validate_dtype(cls, v: Any) -> TorchAOQuantDType | None:
+    def validate_dtype(cls, v: Any) -> TorchIntDType | None:
-        return validate_ao_dtype(v)
+        if v == "int4":
            return TorchIntDType.int4
        if v == "int8":
            return TorchIntDType.int8
        raise ValueError(f"Invalid dtype: '{v}'. Must be one of: ['int4', 'int8']")
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -14,6 +14,7 @@ from transformers.utils.import_utils import is_torch_npu_available
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType
 LOG = get_logger(__name__)
 SUPPORTED_METRICS = {"sacrebleu", "comet", "ter", "chrf", "perplexity"}
@@ -59,20 +60,6 @@ class DatasetValidationMixin:
            raise ValueError("either datasets or pretraining_dataset is required")
        return data
    @model_validator(mode="before")
    @classmethod
    def check_pretraining_streaming_deprecation(cls, data):
        # TODO(djsaunde): remove this check + implement change for 0.13.0 release
        if data.get("pretraining_dataset") and not data.get("streaming"):
            LOG.warning(
                "Setting `pretraining_dataset` without explicitly setting `streaming: "
                "true` is deprecated. In a future release, streaming will not be "
                "automatically enabled when using pretraining_dataset. Please "
                "explicitly set `streaming: true` in your configuration to maintain "
                "current behavior."
            )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_push_ds_auth(cls, data):
@@ -353,30 +340,6 @@ class TrainingValidationMixin:
            )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_multipack_buffer_size(cls, data):
        if data.get("pretrain_multipack_buffer_size") and not data.get(
            "streaming_multipack_buffer_size"
        ):
            LOG.warning(
                "`pretrain_multipack_buffer_size` is deprecated in v0.13.0, will be "
                "removed in v0.14.0. Use `streaming_multipack_buffer_size` instead."
            )
            data["streaming_multipack_buffer_size"] = data[
                "pretrain_multipack_buffer_size"
            ]
            del data["pretrain_multipack_buffer_size"]
        elif data.get("pretrain_multipack_buffer_size") and data.get(
            "streaming_multipack_buffer_size"
        ):
            raise ValueError(
                "pretrain_multipack_buffer_size is deprecated, use "
                "streaming_multipack_buffer_size; both are set, please remove the "
                "deprecated pretrain_multipack_buffer_size setting"
            )
        return data
    @model_validator(mode="after")
    def check_fft_possible_bad_config(self):
        if (
@@ -1111,50 +1074,6 @@ class PretrainingValidationMixin:
                    data["accelerator_config"]["dispatch_batches"] = False
        return data
    @model_validator(mode="before")
    @classmethod
    def check_pretraining_w_val_set_size(cls, data):
        if data.get("pretraining_dataset") and data.get("val_set_size"):
            raise ValueError(
                "val_set_size is not supported with pretraining_dataset. "
                "Use test_datasets to specify evaluation datasets for pretraining."
            )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_streaming_w_val_set_size(cls, data):
        if data.get("streaming") and data.get("val_set_size"):
            raise ValueError(
                "val_set_size is not supported with streaming datasets. "
                "Use test_datasets to specify evaluation datasets when streaming is enabled."
            )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_streaming_w_max_steps(cls, data):
        if data.get("streaming") and not data.get("max_steps"):
            raise ValueError(
                "max_steps must be set when using streaming datasets. "
                "Trainer cannot infer dataset length for iterable datasets."
            )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_streaming_w_multiple_datasets(cls, data):
        if (
            data.get("streaming")
            and data.get("sample_packing")
            and data.get("datasets")
            and len(data.get("datasets")) > 1
        ):
            raise NotImplementedError(
                "Sample packing with multiple streaming datasets is not yet supported"
            )
        return data
 class ModelCompatibilityValidationMixin:
    """Validation methods for specific model compatibility."""
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -475,9 +475,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                train_dataset.remove_columns(["length"]),
                batch_sampler=sampler,
            )
-            data_loader_len = max(
+            data_loader_len = len(data_loader) * cfg.micro_batch_size // cfg.batch_size
                1, len(data_loader) * cfg.micro_batch_size // cfg.batch_size
            )
            LOG.debug(f"data_loader_len: {data_loader_len}")
            # FIXME: is there a bug here somewhere? the total num steps depends
            # on the agreed on value for sample_packing_eff_est
@@ -549,13 +547,6 @@ def setup_deepspeed_env(cfg, stage=None):
        if stage == 3:
            os.environ["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = "true"
    device_count = torch.cuda.device_count()
    if device_count == 1:
        os.environ.setdefault("WORLD_SIZE", "1")
        os.environ.setdefault("LOCAL_RANK", "0")
        os.environ.setdefault("MASTER_ADDR", "0.0.0.0")  # nosec B104
        os.environ.setdefault("MASTER_PORT", "29500")
    # NOTE(djsaunde): The distribued state cannot be initialized prior to the
    # ACCELERATE_USE_DEEPSPEED assignment, but it must be initialized some time prior
    # to model load.
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -25,7 +25,7 @@ def min_cfg(temp_dir):
        "liger_rms_norm": True,
        "liger_glu_activation": True,
        "torch_compile": True,
-        "chat_template": "qwen3",
+        "chat_template": "llama3",
        "kd_trainer": True,
        "kd_ce_alpha": 0.1,
        "kd_alpha": 0.9,
--- a/tests/e2e/test_diffusion.py
+++ b/tests/e2e/test_diffusion.py
@@ -1,139 +0,0 @@
 """E2E smoke test for diffusion training plugin."""
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 from tests.e2e.utils import check_model_output_exists
 class TestDiffusion:
    """Test case for diffusion training plugin."""
    def test_diffusion_smoke_test(self, temp_dir):
        """
        Smoke test for diffusion training to ensure the plugin loads and trains without
        error.
        """
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "trust_remote_code": True,
                "sequence_len": 256,
                "val_set_size": 0.1,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 3,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
                "bf16": True,
                "save_safetensors": True,
                "save_first_step": False,
                "logging_steps": 1,
                "eval_steps": 3,
                # Diffusion-specific config
                "plugins": ["axolotl.integrations.diffusion.DiffusionPlugin"],
                "diffusion": {
                    # sample generation
                    "generate_samples": True,
                    "generation_interval": 1,
                    "num_generation_samples": 1,
                    "generation_steps": 2,
                    "generation_max_length": 32,
                    "generation_temperature": 0.0,
                    # training-specific
                    "mask_token_id": 16,
                    "eps": 1e-3,
                    "importance_weighting": False,
                },
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
    def test_diffusion_sft_labels(self, temp_dir):
        """Test that diffusion training properly handles SFT data with labels."""
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "trust_remote_code": True,
                "sequence_len": 256,
                "val_set_size": 0.1,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 3,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
                "bf16": True,
                "save_safetensors": True,
                "save_first_step": False,
                "logging_steps": 1,
                "eval_steps": 2,
                # Diffusion-specific config
                "plugins": ["axolotl.integrations.diffusion.DiffusionPlugin"],
                "diffusion": {
                    # sample generation
                    "generate_samples": True,
                    "generation_interval": 1,
                    "num_generation_samples": 1,
                    "generation_steps": 2,
                    "generation_max_length": 32,
                    "generation_temperature": 0.0,
                    # training-specific
                    "mask_token_id": 16,
                    "eps": 1e-3,
                    "importance_weighting": True,
                },
                # Ensure we have proper SFT labels
                "train_on_inputs": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)
        # Verify that the dataset has labels
        sample = dataset_meta.train_dataset[0]
        assert "labels" in sample, "SFT dataset should have labels"
        # Check that some labels are -100 (prompt tokens)
        labels = sample["labels"]
        if hasattr(labels, "tolist"):
            labels = labels.tolist()
        assert -100 in labels, "SFT dataset should have -100 labels for prompt tokens"
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
--- a/tests/e2e/test_qat.py
+++ b/tests/e2e/test_qat.py
@@ -43,7 +43,7 @@ class TestQATLlama:
                "qat": {
                    "quantize_embedding": True,
                    "activation_dtype": "int8",
-                    "weight_dtype": "int4",
+                    "weight_dtype": "int8",
                    "group_size": 8,
                },
                "num_epochs": 1,
@@ -111,7 +111,7 @@ class TestQATLlama:
                "qat": {
                    "quantize_embedding": True,
                    "activation_dtype": "int8",
-                    "weight_dtype": "int4",
+                    "weight_dtype": "int8",
                    "group_size": 8,
                },
                "save_first_step": False,
--- a/tests/e2e/test_quantization.py
+++ b/tests/e2e/test_quantization.py
@@ -5,40 +5,41 @@ Tests for axolotl.utils.quantization
 import pytest
 import torch
 from torch import nn
-from torchao.quantization import LinearActivationQuantizedTensor
+from torchao.dtypes.affine_quantized_tensor import AffineQuantizedTensor
 from torchao.quantization.granularity import PerAxis, PerGroup
 from torchao.quantization.linear_activation_quantized_tensor import (
    LinearActivationQuantizedTensor,
 )
 from torchao.quantization.qat.embedding import FakeQuantizedEmbedding
 from torchao.quantization.qat.linear import FakeQuantizedLinear
 from torchao.quantization.quant_api import (
-    Float8DynamicActivationFloat8WeightConfig,
+    Int4DynamicActivationInt4WeightConfig,
-    Float8DynamicActivationInt4WeightConfig,
+    Int4WeightOnlyConfig,
-    Int8DynamicActivationInt4WeightConfig,
+    Int8DynamicActivationInt8WeightConfig,
    Int8WeightOnlyConfig,
    UIntXWeightOnlyConfig,
 )
 from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor
 from transformers import AutoModelForCausalLM
 from transformers.trainer_callback import TrainerState
 from axolotl.utils.callbacks.qat import QATCallback
 from axolotl.utils.quantization import (
-    convert_qat_model,
+    convert_qat_model_for_ptq,
-    get_quantization_config,
+    get_ptq_config,
    prepare_model_for_qat,
-    quantize_model,
+    quantize_model_for_ptq,
 )
-from axolotl.utils.schemas.enums import TorchAOQuantDType
+from axolotl.utils.schemas.enums import TorchIntDType
 from axolotl.utils.schemas.quantization import QATConfig
-from tests.e2e.utils import (
+from tests.e2e.utils import require_torch_2_6_0
    require_torch_2_8_0,
    requires_cuda_ge_8_9,
    requires_sm_ge_100,
 )
@pytest.fixture()
 def model():
    dummy_model = AutoModelForCausalLM.from_pretrained(
-        "Qwen/Qwen2-0.5B",
+        "HuggingFaceTB/SmolLM2-135M",
-        device_map="auto",
+        device_map="cuda",
        torch_dtype=torch.bfloat16,
    )
    with torch.device(dummy_model.device):
@@ -47,56 +48,45 @@ def model():
            dummy_model.model.embed_tokens.weight.shape[1],
            dtype=dummy_model.model.embed_tokens.weight.dtype,
        )
-    yield dummy_model
+    return dummy_model
    del dummy_model
 ptq_config_test_cases = [
-    # weight_dtype, activation_dtype, group_size, expected_type
+    # weight_dtype, activation_dtype, group_size, expected_type, expected_params
    (
-        TorchAOQuantDType.int4,
+        TorchIntDType.uint4,
        TorchAOQuantDType.int8,
        None,
-        Int8DynamicActivationInt4WeightConfig,
+        None,
        UIntXWeightOnlyConfig,
        {"dtype": torch.uint4, "group_size": None},
    ),
    (TorchIntDType.int8, None, 32, Int8WeightOnlyConfig, {"group_size": 32}),
    (TorchIntDType.int4, None, 4, Int4WeightOnlyConfig, {"group_size": 4}),
    (
        TorchIntDType.int4,
        TorchIntDType.int4,
        None,
        Int4DynamicActivationInt4WeightConfig,
        {},
    ),
    (
-        TorchAOQuantDType.float8_e4m3fn,
+        TorchIntDType.int8,
-        TorchAOQuantDType.float8_e4m3fn,
+        TorchIntDType.int8,
        None,
-        Float8DynamicActivationFloat8WeightConfig,
+        Int8DynamicActivationInt8WeightConfig,
-    ),
+        {},
    (
        TorchAOQuantDType.int4,
        TorchAOQuantDType.float8_e4m3fn,
        None,
        Float8DynamicActivationInt4WeightConfig,
    ),
 ]
 ptq_test_cases = [
-    # weight_dtype, activation_dtype, group_size, quantize_embedding, expected_exception, expected_tensor_class
+    # weight_dtype, activation_dtype, group_size, quantize_embedding, expected_exception
-    (TorchAOQuantDType.int4, None, 4, True, None, Int4Tensor),
+    (TorchIntDType.int8, None, 8, False, None),
-    (
+    (TorchIntDType.int4, None, 4, True, None),
-        TorchAOQuantDType.int4,
+    (TorchIntDType.uint4, None, 8, False, None),
-        TorchAOQuantDType.int8,
+    (TorchIntDType.int4, TorchIntDType.int4, 8, False, None),
-        8,
+    (TorchIntDType.int8, TorchIntDType.int8, 8, True, None),
-        False,
+    (TorchIntDType.int8, None, None, False, ValueError),
-        None,
+    (TorchIntDType.int4, None, None, False, ValueError),
        LinearActivationQuantizedTensor,
    ),
    # (
    #     TorchAOQuantDType.int4,
    #     TorchAOQuantDType.float8_e4m3fn,
    #     None,
    #     False,
    #     None,
    #     Int4Tensor,
    # ),
    (TorchAOQuantDType.int4, None, None, False, None, Int4Tensor),
    # Deprecated configs
    (TorchAOQuantDType.int8, None, 8, False, ValueError, None),
    (TorchAOQuantDType.int4, TorchAOQuantDType.int4, 8, False, ValueError, None),
    (TorchAOQuantDType.int8, TorchAOQuantDType.int8, 8, True, ValueError, None),
 ]
@@ -106,132 +96,44 @@ class TestQuantization:
    """
    @pytest.mark.parametrize(
-        "weight_dtype,activation_dtype,group_size,expected_type",
+        "weight_dtype,activation_dtype,group_size,expected_type,expected_params",
        ptq_config_test_cases,
    )
-    @requires_cuda_ge_8_9
+    @require_torch_2_6_0
    @require_torch_2_8_0
    def test_get_ptq_config(
-        self, weight_dtype, activation_dtype, group_size, expected_type
+        self, weight_dtype, activation_dtype, group_size, expected_type, expected_params
    ):
-        config = get_quantization_config(weight_dtype, activation_dtype, group_size)
+        config = get_ptq_config(weight_dtype, activation_dtype, group_size)
        assert isinstance(config, expected_type)
-    @requires_cuda_ge_8_9
+        for param_name, param_value in expected_params.items():
-    @require_torch_2_8_0
+            if isinstance(param_value, (PerAxis, PerGroup)):
-    def test_get_ptq_config_int4_weight_only(self):
+                if isinstance(param_value, PerAxis):
-        from torchao.quantization.quant_api import Int4WeightOnlyConfig
+                    assert isinstance(getattr(config, param_name), PerAxis)
-
+                    assert getattr(config, param_name).axis == param_value.axis
-        config = get_quantization_config(TorchAOQuantDType.int4, None, 4)
+                else:
-        assert isinstance(config, Int4WeightOnlyConfig)
+                    assert isinstance(getattr(config, param_name), PerGroup)
                    assert (
                        getattr(config, param_name).group_size == param_value.group_size
                    )
            else:
                assert getattr(config, param_name) == param_value
    @pytest.mark.parametrize(
-        "weight_dtype,activation_dtype,group_size,quantize_embedding,expected_exception,expected_tensor_class",
+        "weight_dtype", [TorchIntDType.int8, TorchIntDType.int4, TorchIntDType.uint4]
        ptq_test_cases,
    )
    @requires_cuda_ge_8_9
    @require_torch_2_8_0
    def test_quantize_model_for_ptq(
        self,
        model,
        weight_dtype,
        activation_dtype,
        group_size,
        quantize_embedding,
        expected_exception,
        expected_tensor_class,
    ):
        if expected_exception:
            with pytest.raises(expected_exception):
                quantize_model(
                    model,
                    weight_dtype,
                    group_size,
                    activation_dtype,
                    quantize_embedding,
                )
        else:
            quantize_model(
                model, weight_dtype, group_size, activation_dtype, quantize_embedding
            )
            if quantize_embedding:
                assert isinstance(
                    model.model.embed_tokens.weight, expected_tensor_class
                ), "Embedding weight should be quantized"
            for child in list(model.children()):
                if isinstance(child, torch.nn.Linear):
                    assert isinstance(child.weight, expected_tensor_class)
    @require_torch_2_8_0
    @requires_sm_ge_100
    def test_quantize_model_for_ptq_fp8(
        self,
        model,
    ):
        from torchao.quantization.quantize_.workflows.float8.float8_tensor import (
            Float8Tensor,
            QuantizeTensorToFloat8Kwargs,
        )
        quantize_model(
            model,
            TorchAOQuantDType.float8_e4m3fn,
            None,
            TorchAOQuantDType.float8_e4m3fn,
        )
        for child in list(model.children()):
            if isinstance(child, torch.nn.Linear):
                assert isinstance(child.weight, Float8Tensor)
                assert child.weight.act_quant_kwargs is not None and isinstance(
                    child.weight.act_quant_kwargs, QuantizeTensorToFloat8Kwargs
                )
    @require_torch_2_8_0
    @requires_sm_ge_100
    def test_quantize_model_for_ptq_nvfp4(
        self,
        model,
    ):
        from torchao.prototype.mx_formats.nvfp4_tensor import (
            NVFP4Tensor,
            QuantizeTensorToNVFP4Kwargs,
        )
        quantize_model(model, TorchAOQuantDType.nvfp4, 16, TorchAOQuantDType.nvfp4)
        for child in list(model.children()):
            if isinstance(child, torch.nn.Linear):
                assert isinstance(child.weight, NVFP4Tensor)
                assert child.weight.act_quant_kwargs is not None and isinstance(
                    child.weight.act_quant_kwargs, QuantizeTensorToNVFP4Kwargs
                )
    @pytest.mark.parametrize(
-        "weight_dtype,activation_dtype,group_size,quantize_embedding",
+        "activation_dtype", [None, TorchIntDType.int4, TorchIntDType.int8]
        [
            (TorchAOQuantDType.int4, None, 8, False),
            (TorchAOQuantDType.int4, None, 16, True),
            (TorchAOQuantDType.int4, TorchAOQuantDType.int8, 8, False),
            (TorchAOQuantDType.int4, TorchAOQuantDType.int8, 16, True),
            (
                TorchAOQuantDType.float8_e4m3fn,
                TorchAOQuantDType.float8_e4m3fn,
                None,
                False,
            ),
            (TorchAOQuantDType.int4, TorchAOQuantDType.float8_e4m3fn, None, True),
        ],
    )
-    @require_torch_2_8_0
+    @pytest.mark.parametrize("group_size", [4, 8])
-    @requires_cuda_ge_8_9
+    @pytest.mark.parametrize("quantize_embedding", [False, True])
    @require_torch_2_6_0
    def test_prepare_model_for_qat(
        self, model, weight_dtype, activation_dtype, group_size, quantize_embedding
    ):
        prepare_model_for_qat(
-            model,
+            model, weight_dtype, group_size, activation_dtype, quantize_embedding
            weight_dtype,
            group_size,
            activation_dtype,
            quantize_embedding,
        )
        if quantize_embedding:
            assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
@@ -240,19 +142,17 @@ class TestQuantization:
                model.model.embed_tokens.weight_fake_quantizer.config.dtype
                == weight_dtype.value
            )
-            if group_size:
+            assert (
-                assert (
+                model.model.embed_tokens.weight_fake_quantizer.config.group_size
-                    model.model.embed_tokens.weight_fake_quantizer.config.group_size
+                == group_size
-                    == group_size
+            )
                )
        for child in list(model.children()):
            if isinstance(child, torch.nn.Linear):
                assert isinstance(child, FakeQuantizedLinear)
                assert hasattr(child, "weight_fake_quantizer")
                assert child.weight_fake_quantizer.config.dtype == weight_dtype.value
-                if group_size:
+                assert child.weight_fake_quantizer.config.group_size == group_size
                    assert child.weight_fake_quantizer.config.group_size == group_size
                if activation_dtype:
                    assert hasattr(child, "activation_fake_quantizer")
                    assert (
@@ -262,40 +162,49 @@ class TestQuantization:
                else:
                    assert child.activation_fake_quantizer is None
-    @require_torch_2_8_0
+    @pytest.mark.parametrize(
-    @requires_cuda_ge_8_9
+        "weight_dtype,activation_dtype,group_size,quantize_embedding,expected_exception",
-    def test_convert_qat_model(self, model):
+        ptq_test_cases,
-        config = QATConfig(
+    )
-            weight_dtype="int4",
+    @require_torch_2_6_0
-            activation_dtype="int8",
+    def test_quantize_model_for_ptq(
-            group_size=8,
+        self,
-            quantize_embedding=True,
+        model,
-        )
+        weight_dtype,
-
+        activation_dtype,
-        # quantize model for qat
+        group_size,
-        prepare_model_for_qat(
+        quantize_embedding,
-            model,
+        expected_exception,
-            config.weight_dtype,
+    ):
-            config.group_size,
+        if expected_exception:
-            config.activation_dtype,
+            with pytest.raises(expected_exception):
-            config.quantize_embedding,
+                quantize_model_for_ptq(
-        )
+                    model,
-
+                    weight_dtype,
-        assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+                    group_size,
-        assert isinstance(model.lm_head, FakeQuantizedLinear)
+                    activation_dtype,
-
+                    quantize_embedding,
-        # apply conversion
+                )
-        convert_qat_model(
+        else:
-            model,
+            quantize_model_for_ptq(
-            config.quantize_embedding,
+                model, weight_dtype, group_size, activation_dtype, quantize_embedding
-        )
+            )
-        # ensure modules have been swapped out
+            if quantize_embedding:
-        assert not isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+                assert isinstance(
-        assert not isinstance(model.lm_head, FakeQuantizedLinear)
+                    model.model.embed_tokens.weight, AffineQuantizedTensor
-
+                ), "Embedding weight should be quantized"
-        # ensure weights have been quantized
+            for child in list(model.children()):
-        assert isinstance(model.model.embed_tokens.weight, nn.Parameter)
+                if isinstance(child, torch.nn.Linear):
-        assert isinstance(model.lm_head.weight, nn.Parameter)
+                    if activation_dtype:
                        assert isinstance(
                            child.weight, LinearActivationQuantizedTensor
                        ), (
                            "Linear weight should be quantized with activation quantization"
                        )
                    else:
                        assert isinstance(child.weight, AffineQuantizedTensor), (
                            "Linear weight should be quantized without activation quantization"
                        )
 class TestQuantizationCallback:
@@ -309,10 +218,10 @@ class TestQuantizationCallback:
            global_step=0,
        )
-    @require_torch_2_8_0
+    @require_torch_2_6_0
    def test_qat_callback_fake_quant_after_n_steps(self, model, trainer_state):
        cfg = QATConfig(
-            weight_dtype="int4",
+            weight_dtype="int8",
            activation_dtype="int8",
            group_size=8,
            quantize_embedding=True,
@@ -359,10 +268,10 @@ class TestQuantizationCallback:
        assert model.model.embed_tokens.weight_fake_quantizer.enabled
        assert model.lm_head.weight_fake_quantizer.enabled
-    @require_torch_2_8_0
+    @require_torch_2_6_0
    def test_qat_callback_fake_quant_after_n_steps_is_none(self, model, trainer_state):
        cfg = QATConfig(
-            weight_dtype="int4",
+            weight_dtype="int8",
            activation_dtype="int8",
            group_size=8,
            quantize_embedding=True,
@@ -395,3 +304,43 @@ class TestQuantizationCallback:
        # quantization should be enabled from the get-go
        assert model.model.embed_tokens.weight_fake_quantizer.enabled
        assert model.lm_head.weight_fake_quantizer.enabled
 class TestConvertQATModelForPTQ:
    """
    Test convert_qat_model_for_ptq
    """
    @require_torch_2_6_0
    def test_convert_qat_model_for_ptq(self, model):
        config = QATConfig(
            weight_dtype="int8",
            activation_dtype="int8",
            group_size=8,
            quantize_embedding=True,
        )
        # quantize model for qat
        prepare_model_for_qat(
            model,
            config.weight_dtype,
            config.group_size,
            config.activation_dtype,
            config.quantize_embedding,
        )
        assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
        assert isinstance(model.lm_head, FakeQuantizedLinear)
        # apply conversion
        convert_qat_model_for_ptq(
            model,
            quantize_embedding=config.quantize_embedding,
        )
        # ensure modules have been swapped out
        assert not isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
        assert not isinstance(model.lm_head, FakeQuantizedLinear)
        # ensure weights have been quantized
        assert isinstance(model.model.embed_tokens.weight, nn.Parameter)
        assert isinstance(model.lm_head.weight, nn.Parameter)
--- a/tests/e2e/test_streaming.py
+++ b/tests/e2e/test_streaming.py
@@ -1,73 +0,0 @@
 """E2E tests for streaming dataset functionality"""
 # pylint: disable=duplicate-code
 import pytest
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 from .utils import check_model_output_exists, check_tensorboard
 class TestStreamingDatasets:
    """Test case for streaming datasets"""
    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
    )
    def test_streaming_dataset(self, temp_dir, sample_packing):
        """Test streaming datasets"""
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "flash_attention": True,
                "sequence_len": 1024,
                "sample_packing": sample_packing,
                "pretrain_multipack_attn": sample_packing,
                "streaming_multipack_buffer_size": 10000,
                "dataset_processes": 1,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                # Streaming config
                "streaming": True,
                "max_steps": 3,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
        # Verify training actually happened by checking loss decrease
        check_tensorboard(
            temp_dir + "/runs",
            "train/train_loss",
            3.0,
            "Train Loss (%s) is too high",
        )
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -90,18 +90,6 @@ def require_torch_2_7_0(test_case):
    return unittest.skipUnless(is_min_2_7_0(), "test requires torch>=2.7.0")(test_case)
 def require_torch_2_8_0(test_case):
    """
    Decorator marking a test that requires torch >= 2.7.0
    """
    def is_min_2_8_0():
        torch_version = version.parse(torch.__version__)
        return torch_version >= version.parse("2.8.0")
    return unittest.skipUnless(is_min_2_8_0(), "test requires torch>=2.8.0")(test_case)
 def require_torch_lt_2_6_0(test_case):
    """
    Decorator marking a test that requires torch < 2.6.0
@@ -140,24 +128,6 @@ def require_llmcompressor(test_case):
    )(test_case)
 def requires_sm_ge_100(test_case):
    is_sm_ge_100 = (
        torch.cuda.is_available()
        and torch.version.cuda
        and torch.cuda.get_device_capability() >= (10, 0)
    )
    return unittest.skipUnless(is_sm_ge_100, "test requires sm>=100")(test_case)
 def requires_cuda_ge_8_9(test_case):
    is_cuda_ge_8_9 = (
        torch.cuda.is_available()
        and torch.version.cuda
        and torch.cuda.get_device_capability() >= (8, 9)
    )
    return unittest.skipUnless(is_cuda_ge_8_9, "test requires cuda>=8.9")(test_case)
 def is_hopper():
    compute_capability = torch.cuda.get_device_capability()
    return compute_capability == (9, 0)
--- a/tests/integrations/test_diffusion.py
+++ b/tests/integrations/test_diffusion.py
@@ -1,274 +0,0 @@
 """Tests for diffusion trainer integration."""
 # pylint: disable=redefined-outer-name,protected-access
 from unittest.mock import Mock
 import pytest
 import torch
 from axolotl.integrations.diffusion import DiffusionTrainer
 from axolotl.integrations.diffusion.utils import create_bidirectional_attention_mask
 from axolotl.utils.dict import DictDefault
@pytest.fixture
 def mock_tokenizer():
    """Create a mock tokenizer."""
    tokenizer = Mock()
    tokenizer.bos_token_id = 1
    tokenizer.eos_token_id = 2
    tokenizer.pad_token_id = 0
    return tokenizer
@pytest.fixture
 def diffusion_config():
    """Create a diffusion config."""
    return DictDefault(
        {
            "diffusion": {
                "mask_token_id": 32000,
                "eps": 1e-3,
                "importance_weighting": False,
            },
            "sample_packing": False,
        }
    )
@pytest.fixture
 def diffusion_trainer_instance(mock_tokenizer, diffusion_config):
    """Create a diffusion trainer instance for testing methods directly."""
    # Create a minimal trainer instance just for testing methods
    trainer = object.__new__(DiffusionTrainer)  # Bypass __init__
    trainer.cfg = diffusion_config
    trainer._special_token_ids = {0, 1, 2}  # pad, bos, eos
    trainer.processing_class = mock_tokenizer
    trainer.store_metrics = Mock()  # Mock metrics storage
    return trainer
 class TestDiffusionTrainer:
    """Test the DiffusionTrainer class."""
    def test_forward_process_basic(self, diffusion_trainer_instance):
        """Test basic forward process without labels."""
        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
        noisy_batch, masked_indices, p_mask = (
            diffusion_trainer_instance._forward_process(input_ids, eps=0.1)
        )
        # Check shapes
        assert noisy_batch.shape == input_ids.shape
        assert masked_indices.shape == input_ids.shape
        assert p_mask.shape == input_ids.shape
        # Check that special tokens are not masked
        special_token_positions = (input_ids == 1) | (input_ids == 2) | (input_ids == 0)
        assert not masked_indices[special_token_positions].any()
        # Check that mask token is applied
        mask_token_id = diffusion_trainer_instance.cfg.diffusion.mask_token_id
        masked_positions = masked_indices
        if masked_positions.any():
            assert (noisy_batch[masked_positions] == mask_token_id).all()
    def test_forward_process_with_labels(self, diffusion_trainer_instance):
        """Test forward process with SFT labels."""
        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
        labels = torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long)
        noisy_batch, masked_indices, p_mask = (
            diffusion_trainer_instance._forward_process(
                input_ids, labels=labels, eps=0.1
            )
        )
        # Check shapes
        assert noisy_batch.shape == input_ids.shape
        assert masked_indices.shape == input_ids.shape
        assert p_mask.shape == input_ids.shape
        # Check that only answer tokens can be masked (where labels != -100)
        non_answer_mask = labels == -100
        # No masking should occur on non-answer tokens
        assert not masked_indices[non_answer_mask].any()
        # p_mask should be the same for all positions (sampled timestep),
        # but masking is only applied to answer tokens
        assert p_mask.shape == input_ids.shape
        # Verify that masked_indices respects the answer mask
        assert not masked_indices[non_answer_mask].any()
    def test_forward_process_with_attention_mask(self, diffusion_trainer_instance):
        """Test forward process with attention mask."""
        input_ids = torch.tensor([[1, 10, 20, 0]], dtype=torch.long)
        attention_mask = torch.tensor([[1, 1, 1, 0]], dtype=torch.long)
        _, masked_indices, p_mask = diffusion_trainer_instance._forward_process(
            input_ids, attention_mask=attention_mask, eps=0.1
        )
        # Check that padding tokens are not masked
        padding_positions = attention_mask == 0
        assert not masked_indices[padding_positions].any()
        assert (p_mask[padding_positions] == 0).all()
    def test_bidirectional_attention_mask_no_packing(self, diffusion_trainer_instance):
        """Test bidirectional attention mask without sample packing."""
        input_ids = torch.tensor([[1, 10, 20, 2]], dtype=torch.long)
        mask = create_bidirectional_attention_mask(input_ids)
        # Should be all-to-all attention
        expected_shape = (1, 1, 4, 4)
        assert mask.shape == expected_shape
        assert mask.all()
    def test_bidirectional_attention_mask_with_packing(
        self, diffusion_trainer_instance
    ):
        """Test bidirectional attention mask with sample packing."""
        diffusion_trainer_instance.cfg.sample_packing = True
        input_ids = torch.tensor([[1, 10, 20, 30, 40, 2]], dtype=torch.long)
        # Sample IDs: first sample (1), second sample (2)
        attention_mask = torch.tensor([[1, 1, 1, 2, 2, 2]], dtype=torch.long)
        mask = create_bidirectional_attention_mask(
            input_ids, attention_mask, sample_packing=True
        )
        # Check that tokens within same sample can attend to each other
        # but not across samples
        assert mask[0, 0, 0, 1].item()  # First sample tokens can attend to each other
        assert mask[0, 0, 1, 2].item()
        assert not mask[0, 0, 0, 3].item()  # Can't attend across samples
        assert not mask[0, 0, 2, 4].item()
        assert mask[0, 0, 3, 4].item()  # Second sample tokens can attend to each other
    def test_compute_loss_basic(self, diffusion_trainer_instance):
        """Test basic loss computation."""
        # Mock model that returns logits
        mock_model = Mock()
        mock_outputs = Mock()
        vocab_size = 1000
        seq_len = 5
        mock_outputs.logits = torch.randn(1, seq_len, vocab_size, requires_grad=True)
        mock_model.return_value = mock_outputs
        mock_model.training = True
        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
        loss, outputs = diffusion_trainer_instance._compute_diffusion_loss(
            mock_model, input_ids
        )
        # Check that loss is computed
        assert isinstance(loss, torch.Tensor)
        assert loss.requires_grad
        assert outputs == mock_outputs
        # Check that metrics were stored
        diffusion_trainer_instance.store_metrics.assert_called_once()
    def test_compute_loss_sft(self, diffusion_trainer_instance):
        """Test loss computation with SFT labels."""
        # Mock model
        mock_model = Mock()
        mock_outputs = Mock()
        vocab_size = 1000
        seq_len = 5
        mock_outputs.logits = torch.randn(1, seq_len, vocab_size, requires_grad=True)
        mock_model.return_value = mock_outputs
        mock_model.training = True
        diffusion_trainer_instance.cfg.datasets = Mock()
        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
        labels = torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long)
        loss, _ = diffusion_trainer_instance._compute_diffusion_loss(
            mock_model, input_ids, labels=labels
        )
        # Check that loss is computed
        assert isinstance(loss, torch.Tensor)
        assert loss.requires_grad
        # Check that SFT metrics were added
        call_args = diffusion_trainer_instance.store_metrics.call_args[0][0]
        assert "answer_ratio" in call_args
        assert "avg_answer_length" in call_args
    def test_compute_loss_no_masked_tokens(self, diffusion_trainer_instance):
        """Test loss computation when no tokens are masked."""
        # Mock model
        mock_model = Mock()
        mock_outputs = Mock()
        vocab_size = 1000
        seq_len = 3
        mock_outputs.logits = torch.randn(1, seq_len, vocab_size)
        mock_model.return_value = mock_outputs
        mock_model.training = True
        # Only special tokens (which won't be masked)
        input_ids = torch.tensor([[1, 0, 2]], dtype=torch.long)
        loss, _ = diffusion_trainer_instance._compute_diffusion_loss(
            mock_model, input_ids
        )
        # Loss should be zero when no tokens are masked
        assert loss.item() == 0.0
        assert loss.requires_grad
    def test_cache_special_token_ids(self, mock_tokenizer):
        """Test caching of special token IDs."""
        trainer = object.__new__(DiffusionTrainer)
        trainer.processing_class = mock_tokenizer
        trainer._cache_special_token_ids()
        assert trainer._special_token_ids == {0, 1, 2}
    def test_cache_special_token_ids_no_tokenizer(self):
        """Test caching when no tokenizer is available."""
        trainer = object.__new__(DiffusionTrainer)
        trainer.processing_class = None
        trainer._cache_special_token_ids()
        assert trainer._special_token_ids == set()
    def test_main_compute_loss_interface(self, diffusion_trainer_instance):
        """Test the main compute_loss interface."""
        # Mock model
        mock_model = Mock()
        mock_outputs = Mock()
        mock_outputs.logits = torch.randn(1, 5, 1000)
        mock_model.return_value = mock_outputs
        mock_model.training = True
        inputs = {
            "input_ids": torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long),
            "attention_mask": torch.tensor([[1, 1, 1, 1, 1]], dtype=torch.long),
            "labels": torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long),
        }
        # Test without return_outputs
        loss = diffusion_trainer_instance.compute_loss(mock_model, inputs)
        assert isinstance(loss, torch.Tensor)
        # Test with return_outputs
        loss, outputs = diffusion_trainer_instance.compute_loss(
            mock_model, inputs, return_outputs=True
        )
        assert isinstance(loss, torch.Tensor)
        assert outputs == mock_outputs
    def test_missing_input_ids_raises_error(self, diffusion_trainer_instance):
        """Test that missing input_ids raises ValueError."""
        mock_model = Mock()
        inputs = {"attention_mask": torch.tensor([[1, 1, 1]])}
        with pytest.raises(ValueError, match="input_ids is required"):
            diffusion_trainer_instance.compute_loss(mock_model, inputs)
--- a/tests/integrations/test_diffusion_callback.py
+++ b/tests/integrations/test_diffusion_callback.py
@@ -1,92 +0,0 @@
 """Tests for diffusion generation callback dataloader selection and triggering."""
 from types import SimpleNamespace
 from unittest.mock import Mock
 import pytest
 from axolotl.integrations.diffusion import DiffusionGenerationCallback
 class DummyTrainer:
    """Minimal trainer double with required attributes/methods for the callback."""
    def __init__(self, use_eval: bool):
        # Config used by callback
        self.cfg = SimpleNamespace(
            diffusion=SimpleNamespace(
                generation_interval=1,
                num_generation_samples=1,
                generation_max_length=32,
                generation_steps=4,
                generation_temperature=0.0,
                mask_token_id=16,
            ),
            use_wandb=False,
        )
        # Model/tokenizer are passed through to generate_samples; not used here
        self.model = Mock()
        self.processing_class = Mock()
        # Datasets and loaders
        self.eval_dataset = object() if use_eval else None
        self._train_loader = object()
        self._eval_loader = object()
        # State for world process check
        self.state = SimpleNamespace(is_world_process_zero=True)
        # Track which loader was requested
        self.requested: list[str] = []
    def get_train_dataloader(self):
        self.requested.append("train")
        return self._train_loader
    def get_eval_dataloader(self):
        self.requested.append("eval")
        return self._eval_loader
@pytest.mark.parametrize("use_eval", [False, True])
 def test_callback_uses_correct_dataloader(monkeypatch, use_eval):
    trainer = DummyTrainer(use_eval=use_eval)
    callback = DiffusionGenerationCallback(trainer)
    captured = {}
    # Patch generate_samples in the callback module's namespace
    def fake_generate_samples(**kwargs):
        captured["dataloader"] = kwargs.get("dataloader")
        # Return one dummy sample to exercise logging path
        return [
            {
                "original": "o",
                "masked": "m",
                "generated": "g",
                "mask_ratio": 0.5,
                "masked_tokens": 1,
                "total_tokens": 2,
            }
        ]
    monkeypatch.setattr(
        "axolotl.integrations.diffusion.callbacks.generate_samples",
        fake_generate_samples,
    )
    # Trigger at step 1 (interval=1)
    args = SimpleNamespace()
    state = SimpleNamespace(global_step=1)
    control = SimpleNamespace()
    callback.on_step_end(args=args, state=state, control=control)
    # Assert the expected dataloader path was used
    if use_eval:
        assert trainer.requested[0] == "eval"
        assert captured["dataloader"] is trainer._eval_loader
    else:
        assert trainer.requested[0] == "train"
        assert captured["dataloader"] is trainer._train_loader
--- a/tests/monkeypatch/test_trainer_loss_calc.py
+++ b/tests/monkeypatch/test_trainer_loss_calc.py
@@ -3,6 +3,7 @@
 import unittest
 from axolotl.monkeypatch.transformers.trainer_loss_calc import (
    check_evaluation_loop_is_fsdp2_patchable,
    check_evaluation_loop_is_patchable,
    check_maybe_log_save_evaluate_is_patchable,
 )
@@ -19,6 +20,7 @@ class TestTrainerLossCalc(unittest.TestCase):
        the patched code changes upstream.
        """
        assert check_evaluation_loop_is_patchable()
        assert check_evaluation_loop_is_fsdp2_patchable()
        assert check_maybe_log_save_evaluate_is_patchable()
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -6,7 +6,7 @@ import unittest
 from transformers import LlamaTokenizer
-from axolotl.utils.data import encode_streaming, md5
+from axolotl.utils.data import encode_pretraining, md5
 from tests.hf_offline_utils import enable_hf_offline
@@ -39,7 +39,7 @@ class TestEncodePretraining(unittest.TestCase):
                "hello, hello",
            ]
        }
-        result = encode_streaming(examples, self.tokenizer, self.max_tokens)
+        result = encode_pretraining(self.tokenizer, self.max_tokens, examples)
        self.assertEqual(len(result["input_ids"]), 3)
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -1,11 +1,16 @@
 """Module for testing dataset sequence packing"""
 import unittest
 from pathlib import Path
 from datasets import Dataset, load_dataset
 from transformers import AutoTokenizer
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.datasets import ConstantLengthDataset, TokenizedPromptDataset
 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
 from axolotl.prompters import AlpacaPrompter
 from axolotl.train import setup_model_and_trainer
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
@@ -30,6 +35,43 @@ class TestPacking(unittest.TestCase):
            }
        )
    def test_increments_attention(self):
        prompter = AlpacaPrompter("chat")
        strat = AlpacaPromptTokenizingStrategy(
            prompter,
            self.tokenizer,
            False,
            2048,
        )
        dateset = load_dataset(
            "json",
            data_files=str(Path(__file__).parent / "fixtures/alpaca/alpaca.json"),
        )["train"]
        dataset = Dataset.from_list(list(TokenizedPromptDataset(strat, dateset)))
        constant_len_dataset = ConstantLengthDataset(
            self.tokenizer,
            [dataset],
            seq_length=2048,
        )
        packed_dataset = Dataset.from_list(list(constant_len_dataset))
        example = packed_dataset[0]
        next_bos_index = (
            example["input_ids"][1:].index(self.tokenizer.bos_token_id) + 1
        )  # add one since we sliced
        # first example doesn't have mask reset
        assert example["input_ids"][0] == self.tokenizer.bos_token_id
        assert example["attention_mask"][0] == 1
        assert example["position_ids"][0] == 0
        assert example["position_ids"][1] == 1
        # but subsequent one does
        assert example["input_ids"][next_bos_index] == self.tokenizer.bos_token_id
        assert example["attention_mask"][next_bos_index] == 2
        assert example["position_ids"][next_bos_index] == 0
        assert example["position_ids"][next_bos_index + 1] == 1
    @with_temp_dir
    def test_lora_packing(self, temp_dir):
        cfg = DictDefault(
--- a/Show More
+++ b/Show More