use kwargs

max new tokens for online generation
handle input only for online
2026-02-04 12:04:53 -05:00 · 2026-02-04 11:55:19 -05:00 · 2026-02-04 10:53:10 -05:00 · 2026-02-04 09:49:35 -05:00
57 changed files with 398 additions and 1063 deletions
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -123,7 +123,7 @@ datasets:
 | --------------------------------- | -------------------------- | ----------------------------------- |
 | `dataset_prepared_path`           | `"data/last_run_prepared"` | Path for prepared dataset           |
 | `push_dataset_to_hub`             | `""`                       | Push dataset to HF hub              |
-| `dataset_num_proc`                | `4`                        | Number of preprocessing processes   |
+| `dataset_processes`               | `4`                        | Number of preprocessing processes   |
 | `dataset_keep_in_memory`          | `false`                    | Keep dataset in memory              |
 | `shuffle_merged_datasets`         | `true`                     | Shuffle merged datasets             |
 | `shuffle_before_merging_datasets` | `false`                    | Shuffle each dataset before merging |
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -39,6 +39,7 @@
 #     type: # linear | dynamic
 #     factor: # float

+
 # # Whether you are training a 4-bit GPTQ quantized model
 # gptq: true
 # gptq_groupsize: 128 # group size
@@ -106,7 +107,7 @@
 # push_dataset_to_hub: # repo path
 # # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # # if not set.
-# dataset_num_proc: # defaults to os.cpu_count() if not set
+# dataset_processes: # defaults to os.cpu_count() if not set
 # # push checkpoints to hub
 # hub_model_id: # repo path to push finetuned model
 # # how to push checkpoints to hub
@@ -348,6 +349,8 @@
 # # Allow overwrite yml config using from cli
 # strict:

+
+
 base_model: ${BASE_MODEL}
 base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
 base_model_config: ${BASE_MODEL_CONFIG}
@@ -406,7 +409,7 @@ chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
 default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
 dataset_prepared_path: ${DATASET_PREPARED_PATH}
 push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
-dataset_num_proc: ${DATASET_NUM_PROC}
+dataset_processes: ${DATASET_PROCESSES}
 dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
 hub_model_id: ${HUB_MODEL_ID}
 hub_strategy: ${HUB_STRATEGY}
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -251,6 +251,7 @@ website:
                - docs/models/olmo3.qmd
                - docs/models/trinity.qmd
                - docs/models/arcee.qmd
+                - docs/models/mistral.qmd
                - section: "Ministral3"
                  contents:
                    - docs/models/ministral3.qmd
@@ -265,7 +266,6 @@ website:
                - docs/models/mistral-small.qmd
                - docs/models/voxtral.qmd
                - docs/models/devstral.qmd
-                - docs/models/mistral.qmd
                - docs/models/llama-4.qmd
                - docs/models/llama-2.qmd
                - docs/models/qwen3-next.qmd
@@ -320,7 +320,6 @@ website:
            - docs/multipack.qmd
            - docs/mixed_precision.qmd
            - docs/optimizers.qmd
-            - docs/attention.qmd

        - section: "Advanced Features"
          contents:
--- a/docs/attention.qmd
+++ b/docs/attention.qmd
@@ -1,140 +0,0 @@
---
-title: Attention
-description: Supported attention modules in Axolotl
---
-
-## SDP Attention
-
-This is the default built-in attention in PyTorch.
-
-```yaml
-sdp_attention: true
-```
-
-For more details: [PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
-
-## Flash Attention 2
-
-Uses efficient kernels to compute attention.
-
-```yaml
-flash_attention: true
-```
-
-For more details: [Flash Attention](https://github.com/Dao-AILab/flash-attention/)
-
-### Nvidia
-
-Requirements: Ampere, Ada, or Hopper GPUs
-
-Note: For Turing GPUs or lower, please use other attention methods.
-
-```bash
-pip install flash-attn --no-build-isolation
-```
-
-::: {.callout-tip}
-
-If you get `undefined symbol` while training, ensure you installed PyTorch prior to Axolotl. Alternatively, try reinstall or downgrade a version.
-
-:::
-
-#### Flash Attention 3
-
-Requirements: Hopper only and CUDA 12.8 (recommended)
-
-```bash
-git clone https://github.com/Dao-AILab/flash-attention.git
-cd flash-attention/hopper
-
-python setup.py install
-```
-
-### AMD
-
-Requirements: ROCm 6.0 and above.
-
-See [Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).
-
-## Flex Attention
-
-A flexible PyTorch API for attention used in combination with `torch.compile`.
-
-```yaml
-flex_attention: true
-
-# recommended
-torch_compile: true
-```
-
-::: {.callout-note}
-
-We recommend using latest stable version of PyTorch for best performance.
-
-:::
-
-For more details: [PyTorch docs](https://pytorch.org/blog/flexattention/)
-
-## SageAttention
-
-Attention kernels with QK Int8 and PV FP16 accumulator.
-
-```yaml
-sage_attention: true
-```
-
-Requirements: Ampere, Ada, or Hopper GPUs
-
-```bash
-pip install sageattention==2.2.0 --no-build-isolation
-```
-
-::: {.callout-warning}
-
-Only LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See [GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).
-
-:::
-
-For more details: [Sage Attention](https://github.com/thu-ml/SageAttention)
-
-::: {.callout-note}
-
-We do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue.
-
-:::
-
-
-## xFormers
-
-```yaml
-xformers_attention: true
-```
-
-::: {.callout-tip}
-
-We recommend using with Turing GPUs or below (such as on Colab).
-
-:::
-
-For more details: [xFormers](https://github.com/facebookresearch/xformers)
-
-## Shifted Sparse Attention
-
-::: {.callout-warning}
-
-We plan to deprecate this! If you use this feature, we recommend switching to methods above.
-
-:::
-
-Requirements: LLaMA model architecture
-
-```yaml
-flash_attention: true
-s2_attention: true
-```
-
-::: {.callout-tip}
-
-No sample packing support!
-
-:::
--- a/docs/cli.qmd
+++ b/docs/cli.qmd
@@ -210,8 +210,6 @@ axolotl lm-eval config.yml
 Configuration options:

 ```yaml
-lm_eval_model: # model to evaluate (local or hf path)
-
 # List of tasks to evaluate
 lm_eval_tasks:
  - arc_challenge
@@ -220,7 +218,7 @@ lm_eval_batch_size: # Batch size for evaluation
 output_dir: # Directory to save evaluation results
 ```

-See [LM Eval Harness integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#language-model-evaluation-harness-lm-eval) for full configuration details.
+See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details.

 ### delinearize-llama4

--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -89,10 +89,6 @@ lora_o_kernel: true
 Currently, LoRA kernels are not supported for RLHF training, only SFT.
 :::

-::: {.callout-warning}
-LoRA kernels do not support remote modeling code.
-:::
-
 ## Requirements

 - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -19,7 +19,6 @@ format:
 - [Gemma-3n](#sec-gemma-3n)
 - [Qwen2-VL](#sec-qwen2-vl)
 - [Qwen2.5-VL](#sec-qwen25-vl)
- [GLM-4.6V](#sec-glm-4-6v)
 - [SmolVLM2](#sec-smolvlm2)
 - [LFM2-VL](#sec-lfm2-vl)
 - [Intern-VL](#sec-intern-vl)
@@ -184,18 +183,6 @@ base_model: Qwen/Qwen3-VL-4B-Instruct
 chat_template: qwen2_vl  # same as qwen2-vl
 ```

-### GLM-4.6V {#sec-glm-4-6v}
-
-Both GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported.
-
-```yaml
-# GLM-4.6V (106B MoE version)
-base_model: zai-org/GLM-4.6V
-
-# OR GLM-4.6V-Flash (9B version)
-base_model: zai-org/GLM-4.6V-Flash
-```
-
 ### SmolVLM2 {#sec-smolvlm2}

 ::: {.callout-tip}
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712\""
   ]
  },
  {
--- a/examples/glm46v/README.md
+++ b/examples/glm46v/README.md
@@ -1,44 +0,0 @@
-# Finetune GLM-4.6V with Axolotl
-
-GLM-4.6V is a family of vision-language models from ZhipuAI found on [HuggingFace](https://huggingface.co/zai-org/GLM-4.6V). This guide shows how to fine-tune it with Axolotl for vision-language tasks.
-
-
-
-## Getting started
-
-1. Install Axolotl from source following the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).
-
-2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
-
-
-3. Run the fine-tuning:
-
-    glm-4-6v-flash(9B)
-    ```bash
-    axolotl train examples/glm46v/glm-4-6v-flash-qlora.yaml
-    ```
-
-Let us know how it goes. Happy finetuning! 🚀
-
-## Tips
-
- Vision datasets should follow the format described in the [multimodal docs](https://docs.axolotl.ai/docs/multimodal.html#dataset-format)
- You can run a **full finetuning** by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset in the [dataset loading docs](https://docs.axolotl.ai/docs/dataset_loading.html).
-
-## Supported Models
-
- **GLM-4.6V**: Full vision-language model (`zai-org/GLM-4.6V`)
- **GLM-4.6V-Flash**: Faster variant (`zai-org/GLM-4.6V-Flash`)
-
-## Optimization Guides
-
-Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).
-
-## Related Resources
-
- [ZhipuAI GLM-4.6V](https://huggingface.co/zai-org/GLM-4.6V)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
--- a/examples/glm46v/glm-4-6v-flash-ddp.yaml
+++ b/examples/glm46v/glm-4-6v-flash-ddp.yaml
@@ -1,53 +0,0 @@
-base_model: zai-org/GLM-4.6V-Flash
-trust_remote_code: true
-
-processor_type: AutoProcessor
-load_in_4bit: true
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-ddp_find_unused_parameters: true
-
-output_dir: ./outputs/glm-4-6v-flash-qlora
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-
-adapter: qlora
-lora_r: 16
-lora_alpha: 32
-lora_dropout: 0.05
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-sequence_len: 2048
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-logging_steps: 1
-sdp_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 0
-saves_per_epoch: 1
-weight_decay: 0.0
--- a/examples/glm46v/glm-4-6v-flash-qlora.yaml
+++ b/examples/glm46v/glm-4-6v-flash-qlora.yaml
@@ -1,50 +0,0 @@
-base_model: zai-org/GLM-4.6V-Flash
-trust_remote_code: true
-
-processor_type: AutoProcessor
-load_in_4bit: true
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-output_dir: ./outputs/glm-4-6v-flash-qlora
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-
-adapter: qlora
-lora_r: 16
-lora_alpha: 32
-lora_dropout: 0.05
-lora_target_modules:
-  - gate_proj
-  - down_proj
-  - up_proj
-  - q_proj
-  - v_proj
-  - k_proj
-  - o_proj
-
-sequence_len: 2048
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-bf16: auto
-tf32: false
-
-gradient_checkpointing: true
-logging_steps: 1
-sdp_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch: 0
-saves_per_epoch: 1
-weight_decay: 0.0
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,21 +2,21 @@

 # START section of dependencies that don't install on Darwin/MacOS
 bitsandbytes==0.49.1
-triton>=3.4.0
+triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
-liger-kernel==0.7.0
+liger-kernel==0.6.4
 # END section

 packaging==26.0
 huggingface_hub>=1.1.7
 peft>=0.18.1
 tokenizers>=0.22.1
-transformers @ git+https://github.com/winglian/transformers.git@refactor-inner-training-loop-reorder-only
+transformers==5.0.0
 accelerate==1.12.0
 datasets==4.5.0
 deepspeed>=0.18.3
-trl==0.28.0
+trl==0.27.1
 hf_xet==1.2.0
 kernels==0.11.5

@@ -63,7 +63,7 @@ langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2

-torchao==0.16.0
+torchao==0.13.0
 openenv-core==0.1.0
 schedulefree==1.4.1

--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""

 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712"'
 )
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -409,9 +409,6 @@ class TrainerBuilderBase(abc.ABC):
            if self.cfg.hub_strategy:
                training_args_kwargs["hub_strategy"] = self.cfg.hub_strategy

-            if self.cfg.hub_revision:
-                training_args_kwargs["hub_revision"] = self.cfg.hub_revision
-
    def _configure_save_and_eval_strategy(self, training_args_kwargs: dict):
        # save_strategy and save_steps
        if self.cfg.save_steps:
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -246,8 +246,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            ddp_find_unused_parameters
        )

-        if self.cfg.group_by_length:
-            training_arguments_kwargs["train_sampling_strategy"] = "group_by_length"
+        training_arguments_kwargs["group_by_length"] = self.cfg.group_by_length
        training_arguments_kwargs["curriculum_sampling"] = self.cfg.curriculum_sampling

        training_arguments_kwargs["sample_packing"] = bool(self.cfg.sample_packing)
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -11,6 +11,7 @@ from axolotl.core.trainers import (
 )
 from axolotl.core.trainers.dpo import DPOStrategy
 from axolotl.core.trainers.dpo.args import AxolotlDPOConfig
+from axolotl.core.trainers.grpo import GRPOStrategy
 from axolotl.integrations.base import PluginManager
 from axolotl.loaders.utils import ensure_dtype
 from axolotl.utils.callbacks.qat import QATCallback
@@ -52,8 +53,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        trainer_cls_args = [self.model]

        if self.cfg.rl in {RLType.GRPO, RLType.GDPO}:
-            from axolotl.core.trainers.grpo import GRPOStrategy
-
            trainer_cls = GRPOStrategy.get_trainer_class(
                sequence_parallel=self.cfg.context_parallel_size > 1
            )
@@ -134,17 +133,21 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.cpo_alpha is not None:
                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha

-            blocklist_args_kwargs.append("max_prompt_length")
+            # Handle when max_prompt_length == max_length from defaults
+            # CPOTrainer requires strictly less than
+            if (
+                training_args_kwargs["max_prompt_length"]
+                == training_args_kwargs["max_length"]
+            ):
+                training_args_kwargs["max_prompt_length"] -= 1

        elif self.cfg.rl is RLType.ORPO:
            training_args_cls = AxolotlORPOConfig

-            blocklist_args_kwargs.append("max_prompt_length")
-
        elif self.cfg.rl is RLType.KTO:
            training_args_cls = AxolotlKTOConfig
            # KTOConfig in TRL >= 0.27.0 no longer accepts max_prompt_length
-            blocklist_args_kwargs.append("max_prompt_length")
+            blocklist_args_kwargs = ["max_prompt_length"]

            training_args_kwargs["desirable_weight"] = (
                self.cfg.kto_desirable_weight or 1.0
@@ -154,8 +157,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            )

        elif self.cfg.rl in {RLType.GRPO, RLType.GDPO}:
-            from axolotl.core.trainers.grpo import GRPOStrategy
-
            training_args_cls = GRPOStrategy.get_training_args_class()
            training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
            blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -719,13 +719,6 @@ class AxolotlTrainer(
        output_dir = output_dir if output_dir is not None else self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        LOG.info(f"Saving model checkpoint to {output_dir}")
-        if state_dict is None:
-            state_dict = self.accelerator.get_state_dict(self.model)
-        if state_dict is not None:
-            state_dict = {
-                k: v.clone() if isinstance(v, torch.Tensor) else v
-                for k, v in state_dict.items()
-            }
        supported_classes = (
            (PreTrainedModel,)
            if not is_peft_available()
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -57,18 +57,16 @@ class AxolotlDPOTrainer(
    def tokenize_row(
        features,
        processing_class,
-        max_prompt_length: int | None = None,
-        max_completion_length: int | None = None,
-        add_special_tokens: bool = True,
-        is_chat: bool = False,
+        max_prompt_length,
+        max_completion_length,
+        add_special_tokens,
    ) -> Dict:
        res = DPOTrainer.tokenize_row(
            features,
            processing_class,
-            max_prompt_length=max_prompt_length,
-            max_completion_length=max_completion_length,
-            add_special_tokens=add_special_tokens,
-            is_chat=is_chat,
+            max_prompt_length,
+            max_completion_length,
+            add_special_tokens,
        )
        # fix when the tokenizer doesn't have a bos_token_id, e.g. Qwen
        if processing_class.bos_token is None and res["prompt_input_ids"][0] is None:
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -126,6 +126,9 @@ class GRPOStrategy:
        if trl.use_liger_loss is not None:
            grpo_args_kwargs["use_liger_loss"] = trl.use_liger_loss

+        if trl.rollout_func:
+            grpo_args_kwargs["rollout_func"] = cls.get_rollout_func(trl.rollout_func)
+
        if trl.multi_objective_aggregation is not None:
            grpo_args_kwargs["multi_objective_aggregation"] = (
                trl.multi_objective_aggregation
@@ -151,8 +154,6 @@ class GRPOStrategy:
            trainer_kwargs["reward_processing_classes"] = (
                cfg.trl.reward_processing_classes
            )
-        if cfg.trl and cfg.trl.rollout_func:
-            trainer_kwargs["rollout_func"] = cls.get_rollout_func(cfg.trl.rollout_func)

        return trainer_kwargs

@@ -163,12 +164,7 @@ class GRPOStrategy:

    @classmethod
    def get_blocklist_args_kwargs(cls) -> list[str]:
-        return [
-            "dataset_num_proc",
-            "max_length",
-            "include_tokens_per_second",
-            "max_prompt_length",
-        ]
+        return ["dataset_num_proc", "max_length", "include_tokens_per_second"]

    @classmethod
    def get_reward_func(cls, reward_func_fqn: str) -> RewardFunc:
--- a/src/axolotl/core/trainers/mixins/optimizer.py
+++ b/src/axolotl/core/trainers/mixins/optimizer.py
@@ -104,7 +104,7 @@ class OptimizerMixin(Trainer):

        return optimizer_grouped_parameters

-    def create_optimizer(self, model=None):
+    def create_optimizer(self):
        if (
            self.args.loraplus_lr_ratio is None
            and self.args.embedding_lr_scale is None
@@ -112,9 +112,9 @@ class OptimizerMixin(Trainer):
            and self.args.lr_groups is None
            and self.optimizer_cls_and_kwargs is None
        ):
-            return super().create_optimizer(model=model)
+            return super().create_optimizer()

-        opt_model = self.model if model is None else model
+        opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model

        if (
            not self.optimizer
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh

 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712"
 ```

 ## Usage
@@ -54,8 +54,8 @@ plugins:
 - gpt_oss
 - granite
 - granitemoe
- granitemoehybrid
 - granitemoeshared
+- granitemoehybrid
 - hunyuan_v1_dense
 - hunyuan_v1_moe
 - internvl
@@ -80,17 +80,16 @@ plugins:
 - phi3
 - phi4_multimodal
 - qwen2
- qwen2_moe
 - qwen2_vl
+- qwen2_moe
 - qwen2_5_vl
 - qwen3
 - qwen3_moe
- qwen3_next
 - qwen3_vl
 - qwen3_vl_moe
- seed_oss
+- qwen3_next
 - smollm3
- step3p5
+- seed_oss
 - voxtral

 ## Citation
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)

 _CCE_INSTALL_MESSAGE = (
    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0d4ce4b"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@f4b5712"`'
 )


@@ -104,7 +104,7 @@ class CutCrossEntropyPlugin(BasePlugin):

    def patch_llama_like(
        self,
-        model_type_to_patch: str,
+        model_type: str,
    ) -> None:
        """
        Generic patch for model architectures with causal lm similar to llama
@@ -112,10 +112,7 @@ class CutCrossEntropyPlugin(BasePlugin):
        from cut_cross_entropy.transformers.patch import PATCH_FNS

        def patch_generic(
-            maybe_model,
-            patch_options,
-            remote_model_id: str | None,
-            model_type: str,
+            maybe_model, patch_options, model_type: str, remote_model_id: str | None
        ):
            import cut_cross_entropy.transformers.llama
            from cut_cross_entropy.transformers.llama import cce_forward
@@ -139,13 +136,11 @@ class CutCrossEntropyPlugin(BasePlugin):
                    f"Error: {str(e)}"
                ) from e

-        if model_type_to_patch not in PATCH_FNS:
+        if model_type not in PATCH_FNS:
            LOG.warning_once(
-                "Setting up generic cce patch for model type: %s", model_type_to_patch
+                "Setting up generic cce patch for model type: %s", model_type
            )
            LOG.warning_once(
-                f"Generic Cut Cross Entropy + {model_type_to_patch} support is experimental and may not work as expected."
-            )
-            PATCH_FNS[model_type_to_patch] = partial(
-                patch_generic, model_type=model_type_to_patch
+                f"Generic Cut Cross Entropy + {model_type} support is experimental and may not work as expected."
            )
+            PATCH_FNS[model_type] = partial(patch_generic, model_type=model_type)
--- a/src/axolotl/integrations/kd/init.py
+++ b/src/axolotl/integrations/kd/init.py
@@ -39,7 +39,10 @@ class KDPlugin(BasePlugin):

    def get_trainer_cls(self, cfg):
        if cfg.kd_trainer:
-            from .trainer import AxolotlKDTrainer
+            from .trainer import AxolotlKDTrainer, AxolotlOnlineKDTrainer
+
+            if cfg.kd_online_server_base_url:
+                return AxolotlOnlineKDTrainer

            return AxolotlKDTrainer
        return None
--- a/src/axolotl/integrations/kd/args.py
+++ b/src/axolotl/integrations/kd/args.py
@@ -53,7 +53,9 @@ class KDArgs(BaseModel):
    kd_online_server: InferenceServerType | None = Field(
        default_factory=lambda: InferenceServerType.vllm
    )
+    kd_online_server_model: str | None = None
    kd_online_timeout: int | None = 120
+    kd_online_max_new_tokens: int | None = 2048
    kd_temperature_min: float | None = (
        None  # kd temperature scheduling during online kd
    )
@@ -74,3 +76,4 @@ class KDTrainingArgsMixin:
    kd_normalize_topk: float | None = (
        None  # whether to normalize student logits during KD
    )
+    kd_online_max_new_tokens: int | None = None
--- a/src/axolotl/integrations/kd/online_chat_template.py
+++ b/src/axolotl/integrations/kd/online_chat_template.py
@@ -0,0 +1,47 @@
+from axolotl.prompt_strategies.chat_template import ChatTemplateStrategy, StrategyLoader
+from axolotl.prompters import IGNORE_TOKEN_ID
+from axolotl.utils.logging import get_logger
+
+# Configure the logger
+LOG = get_logger(__name__)
+LOG.setLevel("INFO")
+
+
+class ChatTemplateStrategyWithOnlineKD(ChatTemplateStrategy):
+    @property
+    def supports_batched(self) -> bool:
+        # batching doesn't work well for logprob data
+        return False
+
+    def _get_messages(self, prompt):
+        input_prompt = prompt.get("problem")
+        return [
+            {"role": "user", "content": input_prompt},
+        ]
+
+    def _tokenize_single_prompt(self, prompt):
+        turns = self.get_conversation_thread(prompt)
+        tools = self._get_tools(prompt)
+        input_ids = self.prompter.build_prompt(
+            turns, tools=tools, add_generation_prompt=True
+        )  # type: ignore
+        labels = [IGNORE_TOKEN_ID] * len(input_ids)
+
+        return {
+            "input_ids": input_ids,
+            "prompts": input_ids,
+            "labels": labels,
+            "attention_mask": [1] * len(input_ids),
+        }
+
+
+class OnlineKDStrategyLoader(StrategyLoader):
+    """
+    Load ChatTemplateStrategy with KD support using StrategyLoader.
+    """
+
+    def _get_strategy_cls(self, cfg):
+        return ChatTemplateStrategyWithOnlineKD
+
+
+load = OnlineKDStrategyLoader()
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -16,6 +16,14 @@
 KD trainer
 """

+import os
+from typing import Any, Optional, Union
+
+import requests
+import torch
+from torch import nn
+from transformers import GenerationConfig
+from trl.models import unwrap_model_for_generation
 from typing_extensions import override

 from axolotl.core.trainers.base import AxolotlTrainer
@@ -101,3 +109,214 @@ class AxolotlKDTrainer(AxolotlTrainer):
            loss = outputs.loss if hasattr(outputs, "loss") else outputs

        return (loss, outputs) if return_outputs else loss
+
+
+class AxolotlOnlineKDTrainer(AxolotlKDTrainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.generation_config = GenerationConfig(
+            max_new_tokens=kwargs.get("kd_online_max_new_tokens"),
+            temperature=1.0,
+            do_sample=True,
+            top_k=0,
+            use_cache=False if kwargs.get("gradient_checkpointing") else True,
+            pad_token_id=self.processing_class.pad_token_id,
+        )
+        # Set custom EOS tokens if they are specified by the model's generation
+        # config. This is important for models with the Llama 3 chat template,
+        # which use special tokens <|eot_id|> and <|eom_id|> to mark the end of
+        # turns or messages.
+        if (
+            hasattr(self.model.generation_config, "eos_token_id")
+            and self.model.generation_config.eos_token_id is not None
+        ):
+            self.generation_config.eos_token_id = (
+                self.model.generation_config.eos_token_id
+            )
+
+    @staticmethod
+    def generate_on_policy_outputs(model, inputs, generation_config, pad_token_id=None):
+        # Generate output with respect to the prompt-only
+        generated_outputs = model.generate(
+            input_ids=inputs["prompts"],
+            attention_mask=inputs.get("prompt_attention_mask", None),
+            generation_config=generation_config,
+            return_dict_in_generate=True,
+        )
+
+        # Get the generated token IDs
+        generated_tokens = generated_outputs.sequences
+        # Calculate new attention mask
+        new_attention_mask = torch.ones_like(generated_tokens)
+        new_labels = generated_tokens.clone()
+
+        # If there's pad_token_id, set attention mask to 0 for padding tokens
+        if pad_token_id is not None:
+            new_labels[new_labels == pad_token_id] = -100
+            new_attention_mask[generated_tokens == pad_token_id] = 0
+
+        return generated_tokens, new_attention_mask, new_labels
+
+    def training_step(
+        self,
+        model: nn.Module,
+        inputs: dict[str, Union[torch.Tensor, Any]],
+        num_items_in_batch: Optional[int] = None,
+    ) -> torch.Tensor:
+        """
+        Perform a training step for the Generalized Knowledge Distillation (GKD) model.
+
+        This method implements the on-policy learning approach described in the GKD paper. With probability
+        `self.lmbda`, it generates new responses using the student model, which are then used for training instead of
+        the original inputs.
+        """
+        with unwrap_model_for_generation(model, self.accelerator) as unwrapped_model:
+            new_input_ids, new_attention_mask, new_labels = (
+                self.generate_on_policy_outputs(
+                    unwrapped_model,
+                    inputs,
+                    self.generation_config,
+                    self.processing_class.pad_token_id,
+                )
+            )
+        inputs["input_ids"] = new_input_ids
+        inputs["attention_mask"] = new_attention_mask
+        inputs["labels"] = new_labels
+
+        target_token_ids, target_logprobs, target_mask = self.get_teacher_logprobs(
+            inputs["input_ids"], inputs["labels"]
+        )
+        inputs["target_token_ids"] = target_token_ids
+        inputs["target_logprobs"] = target_logprobs
+        inputs["target_mask"] = target_mask
+
+        loss = super().training_step(model, inputs, num_items_in_batch)
+        return loss
+
+    def get_teacher_logprobs(self, input_ids, labels):
+        request_body = {
+            "model": self.axolotl_cfg.kd_online_server_model,
+            "prompt": input_ids,
+            "logprobs": self.axolotl_cfg.kd_online_topk,
+            "echo": True,
+            "skip_special_tokens": False,
+            "n": 1,
+            "max_tokens": 0,
+            "temperature": 1.0,
+        }
+        base_url = self.args.kd_online_server_base_url
+        api_url = f"{base_url}/v1/completions"
+        bearer_token = os.getenv("OPENAI_API_KEY")
+
+        headers = {"Authorization": f"Bearer {bearer_token}"}
+        response = requests.post(
+            api_url, json=request_body, headers=headers, timeout=30
+        )
+        prompt_logprobs = response.choices[0].logprobs.top_logprobs[
+            1:
+        ]  # prune first null position
+        return self.transform_logprobs(input_ids, labels, prompt_logprobs)
+
+    def transform_logprobs(self, input_ids, labels, logprobs):
+        """
+        Transform logprobs to target format for KD training
+        """
+
+        target_seq_len = len(logprobs)
+        input_seq_len = len(input_ids)
+        input_padding_len = input_seq_len - target_seq_len
+        # get non-zero top-k (prune None logprobs from vllm data step)
+        top_k_vals = [
+            len(logprobs[i])
+            for i in range(len(logprobs))
+            if logprobs[i] is not None and len(logprobs[i])
+        ]
+        max_top_k = max(set(top_k_vals), key=top_k_vals.count)
+        min_top_k = min(set(top_k_vals), key=top_k_vals.count)
+        top_k = min(max_top_k, min_top_k)
+        if top_k == 0:
+            raise ValueError("No non-zero top-k logprobs found.")
+
+        target_logprobs = []
+        target_token_ids = []
+        target_mask = []
+
+        if input_padding_len < 0:
+            # logprobs is longer than target_seq_len,
+            # so we need to slice from the left/beginning of logprobs
+            logprobs = logprobs[:-input_seq_len]
+            input_padding_len = 0
+            # target_seq_len = input_seq_len
+
+        # truncate the second dimension of the logprobs to top_k
+        logprobs = [row[:top_k] for row in logprobs]
+
+        # fill with -inf for padding_len tokens for top_k tokens
+        # extend target_logprobs with a padding_len x top_k 2D list filled with -inf
+
+        # we shift for causal models in the trainer, so start the range from 0
+        for _ in range(0, input_padding_len):
+            target_logprobs.append([-float("inf")] * top_k)
+            target_token_ids.append(list(range(top_k)))
+            target_mask.append([0] * top_k)
+
+        for position in range(input_padding_len, input_seq_len):
+            if labels[position] == -100:
+                target_mask.append([0] * top_k)
+            else:
+                target_mask.append([1] * top_k)
+
+        for _, token_pos_logprobs in enumerate(logprobs):
+            # Initialize collections for logprobs and token_ids
+            position_logprobs = []
+            position_token_ids = []
+
+            # Process each token probability entry
+            for entry in token_pos_logprobs:
+                # Extract logprob value
+                logprob = entry["logprob"]
+
+                # Parse token_id from the "token_id:###" format
+                token_id = int(entry["token"].split(":")[1])
+
+                # Append to our collections
+                position_logprobs.append(logprob)
+                position_token_ids.append(token_id)
+
+            # Convert to a tensor for easier manipulation
+            position_logprobs_tensor = torch.tensor(
+                position_logprobs, dtype=torch.float
+            )
+
+            # Now we have distribution at T1 in log form, i.e. log p_{T1}(k).
+            # Next, re-scale to T2 = self.kd_temperature via exponent-based trick
+            # p_{T2}(k) = [p_{T1}(k)]^(T1 / T2) / Z
+            #
+            # Convert from log to probability
+            teacher_probs_t1 = position_logprobs_tensor.exp()
+            # normalize probabilities to sum to 1 in case they aren't already
+            teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
+            if teacher_probs_t1_sum > 1e-9:
+                teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
+            if self.kd_temperature != self.gen_temperature:
+                # Exponentiate by factor (T1 / T2)
+                exponent = self.gen_temperature / self.kd_temperature
+                teacher_probs_t2 = teacher_probs_t1**exponent
+            else:
+                teacher_probs_t2 = teacher_probs_t1
+            # Re-normalize
+            # teacher_probs_t2 = teacher_probs_t2 / teacher_probs_t2.sum(
+            #     dim=0, keepdim=True
+            # )
+            # Convert back to log
+            position_logprobs_tensor = torch.log(teacher_probs_t2)
+
+            # Now we have log p_{teacher, T2}(k) stored in position_logprobs_tensor
+            position_logprobs_scaled = position_logprobs_tensor.tolist()
+
+            target_logprobs.append(position_logprobs_scaled)
+            target_token_ids.append(position_token_ids)
+
+        # Update sample with transformed logprobs
+        return target_token_ids, target_logprobs, target_mask
--- a/src/axolotl/integrations/kernels/README.md
+++ b/src/axolotl/integrations/kernels/README.md
@@ -1,44 +0,0 @@
-# Kernels Integration
-
-MoE (Mixture of Experts) kernels speed up training for MoE layers and reduce VRAM costs. In transformers v5, `batched_mm` and `grouped_mm` were integrated as built-in options via the `experts_implementation` config kwarg:
-
-```python
-class ExpertsInterface(GeneralInterface):
-    _global_mapping = {
-        "batched_mm": batched_mm_experts_forward,
-        "grouped_mm": grouped_mm_experts_forward,
-    }
-```
-
-In our custom integration, we add support for **ScatterMoE**, which is even more efficient and faster than `grouped_mm`.
-
-## Usage
-
-Add the following to your axolotl YAML config:
-
-```yaml
-plugins:
-  - axolotl.integrations.kernels.KernelsPlugin
-
-use_kernels: true
-use_scattermoe: true
-```
-
-**Important:** Setting `experts_implementation` is incompatible with `use_scattermoe`.
-
-## How It Works
-
-The `KernelsPlugin` runs before model loading and:
-
-1. Registers the ScatterMoE kernel from the [`axolotl-ai-co/scattermoe`](https://huggingface.co/axolotl-ai-co/scattermoe) Hub repo.
-2. Patches the model's `SparseMoeBlock` forward method with the optimized ScatterMoE implementation.
-
-This works for any MoE model in transformers that uses a `SparseMoeBlock` class (Mixtral, Qwen2-MoE, OLMoE, etc.).
-
-## Limitations
-
-ScatterMoE uses a softmax -> topk routing, so results may be different for some model arch as baseline (GPT-OSS, GLM_MOE_DSA).
-
-## Note on MegaBlocks
-
-We tested [MegaBlocks](https://huggingface.co/kernels-community/megablocks) but were unable to ensure numerical accuracy, so we did not integrate it. It was also incompatible with many newer model architectures in transformers.
--- a/src/axolotl/integrations/lm_eval/README.md
+++ b/src/axolotl/integrations/lm_eval/README.md
@@ -6,12 +6,6 @@ See https://github.com/EleutherAI/lm-evaluation-harness

 ## Usage

-There are two ways to use the LM Eval integration:
-
-### 1. Post-Training Evaluation
-
-When training with the plugin enabled, evaluation runs automatically after training completes:
-
 ```yaml
 plugins:
  - axolotl.integrations.lm_eval.LMEvalPlugin
@@ -22,50 +16,9 @@ lm_eval_tasks:
  - arc_easy

 lm_eval_batch_size: # Batch size for evaluation
-
-# Directory to save evaluation results.
-# The final model is loaded from this directory
-# unless specified otherwise (see below)
-output_dir:
+output_dir: # Directory to save evaluation results
 ```

-Run training as usual:
-```bash
-axolotl train config.yml
-```
-
-### 2. Standalone CLI Evaluation
-
-Evaluate any model directly without training:
-
-```yaml
-lm_eval_model: meta-llama/Llama-2-7b-hf
-
-plugins:
-  - axolotl.integrations.lm_eval.LMEvalPlugin
-
-lm_eval_tasks:
-  - gsm8k
-  - hellaswag
-  - arc_easy
-
-lm_eval_batch_size: 8
-output_dir: ./outputs
-```
-
-Run evaluation:
-```bash
-axolotl lm-eval config.yml
-```
-
-## Model Selection Priority
-
-The model to evaluate is selected in the following priority order:
-
-1. **`lm_eval_model`** - Explicit model path or HuggingFace repo (highest priority)
-2. **`hub_model_id`** - Trained model pushed to HuggingFace Hub
-3. **`output_dir`** - Local checkpoint directory containing trained model weights
-
 ## Citation

 ```bib
--- a/src/axolotl/integrations/lm_eval/init.py
+++ b/src/axolotl/integrations/lm_eval/init.py
@@ -5,7 +5,7 @@ Module for the Plugin for LM Eval Harness
 import subprocess  # nosec

 from axolotl.integrations.base import BasePlugin
-from axolotl.integrations.lm_eval.cli import build_lm_eval_command, get_model_path
+from axolotl.integrations.lm_eval.cli import build_lm_eval_command

 from .args import LMEvalArgs as LMEvalArgs

@@ -29,7 +29,7 @@ class LMEvalPlugin(BasePlugin):
                wandb_project=cfg.wandb_project,
                wandb_entity=cfg.wandb_entity,
                wandb_name=cfg.wandb_name,
-                model=get_model_path(cfg),
+                model=cfg.lm_eval_model or cfg.hub_model_id,
            ):
                subprocess.run(  # nosec
                    lm_eval_args,
--- a/src/axolotl/integrations/lm_eval/cli.py
+++ b/src/axolotl/integrations/lm_eval/cli.py
@@ -13,21 +13,6 @@ import yaml
 from axolotl.utils.dict import DictDefault


-def get_model_path(cfg: DictDefault) -> str | None:
-    """
-    Determine which model path to use for evaluation.
-
-    Priority order (highest to lowest):
-    1. lm_eval_model - Explicit model path override
-    2. hub_model_id - Model pushed to HuggingFace Hub
-    3. None - Falls back to output_dir in build_lm_eval_command
-
-    Returns:
-        Model path string or None to use output_dir fallback
-    """
-    return cfg.lm_eval_model or cfg.hub_model_id or None
-
-
 def build_lm_eval_command(
    tasks: list[str],
    bfloat16=True,
@@ -123,7 +108,7 @@ def lm_eval(config: str, cloud: Optional[str] = None):
            wandb_project=cfg.wandb_project,
            wandb_entity=cfg.wandb_entity,
            wandb_name=cfg.wandb_name,
-            model=get_model_path(cfg),
+            model=cfg.lm_eval_model or cfg.hub_model_id,
            revision=cfg.revision,
            apply_chat_template=cfg.apply_chat_template,
            fewshot_as_multiturn=cfg.fewshot_as_multiturn,
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -338,12 +338,7 @@ class ModelLoader:
            # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so
            # we need to convert them back to fp16/bf16 for flash-attn compatibility.
            (
-                (
-                    needs_fa2_dtype
-                    or self.cfg.flash_attention
-                    or self.cfg.flex_attention
-                    or self.cfg.sage_attention
-                )
+                (needs_fa2_dtype or self.cfg.flash_attention or self.cfg.flex_attention)
                and not self.is_qlora_and_fsdp_enabled
            )
            or (
@@ -617,10 +612,6 @@ class ModelLoader:
        elif self.cfg.sdp_attention:
            self.model_kwargs["attn_implementation"] = "sdpa"
            self.model_config._attn_implementation = "sdpa"
-        elif self.cfg.sage_attention:
-            # sets FA2 attention to re-use same internal handling like masking
-            self.model_kwargs["attn_implementation"] = "flash_attention_2"
-            self.model_config._attn_implementation = "flash_attention_2"
        elif self.cfg.eager_attention:
            self.model_kwargs["attn_implementation"] = "eager"
            self.model_config._attn_implementation = "eager"
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -10,7 +10,6 @@ from functools import cached_property
 import addict
 import transformers
 from transformers import PretrainedConfig, PreTrainedModel
-from transformers.modeling_flash_attention_utils import is_flash_attn_available

 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.multipack import (
@@ -97,7 +96,6 @@ class PatchManager:
        # self._apply_flex_attention_patches()
        self._apply_flash_attention_patches()
        self._apply_chunked_cross_entropy_patch()
-        self._apply_sageattn_patches()
        self._apply_fsdp_patches()
        self._apply_adapter_patches()
        self._apply_model_specific_patches()
@@ -203,13 +201,6 @@ class PatchManager:
            flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
            patch_flex_wrapper(**flex_attn_compile_kwargs)

-    def _apply_sageattn_patches(self):
-        """Apply patches for SageAttention."""
-        if self.cfg.sage_attention:
-            from axolotl.monkeypatch.attention.sage_attn import patch_sageattn
-
-            patch_sageattn()
-
    def _apply_model_specific_patches(self):
        """Apply patches specific to model architectures."""
        if (
@@ -329,7 +320,7 @@ class PatchManager:
            else:
                has_remote_code = False

-            if has_remote_code and self.cfg.trust_remote_code is False:
+            if has_remote_code and self.cfg.trust_remote_code is not None:
                # If explicitly set in YAML, prefer that
                has_remote_code = self.cfg.trust_remote_code

@@ -501,7 +492,6 @@ class PatchManager:
            and not self.cfg.trust_remote_code
            and not self.cfg.gptq
            and self.cfg.flash_attention
-            and is_flash_attn_available()
            and not self.inference
        ):
            # TODO(MengqingCao): split these patches separately
--- a/src/axolotl/monkeypatch/attention/sage_attn.py
+++ b/src/axolotl/monkeypatch/attention/sage_attn.py
@@ -1,211 +0,0 @@
-"""
-Monkeypatch for SageAttention for use with transformers.
-
-https://github.com/thu-ml/SageAttention/
-"""
-
-import torch
-from transformers.integrations.sdpa_attention import repeat_kv
-
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-sageattn = None  # pylint: disable=invalid-name
-sageattn_varlen = None  # pylint: disable=invalid-name
-
-
-def _is_sageattn_available():
-    """Determine if SageAttention is available"""
-    try:
-        import sageattention  # noqa: F401 # pylint: disable=unused-import
-
-        return True
-    except ImportError:
-        return False
-
-
-if _is_sageattn_available():
-    # import sageattn here if available
-    from sageattention import sageattn, sageattn_varlen
-
-
-def _check_sageattn_imported():
-    """Check if SageAttention is imported. Raises an ImportError if not."""
-    if sageattn is None:
-        raise ImportError(
-            "SageAttention is not installed. Please install it from source: "
-            "`pip install git+https://github.com/thu-ml/SageAttention.git@1718ddc06dbc694bcf3c6b49ac28c1921aa2d8bd`"
-        )
-
-
-def sage_attention_forward(
-    module: torch.nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: torch.Tensor | None = None,
-    dropout: float = 0.0,
-    scaling: float | None = None,
-    is_causal: bool | None = None,
-    **kwargs,
-) -> tuple[torch.Tensor, None]:
-    """
-    Forward pass for SageAttention compatible with transformers attention interfaces.
-
-    https://github.com/thu-ml/SageAttention/
-    """
-
-    _check_sageattn_imported()
-
-    if kwargs.get("output_attentions", False) or kwargs.get("head_mask") is not None:
-        raise NotImplementedError(
-            "SageAttention does not support `output_attentions=True` or `head_mask`."
-        )
-
-    # The base sageattn API does not support dropout.
-    if dropout > 0.0:
-        raise NotImplementedError("SageAttention does not support dropout.")
-
-    # Handle Grouped-Query Attention (GQA) and Multi-Query Attention (MQA)
-    if hasattr(module, "num_key_value_groups"):
-        key = repeat_kv(key, module.num_key_value_groups)
-        value = repeat_kv(value, module.num_key_value_groups)
-
-    # Calculate is_causal following transformers
-    assert is_causal is not False, "is_causal must be True or None"
-    is_causal = True
-
-    position_ids = kwargs.get("position_ids", None)
-    query_length = query.shape[2]
-
-    cu_seqlens_q = kwargs.get("cu_seqlens_q", None)
-    cu_seqlens_k = kwargs.get("cu_seqlens_k", None)
-    max_length_q = kwargs.get("max_length_q", None)
-    max_length_k = kwargs.get("max_length_k", None)
-
-    # Sample packing uses position_ids, so we check for it first
-    if position_ids is not None and (
-        max_length_q is not None
-        or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all())
-    ):
-        # transpose inputs to NHD layout for use with FA2 utils
-        query = query.transpose(1, 2)
-        key = key.transpose(1, 2)
-        value = value.transpose(1, 2)
-
-        batch_size = query.size(0)
-
-        from transformers.modeling_flash_attention_utils import (
-            prepare_fa2_from_position_ids,
-        )
-
-        if cu_seqlens_q is None or cu_seqlens_k is None:
-            query, key, value, indices_q, cu_seq_lens, max_seq_lens = (
-                prepare_fa2_from_position_ids(query, key, value, position_ids)
-            )
-
-            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-            max_length_q, max_length_k = max_seq_lens
-
-        else:
-            query = query.reshape(-1, query.size(-2), query.size(-1))
-            key = key.reshape(-1, key.size(-2), key.size(-1))
-            value = value.reshape(-1, value.size(-2), value.size(-1))
-
-        attn_output_unpad = sageattn_varlen(
-            q=query,
-            k=key,
-            v=value,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-            max_seqlen_q=max_length_q,
-            max_seqlen_k=max_length_k,
-            is_causal=is_causal,
-            sm_scale=scaling,
-            smooth_k=False,  # reduces loss 0 / nan grad norms
-            tensor_layout="NHD",
-        )
-
-        attn_output = attn_output_unpad.view(
-            batch_size, -1, attn_output_unpad.size(-2), attn_output_unpad.size(-1)
-        )
-
-    elif attention_mask is not None:
-        # NOTE: When used without `pad_to_sequence_len`, the loss becomes unstable after a few steps.
-
-        assert attention_mask.ndim == 2, "Attention mask must be 2D"
-
-        from transformers.modeling_flash_attention_utils import (
-            _upad_input,
-        )
-
-        # transpose inputs to NHD layout for use with FA2 utils
-        query = query.transpose(1, 2)
-        key = key.transpose(1, 2)
-        value = value.transpose(1, 2)
-
-        batch_size = query.shape[0]
-
-        query, key, value, indices_q, cu_seq_lens, max_seq_lens = _upad_input(
-            query, key, value, attention_mask, query_length
-        )
-        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
-        max_seqlen_q, max_seqlen_k = max_seq_lens
-
-        attn_output_unpad = sageattn_varlen(
-            q=query,
-            k=key,
-            v=value,
-            cu_seqlens_q=cu_seqlens_q,
-            cu_seqlens_k=cu_seqlens_k,
-            max_seqlen_q=max_seqlen_q,
-            max_seqlen_k=max_seqlen_k,
-            is_causal=is_causal,
-            sm_scale=scaling,
-            tensor_layout="NHD",
-        )
-
-        from flash_attn.bert_padding import pad_input
-
-        attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
-    else:
-        # Use standard sageattn
-        # The input layout for transformers models is (batch_size, num_heads, seq_len, head_dim),
-        # which corresponds to SageAttention's "HND" layout.
-        attn_output = sageattn(
-            q=query,
-            k=key,
-            v=value,
-            tensor_layout="HND",
-            is_causal=is_causal,
-            sm_scale=scaling,
-        )
-
-        # SageAttention with "HND" returns (batch, heads, seq_len, head_dim)
-        # Transformers expects (batch, seq_len, heads, head_dim) for the output
-        # So we need to transpose dimensions 1 and 2
-        attn_output = attn_output.transpose(1, 2).contiguous()
-
-    return attn_output, None
-
-
-def patch_sageattn():
-    """Patch SageAttention for use with transformers."""
-
-    _check_sageattn_imported()
-
-    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
-
-    # Replace flash attention with sage attention
-    ALL_ATTENTION_FUNCTIONS.register("flash_attention_2", sage_attention_forward)
-
-    # Note: New method after transformers refactor to use ALL_MASK_ATTENTION_FUNCTIONS
-    # Register sage_attention with the global attention interface
-    # ALL_ATTENTION_FUNCTIONS.register("sage_attention", sage_attention_forward)
-
-    # from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, flash_attention_mask
-
-    # ALL_MASK_ATTENTION_FUNCTIONS.register("sage_attention", flash_attention_mask)
-
-    LOG.info("SageAttention patched successfully")
--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
@@ -59,12 +59,7 @@ class CPU_Offloaded_Gradient_Checkpointer(torch.autograd.Function):
        hidden_states = hidden_states.to("cuda", non_blocking=True).detach()
        hidden_states.requires_grad = True
        with torch.enable_grad():
-            output = ctx.forward_function(hidden_states, *ctx.args)
-            # Newer HF models (e.g. Qwen3MoE) using GradientCheckpointingLayer
-            # return a plain tensor, not a tuple.  Older models return tuples
-            # like (hidden_states, present_kv, ...).  Unwrap if needed.
-            if isinstance(output, (tuple, list)):
-                (output,) = output
+            (output,) = ctx.forward_function(hidden_states, *ctx.args)
        torch.autograd.backward(output, dY)
        return (
            None,
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -169,8 +169,7 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
        return attention_cls
    except (ImportError, AttributeError) as e:
        raise ValueError(
-            f"Axolotl could not import attention class for model_type: {model_type}. "
-            "Please raise an Issue and turn off lora kernels to continue training. "
+            f"Could not import attention class for model_type: {model_type}. "
            f"Error: {str(e)}"
        ) from e

--- a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
@@ -28,12 +28,8 @@ PATCHED_EVAL_CODE = {
    "array": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(all_losses).item()',
 }

-ORIGINAL_MAYBE_CODE = (
-    "tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).mean().item()"
-)
-PATCHED_MAYBE_CODE = (
-    "tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).nanmean().item()"
-)
+ORIGINAL_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).mean().item()"
+PATCHED_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).nanmean().item()"


 def check_evaluation_loop_is_patchable() -> bool:
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -485,58 +485,6 @@ class InternVLProcessingStrategy(ProcessingStrategy):
        return labels


-class Glm4vProcessingStrategy(ProcessingStrategy):
-    """Processing Strategy class for GLM4V and GLM4V-MoE vision models."""
-
-    def __init__(
-        self,
-        processor: ProcessorMixin,
-        chat_template: Optional[str] = None,
-        image_size: int | tuple[int, int] | None = None,
-        image_resize_algorithm: Resampling | None = None,
-    ):
-        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
-
-        self.tokenizer = getattr(processor, "tokenizer", processor)
-
-        self.image_token = "<|image|>"  # nosec
-        self.begin_image_token = "<|begin_of_image|>"  # nosec
-        self.end_image_token = "<|end_of_image|>"  # nosec
-        self.video_token = "<|video|>"  # nosec
-        self.begin_video_token = "<|begin_of_video|>"  # nosec
-        self.end_video_token = "<|end_of_video|>"  # nosec
-
-        self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
-        self.begin_image_token_id = self.tokenizer.convert_tokens_to_ids(
-            self.begin_image_token
-        )
-        self.end_image_token_id = self.tokenizer.convert_tokens_to_ids(
-            self.end_image_token
-        )
-        self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token)
-        self.begin_video_token_id = self.tokenizer.convert_tokens_to_ids(
-            self.begin_video_token
-        )
-        self.end_video_token_id = self.tokenizer.convert_tokens_to_ids(
-            self.end_video_token
-        )
-
-    def process_labels(self, input_ids):
-        labels = input_ids.clone()
-
-        labels[labels == self.tokenizer.pad_token_id] = -100
-
-        labels[labels == self.image_token_id] = -100
-        labels[labels == self.begin_image_token_id] = -100
-        labels[labels == self.end_image_token_id] = -100
-
-        labels[labels == self.video_token_id] = -100
-        labels[labels == self.begin_video_token_id] = -100
-        labels[labels == self.end_video_token_id] = -100
-
-        return labels
-
-
 def get_processing_strategy(
    processor: ProcessorMixin,
    chat_template,
@@ -553,10 +501,10 @@ def get_processing_strategy(
        "image_resize_algorithm": image_resize_algorithm,
    }

-    if chat_template_type in [None, "tokenizer_default"]:
-        tokenizer = getattr(processor, "tokenizer", processor)
-        if hasattr(tokenizer, "chat_template"):
-            processing_kwargs["chat_template"] = tokenizer.chat_template
+    if chat_template_type in [None, "tokenizer_default"] and hasattr(
+        processor.tokenizer, "chat_template"
+    ):
+        processing_kwargs["chat_template"] = processor.tokenizer.chat_template

    if chat_template_type == "qwen2_vl":
        return Qwen2VLProcessingStrategy(
@@ -585,15 +533,6 @@ def get_processing_strategy(
        return Mistral3ProcessingStrategy(
            **processing_kwargs,
        )
-    try:
-        from transformers.models.glm46v.processing_glm46v import Glm46VProcessor
-
-        if isinstance(processor, Glm46VProcessor):
-            return Glm4vProcessingStrategy(
-                **processing_kwargs,
-            )
-    except ImportError:
-        pass

    if isinstance(processor, InternVLProcessor):
        return InternVLProcessingStrategy(
--- a/src/axolotl/telemetry/callbacks.py
+++ b/src/axolotl/telemetry/callbacks.py
@@ -153,27 +153,13 @@ class TelemetryCallback(TrainerCallback):
                self.last_report_step = step

    def _extract_last_metrics(self, state: TrainerState) -> dict:
-        """Extract last loss, learning_rate, grad_norm, and token metrics from log history."""
+        """Extract last loss, learning_rate, and grad_norm from log history."""
        if not state.log_history:
-            return {
-                "loss": 0,
-                "ppl": 0,
-                "learning_rate": 0,
-                "grad_norm": 0,
-                "tokens/total": 0,
-                "tokens/trainable": 0,
-                "tokens/train_per_sec_per_gpu": 0,
-            }
+            return {"loss": 0, "learning_rate": 0, "grad_norm": 0}

        last_log = state.log_history[-1]
        return {
            "loss": last_log.get("loss", 0),
-            "ppl": last_log.get("ppl", 0),
            "learning_rate": last_log.get("learning_rate", 0),
            "grad_norm": last_log.get("grad_norm", 0),
-            "tokens/total": last_log.get("tokens/total", 0),
-            "tokens/trainable": last_log.get("tokens/trainable", 0),
-            "tokens/train_per_sec_per_gpu": last_log.get(
-                "tokens/train_per_sec_per_gpu", 0
-            ),
        }
--- a/src/axolotl/telemetry/errors.py
+++ b/src/axolotl/telemetry/errors.py
@@ -155,10 +155,6 @@ def send_errors(func: Callable) -> Callable:
                    },
                )

-                LOG.error(
-                    f"Error captured in telemetry. Run ID: {telemetry_manager.run_id}"
-                )
-
            raise

    return wrapper
--- a/src/axolotl/telemetry/manager.py
+++ b/src/axolotl/telemetry/manager.py
@@ -5,6 +5,7 @@ import importlib
 import logging
 import os
 import platform
+import time
 import uuid
 from pathlib import Path
 from typing import Any
@@ -19,6 +20,21 @@ LOG = logging.getLogger(__name__)
 POSTHOG_HOST = "https://app.posthog.com"
 POSTHOG_WRITE_KEY = "phc_1kUR0o04oJKKTTeSsIz2Mfm5mpiVsQEf2WOlzljMD7y"

+OPT_OUT_WARNING_SLEEP_SECONDS = 10
+OPT_OUT_WARNING = (
+    "\nTelemetry is now enabled by default to help improve Axolotl. "
+    "If you'd like to disable it, set AXOLOTL_DO_NOT_TRACK=1 in your environment.\n\n"
+    "Telemetry data helps us understand:\n"
+    "- Which features are most used\n"
+    "- What hardware configurations to prioritize\n"
+    "- Where users encounter errors\n\n"
+    "Personally identifiable information (PII) is not collected.\n\n"
+    "To remove this warning, explicitly set AXOLOTL_DO_NOT_TRACK=0 (enable telemetry) "
+    "or AXOLOTL_DO_NOT_TRACK=1 (disable telemetry).\n\n"
+    "For details, see: https://docs.axolotl.ai/docs/telemetry.html\n\n"
+    f"Sleeping for {OPT_OUT_WARNING_SLEEP_SECONDS}s..."
+)
+
 WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml")

 # NOTE: Need to keep these up to date with any config schema changes
@@ -30,8 +46,8 @@ FIELDS_TO_REDACT = {
    "resume_from_checkpoint",
    "hub_model_id",
 }
-PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_", "trackio_", "swanlab_"}
-PATH_INDICATORS = {"path", "dir", "data_files"}
+PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"}
+PATH_INDICATORS = {"path", "dir"}

 # pylint: disable=duplicate-code
 RELEVANT_PACKAGES = {
@@ -167,6 +183,11 @@ class TelemetryManager:
            "false",
            "true",
        ):
+            # Print opt-out info message for main process only
+            if is_main_process():
+                LOG.warning(OPT_OUT_WARNING)
+            time.sleep(OPT_OUT_WARNING_SLEEP_SECONDS)
+
            return True

        # Only rank 0 will send telemetry
--- a/src/axolotl/telemetry/whitelist.yaml
+++ b/src/axolotl/telemetry/whitelist.yaml
@@ -31,10 +31,3 @@ organizations:
  - "mistral-community"
  - "llava-hf"
  - "ByteDance-Seed"
-  - "ACE-Step"
-  - "openbmb"
-  - "MiniMaxAI"
-  - "stepfun-ai"
-  - "internlm"
-  - "katanemo"
-  - "XiaomiMiMo"
--- a/src/axolotl/utils/callbacks/tokens_per_second.py
+++ b/src/axolotl/utils/callbacks/tokens_per_second.py
@@ -78,19 +78,12 @@ class TokensPerSecondCallback(TrainerCallback):
        **kwargs,
    ):  # pylint: disable=unused-argument
        tokens = getattr(state, "tokens", None)
-        if not (tokens and "trainable_tokens" in tokens):
-            return
-        step_time = time.perf_counter() - self.start_time
-        if step_time <= 0:
-            return
-
-        num_tokens = tokens["trainable_tokens"].clone() / self.non_data_parallel_size
-        if torch.distributed.is_initialized():
-            dp_size = max(
-                1, torch.distributed.get_world_size() // self.non_data_parallel_size
-            )
-            num_tokens = num_tokens / dp_size
-        state.last_tokens_per_second = num_tokens / step_time
+        if tokens and "trainable_tokens" in tokens:
+            step_time = time.perf_counter() - self.start_time
+            num_tokens_per_device = tokens["trainable_tokens"].clone()
+            # non data parallel groups have duplicated tokens, so we avoid double-counting
+            num_tokens_per_device = num_tokens_per_device / self.non_data_parallel_size
+            state.last_tokens_per_second = num_tokens_per_device / step_time

    def on_log(
        self,
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -218,9 +218,6 @@ class SequenceParallelContextManager:
        self.original_seq_len = 0
        self.pad_len = 0

-        # Track local valid token count for eval loss correction across CP ranks
-        self._local_valid_tokens: torch.Tensor | None = None
-
        # Create a partially applied version of the apply_sequence_parallelism function
        self.apply_sequence_parallelism = functools.partial(
            apply_sequence_parallelism,
@@ -273,18 +270,6 @@ class SequenceParallelContextManager:
                self.apply_sequence_parallelism(updated_kwargs)
            )

-            # Track local valid tokens for eval loss correction
-            if "labels" in updated_kwargs and not self.models[0].training:
-                self._local_valid_tokens = (
-                    (updated_kwargs["labels"] != -100).sum().float()
-                )
-                # Strip num_items_in_batch during eval so the model uses
-                # reduction='mean', allowing the post-hook weighted all-reduce
-                # formula (loss * local_valid) to correctly recover the loss sum
-                updated_kwargs.pop("num_items_in_batch", None)
-            else:
-                self._local_valid_tokens = None
-
            return remaining_args, updated_kwargs

        # Forward post-hook to gather outputs
@@ -302,44 +287,6 @@ class SequenceParallelContextManager:

            return output

-        # Post-hook to correct eval loss via weighted all-reduce across CP ranks
-        def eval_loss_correction_post_hook(_, __, output: ModelOutput) -> ModelOutput:
-            if self._local_valid_tokens is None:
-                return output
-            if not hasattr(output, "loss") or output.loss is None:
-                return output
-
-            local_valid = self._local_valid_tokens.to(output.loss.device)
-            loss = output.loss.detach().clone()
-
-            # Handle rank with zero valid tokens (loss is NaN)
-            if local_valid.item() == 0:
-                weighted_loss = torch.zeros(1, device=loss.device, dtype=loss.dtype)
-            else:
-                weighted_loss = loss * local_valid
-
-            total_valid = local_valid.clone()
-            dist.all_reduce(
-                weighted_loss,
-                op=dist.ReduceOp.SUM,
-                group=self.process_group,
-            )
-            dist.all_reduce(
-                total_valid,
-                op=dist.ReduceOp.SUM,
-                group=self.process_group,
-            )
-
-            if total_valid.item() > 0:
-                output["loss"] = (weighted_loss / total_valid).squeeze()
-            else:
-                output["loss"] = torch.tensor(
-                    float("nan"), device=loss.device, dtype=loss.dtype
-                )
-
-            self._local_valid_tokens = None
-            return output
-
        # Register hooks
        for model in self.models:
            self.hook_handles.append(
@@ -351,10 +298,6 @@ class SequenceParallelContextManager:
                self.hook_handles.append(
                    model.register_forward_hook(sequence_parallel_post_hook)
                )
-            # Always register eval loss correction hook
-            self.hook_handles.append(
-                model.register_forward_hook(eval_loss_correction_post_hook)
-            )

    def _gather_outputs(self, output: CausalLMOutputWithPast) -> CausalLMOutputWithPast:
        """Gather sharded outputs from all ranks and reconstruct the full tensor."""
--- a/src/axolotl/utils/datasets.py
+++ b/src/axolotl/utils/datasets.py
@@ -2,19 +2,11 @@

 import os

-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-

 def get_default_process_count():
    if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"):
        return int(axolotl_dataset_num_proc)
    if axolotl_dataset_processes := os.environ.get("AXOLOTL_DATASET_PROCESSES"):
-        LOG.warning(
-            "AXOLOTL_DATASET_PROCESSES and `dataset_processes` are deprecated and will be "
-            "removed in a future version. Please use `dataset_num_proc` instead."
-        )
        return int(axolotl_dataset_processes)
    if runpod_cpu_count := os.environ.get("RUNPOD_CPU_COUNT"):
        return int(runpod_cpu_count)
--- a/src/axolotl/utils/mistral/mistral_tokenizer.py
+++ b/src/axolotl/utils/mistral/mistral_tokenizer.py
@@ -86,15 +86,15 @@ class HFMistralTokenizer(MistralCommonBackend):
        add_generation_prompt: bool = False,
        **kwargs,
    ) -> str | list[int]:
-        """Patched fn to handle setting test mode, remove chat_template and add_generation_prompt kwarg"""
+        """Patched fn to handle setting serving mode, continue_final_message, remove chat_template and add_generation_prompt kwarg"""

        # pop unnecessary kwarg for mistral
        kwargs.pop("real_last_index", None)
-        kwargs.pop("add_special_tokens", None)

        try:
            if add_generation_prompt:
-                self._set_mode(ValidationMode.test)
+                self._set_mode(ValidationMode.serving)
+                kwargs["continue_final_message"] = True

            out = super().apply_chat_template(conversation, **kwargs)

--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -446,16 +446,7 @@ class AxolotlInputConfig(
        },
    )

-    unfrozen_parameters: list[str] | None = Field(
-        default=None,
-        json_schema_extra={
-            "description": "List of regex patterns for parameter names to keep unfrozen. "
-            "All other parameters will be frozen via requires_grad=False. "
-            "Note: range-based patterns (e.g. embed_tokens.weight$[:32000]) use gradient "
-            "zeroing rather than a true freeze, so weight decay will still apply to the "
-            "frozen portion and optimizer states are allocated for the full parameter."
-        },
-    )
+    unfrozen_parameters: list[str] | None = None

    sequence_len: int = Field(
        default=512,
@@ -618,12 +609,6 @@ class AxolotlInputConfig(
        default=None,
        json_schema_extra={"description": "Whether to use bettertransformers"},
    )
-    sage_attention: bool | None = Field(
-        default=None,
-        json_schema_extra={
-            "description": "Whether to use SageAttention https://github.com/thu-ml/SageAttention"
-        },
-    )

    eager_attention: bool | None = None

@@ -1135,27 +1120,6 @@ class AxolotlInputConfig(
                    )
        return data

-    @model_validator(mode="before")
-    @classmethod
-    def check_sageattn_wo_sample_packing(cls, data):
-        if (not data.get("sample_packing", False)) and data.get("sage_attention"):
-            if not data.get("pad_to_sequence_len", False):
-                LOG.warning(
-                    "We recommend turning on `pad_to_sequence_len` for SageAttention without packing."
-                    "This is because there has been signs that the loss explodes after a few steps."
-                )
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_sageattn_fft(cls, data):
-        if (not data.get("adapter", False)) and data.get("sage_attention"):
-            LOG.warning(
-                "We found loss to drop to 0 with SageAttention full finetuning."
-                "Please observe the loss, otherwise switch to LoRA/QLoRA or another attention method."
-            )
-        return data
-

 class AxolotlConfigWCapabilities(AxolotlInputConfig):
    """Wrapper to valdiate GPU capabilities with the configured options"""
@@ -1212,21 +1176,6 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):

        return data

-    @model_validator(mode="before")
-    @classmethod
-    def check_compute_capability_w_sageattn(cls, data):
-        if (
-            data.get("sage_attention")
-            and data.get("capabilities")
-            and data.get("capabilities").get("compute_capability")
-            not in ["sm_80", "sm_86", "sm_89", "sm_90", "sm_120"]
-        ):
-            raise ValueError(
-                "SageAttention supports compute capability between sm_80 and sm_120. "
-                "Please use a different attention implementation."
-            )
-        return data
-
    @model_validator(mode="before")
    @classmethod
    def check_multigpu_unsloth(cls, data):
@@ -1280,10 +1229,6 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
            ):
                return data

-            # Skip if trust_remote_code is enabled, as lora kernels are not compatible
-            if data.get("trust_remote_code"):
-                return data
-
            # Skip if dropout is not 0, as auto enabling it would just disable it during runtime patch checks
            if data.get("lora_dropout") != 0:
                return data
--- a/src/axolotl/utils/schemas/model.py
+++ b/src/axolotl/utils/schemas/model.py
@@ -120,12 +120,6 @@ class ModelOutputConfig(BaseModel):
        default=None,
        json_schema_extra={"description": "how to push checkpoints to hub"},
    )
-    hub_revision: str | None = Field(
-        default=None,
-        json_schema_extra={
-            "description": "branch/revision to push to on hub (default: main)"
-        },
-    )
    save_safetensors: bool | None = Field(
        default=True,
        json_schema_extra={
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -166,10 +166,9 @@ class AttentionValidationMixin:
        fields = (
            "xformers_attention",
            "sdp_attention",
-            # "s2_attention",  # requires both FA and this to be enabled
+            "s2_attention",
            "flash_attention",
            "flex_attention",
-            "sage_attention",
        )
        non_empty_count = sum(1 for field in fields if data.get(field))

@@ -186,10 +185,9 @@ class AttentionValidationMixin:
            and not data.get("sdp_attention")
            and not data.get("flex_attention")
            and not data.get("xformers_attention")
-            and not data.get("sage_attention")
        ):
            LOG.warning(
-                "sample_packing without flash, sdp, xformers, sage, or flex attention does not handle cross sample decontamination."
+                "sample_packing without flash, sdp, xformers or flex attention does not handle cross sample decontamination."
            )
        return data

@@ -690,21 +688,6 @@ class LoRAValidationMixin:
            )
        return data

-    @model_validator(mode="before")
-    @classmethod
-    def check_lora_kernels_trust_remote_code(cls, data):
-        if (
-            data.get("lora_mlp_kernel")
-            or data.get("lora_qkv_kernel")
-            or data.get("lora_o_kernel")
-        ) and data.get("trust_remote_code"):
-            raise ValueError(
-                "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not "
-                "compatible with trust_remote_code. Please disable trust_remote_code "
-                "or explicitly set lora_*_kernel to false."
-            )
-        return data
-

 class RLValidationMixin:
    """Validation methods related to RL training configuration."""
--- a/tests/core/test_builders.py
+++ b/tests/core/test_builders.py
@@ -79,7 +79,7 @@ def fixture_base_cfg():
            "ddp_timeout": 1800,
            "ddp_bucket_cap_mb": 25,
            "ddp_broadcast_buffers": False,
-            "dataset_num_proc": 4,
+            "dataset_processes": 4,
        }
    )

@@ -300,6 +300,7 @@ class TestHFRLTrainerBuilder:
        self._test_common_training_arguments(training_arguments, rl=orpo_cfg.rl)
        # ORPO specific
        assert training_arguments.beta == 0.1  # maps from orpo_alpha
+        assert training_arguments.max_prompt_length == 512

    def test_kto_training_arguments(self, kto_cfg, model, tokenizer):
        builder = HFRLTrainerBuilder(kto_cfg, model, tokenizer)
--- a/tests/e2e/multigpu/test_fsdp1.py
+++ b/tests/e2e/multigpu/test_fsdp1.py
@@ -186,7 +186,6 @@ class TestFSDP1:

        verify_training_success(temp_dir)

-    @pytest.mark.skip(reason="slow test, deprecate fsdp1 asap")
    def test_dpo_fft(self, temp_dir):
        cfg = DictDefault(
            {
--- a/tests/e2e/multigpu/test_fsdp2.py
+++ b/tests/e2e/multigpu/test_fsdp2.py
@@ -365,7 +365,6 @@ class TestFSDP2:

        verify_training_success(temp_dir)

-    @pytest.mark.skip(reason="slow test w cu129 + torch 2.9.1 + py3.12")
    @require_torch_2_7_0
    def test_dpo_fft(self, temp_dir):
        cfg = DictDefault(
--- a/tests/e2e/test_streaming.py
+++ b/tests/e2e/test_streaming.py
@@ -30,7 +30,7 @@ class TestStreamingDatasets:
                "sample_packing": sample_packing,
                "pretrain_multipack_attn": sample_packing,
                "streaming_multipack_buffer_size": 10000,
-                "dataset_num_proc": 1,
+                "dataset_processes": 1,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -179,7 +179,7 @@ def check_tensorboard(
    tag: str,
    lt_val: float,
    assertion_err: str,
-    rtol: float = 0.02,
+    rtol: float = 0.05,
 ) -> None:
    """
    helper function to parse and check tensorboard logs
--- a/tests/prompt_strategies/test_chat_templates.py
+++ b/tests/prompt_strategies/test_chat_templates.py
@@ -115,9 +115,6 @@ class TestAssistantChatTemplateLlama3:

    def test_phi35(self, phi35_tokenizer, assistant_dataset):
        LOG.info("Testing phi-3.5 with assistant dataset")
-        assert "LlamaTokenizer" in phi35_tokenizer.__class__.__name__, (
-            "phi35 tokenizer should be a LlamaTokenizer"
-        )
        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                phi35_tokenizer,
@@ -143,13 +140,13 @@ class TestAssistantChatTemplateLlama3:
        # fmt: off
        expected_input_ids = [
            32010,  # user
-            12199, 32007,  # user eot
+            22172, 32007,  # user eot
            32001,  # assistant
-            12199, 32007,  # assistant eot
+            22172, 32007,  # assistant eot
            32010,  # user
-            16773, 26966, 32007,  # user eot
+            1781, 26966, 32007,  # user eot
            32001,  # assistant
-            16773, 26966, 32007,  # assistant eot
+            1781, 26966, 32007,  # assistant eot
        ]
        expected_labels = [
            -100,  # user
@@ -159,7 +156,7 @@ class TestAssistantChatTemplateLlama3:
            -100,  # user
            -100, -100, -100,  # user eot
            -100,  # assistant
-            16773, 26966, 32007,  # assistant eot
+            1781, 26966, 32007,  # assistant eot
        ]
        # fmt: on
        LOG.debug(f"Expected input_ids: {expected_input_ids}")
--- a/tests/telemetry/test_manager.py
+++ b/tests/telemetry/test_manager.py
@@ -118,6 +118,20 @@ def test_telemetry_disabled_for_non_main_process(telemetry_manager_class):
        assert not manager.enabled


+def test_opt_in_info_displayed(telemetry_manager_class):
+    """Test that opt-in info is displayed when telemetry is not configured"""
+    with (
+        patch.dict(os.environ, {"RANK": "0"}, clear=True),
+        patch("logging.Logger.warning") as mock_warning,
+        patch("time.sleep"),
+    ):
+        telemetry_manager_class()
+        assert any(
+            "Telemetry is now enabled by default" in str(call)
+            for call in mock_warning.call_args_list
+        )
+
+
 def test_is_whitelisted(telemetry_manager_class, mock_whitelist):
    """Test org whitelist functionality"""
    with (
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -84,8 +84,7 @@ class TestTokenizers:
            }
        )
        tokenizer = load_tokenizer(cfg)
-        assert "LlamaTokenizer" in tokenizer.__class__.__name__
-        assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1792]
+        assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1404]
        assert len(tokenizer) == 32001

        # ensure reloading the tokenizer again from cfg results in same vocab length
--- a/tests/utils/lora/test_config_validation_lora.py
+++ b/tests/utils/lora/test_config_validation_lora.py
@@ -90,62 +90,3 @@ class TestLoRAConfigValidation:
                }
            )
            validate_config(invalid_config)
-
-    @pytest.mark.parametrize(
-        "kernel_field", ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"]
-    )
-    def test_lora_kernels_trust_remote_code_incompatible(self, kernel_field):
-        """Test that lora kernels are incompatible with trust_remote_code"""
-        with pytest.raises(ValueError, match="not compatible with trust_remote_code"):
-            invalid_config = DictDefault(
-                {
-                    "adapter": "lora",
-                    kernel_field: True,
-                    "trust_remote_code": True,
-                    "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
-                    "micro_batch_size": 1,
-                    "gradient_accumulation_steps": 1,
-                    "learning_rate": 1e-5,
-                    "base_model": "dummy_model",
-                }
-            )
-            validate_config(invalid_config)
-
-    def test_lora_kernels_trust_remote_code_false(self):
-        """Test that lora kernels work when trust_remote_code is false"""
-        # Test with trust_remote_code=False, lora kernels should be allowed
-        valid_config = DictDefault(
-            {
-                "adapter": "lora",
-                "lora_mlp_kernel": True,
-                "lora_qkv_kernel": True,
-                "lora_o_kernel": True,
-                "trust_remote_code": False,
-                "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 1,
-                "learning_rate": 1e-5,
-                "base_model": "dummy_model",
-            }
-        )
-        result = validate_config(valid_config)
-        assert result["lora_mlp_kernel"] is True
-        assert result["lora_qkv_kernel"] is True
-        assert result["lora_o_kernel"] is True
-
-        # Test with trust_remote_code=None (unset), kernels should be allowed
-        valid_config = DictDefault(
-            {
-                "adapter": "lora",
-                "lora_qkv_kernel": True,
-                "trust_remote_code": None,
-                "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 1,
-                "learning_rate": 1e-5,
-                "base_model": "dummy_model",
-            }
-        )
-        result = validate_config(valid_config)
-        assert result["lora_qkv_kernel"] is True
-        assert result["trust_remote_code"] is None
Author	SHA1	Message	Date
Wing Lian	b8d52a2193	use kwargs	2026-02-04 12:04:53 -05:00
Wing Lian	002b1ac967	max new tokens for online generation	2026-02-04 11:55:19 -05:00
Wing Lian	17b01bfe36	handle input only for online	2026-02-04 10:53:10 -05:00
Wing Lian	a0669335e2	online top-k kd	2026-02-04 09:49:35 -05:00