fix import

add muon optimizer
optimizer_cls_and_kwargs is on trainer_kwargs only add adamw_kwargs if they're non-null fix mocks better handling of override and check the optimizer unwrap optimizer
2025-03-05 14:05:27 -05:00 · 2025-03-05 10:47:22 -05:00
34 changed files with 89 additions and 619 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -88,11 +88,6 @@ jobs:
            pytorch: 2.5.1
            axolotl_extras:
            is_latest: true
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -80,11 +80,6 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            axolotl_extras:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/README.md
+++ b/README.md
@@ -55,7 +55,6 @@ Features:
 ### Installation

 ```bash
-pip3 install -U packaging setuptools wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]

 # Download example axolotl configs, deepspeed configs
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -32,9 +32,8 @@ website:
          contents:
            - docs/getting-started.qmd
            - docs/installation.qmd
-            - docs/inference.qmd
            - docs/cli.qmd
-            - docs/config.qmd
+            - docs/inference.qmd

        - section: "Dataset Formats"
          contents: docs/dataset-formats/*
@@ -75,6 +74,10 @@ website:
            - docs/debugging.qmd
            - docs/nccl.qmd

+        - section: "Reference"
+          contents:
+            - docs/config.qmd
+
 format:
  html:
    theme: darkly
--- a/docker/Dockerfile-cloud
+++ b/docker/Dockerfile-cloud
@@ -14,7 +14,7 @@ COPY scripts/motd /etc/motd

 RUN pip install jupyterlab notebook ipywidgets && \
    jupyter lab clean
-RUN apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
+RUN apt install --yes --no-install-recommends openssh-server tmux && \
    mkdir -p ~/.ssh && \
    chmod 700 ~/.ssh && \
    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -1,5 +1,5 @@
 ---
-title: Config Reference
+title: Config options
 description: A complete list of all configuration options.
 ---

@@ -30,8 +30,6 @@ tokenizer_legacy:
 # Resize the model embeddings when new tokens are added to multiples of 32
 # This is reported to improve training speed on some models
 resize_token_embeddings_to_32x:
-# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink.
-shrink_embeddings:

 # (Internal use only)
 # Used to identify which the model is based on
@@ -156,6 +154,8 @@ datasets:
      content: value
      # ...

+    message_property_mappings:
+
    # Optional[Dict[str, List]]. Roles mapping in the messages. The default is:
    roles:
      user: ["human", "user"]
@@ -207,46 +207,10 @@ test_datasets:
    data_files:
      - /workspace/data/eval.jsonl

-# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'
+# use RL training: 'dpo', 'ipo', 'kto'
 rl:
-rl_beta:  # Optional[float]. The beta parameter for the RL training.
-
-# dpo
-dpo_use_weighting:  # Optional[bool]. Whether to perform weighting.
-rpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper.
-
-# orpo
-orpo_alpha: 0.1  # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping.
-
-# kto
-kto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss.
-kto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss.
-
-# simpo
-cpo_alpha: 1.0  # Weight of the BC regularizer
-simpo_gamma: 0.5  # Target reward margin for the SimPO loss
-
-# grpo
-trl:
-  use_vllm: # Optional[bool]. Whether to use VLLM for RL training.
-  vllm_device: # Optional[str]. Device to use for VLLM.
-  vllm_gpu_memory_utilization: # Optional[float]. GPU memory utilization for VLLM.
-  vllm_max_model_len: # Optional[int]. Maximum length of the model for VLLM.
-  vllm_dtype: # Optional[str]. Data type for VLLM.
-
-  beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use
-  max_completion_length: # Optional[int]. Maximum length of the completion for RL training.
-
-  reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir.
-  reward_weights: # Optional[list[float]]. List of reward weights for the reward functions.
-
-  num_generations: # Optional[int]. Number of generations to sample.
-  log_completions: # Optional[bool]. Whether to log completions.
-
-  sync_ref_model: # Optional[bool]. Whether to sync the reference model.
-  ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model.
-  ref_model_sync_steps: # Optional[int]. Sync steps for the reference model.
-
+# whether to perform weighting if doing DPO training. Boolean.
+dpo_use_weighting:

 # reward modelling: `True` or `False`
 reward_model:
@@ -270,7 +234,7 @@ default_system_message: You are a helpful assistant. Please give a long and deta
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
 # Push prepared dataset to hub
-push_dataset_to_hub: # Optional[str] repo_org/repo_name
+push_dataset_to_hub: # repo path
 # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
 # if not set.
 dataset_processes: # defaults to os.cpu_count() if not set
@@ -592,13 +556,6 @@ special_tokens:
 # Add extra tokens.
 tokens:

-# Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer.
-# Only works for tokens that are not part of the base vocab (aka are added_tokens).
-# Can be checked if they exist in tokenizer.json added_tokens.
-added_tokens_overrides:  # Dict[int, str]
-#  128041: "<|im_start|>"
-#  128042: "<|im_end|>"
-
 # FSDP
 fsdp:
 fsdp_config:
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -74,10 +74,6 @@ datasets:
    train_on_eos:
 ```

-::: {.callout-tip}
-If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
-:::
-
 2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.

 ```yaml
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -27,16 +27,6 @@ description: Frequently asked questions

 > A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.

-**Q: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.**
-
-> A: This is likely due to vocab size mismatch. By default, Axolotl expands the model's embeddings if the tokenizer has more tokens than the model. Please use the `axolotl merge-lora` command to merge the adapters instead of using your own scripts.
-
-> On the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model's embeddings unless `shrink_embeddings: true` is set in the config.
-
-**Q: How to call Axolotl via custom python scripts?**
-
-> A: Yes, since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.
-
 ### Chat templates

 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
@@ -62,7 +52,3 @@ description: Frequently asked questions
 **Q: The EOS/EOT token is incorrectly being masked or not being masked.**

 > A: This is because of the mismatch between `tokenizer.eos_token` and EOS/EOT token in template. Please make sure to set `eos_token` under `special_tokens` to the same EOS/EOT token as in template.
-
-**Q: "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null. Please add a `chat_template` in tokenizer config"**
-
-> A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See [chat_template](dataset-formats/conversation.qmd#chat-template) for more details.
--- a/docs/getting-started.qmd
+++ b/docs/getting-started.qmd
@@ -36,9 +36,7 @@ The YAML configuration file controls everything about your training. Here's what

 ```yaml
 base_model: NousResearch/Llama-3.2-1B
-
-load_in_8bit: true
-adapter: lora
+# hub_model_id: username/custom_model_name

 datasets:
  - path: teknium/GPT4-LLM-Cleaned
@@ -46,15 +44,11 @@ datasets:
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.1
 output_dir: ./outputs/lora-out
+
+adapter: lora
+lora_model_dir:
 ```

-::: {.callout-tip}
-`load_in_8bit: true` and `adapter: lora` enables LoRA adapter finetuning.
-
- To perform Full finetuning, remove these two lines.
- To perform QLoRA finetuning, replace with `load_in_4bit: true` and `adapter: qlora`.
-:::
-
 See our [Config options](config.qmd) for more details.

 ### Training {#sec-training}
@@ -62,7 +56,7 @@ See our [Config options](config.qmd) for more details.
 When you run `axolotl train`, Axolotl:

 1. Downloads the base model
-2. (If specified) applies QLoRA/LoRA adapter layers
+2. (If specified) applies LoRA adapter layers
 3. Loads and processes the dataset
 4. Runs the training loop
 5. Saves the trained model and / or LoRA weights
@@ -75,8 +69,6 @@ Let's modify the example for your own data:

 ```yaml
 base_model: NousResearch/Nous-Hermes-llama-1b-v1
-
-load_in_8bit: true
 adapter: lora

 # Training settings
@@ -112,6 +104,8 @@ format):
 {"instruction": "Classify this text", "input": "Not good at all", "output": "negative"}
 ```

+Please consult the supported [Dataset Formats](dataset-formats/) for more details.
+
 3. Run the training:

 ```bash
--- a/docs/inference.qmd
+++ b/docs/inference.qmd
@@ -1,5 +1,5 @@
 ---
-title: "Inference and Merging"
+title: "Inference"
 format:
  html:
    toc: true
@@ -9,14 +9,10 @@ execute:
  enabled: false
 ---

-This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.
+This guide covers how to use your trained models for inference, including model loading, interactive testing, and common troubleshooting steps.

 ## Quick Start {#sec-quickstart}

-::: {.callout-tip}
-Use the same config used for training on inference/merging.
-:::
-
 ### Basic Inference {#sec-basic}

 ::: {.panel-tabset}
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -22,7 +22,6 @@ This guide covers all the ways you can install and set up Axolotl for your envir
 ### PyPI Installation (Recommended) {#sec-pypi}

 ```{.bash}
-pip3 install -U packaging setuptools wheel ninja
 pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
 ```

@@ -38,7 +37,7 @@ For the latest features between releases:
 ```{.bash}
 git clone https://github.com/axolotl-ai-cloud/axolotl.git
 cd axolotl
-pip3 install -U packaging setuptools wheel ninja
+pip3 install packaging ninja
 pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
 ```

@@ -108,7 +107,7 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
 2. Install PyTorch: https://pytorch.org/get-started/locally/
 3. Install Axolotl:
   ```{.bash}
-   pip3 install -U packaging setuptools wheel ninja
+   pip3 install packaging
   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
   ```
 4. (Optional) Login to Hugging Face:
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -66,10 +66,6 @@ logic to be compatible with more of them.

 </details>

-::: {.callout-tip}
-Check out our [LoRA optimizations blog](https://axolotlai.substack.com/p/accelerating-lora-fine-tuning-with).
-:::
-
 ## Usage

 These optimizations can be enabled in your Axolotl config YAML file. The
--- a/docs/reward_modelling.qmd
+++ b/docs/reward_modelling.qmd
@@ -28,23 +28,8 @@ val_set_size: 0.1
 eval_steps: 100
 ```

-Bradley-Terry chat templates expect single-turn conversations in the following format:
-
-```json
-{
-    "system": "...", // optional
-    "input": "...",
-    "chosen": "...",
-    "rejected": "..."
-}
-```
-
 ### Process Reward Models (PRM)

-::: {.callout-tip}
-Check out our [PRM blog](https://axolotlai.substack.com/p/process-reward-models).
-:::
-
 Process reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.
 ```yaml
 base_model: Qwen/Qwen2.5-3B
@@ -60,5 +45,3 @@ datasets:
 val_set_size: 0.1
 eval_steps: 100
 ```
-
-Please see [stepwise_supervised](dataset-formats/stepwise_supervised.qmd) for more details on the dataset format.
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -3,7 +3,6 @@ title: "RLHF (Beta)"
 description: "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback."
 back-to-top-navigation: true
 toc: true
-toc-expand: 2
 toc-depth: 4
 ---

@@ -298,7 +297,7 @@ The input format is a simple JSON input with customizable fields based on the ab

 ### IPO

-As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO.
+As IPO is just DPO with a different loss function, all supported options for DPO works here.

 ```yaml
 rl: ipo
@@ -344,9 +343,8 @@ ORPO supports the following types with the following dataset format:

 ```yaml
 rl: kto
-rl_beta: 0.1  # default
-kto_desirable_weight: 1.0  # default
-kto_undesirable_weight: 1.0  # default
+rl_beta: 0.5
+kto_desirable_weight: 0.2

 remove_unused_columns: false

@@ -498,10 +496,6 @@ The input format is a simple JSON input with customizable fields based on the ab

 ### GRPO

-::: {.callout-tip}
-Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
-:::
-
 GRPO uses custom reward functions and transformations. Please have them ready locally.

 For ex, to load OpenAI's GSM8K and use a random reward for completions:
@@ -534,7 +528,6 @@ trl:
    vllm_gpu_memory_utilization: 0.15
    num_generations: 4
    reward_funcs: ["rewards.rand_reward_func"]    # format: '{file_name}.{fn_name}'
-    reward_weights: [1.0]
 datasets:
  - path: openai/gsm8k
    name: main
@@ -543,21 +536,6 @@ datasets:

 To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).

-To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py).
-
-### SimPO
-
-SimPO uses [CPOTrainer](https://huggingface.co/docs/trl/main/en/cpo_trainer) but with alternative loss function.
-
-```yaml
-rl: simpo
-rl_beta: 0.1  # default in CPOTrainer
-cpo_alpha: 1.0  # default in CPOTrainer
-simpo_gamma: 0.5  # default in CPOTrainer
-```
-
-This method uses the same dataset format as [DPO](#dpo).
-
 ### Using local dataset files

 ```yaml
--- a/examples/llama-3/qlora-1b-kto.yaml
+++ b/examples/llama-3/qlora-1b-kto.yaml
@@ -55,7 +55,7 @@ tf32: true

 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
-  use_reentrant: false
+  use_reentrant: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
--- a/requirements.txt
+++ b/requirements.txt
@@ -62,5 +62,5 @@ antlr4-python3-runtime==4.13.2
 torchao==0.7.0
 schedulefree==1.3.0

-axolotl-contribs-lgpl==0.0.6
+axolotl-contribs-lgpl==0.0.3
 axolotl-contribs-mit==0.0.3
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -24,5 +24,5 @@ if cce_spec:

 print(
    UNINSTALL_PREFIX
-    + 'pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@24fbe4b5dab9a6c250a014573613c1890190536c"'
+    + 'pip install "cut-cross-entropy @ git+https://github.com/apple/ml-cross-entropy.git@9c297c905f55b73594b5d650722d1e78183b77bd"'
 )
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -113,7 +113,7 @@ class ModalCloud(Cloud):
                [
                    # Random id for cache busting of branch commits
                    f"RUN echo '{str(randint(0, 1000000))}'",  # nosec B311
-                    f"RUN cd /workspace/axolotl && git fetch && git checkout {self.config.branch} && git pull",
+                    f"RUN cd /workspace/axolotl && git fetch && git checkout {self.config.branch}",
                ]
            )

@@ -270,7 +270,6 @@ def _preprocess(config_yaml: str, volumes=None):


 def _train(config_yaml: str, accelerate: bool = True, volumes=None, **kwargs):
-    Path("/workspace/mounts").mkdir(parents=True, exist_ok=True)
    with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out:
        f_out.write(config_yaml)
    run_folder = "/workspace/mounts"
@@ -289,7 +288,6 @@ def _train(config_yaml: str, accelerate: bool = True, volumes=None, **kwargs):


 def _lm_eval(config_yaml: str, volumes=None):
-    Path("/workspace/mounts").mkdir(parents=True, exist_ok=True)
    with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out:
        f_out.write(config_yaml)
    run_folder = "/workspace/mounts"
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -1,7 +1,6 @@
 """CLI to run training on a model."""

 import logging
-import os
 from pathlib import Path
 from typing import Union

@@ -35,8 +34,7 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
    """
    print_axolotl_text_art()
    check_accelerate_default_config()
-    if int(os.getenv("LOCAL_RANK", "0")) == 0:
-        check_user_token()
+    check_user_token()

    if cfg.rl:
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/core/datasets/chat.py
+++ b/src/axolotl/core/datasets/chat.py
@@ -43,7 +43,7 @@ class TokenizedChatDataset(Dataset):
        process_or_cpu_count: int = (
            process_count or os.cpu_count()  # type: ignore[assignment]
        )
-        num_proc = min(32, process_or_cpu_count)
+        num_proc = min(64, process_or_cpu_count)
        features = data.features.keys()
        tokenized_data = data.map(
            map_fn,
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -17,7 +17,7 @@ Run the following command to install `cut_cross_entropy[transformers]` if you do
 python scripts/cutcrossentropy_install.py | sh

 # if you are not in dev environment
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@24fbe4b5dab9a6c250a014573613c1890190536c"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy @ git+https://github.com/apple/ml-cross-entropy.git@9c297c905f55b73594b5d650722d1e78183b77bd"'
 ```

 ## Usage
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -33,7 +33,7 @@ LOG = logging.getLogger("axolotl.integrations.cut_cross_entropy")

 _CCE_INSTALL_MESSAGE = (
    "Please install cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/apple/ml-cross-entropy.git@24fbe4b5dab9a6c250a014573613c1890190536c"`'
+    '`pip install "cut-cross-entropy[transformers]==24.11.4"`'
 )


--- a/src/axolotl/integrations/spectrum/args.py
+++ b/src/axolotl/integrations/spectrum/args.py
@@ -17,7 +17,7 @@ Module for handling Spectrum input arguments.
 """
 from typing import Optional

-from pydantic import BaseModel, model_validator
+from pydantic import BaseModel


 class SpectrumArgs(BaseModel):
@@ -27,20 +27,3 @@ class SpectrumArgs(BaseModel):

    spectrum_top_fraction: Optional[float] = 0.5
    spectrum_model_name: Optional[str] = None
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_fsdp_use_orig_params(cls, data):
-        if (
-            data.get("fsdp")
-            and data.get("fsdp_config")
-            and not data["fsdp_config"].get("use_orig_params")
-            and data.get("plugins")
-            and any("SpectrumPlugin" in plugin for plugin in data["plugins"])
-        ):
-            # would otherwise raise
-            # ValueError: Must flatten tensors with uniform `requires_grad` when `use_orig_params=False`
-            raise ValueError(
-                "FSDP + SpectrumPlugin cannot be used together when `use_orig_params=False` is set"
-            )
-        return data
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -7,7 +7,7 @@ import signal
 import sys
 import weakref
 from pathlib import Path
-from typing import Any, Dict
+from typing import Any

 import torch
 import transformers.modelcard
@@ -20,7 +20,7 @@ from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer

 from axolotl.common.datasets import TrainDatasetMeta
-from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
+from axolotl.contribs.lgpl.unsloth import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
 )
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
@@ -382,23 +382,21 @@ def handle_untrained_tokens_fix(
    if not cfg.fix_untrained_tokens:
        return

-    is_ds_zero3: bool = False
-    if os.environ.get("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3":
-        is_ds_zero3 = True
-
    # Check if the `token_ids_to_fix` kwarg exists in the fix_untrained_tokens args
    sig = inspect.signature(fix_untrained_tokens)

-    fix_kwargs: Dict[str, Any] = {}
    # If the function has the `token_ids_to_fix` arg, and fix_untrained_tokens is a list
    if "token_ids_to_fix" in sig.parameters and isinstance(
        cfg.fix_untrained_tokens, list
    ):
-        fix_kwargs["token_ids_to_fix"] = cfg.fix_untrained_tokens
-    if "is_ds_zero3" in sig.parameters:
-        fix_kwargs["is_ds_zero3"] = is_ds_zero3
-
-    fix_untrained_tokens(model, tokenizer, train_dataset, **fix_kwargs)
+        fix_untrained_tokens(
+            model,
+            tokenizer,
+            train_dataset,
+            token_ids_to_fix=cfg.fix_untrained_tokens,
+        )
+    else:
+        fix_untrained_tokens(model, tokenizer, train_dataset)

    if cfg.local_rank == 0:
        model.save_pretrained(
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -1,5 +1,4 @@
 """Module with Pydantic models for configuration."""
-
 # pylint: disable=too-many-lines

 import logging
@@ -73,6 +72,7 @@ class CustomSupportedOptimizers(str, Enum):
    ao_adamw_8bit = "ao_adamw_8bit"  # pylint: disable=invalid-name
    ao_adamw_fp8 = "ao_adamw_fp8"  # pylint: disable=invalid-name
    adopt_adamw = "adopt_adamw"  # pylint: disable=invalid-name
+    lion_pytorch = "lion_pytorch"  # pylint: disable=invalid-name
    muon = "muon"  # pylint: disable=invalid-name


@@ -729,7 +729,7 @@ class AxolotlInputConfig(
        default=None,
        json_schema_extra={"description": "streaming dataset to use for pretraining"},
    )
-    dataset_processes: Optional[int] = Field(default=min(32, os.cpu_count()))  # type: ignore[type-var]
+    dataset_processes: Optional[int] = Field(default=os.cpu_count())
    dataset_exact_deduplication: Optional[bool] = None
    dataset_keep_in_memory: Optional[bool] = None
    dataloader_pin_memory: Optional[bool] = None
@@ -780,9 +780,9 @@ class AxolotlInputConfig(

    # torch_dtype: Optional[torch.dtype]

-    gradient_checkpointing: Optional[
-        Union[Literal["unsloth", "offload"], bool]
-    ] = Field(default=False)
+    gradient_checkpointing: Optional[Union[Literal["unsloth"], bool]] = Field(
+        default=False
+    )
    gradient_checkpointing_kwargs: Optional[Dict[str, Any]] = None

    unfrozen_parameters: Optional[List[str]] = None
@@ -857,7 +857,6 @@ class AxolotlInputConfig(

    special_tokens: Optional[SpecialTokensConfig] = None
    tokens: Optional[List[str]] = None
-    added_tokens_overrides: Optional[Dict[int, str]] = None

    torch_compile: Optional[Union[Literal["auto"], bool]] = None
    torch_compile_backend: Optional[str] = None
@@ -1156,15 +1155,6 @@ class AxolotlInputConfig(
            raise ValueError("gradient_checkpointing is not supported for MPT models")
        return self

-    @model_validator(mode="after")
-    def check_offload_grad_checkpointing(self):
-        if self.gradient_checkpointing and self.gradient_checkpointing == "unsloth":
-            LOG.warning(
-                "`unsloth` is deprecated for gradient_checkpointing, use `offload`"
-            )
-            self.gradient_checkpointing = "offload"
-        return self
-
    @model_validator(mode="after")
    def check_better_transformers(self):
        if self.flash_optimum is True:
@@ -1679,30 +1669,6 @@ class AxolotlInputConfig(

        return data

-    @model_validator(mode="before")
-    @classmethod
-    def check_rl_config_gradient_checkpointing(cls, data):
-        # TODO: SalmanMohammadi
-        # Distributed RL with QLoRA + gradient checkpointing
-        # and use_reentrant = True is broken upstream in TRL
-        # pylint: disable=too-many-boolean-expressions
-        if (
-            data.get("rl")
-            and data.get("gradient_checkpointing")
-            and data.get("gradient_checkpointing_kwargs")
-            and data.get("gradient_checkpointing_kwargs").get("use_reentrant")
-            and data.get("load_in_4bit")
-            and data.get("adapter") == "qlora"
-            and data.get("capabilities")
-            and data.get("capabilities").get("n_gpu", 1) > 1
-        ):
-            raise ValueError(
-                "The `use_reentrant: True` implementation of gradient checkpointing "
-                "is not supported for distributed RL training with QLoRA. Please set "
-                "`use_reentrant: False` in `gradient_checkpointing_kwargs`."
-            )
-        return data
-
    @model_validator(mode="before")
    @classmethod
    def check_kto_config(cls, data):
@@ -1713,6 +1679,15 @@ class AxolotlInputConfig(
            if data.get("remove_unused_columns") is not False:
                raise ValueError("Set `remove_unused_columns: False` when using kto")

+            if data.get("gradient_checkpointing") and not (
+                data.get("gradient_checkpointing_kwargs")
+                and isinstance(data.get("gradient_checkpointing_kwargs"), dict)
+                and data["gradient_checkpointing_kwargs"].get("use_reentrant")
+            ):
+                raise ValueError(
+                    "Set `gradient_checkpointing_kwargs: {use_reentrant: true}` for when kto is enabled"
+                )
+
        return data


@@ -1843,14 +1818,6 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
                data["torch_compile"] = False
        return data

-    @model_validator(mode="before")
-    @classmethod
-    def check_beta_and_trl_beta_match(cls, data):
-        if data.get("beta") and data.get("trl", {}).get("beta"):
-            if data["beta"] != data["trl"]["beta"]:
-                raise ValueError("beta and trl.beta must match or one must be removed")
-        return data
-

 def handle_legacy_message_fields_logic(data: dict) -> dict:
    """
--- a/src/axolotl/utils/config/models/input/v0_4_1/trl.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/trl.py
@@ -1,8 +1,7 @@
 """
 GRPO specific configuration args
 """
-
-from typing import Optional
+from typing import List, Optional

 from pydantic import BaseModel, Field

@@ -12,10 +11,7 @@ class TRLConfig(BaseModel):
    Input args for TRL.
    """

-    beta: Optional[float] = Field(
-        default=None,
-        json_schema_extra={"description": "Beta for RL training"},
-    )
+    beta: Optional[float] = None
    max_completion_length: Optional[int] = Field(
        default=None,
        json_schema_extra={
@@ -24,68 +20,17 @@ class TRLConfig(BaseModel):
    )

    # GRPO specific args
-    # Ref: https://github.com/huggingface/trl/blob/e3244d2d096ff1e2e248c931d06d39e165e20623/trl/trainer/grpo_config.py#L22
-    use_vllm: Optional[bool] = Field(
-        default=False,
-        json_schema_extra={"description": "Whether to use VLLM for RL training"},
-    )
-    vllm_device: Optional[str] = Field(
-        default="auto",
-        json_schema_extra={"description": "Device to use for VLLM"},
-    )
-    vllm_gpu_memory_utilization: Optional[float] = Field(
-        default=0.9,
-        json_schema_extra={"description": "GPU memory utilization for VLLM"},
-    )
-    vllm_dtype: Optional[str] = Field(
-        default="auto",
-        json_schema_extra={"description": "Data type for VLLM"},
-    )
-    vllm_max_model_len: Optional[int] = Field(
-        default=None,
-        json_schema_extra={
-            "description": "Maximum length of the model context for VLLM"
-        },
-    )
+    use_vllm: Optional[bool] = False
+    vllm_device: Optional[str] = "auto"
+    vllm_gpu_memory_utilization: Optional[float] = 0.9
+    vllm_max_model_len: Optional[int] = None
+    vllm_dtype: Optional[str] = "auto"

-    reward_funcs: Optional[list[str]] = Field(
-        default=None,
-        json_schema_extra={"description": "List of reward functions to load"},
-    )
-    reward_weights: Optional[list[float]] = Field(
-        default=None,
-        json_schema_extra={
-            "description": "Weights for each reward function. Must match the number of reward functions."
-        },
-    )
-    num_generations: Optional[int] = Field(
-        default=None,
-        json_schema_extra={
-            "description": "Number of generations to sample. The global batch size (num_processes * per_device_batch_size) must be divisible by this value."
-        },
-    )
-    log_completions: Optional[bool] = Field(
-        default=False,
-        json_schema_extra={"description": "Whether to log completions"},
-    )
-    sync_ref_model: Optional[bool] = Field(
-        default=False,
-        json_schema_extra={
-            "description": (
-                "Whether to sync the reference model every `ref_model_sync_steps` "
-                "steps, using the `ref_model_mixup_alpha` parameter."
-            )
-        },
-    )
-    ref_model_mixup_alpha: Optional[float] = Field(
-        default=0.9,
-        json_schema_extra={
-            "description": "Mixup alpha for the reference model. Requires `sync_ref_model=True`."
-        },
-    )
-    ref_model_sync_steps: Optional[int] = Field(
-        default=64,
-        json_schema_extra={
-            "description": "Sync steps for the reference model. Requires `sync_ref_model=True`."
-        },
-    )
+    reward_funcs: Optional[List[str]] = None
+    reward_weights: Optional[List[float]] = None
+    num_generations: Optional[int] = None
+    log_completions: Optional[bool] = False
+
+    sync_ref_model: Optional[bool] = False
+    ref_model_mixup_alpha: Optional[float] = 0.9
+    ref_model_sync_steps: Optional[int] = 64
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -79,7 +79,7 @@ def is_main_process():


 def is_local_main_process():
-    return PartialState().is_local_main_process
+    return PartialState().is_main_process


 def get_world_size():
--- a/src/axolotl/utils/gradient_checkpointing/init.py
+++ b/src/axolotl/utils/gradient_checkpointing/init.py
@@ -4,7 +4,7 @@ from axolotl.utils.gradient_checkpointing.unsloth import (
 )


-def hf_grad_checkpoint_offload_wrapper(
+def hf_grad_checkpoint_unsloth_wrapper(
    decoder_layer, *args, use_reentrant=None
 ):  # pylint: disable=unused-argument
    return Unsloth_Offloaded_Gradient_Checkpointer.apply(
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -24,6 +24,7 @@ from peft import (
    PeftModelForCausalLM,
    prepare_model_for_kbit_training,
 )
+from peft.tuners.lora import QuantLinear
 from torch import nn
 from transformers import (  # noqa: F401
    AddedToken,
@@ -56,14 +57,8 @@ from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import (
-    barrier,
-    get_device_count,
-    get_device_type,
-    is_local_main_process,
-    zero_only,
-)
-from axolotl.utils.gradient_checkpointing import hf_grad_checkpoint_offload_wrapper
+from axolotl.utils.distributed import get_device_count, get_device_type, zero_only
+from axolotl.utils.gradient_checkpointing import hf_grad_checkpoint_unsloth_wrapper
 from axolotl.utils.lora_embeddings import get_linear_embedding_layers
 from axolotl.utils.model_shard_quant import load_sharded_model, load_sharded_model_quant

@@ -170,95 +165,7 @@ def load_model_config(cfg):
    return model_config


-def modify_tokenizer_files(
-    tokenizer_path: str, token_mappings: Dict[int, str], output_dir: str
-) -> str:
-    """
-    Modify tokenizer files to replace added_tokens strings, save to output directory, and return the path to the modified tokenizer.
-
-    This only works with reserved tokens that were added to the tokenizer, not tokens already part of the vocab.
-
-    Args:
-        tokenizer_path: Path or name of the original tokenizer
-        token_mappings: Dict mapping {token_id (int): new_token_string}
-        output_dir: Directory to save the modified tokenizer
-
-    Returns:
-        Path to the modified tokenizer directory
-
-    Ref: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941
-    """
-
-    import json
-
-    # Create the tokenizer directory in output_dir if it doesn't exist
-    tokenizer_dir = os.path.join(output_dir, "tokenizer")
-    os.makedirs(tokenizer_dir, exist_ok=True)
-
-    if is_local_main_process():  # pylint: disable=too-many-nested-blocks
-        # Load the tokenizer
-        temp_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
-
-        # Save the tokenizer to the output directory
-        temp_tokenizer.save_pretrained(tokenizer_dir)
-
-        # Get the token IDs and map them to their new values
-        token_id_mappings = {
-            int(token_id): new_value for token_id, new_value in token_mappings.items()
-        }
-
-        # 1. Update tokenizer_config.json - added_tokens_decoder
-        config_path = os.path.join(tokenizer_dir, "tokenizer_config.json")
-        if os.path.exists(config_path):
-            with open(config_path, "r", encoding="utf-8") as f:
-                config_data = json.load(f)
-
-            # Update added_tokens_decoder
-            if "added_tokens_decoder" in config_data:
-                for token_id, new_value in token_id_mappings.items():
-                    token_id_str = str(token_id)
-                    if token_id_str in config_data["added_tokens_decoder"]:
-                        config_data["added_tokens_decoder"][token_id_str][
-                            "content"
-                        ] = new_value
-                    else:
-                        raise ValueError(
-                            f"Token ID {token_id_str} not found in added_tokens_decoder"
-                        )
-
-            # Write the updated config back
-            with open(config_path, "w", encoding="utf-8") as f:
-                json.dump(config_data, f, indent=2)
-
-        # 2. Update tokenizer.json - added_tokens
-        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
-        if os.path.exists(tokenizer_path):
-            with open(tokenizer_path, "r", encoding="utf-8") as f:
-                tokenizer_data = json.load(f)
-
-            # Update added_tokens
-            if "added_tokens" in tokenizer_data:
-                for token_id, new_value in token_id_mappings.items():
-                    for i, token_entry in enumerate(tokenizer_data["added_tokens"]):
-                        if token_entry["id"] == token_id:
-                            tokenizer_data["added_tokens"][i]["content"] = new_value
-                            break
-                    else:
-                        # Reaching this section means the token_id was not found in tokenizer.json added_tokens
-                        raise ValueError(
-                            f"Token ID {token_id} not found in added_tokens"
-                        )
-
-            # Write the updated tokenizer data back
-            with open(tokenizer_path, "w", encoding="utf-8") as f:
-                json.dump(tokenizer_data, f, indent=2)
-
-    barrier()
-    return tokenizer_dir
-
-
 def load_tokenizer(cfg):
-    """Load and configure the tokenizer based on the provided config."""
    model_config = load_model_config(cfg)
    tokenizer_kwargs = {}
    use_fast = True  # this is the default
@@ -273,18 +180,8 @@ def load_tokenizer(cfg):
    if cfg.tokenizer_type:
        tokenizer_cls = getattr(transformers, cfg.tokenizer_type)

-    # Set base tokenizer path
-    tokenizer_path = cfg.tokenizer_config
-
-    # Apply token string overrides if specified
-    if cfg.added_tokens_overrides:
-        # Modify tokenizer files and get path to modified tokenizer
-        tokenizer_path = modify_tokenizer_files(
-            tokenizer_path, cfg.added_tokens_overrides, output_dir=cfg.output_dir
-        )
-
    tokenizer = tokenizer_cls.from_pretrained(
-        tokenizer_path,
+        cfg.tokenizer_config,
        trust_remote_code=cfg.trust_remote_code or False,
        use_fast=use_fast,
        **tokenizer_kwargs,
@@ -492,8 +389,8 @@ class ModelLoader:

            patch_fa_peft_integration()

-        if self.cfg.gradient_checkpointing in ["unsloth", "offload"]:
-            transformers.modeling_utils.checkpoint = hf_grad_checkpoint_offload_wrapper
+        if self.cfg.gradient_checkpointing == "unsloth":
+            transformers.modeling_utils.checkpoint = hf_grad_checkpoint_unsloth_wrapper

        if self.cfg.flash_attention:
            self.patch_attention()
@@ -1359,7 +1256,7 @@ def load_llama_adapter(model, cfg):


 def find_all_linear_names(model):
-    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
+    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear, QuantLinear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if (
--- a/styles.css
+++ b/styles.css
@@ -14,7 +14,7 @@
 h1 {
    font-family: var(--font-title);
    font-weight: 400;
-    font-size: 5rem;
+    font-size: 6rem;
    line-height: 1.1;
    letter-spacing: -0.05em;
    font-feature-settings: "ss01" on;
--- a/tests/e2e/integrations/test_cut_cross_entropy.py
+++ b/tests/e2e/integrations/test_cut_cross_entropy.py
@@ -69,51 +69,6 @@ class TestCutCrossEntropyIntegration:
            train(cfg=cfg, dataset_meta=dataset_meta)
            check_model_output_exists(temp_dir, cfg)

-    # pylint: disable=redefined-outer-name
-    def test_qwen2_w_cce(self, temp_dir):
-        cfg = DictDefault(
-            {
-                "base_model": "Qwen/Qwen2.5-0.5B",
-                "plugins": [
-                    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin",
-                ],
-                "cut_cross_entropy": True,
-                "sequence_len": 1024,
-                "val_set_size": 0.1,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "micro_batch_size": 4,
-                "gradient_accumulation_steps": 1,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "output_dir": temp_dir,
-                "lr_scheduler": "cosine",
-                "save_safetensors": True,
-                "max_steps": 10,
-                "bf16": "auto",
-            }
-        )
-        prepare_plugins(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        major, minor, _ = get_pytorch_version()
-        if (major, minor) < (2, 4):
-            with pytest.raises(ImportError):
-                train(cfg=cfg, dataset_meta=dataset_meta)
-        else:
-            train(cfg=cfg, dataset_meta=dataset_meta)
-            check_model_output_exists(temp_dir, cfg)
-
    @pytest.mark.parametrize(
        "attention_type",
        [
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -750,66 +750,3 @@ class TestMultiGPULlama:
        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
        )
-
-    def test_fix_untrained_tokens(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "fix_untrained_tokens": True,
-                "sequence_len": 512,
-                "val_set_size": 0.0,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                    "bos_token": "<|custom_im_start|>",
-                    "eos_token": "<|custom_im_end|>",
-                },
-                "datasets": [
-                    {
-                        "chat_template": "jinja",
-                        "chat_template_jinja": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|custom_im_start|>' + message['role'] + '\n' + message['content'] + '<|custom_im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|custom_im_start|>assistant\n' }}{% endif %}",
-                        "path": "mlabonne/FineTome-100k",
-                        "type": "chat_template",
-                        "split": "train[:10%]",
-                        "field_messages": "conversations",
-                        "message_field_role": "from",
-                        "message_field_content": "value",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 5,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "sample_packing": True,
-                "bf16": True,
-                "save_safetensors": True,
-                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero3_bf16.json"),
-                "use_tensorboard": True,
-            }
-        )
-
-        # write cfg to yaml file
-        Path(temp_dir).mkdir(parents=True, exist_ok=True)
-        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
-            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
-
-        execute_subprocess_async(
-            [
-                "axolotl",
-                "train",
-                str(Path(temp_dir) / "config.yaml"),
-                "--num-processes",
-                "2",
-                "--main-process-port",
-                f"{get_torch_dist_unique_port()}",
-            ]
-        )
-
-        check_tensorboard(
-            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss is too high"
-        )
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -66,54 +66,6 @@ class TestLlama:
        check_model_output_exists(temp_dir, cfg)

    def test_fix_untrained_tokens(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM2-135M",
-                "fix_untrained_tokens": True,
-                "sequence_len": 512,
-                "val_set_size": 0.0,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                    "bos_token": "<|custom_im_start|>",
-                    "eos_token": "<|custom_im_end|>",
-                },
-                "datasets": [
-                    {
-                        "chat_template": "jinja",
-                        "chat_template_jinja": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|custom_im_start|>' + message['role'] + '\n' + message['content'] + '<|custom_im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|custom_im_start|>assistant\n' }}{% endif %}",
-                        "path": "mlabonne/FineTome-100k",
-                        "type": "chat_template",
-                        "split": "train[:10%]",
-                        "field_messages": "conversations",
-                        "message_field_role": "from",
-                        "message_field_content": "value",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 5,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_8bit",
-                "lr_scheduler": "cosine",
-                "flash_attention": True,
-                "sample_packing": True,
-                "bf16": True,
-                "save_safetensors": True,
-            }
-        )
-
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        check_model_output_exists(temp_dir, cfg)
-
-    def test_fix_untrained_tokens_already_trained(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -1,7 +1,6 @@
 """
 Test cases for the tokenizer loading
 """
-
 import unittest

 import pytest
@@ -10,7 +9,7 @@ from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_tokenizer


-class TestTokenizers:
+class TestTokenizers(unittest.TestCase):
    """
    test class for the load_tokenizer fn
    """
@@ -76,48 +75,12 @@ class TestTokenizers:
            }
        )
        tokenizer = load_tokenizer(cfg)
-        assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1404]
-        assert len(tokenizer) == 32001
+        self.assertEqual(tokenizer("<|im_start|>user")["input_ids"], [1, 32000, 1404])
+        self.assertEqual(len(tokenizer), 32001)

        # ensure reloading the tokenizer again from cfg results in same vocab length
        tokenizer = load_tokenizer(cfg)
-        assert len(tokenizer) == 32001
-
-    def test_added_tokens_overrides(self, temp_dir):
-        cfg = DictDefault(
-            {
-                # use with tokenizer that has reserved_tokens in added_tokens
-                "tokenizer_config": "NousResearch/Llama-3.2-1B",
-                "added_tokens_overrides": {
-                    128041: "RANDOM_OVERRIDE_1",
-                    128042: "RANDOM_OVERRIDE_2",
-                },
-                "output_dir": temp_dir,
-            }
-        )
-
-        tokenizer = load_tokenizer(cfg)
-        assert tokenizer.encode("RANDOM_OVERRIDE_1", add_special_tokens=False) == [
-            128041
-        ]
-        assert tokenizer.encode("RANDOM_OVERRIDE_2", add_special_tokens=False) == [
-            128042
-        ]
-
-    def test_added_tokens_overrides_with_toolargeid(self, temp_dir):
-        cfg = DictDefault(
-            {
-                # use with tokenizer that has reserved_tokens in added_tokens
-                "tokenizer_config": "NousResearch/Llama-3.2-1B",
-                "added_tokens_overrides": {1000000: "BROKEN_RANDOM_OVERRIDE_1"},
-                "output_dir": temp_dir,
-            }
-        )
-
-        with pytest.raises(
-            ValueError, match=r".*Token ID 1000000 not found in added_tokens.*"
-        ):
-            load_tokenizer(cfg)
+        self.assertEqual(len(tokenizer), 32001)


 if __name__ == "__main__":
Author	SHA1	Message	Date
Wing Lian	76bb09784d	fix import	2025-03-05 14:05:27 -05:00
Wing Lian	0542c7dd56	add muon optimizer optimizer_cls_and_kwargs is on trainer_kwargs only add adamw_kwargs if they're non-null fix mocks better handling of override and check the optimizer unwrap optimizer	2025-03-05 10:47:22 -05:00