simplify by installing no deps

installing axolotl prior to quartodoc build
2025-03-21 13:27:54 -04:00 · 2025-03-21 16:52:51 +00:00
86 changed files with 815 additions and 4140 deletions
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -136,4 +136,4 @@ jobs:
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
+          modal run cicd.tests
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -63,7 +63,7 @@ jobs:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+          key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -137,7 +137,7 @@ jobs:
          path: |
            /home/runner/.cache/huggingface/hub/datasets--*
            /home/runner/.cache/huggingface/hub/models--*
-          key: ${{ runner.os }}-hf-hub-cache-v2
+          key: ${{ runner.os }}-hf-hub-cache-${{ hashFiles('**/conftest.py') }}

      - name: Setup Python
        uses: actions/setup-python@v5
@@ -171,9 +171,6 @@ jobs:
        run: |
          axolotl --help

-      - name: Show HF cache
-        run: huggingface-cli scan-cache
-
      - name: Run tests
        run: |
          pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
@@ -232,7 +229,7 @@ jobs:
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
+          modal run cicd.tests

  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
@@ -279,4 +276,4 @@ jobs:
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
-          modal run cicd.e2e_tests
+          modal run cicd.tests
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -1,4 +1,3 @@
 [settings]
 profile=black
 known_third_party=wandb,comet_ml
-known_local_folder=src,tests
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -133,7 +133,6 @@ quartodoc:
        - utils.schemas.datasets
        - utils.schemas.peft
        - utils.schemas.trl
-        - utils.schemas.multimodal
        - utils.schemas.integrations
        - utils.schemas.enums
        - utils.schemas.utils
@@ -243,7 +242,6 @@ website:
            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
-            - docs/sequence_parallelism.qmd

        - section: "Troubleshooting"
          contents:
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -466,7 +466,6 @@ auto_find_batch_size: # Optional[bool]

 eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
-do_causal_lm_eval: # Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`.
 eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]

 profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.
@@ -507,58 +506,36 @@ lr_div_factor: # Learning rate div factor

 # Specify optimizer
 # Valid values are driven by the Transformers OptimizerNames class, see:
-# https://github.com/huggingface/transformers/blob/cbf924b76c03828101a34069a96d209314114fd5/src/transformers/training_args.py#L144-L189
+# https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
 #
 # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
 # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
 # in the examples/ for your model and fine-tuning use case.
 #
 # Valid values for 'optimizer' include:
+# - adamw_hf
 # - adamw_torch
 # - adamw_torch_fused
 # - adamw_torch_xla
-# - adamw_torch_npu_fused
 # - adamw_apex_fused
-# - adopt_adamw  (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
+# - adopt_adamw (an EXPERIMENTAL optimizer, only for torch version >= 2.5.1)
 # - adafactor
 # - adamw_anyprecision
-# - adamw_torch_4bit
-# - ademamix
 # - sgd
 # - adagrad
 # - adamw_bnb_8bit
-# - adamw_8bit   # alias for adamw_bnb_8bit
-# - ademamix_8bit
 # - lion_8bit
 # - lion_32bit
 # - paged_adamw_32bit
 # - paged_adamw_8bit
-# - paged_ademamix_32bit
-# - paged_ademamix_8bit
 # - paged_lion_32bit
 # - paged_lion_8bit
-# - rmsprop
-# - rmsprop_bnb
-# - rmsprop_bnb_8bit
-# - rmsprop_bnb_32bit
 # - galore_adamw
 # - galore_adamw_8bit
 # - galore_adafactor
 # - galore_adamw_layerwise
 # - galore_adamw_8bit_layerwise
 # - galore_adafactor_layerwise
-# - lomo
-# - adalomo
-# - grokadamw
-# - schedule_free_adamw
-# - schedule_free_sgd
-# - apollo_adamw
-# - apollo_adamw_layerwise
-#
-# Additional custom optimizers include:
-# - optimi_adamw
-# - ao_adamw_8bit
-# - ao_adamw_fp8
 optimizer:
 # Dictionary of arguments to pass to the optimizer
 optim_args:
@@ -610,14 +587,6 @@ resume_from_checkpoint:
 # Be careful with this being turned on between different models.
 auto_resume_from_checkpoints: false

-## Multimodal section
-# int | tuple[int, int] | None . Size to resize images to, width x height.
-# Will read from model/processor config if not set.
-image_size:
-# str. Algorithm to use for image resizing. "bilinear", "bicubic", "lanczos". Default is "bilinear".
-image_resize_algorithm: 'bilinear'
-## End of multimodal section
-
 # Don't mess with this, it's here for accelerate and torchrun
 local_rank:

@@ -658,9 +627,6 @@ ddp_broadcast_buffers:
 # subsequences, or set to 4 to split into four equal-sized subsequences.
 # See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
 sequence_parallel_degree:
-# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
-# Must evenly divide the number of KV heads in your model.
-heads_k_stride: 1

 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -103,7 +103,8 @@ This uses the same tags as the [`main` image](#sec-main-tags).

 - `JUPYTER_DISABLE`: Disable Jupyter lab.
 - `JUPYTER_PASSWORD`: Set a password for the Jupyter lab.
- `PUBLIC_KEY` / `SSH_KEY`: Add a public key for the SSH service.
+- `PUBLIC_KEY`: Add a public key for the SSH service.
+- `SSH_KEY`: Add a private key for the SSH service.

 #### Volume mounts

--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -18,7 +18,6 @@ Axolotl supports several methods for multi-GPU training:

 - DeepSpeed (recommended)
 - FSDP (Fully Sharded Data Parallel)
- Sequence parallelism
 - FSDP + QLoRA

 ## DeepSpeed {#sec-deepspeed}
@@ -67,28 +66,6 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

-## Sequence parallelism {#sec-sequence-parallelism}
-
-We support sequence parallelism (SP) via the
-[ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
-allows one to split up sequences across GPUs, which is useful in the event that a
-single sequence causes OOM errors during model training.
-
-First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
-or from source with `pip install .[ring-flash-attn]`.
-
-Your Axolotl YAML config should contain the following lines:
-
-```{.yaml}
-sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
-flash_attention: true  # Required with sequence parallelism
-
-# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
-heads_k_stride: 1
-```
-
-See our [dedicated guide](sequence_parallelism.qmd) for more details.
-
 ### FSDP + QLoRA {#sec-fsdp-qlora}

 For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -1,171 +1,28 @@
---
-title: MultiModal / Vision Language Models (BETA)
-format:
-  html:
-    toc: true
-    toc-depth: 3
---
+# MultiModal / Vision Language Models (BETA)

-## Supported Models
+### Supported Models

- [Mllama](#sec-mllama)
- [Pixtral](#sec-pixtral)
- [Llava-1.5](#sec-llava-15)
- [Mistral-Small-3.1](#sec-mistral-small-31)
- [Gemma-3](#sec-gemma-3)
- [Qwen2-VL](#sec-qwen2-vl)
- [Qwen2.5-VL](#sec-qwen25-vl)
+- Mllama, i.e. llama with vision models

-## Usage
+### Usage

-Multimodal support is limited and doesn't have full feature parity.
-
-Here are the hyperparams you'll need to use to finetune a multimodal model.
+Currently multimodal support is limited and doesn't have full feature parity. To finetune a multimodal Llama w/ LoRA,
+you'll need to use the following in YAML in combination with the rest of the required hyperparams.

 ```yaml
+base_model: alpindale/Llama-3.2-11B-Vision-Instruct
 processor_type: AutoProcessor
-
 skip_prepare_dataset: true
-remove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training
-sample_packing: false  # not yet supported with multimodal

-chat_template:  # see in next section
-
-# example dataset
+chat_template: llama3_2_vision
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
+remove_unused_columns: false
+sample_packing: false

-# (optional) if doing lora, only finetune the Language model,
-# leave the vision model and vision tower frozen
-# load_in_8bit: true
-adapter: lora
+# only finetune the Language model, leave the vision model and vision tower frozen
 lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-# (optional) if you want to resize images to a set size
-image_size: 512
-image_resize_algorithm: bilinear
-```
-
-Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs.
-
-::: {.callout-warning}
-Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
-:::
-
-### Mllama {#sec-mllama}
-
-```yaml
-base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
-
-chat_template: llama3_2_vision
-```
-
-### Pixtral {#sec-pixtral}
-
-```yaml
-base_model: mistralai/Pixtral-12B-2409
-
-chat_template: pixtral
-```
-
-### Llava-1.5 {#sec-llava-15}
-
-```yaml
-base_model: llava-hf/llava-1.5-7b-hf
-
-chat_template: llava
-```
-
-### Mistral-Small-3.1 {#sec-mistral-small-31}
-
-```yaml
-base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
-
-chat_template: mistral_v7_tekken
-```
-
-### Gemma-3 {#sec-gemma-3}
-
-::: {.callout-tip}
-The Gemma3-1B model is a text-only model, so please train as regular text model.
-:::
-
-For multi-modal 4B/12B/27B models, use the following config:
-
-```yaml
-base_model: google/gemma-3-4b-it
-
-chat_template: gemma3
-```
-
-### Qwen2-VL {#sec-qwen2-vl}
-
-```yaml
-base_model: Qwen/Qwen2-VL-7B-Instruct
-
-chat_template: qwen2_vl
-```
-
-### Qwen2.5-VL {#sec-qwen25-vl}
-
-```yaml
-base_model: Qwen/Qwen2.5-VL-7B-Instruct
-
-chat_template: qwen2_vl  # same as qwen2-vl
-```
-
-## Dataset Format
-
-For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
-
- A message is a list of `role` and `content`.
- `role` can be `system`, `user`, `assistant`, etc.
- `content` is a list of `type` and (`text` or `image` or `path` or `url` or `base64`).
-
-::: {.callout-note}
-For backwards compatibility:
-
- If the dataset has a `images` or `image` column of `list[Image]`, it will be appended to the first `content` list as `{"type": "image", "image": ...}`. However, if the content already has a `{"type": "image"}` but no `image` key, it will be set the `image` key.
- If `content` is a string, it will be converted to a list with `type` as `text`.
-:::
-
-::: {.callout-tip}
-For image loading, you can use the following keys within `content` alongside `"type": "image"`:
-
- `"path": "/path/to/image.jpg"`
- `"url": "https://example.com/image.jpg"`
- `"base64": "..."`
- `"image": PIL.Image`
-:::
-
-Here is an example of a multi-modal dataset:
-```json
-[
-  {
-    "messages": [
-        {
-            "role": "system",
-            "content": [
-              {"type": "text", "text": "You are a helpful assistant."}
-              ]
-        },
-        {
-            "role": "user",
-            "content": [
-                {"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
-                {"type": "text", "text": "Describe this image in detail."}
-            ]
-        },
-        {
-            "role": "assistant",
-            "content": [
-              {"type": "text", "text": "The image is a bee."}
-            ]
-        }
-    ]
-  }
-]
 ```
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -25,8 +25,6 @@ To enable sequence parallelism, add the following to your configuration file:
 ```yaml
 # Set to a divisor (> 1) of the number of GPUs available
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
-# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
-heads_k_stride: 1
 ```

 The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
@@ -60,16 +58,11 @@ To use sequence parallelism, you need:
 ## Example

 ```yaml
+# Example config with sequence parallelism
 base_model: meta-llama/Llama-3-8B-Instruct
 sequence_len: 8192
-
-...
-
-sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+sequence_parallel_degree: 2  # Split each sequence into 4 parts
 flash_attention: true  # Required with sequence parallelism
-# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
-heads_k_stride: 1
-
 ...
 ```

--- a/examples/cohere/command-r-7b-qlora.yml
+++ b/examples/cohere/command-r-7b-qlora.yml
@@ -1,71 +0,0 @@
-base_model: CohereForAI/c4ai-command-r7b-12-2024
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-# huggingface repo
-chat_template: cohere
-datasets:
-  - path: cgato/SlimOrcaDedupCleaned
-    type: chat_template
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-adapter: qlora
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch:
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -1,74 +0,0 @@
-base_model: google/gemma-3-1b-it
-# optionally might have model_type or tokenizer_type
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
-# Automatically upload checkpoint and final model to HF
-# hub_model_id: username/custom_model_name
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-# huggingface repo
-chat_template: gemma3
-datasets:
-  - path: cgato/SlimOrcaDedupCleaned
-    type: chat_template
-    field_messages: conversations
-    message_property_mappings:
-      role: from
-      content: value
-
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-adapter: qlora
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: false
-pad_to_sequence_len: true
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_ratio: 0.1
-evals_per_epoch:
-eval_table_size:
-eval_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/gemma3/gemma-3-4b-lora.yml
+++ b/examples/gemma3/gemma-3-4b-lora.yml
@@ -1,63 +0,0 @@
-base_model: google/gemma-3-4b-it
-processor_type: AutoProcessor
-strict: false
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-chat_template: gemma3
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./outputs/out
-
-adapter: lora
-lora_model_dir:
-
-sequence_len: 2048
-pad_to_sequence_len: false
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-local_rank:
-logging_steps: 1
-flash_attention: true
-eager_attention:
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
--- a/examples/llama-3/lora-1b-deduplicate-sft.yml
+++ b/examples/llama-3/lora-1b-deduplicate-sft.yml
@@ -19,6 +19,7 @@ val_set_size: 0.0
 output_dir: ./outputs/lora-out

 dataset_exact_deduplication: true
+test_value: true

 sequence_len: 4096
 sample_packing: true
--- a/examples/llava/lora-7b.yaml
+++ b/examples/llava/lora-7b.yaml
@@ -1,63 +0,0 @@
-base_model: llava-hf/llava-1.5-7b-hf
-processor_type: AutoProcessor
-strict: false
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-chat_template: llava
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-adapter: lora
-lora_model_dir:
-
-sequence_len: 8192
-pad_to_sequence_len: false
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-local_rank:
-logging_steps: 1
-flash_attention: true
-eager_attention:
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small-3.1-24B-lora.yml
@@ -1,66 +0,0 @@
-base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
-processor_type: AutoProcessor
-strict: false
-
-load_in_8bit: true
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-chat_template: mistral_v7_tekken
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.01
-output_dir: ./outputs/out
-
-adapter: lora
-lora_model_dir:
-
-sequence_len: 2048
-pad_to_sequence_len: false
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 1
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-local_rank:
-logging_steps: 1
-flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
-eager_attention:
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/pixtral/lora-12b.yml
+++ b/examples/pixtral/lora-12b.yml
@@ -1,65 +0,0 @@
-base_model: mistral-community/pixtral-12b
-processor_type: AutoProcessor
-strict: false
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-chat_template: pixtral
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-adapter: lora
-lora_model_dir:
-
-sequence_len: 8192
-pad_to_sequence_len: false
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-local_rank:
-logging_steps: 1
-flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet
-eager_attention:
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  pad_token: <pad>
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -1,63 +0,0 @@
-base_model: Qwen/Qwen2-VL-7B-Instruct
-processor_type: AutoProcessor
-strict: false
-
-# these 3 lines are needed for now to handle vision chat templates w images
-skip_prepare_dataset: true
-remove_unused_columns: false
-sample_packing: false
-
-chat_template: qwen2_vl
-datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
-    type: chat_template
-    split: train[:1%]
-    field_messages: messages
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.0
-output_dir: ./outputs/out
-
-adapter: lora
-lora_model_dir:
-
-sequence_len: 8192
-pad_to_sequence_len: false
-
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 1
-num_epochs: 1
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: true
-fp16:
-tf32: true
-
-gradient_checkpointing: true
-local_rank:
-logging_steps: 1
-flash_attention: true
-eager_attention:
-
-warmup_ratio: 0.1
-evals_per_epoch: 1
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
--- a/requirements.txt
+++ b/requirements.txt
@@ -6,16 +6,16 @@ triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
-liger-kernel==0.5.5
+liger-kernel==0.5.3
 # END section

 packaging==23.2

 peft==0.15.0
-transformers==4.50.0
+transformers==4.49.0
 tokenizers>=0.21.1
 accelerate==1.5.2
-datasets==3.5.0
+datasets==3.4.1
 deepspeed==0.16.4
 trl==0.15.1

--- a/requirements_env.txt
+++ b/requirements_env.txt
@@ -0,0 +1,315 @@
+accelerate==0.34.1
+addict==2.4.0
+aiofiles==23.2.1
+aiohttp==3.9.0
+aiosignal==1.3.1
+aiostream==0.5.2
+alembic==1.13.1
+annotated-types==0.6.0
+annoy==1.17.3
+ansible==6.7.0
+ansible-core==2.13.13
+ansible-vault==2.1.0
+anyio==3.7.1
+appdirs==1.4.4
+art==6.0
+asgiref==3.7.2
+async-timeout==4.0.2
+attrdict==2.0.1
+attrs==22.2.0
+awscli==1.32.75
+-e git+ssh://git@github.com/OpenAccess-AI-Collective/axolotl.git@6e354682e3c1735d3f7fb9e362280c38e922260f#egg=axolotl
+backoff==2.2.1
+base58==2.1.1
+beartype==0.17.2
+bitnet==0.2.1
+bitsandbytes==0.42.0
+bittensor==6.7.0
+black==23.7.0
+blinker==1.7.0
+boto3==1.34.75
+botocore==1.34.75
+cachetools==5.3.3
+cachy==0.1.1
+certifi==2023.7.22
+cffi==1.16.0
+cfgv==3.3.1
+chai-guanaco==1.2.4
+charset-normalizer==3.2.0
+cleo==0.6.8
+click==8.1.7
+cloudpickle==2.0.0
+cohere==4.11.2
+colorama==0.4.4
+coloredlogs==15.0.1
+CoLT5-attention==0.10.20
+contextlib2==21.6.0
+contourpy==1.2.0
+cryptography==41.0.3
+cycler==0.12.1
+cytoolz==0.12.3
+databricks-cli==0.18.0
+dataclasses-json==0.5.7
+datasets==2.11.0
+ddt==1.6.0
+decorator==5.1.1
+deepspeed==0.15.0
+# Editable Git install with no remote (dialogpt==0.1)
+-e /Users/wing/Projects/ml/dialogpt/src
+dill==0.3.6
+distlib==0.3.6
+docker==7.0.0
+docker-pycreds==0.4.0
+docstring-parser==0.15
+docutils==0.16
+ecdsa==0.18.0
+einops==0.7.0
+einops-exts==0.0.4
+einx==0.1.3
+entrypoints==0.4
+eth-hash==0.6.0
+eth-keys==0.5.0
+eth-typing==4.0.0
+eth-utils==2.3.1
+evaluate==0.4.0
+exceptiongroup==1.1.1
+fastapi==0.109.2
+fastcore==1.5.29
+ffmpy==0.4.0
+filelock==3.12.2
+-e git+https://github.com/NousResearch/finetuning-subnet.git@24e9407d6b4430a7ca39d344692f89ce5a97d27e#egg=finetuning_subnet
+fire==0.5.0
+first==2.0.2
+flake8==7.0.0
+Flask==3.0.1
+fonttools==4.47.2
+frozendict==2.4.1
+frozenlist==1.3.3
+fschat @ git+https://github.com/lm-sys/FastChat.git@27a05b04a35510afb1d767ae7e5990cbd278f8fe
+fsspec==2023.6.0
+fuzzywuzzy==0.18.0
+gitdb==4.0.10
+GitPython==3.1.31
+google-pasta==0.2.0
+gradio==4.42.0
+gradio_client==1.3.0
+greenlet==2.0.2
+grpclib==0.4.7
+gunicorn==21.2.0
+h11==0.14.0
+h2==4.1.0
+hpack==4.0.0
+httpcore==0.17.3
+httpx==0.24.1
+huggingface-hub==0.23.4
+humanfriendly==10.0
+hyperframe==6.0.1
+identify==2.5.24
+idna==3.4
+immutables==0.20
+importlib-metadata==6.7.0
+importlib-resources==6.1.1
+inflection==0.5.1
+iniconfig==2.0.0
+itsdangerous==2.1.2
+Jinja2==3.1.2
+jmespath==1.0.1
+joblib==1.3.2
+jsonlines==3.1.0
+jsonschema==2.6.0
+kiwisolver==1.4.5
+langchain==0.0.144
+Levenshtein==0.24.0
+libcst==1.1.0
+liger-kernel==0.0.0
+lion-pytorch==0.1.2
+llama-cpp-python==0.1.36
+llvmlite==0.40.1
+local-attention==1.9.0
+loguru==0.7.0
+Mako==1.3.2
+Markdown==3.5.2
+markdown-it-py==3.0.0
+markdown2==2.4.10
+MarkupSafe==2.1.2
+marshmallow==3.19.0
+marshmallow-enum==1.5.1
+matplotlib==3.8.2
+mccabe==0.7.0
+mdurl==0.1.2
+MEGABYTE-pytorch==0.0.7
+-e git+https://github.com/cg123/mergekit.git@53c5f414774a0558b8d84858fb6374bc93a8f1c1#egg=mergekit
+mlflow==2.10.0
+modal==0.62.77
+more-itertools==10.2.0
+mpmath==1.2.1
+msgpack==1.0.7
+msgpack-numpy-opentensor==0.5.0
+multidict==6.0.4
+multiprocess==0.70.14
+munch==2.5.0
+mypy==1.3.0
+mypy-extensions==1.0.0
+nest-asyncio==1.6.0
+netaddr==0.10.1
+networkx==3.0rc1
+nh3==0.2.14
+nodeenv==1.8.0
+nomic==2.0.2
+numba==0.57.1
+numexpr==2.8.4
+numpy==1.24.4
+oauthlib==3.2.2
+openai==0.27.4
+openapi==1.1.0
+openapi-schema-pydantic==1.2.4
+optimum==1.8.6
+orjson==3.10.7
+packaging==23.1
+pandas==2.0.0
+parameterized==0.9.0
+password-strength==0.0.3.post2
+pastel==0.1.1
+pathos==0.3.0
+pathspec==0.11.1
+pathtools==0.1.2
+peft==0.11.1
+pendulum==3.0.0
+Pillow==9.5.0
+pip-tools==1.11.0
+platformdirs==3.2.0
+pluggy==1.4.0
+poetry==0.7.1
+pox==0.3.2
+ppft==1.7.6.6
+pre-commit==3.3.2
+prettytable==3.10.0
+prompt-toolkit==3.0.39
+protobuf==3.20.2
+protobuf3-to-dict==0.1.5
+psutil==5.9.5
+psycopg==3.1.18
+PuLP==2.8.0
+py==1.11.0
+py-bip39-bindings==0.1.11
+py-cpuinfo==9.0.0
+py-ed25519-zebra-bindings==1.0.1
+py-sr25519-bindings==0.2.0
+pyarrow==11.0.0
+pyasn1==0.6.0
+pycodestyle==2.11.1
+pycparser==2.21
+pycryptodome==3.20.0
+pydantic==2.5.3
+pydantic_core==2.14.6
+pydub==0.25.1
+pyfiglet==0.8.post1
+pyflakes==3.2.0
+Pygments==2.15.1
+PyJWT==2.8.0
+pylev==1.4.0
+PyNaCl==1.5.0
+pynvml==11.5.0
+pyparsing==2.4.7
+pyrsistent==0.14.11
+pytest==8.0.2
+pytest-asyncio==0.23.4
+python-dateutil==2.8.2
+python-dotenv==1.0.1
+python-Levenshtein==0.24.0
+python-multipart==0.0.9
+pytz==2023.3
+PyYAML==6.0.1
+querystring-parser==1.2.4
+rapidfuzz==3.6.1
+regex==2023.6.3
+requests==2.31.0
+requests-toolbelt==0.8.0
+resolvelib==0.8.1
+responses==0.18.0
+retry==0.9.2
+rich==13.7.0
+rsa==4.7.2
+ruff==0.6.3
+s3transfer==0.10.1
+safetensors==0.4.5
+sagemaker==2.148.0
+scalecodec==1.2.7
+schedulefree==1.2.1
+schema==0.7.5
+scikit-learn==1.4.0
+scipy==1.9.3
+seaborn==0.13.2
+semantic-version==2.10.0
+sentencepiece==0.2.0
+sentry-sdk==1.19.1
+setproctitle==1.3.2
+shellingham==1.5.4
+shortuuid==1.0.11
+shtab==1.6.5
+sigtools==4.0.1
+six==1.16.0
+skypilot==0.4.1
+smdebug-rulesconfig==1.0.1
+smmap==5.0.0
+sniffio==1.3.0
+SQLAlchemy==1.4.47
+sqlparse==0.4.4
+starlette==0.36.3
+substrate-interface==1.5.2
+svgwrite==1.4.3
+sympy==1.11.1
+synchronicity==0.6.7
+tabulate==0.9.0
+tblib==1.7.0
+tenacity==8.2.2
+tensor-parallel==2.0.0
+termcolor==2.2.0
+text2art==0.2.0
+threadpoolctl==3.2.0
+tiktoken==0.6.0
+time-machine==2.14.1
+timm==0.9.16
+tokenizers==0.19.1
+tokenmonster==1.1.12
+toml==0.9.6
+tomli==2.0.1
+tomlkit==0.12.0
+toolz==0.12.1
+torch==2.2.0
+torchdata==0.6.1
+torchdiffeq==0.2.3
+TorchFix==0.4.0
+torchtext==0.15.2
+torchvision==0.17.0
+tqdm==4.66.2
+transformers==4.44.2
+trl==0.9.6
+typer==0.12.5
+types-certifi==2021.10.8.3
+types-requests==2.31.0.20240125
+types-setuptools==69.0.0.20240125
+types-toml==0.10.8.7
+typing==3.7.4.3
+typing-inspect==0.8.0
+typing_extensions==4.9.0
+tyro==0.5.18
+tzdata==2023.3
+unique-names-generator==1.0.2
+urllib3==2.2.2
+uvicorn==0.22.0
+vector_quantize_pytorch==1.14.1
+virtualenv==20.23.0
+voyager==2.0.2
+wandb==0.16.2
+watchfiles==0.21.0
+wavedrom==2.0.3.post3
+wcwidth==0.2.6
+websocket-client==1.7.0
+websockets==12.0
+Werkzeug==3.0.1
+wonderwords==2.2.0
+xxhash==3.2.0
+yarl==1.8.2
+zetascale==2.2.7
+zipp==3.15.0
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,9 @@ def parse_requirements():
    with open("./requirements.txt", encoding="utf-8") as requirements_file:
        lines = [r.strip() for r in requirements_file.readlines()]
        for line in lines:
-            is_extras = "deepspeed" in line or "mamba-ssm" in line
+            is_extras = (
+                "deepspeed" in line or "mamba-ssm" in line or "lion-pytorch" in line
+            )
            if line.startswith("--extra-index-url"):
                # Handle custom index URLs
                _, url = line.split()
@@ -133,15 +135,15 @@ setup(
        "mlflow": [
            "mlflow",
        ],
+        "lion-pytorch": [
+            "lion-pytorch==0.1.2",
+        ],
        "galore": [
            "galore_torch",
        ],
-        "apollo": [
-            "apollo-torch",
-        ],
        "optimizers": [
            "galore_torch",
-            "apollo-torch",
+            "lion-pytorch==0.1.2",
            "lomo-optim==0.1.1",
            "torch-optimi==0.2.1",
        ],
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -56,7 +56,7 @@ def do_inference(
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Inference-specific CLI arguments.
    """
-    model, tokenizer, _ = load_model_and_tokenizer(cfg=cfg, inference=True)
+    model, tokenizer = load_model_and_tokenizer(cfg=cfg, inference=True)
    prompter = cli_args.prompter

    prompter_module = None
@@ -151,7 +151,7 @@ def do_inference_gradio(
    """
    import gradio as gr

-    model, tokenizer, _ = load_model_and_tokenizer(cfg=cfg, inference=True)
+    model, tokenizer = load_model_and_tokenizer(cfg=cfg, inference=True)
    prompter = cli_args.prompter

    prompter_module = None
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -27,7 +27,7 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
    """
    print_axolotl_text_art()

-    model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
+    model, tokenizer = load_model_and_tokenizer(cfg=cfg)
    safe_serialization = cfg.save_safetensors is True

    LOG.info("Running merge of LoRA with base model...")
@@ -44,9 +44,6 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
        )
        tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))

-        if processor:
-            processor.save_pretrained(str(Path(cfg.output_dir) / "merged"))
-

 def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
    """
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -17,7 +17,6 @@ from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.integrations.base import PluginManager
 from axolotl.train import train
-from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.config import normalize_config, resolve_dtype
 from axolotl.utils.dict import DictDefault

@@ -34,9 +33,6 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Training-specific CLI arguments.
    """
-    # Enable expandable segments for cuda allocation to improve VRAM usage
-    set_pytorch_cuda_alloc_conf()
-
    print_axolotl_text_art()
    check_accelerate_default_config()
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -13,16 +13,11 @@ from typing import Any, Callable, Type, Union, get_args, get_origin
 import click
 import requests
 from pydantic import BaseModel
-from transformers import (
-    PreTrainedModel,
-    PreTrainedTokenizer,
-    PreTrainedTokenizerFast,
-    ProcessorMixin,
-)
+from transformers import PreTrainedModel, PreTrainedTokenizer, PreTrainedTokenizerFast

 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.models import load_model, load_processor, load_tokenizer
+from axolotl.utils.models import load_model, load_tokenizer

 configure_logging()
 LOG = logging.getLogger(__name__)
@@ -300,13 +295,9 @@ def load_model_and_tokenizer(
    *,
    cfg: DictDefault,
    inference: bool = False,
-) -> tuple[
-    PreTrainedModel,
-    PreTrainedTokenizer | PreTrainedTokenizerFast | Any,
-    ProcessorMixin | None,
-]:
+) -> tuple[PreTrainedModel, PreTrainedTokenizer | PreTrainedTokenizerFast | Any]:
    """
-    Helper function for loading a model, tokenizer, and processor specified in the given `axolotl`
+    Helper function for loading a model and tokenizer specified in the given `axolotl`
    config.

    Args:
@@ -314,7 +305,7 @@ def load_model_and_tokenizer(
        inference: Boolean denoting inference mode.

    Returns:
-        Tuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin).
+        `transformers` model and tokenizer.
    """
    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
    tokenizer = load_tokenizer(cfg)
@@ -322,9 +313,4 @@ def load_model_and_tokenizer(
    LOG.info("loading model...")
    model, _ = load_model(cfg, tokenizer, inference=inference)

-    processor = None
-    if cfg.is_multimodal:
-        LOG.info("loading processor...")
-        processor = load_processor(cfg, tokenizer)
-
-    return model, tokenizer, processor
+    return model, tokenizer
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -60,7 +60,6 @@ from axolotl.core.training_args import (
 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback
-from axolotl.processing_strategies import get_processing_strategy
 from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
    EvalFirstStepCallback,
@@ -69,6 +68,7 @@ from axolotl.utils.callbacks import (
    LossWatchDogCallback,
    SaveAxolotlConfigtoWandBCallback,
    SaveBetterTransformerModelCallback,
+    SaveModelCallback,
    bench_eval_callback_factory,
    causal_lm_bench_eval_callback_factory,
    log_prediction_callback_factory,
@@ -248,6 +248,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

        if self.cfg.gc_steps:
            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
+        callbacks.append(SaveModelCallback())

        return callbacks

@@ -746,12 +747,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                self.cfg.accelerator_config
            )

-        if self.cfg.image_size:
-            training_arguments_kwargs["image_size"] = self.cfg.image_size
-        if self.cfg.image_resize_algorithm:
-            training_arguments_kwargs["image_resize_algorithm"] = (
-                self.cfg.image_resize_algorithm
-            )
        if self.cfg.kd_ce_alpha is not None:
            training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
        if self.cfg.kd_alpha is not None:
@@ -895,13 +890,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        else:
            if self.cfg.processor_type and self.processor:
                collator = MultiModalChatDataCollator
-                kwargs["processing_strategy"] = get_processing_strategy(
-                    self.processor,
-                    training_args.chat_template,
-                    self.cfg.chat_template,
-                    image_size=training_args.image_size,
-                    image_resize_algorithm=training_args.image_resize_algorithm,
-                )
+                kwargs["processor"] = self.processor
+                kwargs["chat_template"] = training_args.chat_template
            elif self.cfg.batch_flattening:
                collator = DataCollatorWithFlattening
                collator_args.pop(0)
@@ -935,6 +925,7 @@ class HFRLTrainerBuilder(TrainerBuilderBase):

    def get_callbacks(self):
        callbacks = super().get_callbacks()
+        callbacks.append(SaveModelCallback())

        return callbacks

--- a/src/axolotl/core/trainers/trl.py
+++ b/src/axolotl/core/trainers/trl.py
@@ -1,7 +1,5 @@
 """Module for TRL PPO trainer"""

-from typing import Literal, Union
-
 import torch
 from tqdm import tqdm
 from trl import (
@@ -81,78 +79,6 @@ class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):

    tag_names = ["axolotl", "orpo"]

-    def get_batch_loss_metrics(
-        self,
-        model,
-        batch: dict[str, Union[list, torch.LongTensor]],
-        train_eval: Literal["train", "eval"] = "train",
-    ):
-        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
-
-        # TODO remove once https://github.com/huggingface/trl/pull/3069 is included in a trl release
-
-        metrics = {}
-
-        forward_output = self.concatenated_forward(model, batch)
-        (
-            policy_chosen_logps,
-            policy_rejected_logps,
-            policy_chosen_logits,
-            policy_rejected_logits,
-            policy_nll_loss,
-        ) = forward_output[:5]
-        if self.aux_loss_enabled:
-            aux_loss = forward_output[5]
-
-        losses, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = (
-            self.odds_ratio_loss(policy_chosen_logps, policy_rejected_logps)
-        )
-        # full ORPO loss
-        loss = policy_nll_loss - losses.mean()
-
-        reward_accuracies = (chosen_rewards > rejected_rewards).float()
-
-        prefix = "eval_" if train_eval == "eval" else ""
-        metrics[f"{prefix}rewards/chosen"] = self.accelerator.gather_for_metrics(
-            chosen_rewards
-        ).mean()
-        metrics[f"{prefix}rewards/rejected"] = self.accelerator.gather_for_metrics(
-            rejected_rewards
-        ).mean()
-        metrics[f"{prefix}rewards/accuracies"] = self.accelerator.gather_for_metrics(
-            reward_accuracies
-        ).mean()
-        metrics[f"{prefix}rewards/margins"] = self.accelerator.gather_for_metrics(
-            chosen_rewards - rejected_rewards
-        ).mean()
-        metrics[f"{prefix}logps/rejected"] = (
-            self.accelerator.gather_for_metrics(policy_rejected_logps).detach().mean()
-        )
-        metrics[f"{prefix}logps/chosen"] = (
-            self.accelerator.gather_for_metrics(policy_chosen_logps).detach().mean()
-        )
-        metrics[f"{prefix}logits/rejected"] = self.accelerator.gather_for_metrics(
-            policy_rejected_logits.detach().mean()
-        ).mean()
-        metrics[f"{prefix}logits/chosen"] = self.accelerator.gather_for_metrics(
-            policy_chosen_logits.detach().mean()
-        ).mean()
-        metrics[f"{prefix}nll_loss"] = (
-            self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean()
-        )
-        metrics[f"{prefix}log_odds_ratio"] = (
-            self.accelerator.gather_for_metrics(log_odds_ratio).detach().mean()
-        )
-        metrics[f"{prefix}log_odds_chosen"] = (
-            self.accelerator.gather_for_metrics(log_odds_chosen).detach().mean()
-        )
-        for k, v in metrics.items():
-            metrics[k] = v.item()
-        if self.aux_loss_enabled:
-            loss += self.aux_loss_coef * aux_loss
-
-        return loss, metrics
-

 class AxolotlKTOTrainer(SchedulerMixin, KTOTrainer):
    """
@@ -169,80 +95,6 @@ class AxolotlCPOTrainer(SchedulerMixin, CPOTrainer):

    tag_names = ["axolotl", "cpo"]

-    def get_batch_loss_metrics(
-        self,
-        model,
-        batch: dict[str, Union[list, torch.LongTensor]],
-        train_eval: Literal["train", "eval"] = "train",
-    ):
-        """Compute the CPO loss and other metrics for the given batch of inputs for train or test."""
-        metrics = {}
-
-        forward_output = self.concatenated_forward(model, batch)
-        (
-            policy_chosen_logps,
-            policy_rejected_logps,
-            policy_chosen_logits,
-            policy_rejected_logits,
-            policy_nll_loss,
-        ) = forward_output[:5]
-        if self.aux_loss_enabled:
-            aux_loss = forward_output[5]
-
-        losses, chosen_rewards, rejected_rewards = self.cpo_loss(
-            policy_chosen_logps,
-            policy_rejected_logps,
-        )
-
-        loss = losses.mean() + self.cpo_alpha * policy_nll_loss
-        reward_accuracies = (chosen_rewards > rejected_rewards).float()
-
-        prefix = "eval_" if train_eval == "eval" else ""
-        metrics[f"{prefix}rewards/chosen"] = (
-            self.accelerator.gather_for_metrics(chosen_rewards).mean().item()
-        )
-        metrics[f"{prefix}rewards/rejected"] = (
-            self.accelerator.gather_for_metrics(rejected_rewards).mean().item()
-        )
-        metrics[f"{prefix}rewards/accuracies"] = (
-            self.accelerator.gather_for_metrics(reward_accuracies).mean().item()
-        )
-        metrics[f"{prefix}rewards/margins"] = (
-            self.accelerator.gather_for_metrics(chosen_rewards - rejected_rewards)
-            .mean()
-            .item()
-        )
-        metrics[f"{prefix}logps/rejected"] = (
-            self.accelerator.gather_for_metrics(policy_rejected_logps)
-            .detach()
-            .mean()
-            .item()
-        )
-        metrics[f"{prefix}logps/chosen"] = (
-            self.accelerator.gather_for_metrics(policy_chosen_logps)
-            .detach()
-            .mean()
-            .item()
-        )
-        metrics[f"{prefix}logits/rejected"] = (
-            self.accelerator.gather_for_metrics(policy_rejected_logits.detach().mean())
-            .mean()
-            .item()
-        )
-        metrics[f"{prefix}logits/chosen"] = (
-            self.accelerator.gather_for_metrics(policy_chosen_logits.detach().mean())
-            .mean()
-            .item()
-        )
-        metrics[f"{prefix}nll_loss"] = (
-            self.accelerator.gather_for_metrics(policy_nll_loss).detach().mean().item()
-        )
-
-        if self.aux_loss_enabled:
-            loss += self.aux_loss_coef * aux_loss
-
-        return loss, metrics
-

 class AxolotlRewardTrainer(SchedulerMixin, RewardTrainer):
    """
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -5,7 +5,6 @@ extra axolotl specific training args
 from dataclasses import dataclass, field
 from typing import Optional

-from PIL.Image import Resampling
 from transformers import TrainingArguments
 from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig

@@ -213,20 +212,6 @@ class AxolotlTrainingMixins:
        metadata={"help": "The number of workers to use in sequence parallelism"},
    )

-    # multi-modal section
-
-    image_size: int | tuple[int, int] | None = field(
-        default=None,
-        metadata={"help": "The size of the image to resize to"},
-    )
-
-    image_resize_algorithm: Resampling | None = field(
-        default=None,
-        metadata={"help": "The algorithm to use for image resizing"},
-    )
-
-    # end of multi-modal section
-

@dataclass
 class AxolotlTrainingArguments(AxolotlTrainingMixins, TrainingArguments):
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -15,7 +15,6 @@ from axolotl.logging_config import configure_logging
 from axolotl.train import TrainDatasetMeta
 from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.trainer import setup_trainer

@@ -160,6 +159,4 @@ def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, f
    del model
    del tokenizer

-    cleanup_distributed()
-
    return all_metrics
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -1,6 +1,6 @@
 # Cut Cross Entropy

-Cut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.
+Cut Cross Entropy reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.

 See https://github.com/apple/ml-cross-entropy

@@ -29,20 +29,6 @@ plugins:
 cut_cross_entropy: true
 ```

-## Supported Models
-
- llama
- phi3
- gemma
- gemma2
- gemma3
- gemma3_text
- mistral
- mistral3
- qwen2
- cohere
- cohere2
-
 ## Citation

 ```bib
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -25,8 +25,8 @@ import torch

 from axolotl.integrations.base import BasePlugin
 from axolotl.utils import get_pytorch_version
-from axolotl.utils.distributed import zero_only

+from ...utils.distributed import zero_only
 from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401

 LOG = logging.getLogger("axolotl.integrations.cut_cross_entropy")
@@ -72,9 +72,7 @@ class CutCrossEntropyPlugin(BasePlugin):
        if cfg.cut_cross_entropy:
            self._check_requirements()

-            from axolotl.integrations.cut_cross_entropy.monkeypatch.patch import (
-                cce_patch,
-            )
+            from cut_cross_entropy.transformers import cce_patch

            with zero_only():
                LOG.info(
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/cohere.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/cohere.py
@@ -1,201 +0,0 @@
-"""Cohere and Cohere2 CCE patch."""
-
-# This patch is based off transformers 4.50.0.
-# It patches the forward function for CohereForCausalLM and Cohere2ForCausalLM.
-# It scales the hidden states by the logit scale in advance instead of the logits as the
-# operation is done internally and should be mathematically equivalent.
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.cohere.modeling_cohere import (
-    _CONFIG_FOR_DOC,
-    COHERE_INPUTS_DOCSTRING,
-    KwargsForCausalLM,
-)
-from transformers.processing_utils import Unpack
-from transformers.utils import (
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-from transformers.utils.deprecation import deprecate_kwarg
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
-@replace_return_docstrings(
-    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-)
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs: Unpack[KwargsForCausalLM],
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-
-    Example:
-
-    ```python
-    >> from transformers import AutoTokenizer, CohereForCausalLM
-
-    >> model = CohereForCausalLM.from_pretrained("CohereForAI/c4ai-command-r-v01")
-    >> tokenizer = AutoTokenizer.from_pretrained("CohereForAI/c4ai-command-r-v01")
-
-    >> prompt = "Hey, are you conscious? Can you talk to me?"
-    >> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >> # Generate
-    >> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-    ```"""
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        # scale weight by logit_scale in-place of logits
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight * self.logit_scale,
-            labels,
-            _PATCH_OPTS,
-            **kwargs,
-        )
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        logits = logits * self.logit_scale  # main diff from Llama
-
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def patch_cohere(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.cohere import modeling_cohere
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_cohere.CohereForCausalLM
-        ), f"Expected a CohereForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_cohere.CohereForCausalLM.forward = cce_forward
-    return None
-
-
-def patch_cohere2(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.cohere2 import modeling_cohere2
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_cohere2.Cohere2ForCausalLM
-        ), f"Expected a Cohere2ForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_cohere2.Cohere2ForCausalLM.forward = cce_forward
-    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma.py
@@ -1,175 +0,0 @@
-"""Gemma CCE patch"""
-
-# This patch is based off transformers 4.50.0.
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.gemma.modeling_gemma import (
-    _CONFIG_FOR_DOC,
-    GEMMA_INPUTS_DOCSTRING,
-    KwargsForCausalLM,
-)
-from transformers.processing_utils import Unpack
-from transformers.utils import (
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-from transformers.utils.deprecation import deprecate_kwarg
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-@add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
-@replace_return_docstrings(
-    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-)
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **kwargs: Unpack[KwargsForCausalLM],
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, GemmaForCausalLM
-
-    >>> model = GemmaForCausalLM.from_pretrained("google/gemma-7b")
-    >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
-
-    >>> prompt = "What is your favorite condiment?"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "What is your favorite condiment?"
-    ```"""
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **kwargs,
-        )
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def patch_gemma(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.gemma import modeling_gemma
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_gemma.GemmaForCausalLM
-        ), f"Expected a GemmaForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_gemma.GemmaForCausalLM.forward = cce_forward
-    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma3.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/gemma3.py
@@ -1,459 +0,0 @@
-"""Gemma2 and Gemma3 (text and multimodal) CCE patch."""
-
-# Implementation originally adapted from https://github.com/apple/ml-cross-entropy/pull/29
-# and updated for transformers 4.50.0.
-# This is a modified version of the patch that allows for deferred logits calculation for gemma3 and works
-# with both gemma3 (text and multimodal) models.
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-)
-from torch import nn
-from transformers.cache_utils import Cache, HybridCache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.gemma3.modeling_gemma3 import (
-    _CONFIG_FOR_DOC,
-    GEMMA3_INPUTS_DOCSTRING,
-    Gemma3CausalLMOutputWithPast,
-    logger,
-)
-from transformers.utils import (
-    add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
-    replace_return_docstrings,
-)
-from transformers.utils.deprecation import deprecate_kwarg
-
-from axolotl.integrations.cut_cross_entropy.monkeypatch.utils import apply_lce
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
-@replace_return_docstrings(
-    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-)
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[HybridCache] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    defer_logits_calculation: bool = False,
-    **loss_kwargs,
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-        defer_logits_calculation (`bool`, *optional*):
-            If `True`, defer logits calculation to the ConditionalGeneration forward. This is used to avoid the
-            memory overhead of calculating logits using regular lm_head forward pass and to use CCE.
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, Gemma3ForCausalLM
-
-    >>> model = Gemma3ForCausalLM.from_pretrained("google/gemma-2-9b")
-    >>> tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-9b")
-
-    >>> prompt = "What is your favorite condiment?"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "What is your favorite condiment?"
-    ```"""
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        **loss_kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            softcap=getattr(self.config, "final_logit_softcapping", None),
-            **loss_kwargs,
-        )
-    elif _PATCH_OPTS is not None and defer_logits_calculation:
-        # defer logits calculation to the ConditionalGeneration forward
-        logits = hidden_states[:, slice_indices, :]
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        if self.config.final_logit_softcapping is not None:
-            logits = logits / self.config.final_logit_softcapping
-            logits = torch.tanh(logits)
-            logits = logits * self.config.final_logit_softcapping
-
-        if labels is not None:
-            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
-@replace_return_docstrings(
-    output_type=Gemma3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-)
-def cce_forward_multimodal(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    pixel_values: torch.FloatTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[list[torch.FloatTensor], Cache]] = None,
-    token_type_ids: Optional[torch.LongTensor] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **lm_kwargs,
-) -> Union[Tuple, Gemma3CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from PIL import Image
-    >>> import requests
-    >>> from transformers import AutoProcessor, Gemma3ForConditionalGeneration
-
-    >>> model = Gemma3ForConditionalGeneration.from_pretrained("google/Gemma3-test-224px-hf")
-    >>> processor = AutoProcessor.from_pretrained("google/Gemma3-test-224px-hf")
-
-    >>> prompt = "answer en Where is the cow standing?"
-    >>> url = "https://huggingface.co/gv-hf/Gemma3-test-224px-hf/resolve/main/cow_beach_1.png"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(**inputs, max_length=30)
-    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "answer en Where is the cow standing?\nbeach"
-    ```"""
-
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    is_training = token_type_ids is not None and labels is not None
-
-    # Replace image id woth PAD if the image token if OOV, to avoid index-errors
-    if input_ids is not None and self.config.image_token_index >= self.vocab_size:
-        special_image_mask = input_ids == self.config.image_token_index
-        llm_input_ids = input_ids.clone()
-        llm_input_ids[special_image_mask] = 0
-    else:
-        llm_input_ids = input_ids  # type: ignore
-
-    if inputs_embeds is None:
-        inputs_embeds = self.get_input_embeddings()(llm_input_ids)
-
-    if cache_position is None:
-        past_seen_tokens = (
-            past_key_values.get_seq_length() if past_key_values is not None else 0  # type: ignore
-        )
-        cache_position = torch.arange(  # type: ignore
-            past_seen_tokens,
-            past_seen_tokens + inputs_embeds.shape[1],
-            device=inputs_embeds.device,
-        )
-
-    # Merge text and images
-    if pixel_values is not None:
-        image_features = self.get_image_features(pixel_values)
-
-        if input_ids is None:
-            special_image_mask = inputs_embeds == self.get_input_embeddings()(
-                torch.tensor(
-                    self.config.image_token_index,
-                    dtype=torch.long,
-                    device=inputs_embeds.device,
-                )
-            )
-        else:
-            special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(
-                -1
-            )
-            special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
-                inputs_embeds.device
-            )
-
-        if (
-            not is_torchdynamo_compiling()
-            and inputs_embeds[special_image_mask].numel() != image_features.numel()
-        ):
-            image_tokens_in_text = (special_image_mask).sum(dim=1).sum(dim=0)[0]
-            raise ValueError(
-                f"Number of images does not match number of special image tokens in the input text. "
-                f"Got {image_tokens_in_text} image tokens in the text but {image_features.shape[0] * image_features.shape[1]} "
-                "tokens from image embeddings."
-            )
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)  # type: ignore
-
-    # mask out pad-token-ids in labels for BC
-    if labels is not None and self.pad_token_id in labels:
-        logger.warning_once(
-            "`labels` contains `pad_token_id` which will be masked with `config.ignore_index`. "
-            "You have to mask out `pad_token_id` when preparing `labels`, this behavior will be removed in v.4.46.",
-        )
-        labels = torch.where(  # type: ignore
-            input_ids == self.pad_token_id, self.config.ignore_index, labels
-        )
-
-    causal_mask = self._update_causal_mask(  # pylint: disable=protected-access
-        attention_mask,
-        token_type_ids,
-        past_key_values,
-        cache_position,
-        inputs_embeds,
-        is_training,
-    )
-    outputs = self.language_model(
-        attention_mask=causal_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        logits_to_keep=logits_to_keep,
-        defer_logits_calculation=True,  # enable deferred logits calculation
-        **lm_kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states,
-            self.language_model.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            softcap=getattr(self.config, "final_logit_softcapping", None),
-            **lm_kwargs,
-        )
-    else:
-        logits = hidden_states
-        if labels is not None:
-            # Upcast to float if we need to compute the loss to avoid potential precision issues
-            logits = logits.float()
-            shift_logits = logits[..., :-1, :]
-            shift_labels = labels[..., 1:]
-            if attention_mask is not None:
-                # we use the input attention mask to shift the logits and labels, because it is 2D.
-                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
-                shift_attention_mask = attention_mask[:, -shift_logits.shape[1] :].to(
-                    logits.device
-                )
-                shift_logits = shift_logits[
-                    shift_attention_mask.to(logits.device) != 0
-                ].contiguous()
-                shift_labels = shift_labels[
-                    shift_attention_mask.to(shift_labels.device) != 0
-                ].contiguous()
-            else:
-                shift_logits = shift_logits.contiguous()
-                shift_labels = shift_labels.contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-
-            flat_logits = shift_logits.view(-1, self.config.text_config.vocab_size)
-            flat_labels = shift_labels.view(-1).to(shift_logits.device)
-            loss = loss_fct(flat_logits, flat_labels)
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return Gemma3CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        image_hidden_states=image_features if pixel_values is not None else None,
-    )
-
-
-def patch_gemma2(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.gemma2 import modeling_gemma2
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_gemma2.Gemma2ForCausalLM
-        ), f"Expected a Gemma2ForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_gemma2.Gemma2ForCausalLM.forward = cce_forward
-    return None
-
-
-def patch_gemma3_text(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.gemma3 import modeling_gemma3
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_gemma3.Gemma3ForCausalLM
-        ), f"Expected a Gemma3ForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_gemma3.Gemma3ForCausalLM.forward = cce_forward
-    return None
-
-
-def patch_gemma3(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.gemma3 import modeling_gemma3
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_gemma3.Gemma3ForConditionalGeneration
-        ), f"Expected a Gemma3ForConditionalGeneration model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
-
-        # patch the causal model to enable deferred logits calculation
-        maybe_model.language_model.forward = MethodType(
-            cce_forward, maybe_model.language_model
-        )
-        return maybe_model
-
-    modeling_gemma3.Gemma3ForConditionalGeneration.forward = cce_forward_multimodal
-    # patch the causal model to enable deferred logits calculation
-    modeling_gemma3.Gemma3ForCausalLM.forward = cce_forward
-    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mistral3.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mistral3.py
@@ -1,392 +0,0 @@
-"""Mistral and Mistral3 CCE patch."""
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from torch import nn
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.mistral3.modeling_mistral3 import (
-    Mistral3CausalLMOutputWithPast,
-)
-from transformers.models.mistral.modeling_mistral import (
-    _CONFIG_FOR_DOC,
-    MISTRAL_INPUTS_DOCSTRING,
-    KwargsForCausalLM,
-)
-from transformers.processing_utils import Unpack
-from transformers.utils import (
-    add_start_docstrings_to_model_forward,
-    is_torchdynamo_compiling,
-    replace_return_docstrings,
-)
-from transformers.utils.deprecation import deprecate_kwarg
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-@add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
-@replace_return_docstrings(
-    output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
-)
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] | None = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    defer_logits_calculation: bool = False,
-    **kwargs: Unpack[KwargsForCausalLM],
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-        defer_logits_calculation (`bool`, *optional*):
-            If `True`, defer logits calculation to the ConditionalGeneration forward. This is used to avoid the
-            memory overhead of calculating logits using regular lm_head forward pass and to use CCE.
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, MistralForCausalLM
-
-    >>> model = MistralForCausalLM.from_pretrained("meta-mistral/Mistral-2-7b-hf")
-    >>> tokenizer = AutoTokenizer.from_pretrained("meta-mistral/Mistral-2-7b-hf")
-
-    >>> prompt = "Hey, are you conscious? Can you talk to me?"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
-    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
-    ```"""
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        **kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **kwargs,
-        )
-    elif _PATCH_OPTS is not None and defer_logits_calculation:
-        # defer logits calculation to the ConditionalGeneration forward
-        logits = hidden_states[:, slice_indices, :]
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :])
-        if labels is not None:
-            loss = self.loss_function(
-                logits=logits,
-                labels=labels,
-                vocab_size=self.config.vocab_size,
-                **kwargs,
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def cce_forward_multimodal(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    pixel_values: torch.FloatTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[list[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    vision_feature_layer: Optional[Union[int, list[int]]] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    image_sizes: torch.Tensor | None = None,
-    **lm_kwargs,
-) -> Union[Tuple, Mistral3CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from PIL import Image
-    >>> import requests
-    >>> from transformers import AutoProcessor, Mistral3ForConditionalGeneration
-
-    >>> model = Mistral3ForConditionalGeneration.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
-    >>> processor = AutoProcessor.from_pretrained("mistralai/Mistral-Small-3.1-24B-Instruct-2503")
-
-    >>> prompt = "<s>[INST][IMG]What is the image?[/INST]"
-    >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
-    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    "What is the image?The image depicts two cats lying on a pink blanket."
-    ```"""
-
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-    vision_feature_layer = (
-        vision_feature_layer
-        if vision_feature_layer is not None
-        else self.config.vision_feature_layer
-    )
-
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-    if pixel_values is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
-        )
-
-    if inputs_embeds is None:
-        inputs_embeds = self.get_input_embeddings()(input_ids)
-
-    if pixel_values is not None:
-        image_features = self.get_image_features(
-            pixel_values=pixel_values,
-            vision_feature_layer=vision_feature_layer,
-            image_sizes=image_sizes,
-        )
-
-        special_image_mask = (input_ids == self.config.image_token_index).unsqueeze(-1)
-        special_image_mask = special_image_mask.expand_as(inputs_embeds).to(
-            inputs_embeds.device
-        )
-        if (
-            not is_torchdynamo_compiling()
-            and inputs_embeds[special_image_mask].numel() != image_features.numel()
-        ):
-            n_image_tokens = (input_ids == self.config.image_token_index).sum()
-            n_image_features = image_features.shape[0] * image_features.shape[1]
-            raise ValueError(
-                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
-            )
-        image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
-        inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)  # type: ignore
-
-    outputs = self.language_model(
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        logits_to_keep=logits_to_keep,
-        defer_logits_calculation=True,  # enable deferred logits calculation
-        **lm_kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states,
-            self.language_model.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **lm_kwargs,
-        )
-    else:
-        logits = hidden_states
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                # we use the input attention mask to shift the logits and labels, because it is 2D.
-                # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
-                shift_attention_mask = attention_mask[:, -(logits.shape[1] - 1) :].to(
-                    logits.device
-                )
-                shift_logits = logits[..., :-1, :][
-                    shift_attention_mask.to(logits.device) != 0
-                ].contiguous()
-                shift_labels = labels[..., 1:][
-                    shift_attention_mask.to(labels.device) != 0
-                ].contiguous()
-            else:
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
-                shift_labels.view(-1).to(shift_logits.device),
-            )
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return Mistral3CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-        image_hidden_states=image_features if pixel_values is not None else None,
-    )
-
-
-def patch_mistral(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.mistral import modeling_mistral
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_mistral.MistralForCausalLM
-        ), f"Expected a MistralForCausalLM model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward, maybe_model)
-        return maybe_model
-
-    modeling_mistral.MistralForCausalLM.forward = cce_forward
-    return None
-
-
-def patch_mistral3(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.mistral import modeling_mistral
-    from transformers.models.mistral3 import modeling_mistral3
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_mistral3.Mistral3ForConditionalGeneration
-        ), f"Expected a Mistral3ForConditionalGeneration model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
-
-        # patch the causal model to enable deferred logits calculation
-        maybe_model.language_model.forward = MethodType(
-            cce_forward, maybe_model.language_model
-        )
-        return maybe_model
-
-    modeling_mistral3.Mistral3ForConditionalGeneration.forward = cce_forward_multimodal
-    # patch the causal model to enable deferred logits calculation
-    modeling_mistral.MistralForCausalLM.forward = cce_forward
-    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mllama.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/mllama.py
@@ -1,379 +0,0 @@
-"""Mllama CCE patch."""
-
-# pylint: disable=duplicate-code
-
-from types import MethodType
-from typing import Optional, Tuple, Union
-
-import torch
-import transformers
-from cut_cross_entropy.transformers.utils import (
-    PatchOptions,
-    TransformersModelT,
-    apply_lce,
-)
-from transformers.cache_utils import Cache
-from transformers.modeling_outputs import CausalLMOutputWithPast
-from transformers.models.mllama.modeling_mllama import (
-    MLLAMA_INPUTS_DOCSTRING,
-    _prepare_cross_attention_mask,
-)
-from transformers.utils import (
-    add_start_docstrings_to_model_forward,
-    replace_return_docstrings,
-)
-from transformers.utils.deprecation import deprecate_kwarg
-
-_PATCH_OPTS: PatchOptions | None = None
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-@add_start_docstrings_to_model_forward(MLLAMA_INPUTS_DOCSTRING)
-@replace_return_docstrings(
-    output_type=CausalLMOutputWithPast, config_class="MllamaTextConfig"
-)
-def cce_forward(
-    self,
-    input_ids: torch.LongTensor | None = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    cross_attention_states: Optional[torch.LongTensor] = None,
-    cross_attention_mask: Optional[torch.LongTensor] = None,
-    full_text_row_masked_out_mask: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    defer_logits_calculation: bool = False,
-    **loss_kwargs,
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-        defer_logits_calculation (`bool`, *optional*):
-            If `True`, defer logits calculation to the ConditionalGeneration forward. This is used to avoid the
-            memory overhead of calculating logits using regular lm_head forward pass and to use CCE.
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from transformers import AutoTokenizer, MllamaForCausalLM
-
-    >>> model = MllamaForCausalLM.from_pretrained("Llama-3.2-11B-Vision")
-    >>> tokenizer = AutoTokenizer.from_pretrained("Llama-3.2-11B-Vision")
-
-    >>> prompt = "If I had to write a haiku, it would be:"
-    >>> inputs = tokenizer(prompt, return_tensors="pt")
-
-    >>> # Generate
-    >>> generate_ids = model.generate(inputs.input_ids, max_length=40, do_sample=True, temperature=0.6)
-    >>> result = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-    >>> print(result)
-    If I had to write a haiku, it would be: "Snowflakes gently fall" - simple, yet peaceful.
-    I love the idea of snowflakes gently falling, each one
-    ```
-    """
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
-    outputs = self.model(
-        input_ids=input_ids,
-        cross_attention_states=cross_attention_states,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        cross_attention_mask=cross_attention_mask,
-        full_text_row_masked_out_mask=full_text_row_masked_out_mask,
-        past_key_values=past_key_values,
-        inputs_embeds=inputs_embeds,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-        cache_position=cache_position,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    slice_indices = (
-        slice(-logits_to_keep, None)
-        if isinstance(logits_to_keep, int)
-        else logits_to_keep
-    )
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states[:, slice_indices, :],
-            self.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **loss_kwargs,
-        )
-    elif _PATCH_OPTS is not None and defer_logits_calculation:
-        # defer logits calculation to the ConditionalGeneration forward
-        logits = hidden_states[:, slice_indices, :]
-    else:
-        logits = self.lm_head(hidden_states[:, slice_indices, :]).float()
-
-        loss = None
-        if labels is not None:
-            loss = self.loss_function(logits, labels, self.vocab_size, **loss_kwargs)
-
-    if not return_dict:
-        output = (logits,) + outputs[1:]
-        return (loss,) + output if loss is not None else output
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
-@add_start_docstrings_to_model_forward(MLLAMA_INPUTS_DOCSTRING)
-@replace_return_docstrings(
-    output_type=CausalLMOutputWithPast, config_class="MllamaConfig"
-)
-def cce_forward_multimodal(
-    self,
-    input_ids: Optional[torch.LongTensor] = None,
-    pixel_values: Optional[torch.FloatTensor] = None,
-    aspect_ratio_mask: Optional[torch.Tensor] = None,
-    aspect_ratio_ids: Optional[torch.Tensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,
-    cross_attention_mask: Optional[torch.Tensor] = None,
-    cross_attention_states: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_values: Optional[list[torch.FloatTensor]] = None,
-    inputs_embeds: Optional[torch.FloatTensor] = None,
-    labels: Optional[torch.LongTensor] = None,
-    use_cache: Optional[bool] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
-    **loss_kwargs,
-) -> Union[Tuple, CausalLMOutputWithPast]:
-    r"""
-        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        logits_to_keep (`int` or `torch.Tensor`, *optional*):
-            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
-            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
-            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
-            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
-            This is useful when using packed tensor format (single dimension for batch and sequence length).
-
-
-    Returns:
-
-    Example:
-
-    ```python
-    >>> from PIL import Image
-    >>> import requests
-    >>> from transformers import AutoProcessor, MllamaForConditionalGeneration
-
-    >>> checkpoint = "meta-llama/Llama-3.2-11B-Vision"
-    >>> model = MllamaForConditionalGeneration.from_pretrained(checkpoint)
-    >>> processor = AutoProcessor.from_pretrained(checkpoint)
-
-    >>> prompt = "<|image|>If I had to write a haiku for this one"
-    >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-    >>> image = Image.open(requests.get(url, stream=True).raw)
-
-    >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
-
-    >>> # Generate
-    >>> output = model.generate(**inputs, max_new_tokens=15)
-
-    >>> prompt_len = inputs.input_ids.shape[-1]
-    >>> generated_ids = output[:, prompt_len:]
-    >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-    >>> print(generated_text)
-    [', it would be:.\\nA stop sign in Chinatown.\\n']
-    ```
-    """
-    output_attentions = (
-        output_attentions
-        if output_attentions is not None
-        else self.config.output_attentions
-    )
-    output_hidden_states = (
-        output_hidden_states
-        if output_hidden_states is not None
-        else self.config.output_hidden_states
-    )
-    return_dict = (
-        return_dict if return_dict is not None else self.config.use_return_dict
-    )
-
-    if (input_ids is None) ^ (inputs_embeds is not None):
-        raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
-
-    if pixel_values is not None and inputs_embeds is not None:
-        raise ValueError(
-            "You cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one"
-        )
-
-    if pixel_values is not None and cross_attention_states is not None:
-        raise ValueError(
-            "`pixel_values` and `cross_attention_states` cannot be provided simultaneously"
-        )
-
-    if pixel_values is not None:
-        if aspect_ratio_ids is None:
-            raise ValueError(
-                "`aspect_ratio_ids` must be provided if `pixel_values` is provided"
-            )
-        # get vision tokens from vision model
-        vision_outputs = self.vision_model(
-            pixel_values=pixel_values,
-            aspect_ratio_ids=aspect_ratio_ids,
-            aspect_ratio_mask=aspect_ratio_mask,
-            output_hidden_states=output_hidden_states,
-            output_attentions=output_attentions,
-            return_dict=return_dict,
-        )
-        cross_attention_states = vision_outputs[0]
-        cross_attention_states = self.multi_modal_projector(
-            cross_attention_states
-        ).reshape(
-            -1, cross_attention_states.shape[-2], self.hidden_size  # type: ignore
-        )
-
-    if cross_attention_mask is not None:
-        cross_attention_mask, full_text_row_masked_out_mask = (
-            _prepare_cross_attention_mask(
-                cross_attention_mask,
-                num_vision_tokens=self.vision_model.num_patches,
-                dtype=self.dtype,
-            )
-        )
-    else:
-        full_text_row_masked_out_mask = None
-
-    if cross_attention_mask is not None and cache_position is not None:
-        cross_attention_mask = cross_attention_mask[:, :, cache_position]
-        full_text_row_masked_out_mask = full_text_row_masked_out_mask[
-            :, :, cache_position
-        ]
-
-    outputs = self.language_model(
-        input_ids=input_ids,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        cross_attention_states=cross_attention_states,
-        cross_attention_mask=cross_attention_mask,
-        full_text_row_masked_out_mask=full_text_row_masked_out_mask,
-        past_key_values=past_key_values,
-        use_cache=use_cache,
-        inputs_embeds=inputs_embeds,
-        output_hidden_states=output_hidden_states,
-        output_attentions=output_attentions,
-        return_dict=return_dict,
-        cache_position=cache_position,
-        logits_to_keep=logits_to_keep,
-        defer_logits_calculation=True,  # enable deferred logits calculation
-        **loss_kwargs,
-    )
-
-    hidden_states = outputs[0]
-    loss = None
-    logits = None
-
-    if _PATCH_OPTS is not None and _PATCH_OPTS.use_lce(labels, self.training):
-        assert labels is not None
-        loss = apply_lce(
-            hidden_states,
-            self.language_model.lm_head.weight,
-            labels,
-            _PATCH_OPTS,
-            **loss_kwargs,
-        )
-    else:
-        # Temporary fix to calculate the loss in main class, as the model's vocab size may be resized
-        logits = hidden_states
-
-        if labels is not None:
-            loss = self.loss_function(
-                logits, labels, self.config.get_text_config().vocab_size, **loss_kwargs
-            )
-
-    if not return_dict:
-        return (loss,) + outputs if loss is not None else outputs
-
-    return CausalLMOutputWithPast(
-        loss=loss,
-        logits=outputs.logits,
-        past_key_values=outputs.past_key_values,
-        hidden_states=outputs.hidden_states,
-        attentions=outputs.attentions,
-    )
-
-
-def patch_mllama(
-    maybe_model: TransformersModelT | str | transformers.PretrainedConfig,
-    patch_options: PatchOptions,
-) -> TransformersModelT | None:
-
-    global _PATCH_OPTS  # pylint: disable=global-statement
-    from transformers.models.mllama import modeling_mllama
-
-    _PATCH_OPTS = patch_options
-
-    if isinstance(maybe_model, transformers.PreTrainedModel):
-        assert isinstance(
-            maybe_model, modeling_mllama.MllamaForConditionalGeneration
-        ), f"Expected a MllamaForConditionalGeneration model. Got {type(maybe_model)}."
-        maybe_model.forward = MethodType(cce_forward_multimodal, maybe_model)
-
-        # patch the language model
-        maybe_model.language_model.forward = MethodType(
-            cce_forward, maybe_model.language_model
-        )
-        return maybe_model
-
-    modeling_mllama.MllamaForConditionalGeneration.forward = cce_forward_multimodal
-
-    # patch the causal language model
-    modeling_mllama.MllamaForCausalLM.forward = cce_forward
-    return None
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/patch.py
@@ -1,85 +0,0 @@
-# Copyright (C) 2024 Apple Inc. All Rights Reserved.
-
-"""Cut Cross Entropy patcher"""
-
-import transformers
-from cut_cross_entropy.cce_utils import LinearCrossEntropyImpl
-from cut_cross_entropy.linear_cross_entropy import LCE_IMPL_DEFAULT
-from cut_cross_entropy.transformers.llama import patch_llama
-from cut_cross_entropy.transformers.phi3 import patch_phi3
-from cut_cross_entropy.transformers.qwen2 import patch_qwen2
-from cut_cross_entropy.transformers.utils import PatchOptions, TransformersModelT
-
-from axolotl.integrations.cut_cross_entropy.monkeypatch.cohere import (
-    patch_cohere,
-    patch_cohere2,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.gemma import patch_gemma
-from axolotl.integrations.cut_cross_entropy.monkeypatch.gemma3 import (
-    patch_gemma2,
-    patch_gemma3,
-    patch_gemma3_text,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.mistral3 import (
-    patch_mistral,
-    patch_mistral3,
-)
-from axolotl.integrations.cut_cross_entropy.monkeypatch.mllama import patch_mllama
-
-CUT_CROSS_ENTROPY_MODEL_MAPPING = {
-    "llama": patch_llama,
-    "mllama": patch_mllama,
-    "phi3": patch_phi3,
-    "gemma": patch_gemma,
-    "gemma2": patch_gemma2,
-    "gemma3": patch_gemma3,
-    "gemma3_text": patch_gemma3_text,
-    "mistral": patch_mistral,
-    "mistral3": patch_mistral3,
-    "qwen2": patch_qwen2,
-    "cohere": patch_cohere,
-    "cohere2": patch_cohere2,
-}
-
-
-def cce_patch(
-    model_type_or_model: str | TransformersModelT | transformers.PretrainedConfig,
-    impl: str | LinearCrossEntropyImpl = LCE_IMPL_DEFAULT,
-    reduction: str = "mean",
-    filter_eps: float | str | None = "auto",
-    accum_e_fp32: bool = False,
-    accum_c_fp32: bool = False,
-    filter_e_grad: bool = True,
-    filter_c_grad: bool = True,
-    train_only: bool = False,
-) -> TransformersModelT | None:
-    if isinstance(impl, LinearCrossEntropyImpl):
-        impl = impl.name.lower()
-
-    if impl not in (v.name.lower() for v in LinearCrossEntropyImpl):
-        raise ValueError(f"Unknown {impl=}")
-
-    if isinstance(model_type_or_model, transformers.PreTrainedModel):
-        model_type = model_type_or_model.config.model_type
-    elif isinstance(model_type_or_model, transformers.PretrainedConfig):
-        model_type = model_type_or_model.model_type
-    else:
-        model_type = model_type_or_model
-
-    patch_options = PatchOptions(
-        impl=impl,
-        reduction=reduction,
-        filter_eps=filter_eps,
-        accum_e_fp32=accum_e_fp32,
-        accum_c_fp32=accum_c_fp32,
-        filter_e_grad=filter_e_grad,
-        filter_c_grad=filter_c_grad,
-        train_only=train_only,
-    )
-
-    if model_type in CUT_CROSS_ENTROPY_MODEL_MAPPING:
-        return CUT_CROSS_ENTROPY_MODEL_MAPPING[model_type](
-            model_type_or_model, patch_options
-        )
-
-    raise RuntimeError(f"Unknown model type {model_type}")
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/utils.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/utils.py
@@ -1,40 +0,0 @@
-# Copyright (C) 2024 Apple Inc. All Rights Reserved.
-
-"""Monkeypatch for apply_lce to add softcap."""
-
-import torch
-from cut_cross_entropy import linear_cross_entropy
-from cut_cross_entropy.transformers.utils import PatchOptions
-
-
-def apply_lce(
-    e: torch.Tensor,
-    c: torch.Tensor,
-    labels: torch.Tensor,
-    opts: PatchOptions,
-    bias: torch.Tensor | None = None,
-    softcap: float | None = None,
-    **loss_kwargs,
-) -> torch.Tensor:
-    """Monkey patch for apply_lce to support softcap kwarg."""
-    num_items_in_batch = loss_kwargs.get("num_items_in_batch", None)
-    cce_kwargs = opts.to_kwargs()
-    if num_items_in_batch is not None and cce_kwargs["reduction"] == "mean":
-        cce_kwargs["reduction"] = "sum"
-    else:
-        num_items_in_batch = None
-
-    loss = linear_cross_entropy(
-        e,
-        c,
-        labels.to(e.device),
-        bias=bias,
-        shift=True,
-        softcap=softcap,
-        **cce_kwargs,
-    )
-
-    if num_items_in_batch is not None:
-        loss = loss / num_items_in_batch
-
-    return loss
--- a/src/axolotl/integrations/liger/README.md
+++ b/src/axolotl/integrations/liger/README.md
@@ -20,26 +20,6 @@ liger_layer_norm: true
 liger_fused_linear_cross_entropy: true
 ```

-## Supported Models
-
- deepseek_v2
- gemma
- gemma2
- gemma3 (partial support, no support for FLCE yet)
- granite
- jamba
- llama
- mistral
- mixtral
- mllama
- mllama_text_model
- olmo2
- paligemma
- phi3
- qwen2
- qwen2_5_vl
- qwen2_vl
-
 ## Citation

 ```bib
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -21,7 +21,6 @@ It is designed to be performant, correct, and light-weight.
 import inspect
 import logging
 import sys
-from functools import partial

 from axolotl.integrations.base import BasePlugin

@@ -42,18 +41,11 @@ class LigerPlugin(BasePlugin):
    def pre_model_load(self, cfg):
        from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
        from liger_kernel.transformers.functional import liger_cross_entropy
-        from liger_kernel.transformers.geglu import LigerGEGLUMLP
-        from liger_kernel.transformers.layer_norm import LigerLayerNorm
        from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
        from liger_kernel.transformers.rms_norm import LigerRMSNorm
        from liger_kernel.transformers.rope import liger_rotary_pos_emb
        from liger_kernel.transformers.swiglu import LigerSwiGLUMLP

-        if cfg.liger_cross_entropy and cfg.liger_fused_linear_cross_entropy:
-            raise ValueError(
-                "Cannot have both `liger_cross_entropy` and `liger_fused_linear_cross_entropy` set."
-            )
-
        if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
            apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
            liger_fn_sig = inspect.signature(apply_liger_fn)
@@ -90,8 +82,6 @@ class LigerPlugin(BasePlugin):
                modeling_jamba.JambaRMSNorm = LigerRMSNorm
            if cfg.liger_glu_activation:
                modeling_jamba.JambaMLP = LigerSwiGLUMLP
-            if cfg.liger_layer_norm:
-                modeling_jamba.nn.LayerNorm = LigerLayerNorm
            if cfg.liger_cross_entropy:
                from transformers.loss.loss_utils import nn

@@ -114,51 +104,13 @@ class LigerPlugin(BasePlugin):
                # The DeepseekV2 version of RoPE is different than upstream LLaMA.
                # See https://github.com/linkedin/Liger-Kernel/issues/129#issuecomment-2313763528
                logging.warning("Fused liger_rope is not supported for DeepseekV2.")
-            if cfg.liger_glu_activation:
-                logging.warning("liger_glu_activation is not supported for DeepseekV2.")
            if cfg.liger_rms_norm:
                modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm
            if cfg.liger_glu_activation:
                modeling_mod.DeepseekV2MLP.forward = LigerSwiGLUMLP.forward
-            if cfg.liger_layer_norm:
-                modeling_mod.DeepseekV2MLP.forward = LigerLayerNorm.forward
            if cfg.liger_cross_entropy:
                # We do not patch `nn.functional.cross_entropy` for DeepseekV2 as it still uses
                # nn.CrossEntropyLoss in the forward method.
                modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
            if cfg.liger_fused_linear_cross_entropy:
                modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
-        elif cfg.model_config_type in ["gemma3", "gemma3_text"]:
-            from transformers.models.gemma3 import modeling_gemma3
-
-            if cfg.liger_rope:
-                modeling_gemma3.apply_rotary_pos_emb = liger_rotary_pos_emb
-            if cfg.liger_rms_norm:
-
-                def _liger_rms_norm_wrapper(dim, **kwargs):
-                    "Convert 'dim' keyword to 'hidden_size' to pass to LigerRMSNorm"
-                    return LigerRMSNorm(hidden_size=dim, **kwargs)
-
-                modeling_gemma3.Gemma3RMSNorm = partial(
-                    _liger_rms_norm_wrapper,
-                    offset=1.0,
-                    casting_mode="gemma",
-                    init_fn="zeros",
-                    in_place=False,
-                )
-            if cfg.liger_glu_activation:
-                modeling_gemma3.Gemma3MLP = LigerGEGLUMLP
-            if cfg.liger_layer_norm:
-                modeling_gemma3.nn.LayerNorm = LigerLayerNorm
-
-            if cfg.liger_cross_entropy:
-                from transformers.loss.loss_utils import nn
-
-                nn.functional.cross_entropy = liger_cross_entropy
-
-            if cfg.liger_fused_linear_cross_entropy:
-                raise NotImplementedError(
-                    "Fused linear cross entropy is not yet supported for Gemma3."
-                )
-        elif cfg.model_config_type in ["deepseek_v3"]:
-            raise ValueError(f"Unsupported model config type: {cfg.model_config_type}")
--- a/src/axolotl/monkeypatch/attention/ring_attn.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn.py
@@ -27,7 +27,7 @@ def get_ring_attn_group() -> dist.ProcessGroup:
    return RING_ATTN_GROUP


-def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
+def set_ring_attn_group(ring_attn_group: dist.ProcessGroup):
    """
    Setter for ring attention group on this rank.

@@ -38,19 +38,13 @@ def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
    RING_ATTN_GROUP = ring_attn_group


-def register_ring_attn(sequence_parallel_degree: int, heads_k_stride: int | None):
+def register_ring_attn(sequence_parallel_degree: int):
    """
    Create ring attention group and substitute flash attn with ring flash attn.

    Args:
        sequence_parallel_degree: Sequence parallelism factor.
-        heads_k_stride: Sequence parallelism K head stride size. Passed
-            through to `ring_flash_attn.substitute_hf_flash_attn`.
    """
-    if get_ring_attn_group() is not None:
-        LOG.info("Ring attention already registered, exiting early...")
-        return
-
    LOG.info(
        "Enabling ring attention sequence parallelism: "
        f"each sequence will be processed across {sequence_parallel_degree} GPUs"
@@ -90,11 +84,6 @@ def register_ring_attn(sequence_parallel_degree: int, heads_k_stride: int | None
    if rank == 0:
        LOG.info(f"Sequence parallel group assignments: {group_assignments}")

-    if heads_k_stride is None:
-        heads_k_stride = 1
-
    from ring_flash_attn import substitute_hf_flash_attn

-    substitute_hf_flash_attn(
-        process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride
-    )
+    substitute_hf_flash_attn(get_ring_attn_group(), sequence_parallel_degree)
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -22,9 +22,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "phi3",
    "gemma",
    "gemma2",
-    "gemma3_text",
-    "cohere",
-    "cohere2",
    "gemmoe",
    "starcoder2",
    "deepseek_v2",
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -1,278 +0,0 @@
-"""Module containing ProcessingStrategy classes and its derivative for different MultiModal Model types"""
-
-from copy import deepcopy
-from typing import Optional
-
-from PIL import Image, ImageOps
-from PIL.Image import Resampling
-from torch import Tensor
-from transformers import ProcessorMixin
-from transformers.image_utils import load_image
-
-
-class ProcessingStrategy:
-    """Base Processing Strategy class"""
-
-    def __init__(
-        self,
-        processor: ProcessorMixin,
-        chat_template: Optional[str] = None,
-        image_size: int | tuple[int, int] | None = None,
-        image_resize_algorithm: Resampling | None = None,
-    ):
-        self.processor = processor
-        self.chat_template = chat_template
-        self.image_token = None
-        self.image_token_id = None
-
-        self.image_size = image_size
-        self.image_resize_algorithm = (
-            image_resize_algorithm or Image.Resampling.BILINEAR
-        )
-
-        if hasattr(processor, "image_token"):
-            self.image_token = processor.image_token
-            self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
-                self.image_token
-            )
-
-    def __call__(self, examples: list[dict]) -> list[dict]:
-        """
-        Preprocess conversation examples to ensure consistent format.
-        Converts different conversation formats to OpenAI format with 'messages'.
-        Supports two formats:
-        1. OpenAI format with 'messages'
-        2. Legacy format with 'conversations'
-
-        Args:
-            examples: list of conversation dictionaries
-
-        Returns:
-            list of dicts in OpenAI format with 'messages' key
-
-        Raises:
-            ValueError: If the conversation format is not supported
-        """
-        role_mapping = {
-            "human": "user",
-            "gpt": "assistant",
-        }
-
-        def normalize_role(role: str) -> str:
-            """Normalize role names to OpenAI format. Default to original role if not found."""
-            return role_mapping.get(role, role)
-
-        def convert_legacy_format(example: dict) -> dict:
-            """Convert legacy 'conversations' format to OpenAI 'messages' format."""
-            messages = [
-                {"role": normalize_role(convo["from"]), "content": convo["value"]}
-                for convo in example["conversations"]
-            ]
-
-            # Create new dict without 'conversations' key
-            result = deepcopy(example)
-            result.pop("conversations")
-            result["messages"] = messages
-            return result
-
-        def convert_messages_to_multimedia_messages(messages: list[dict]) -> list[dict]:
-            """Convert regular messages format to Messages format with content type"""
-
-            new_messages = []
-            for message in messages:
-                if isinstance(message["content"], str):
-                    new_messages.append(
-                        {
-                            "role": message["role"],
-                            "content": [
-                                {
-                                    "type": "text",
-                                    "text": message["content"],
-                                }
-                            ],
-                        }
-                    )
-                elif isinstance(message["content"], list):
-                    content = message["content"]
-
-                    new_messages.append(
-                        {
-                            "role": message["role"],
-                            "content": content,
-                        }
-                    )
-
-            return new_messages
-
-        processed_examples = []
-        for example in examples:
-            if not ("messages" in example or "conversations" in example):
-                raise ValueError(
-                    "Only `messages` and `conversations` message keys are currently supported."
-                )
-
-            processed_example = None
-            if "messages" in example:  # OpenAI format
-                processed_example = example
-            else:  # Legacy format
-                processed_example = convert_legacy_format(example)
-
-            # convert regular messages format to Messages format with content type
-            # for compatibility with apply_chat_template
-            processed_example["messages"] = convert_messages_to_multimedia_messages(
-                processed_example["messages"]
-            )
-
-            # find the image key if it exists
-            possible_image_keys = ["images", "image"]
-            image_key = None
-            for key in possible_image_keys:
-                if key in processed_example:
-                    image_key = key
-                    break
-
-            # if the image key exists, add the image to the first message
-            if image_key is not None:
-                # TODO: check if it's normal to be single image only for common datasets
-                # From observation, it's usually a list of single image but some datasets may have several columns for images
-                # Temporary solution: take the first image and suggest people convert their datasets to use multi-content Messages
-                image_value = processed_example[image_key][0]
-
-                # Handle image loading (Image, url, path, base64)
-                image_value = load_image(image_value)
-
-                if self.image_size is not None:
-                    assert hasattr(
-                        image_value, "resize"
-                    ), "Image does not have a resize method"
-
-                    if isinstance(self.image_size, tuple):
-                        image_value = image_value.resize(
-                            self.image_size, self.image_resize_algorithm
-                        )
-                    else:
-                        # Set the padding value; here we use black (0, 0, 0) for RGB images
-                        padding_color = (0, 0, 0)
-
-                        # When image_size is an int (square target), preserve aspect ratio then pad
-                        # This is to prevent aspect ratio distortion when resizing to square
-                        image_value = ImageOps.pad(
-                            image_value,
-                            (self.image_size, self.image_size),
-                            method=self.image_resize_algorithm,
-                            color=padding_color,
-                        )
-
-                # Look for any image type in the first message
-                # some dataset have an {type: "image"} in the first message
-                ind_to_add = None
-
-                for i, content in enumerate(
-                    processed_example["messages"][0]["content"]
-                ):
-                    # Usually datasets created with image columns, don't have it in the messages itself
-                    if content["type"] == "image" and all(
-                        k not in content for k in ["image", "url", "path", "base64"]
-                    ):
-                        ind_to_add = i
-                        break
-
-                # If an image type is found, add the image to that index
-                if ind_to_add is not None:
-                    processed_example["messages"][0]["content"][ind_to_add][
-                        "image"
-                    ] = image_value
-                else:
-                    # if no image type is found, add it to end of the first message
-                    processed_example["messages"][0]["content"].append(
-                        {
-                            "type": "image",
-                            "image": image_value,
-                        }
-                    )
-
-            processed_examples.append(processed_example)
-
-        return processed_examples
-
-    def process_labels(self, input_ids: Tensor) -> Tensor:
-        labels = input_ids.clone()
-
-        # The labels are the input_ids, and we mask the padding tokens in the loss computation
-        labels[labels == self.processor.tokenizer.pad_token_id] = -100
-
-        # Ignore the image token index in the loss computation (model specific)
-        labels[labels == self.image_token_id] = -100
-
-        return labels
-
-
-class Qwen2VLProcessingStrategy(ProcessingStrategy):
-    """Processing Strategy class for Qwen2-VL"""
-
-    def __init__(
-        self,
-        processor: ProcessorMixin,
-        chat_template: Optional[str] = None,
-        image_size: int | tuple[int, int] | None = None,
-        image_resize_algorithm: Resampling | None = None,
-    ):
-        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
-        self.image_token = "<|image_pad|>"  # nosec
-        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
-            self.image_token
-        )
-
-
-class Gemma3ProcessingStrategy(ProcessingStrategy):
-    """Processing Strategy class for Gemma3"""
-
-    def __init__(
-        self,
-        processor: ProcessorMixin,
-        chat_template: Optional[str] = None,
-        image_size: int | tuple[int, int] | None = None,
-        image_resize_algorithm: Resampling | None = None,
-    ):
-        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
-        self.image_token = processor.tokenizer.special_tokens_map["boi_token"]
-        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
-            self.image_token
-        )
-
-    def process_labels(self, input_ids):
-        labels = input_ids.clone()
-
-        # Follows https://ai.google.dev/gemma/docs/core/huggingface_vision_finetune_qlora
-        labels[labels == self.processor.tokenizer.pad_token_id] = -100
-        labels[labels == self.image_token_id] = -100
-        labels[labels == 262144] = -100  # corresponds to <image_soft_token>
-
-        return labels
-
-
-def get_processing_strategy(
-    processor: ProcessorMixin,
-    chat_template,
-    chat_template_type,
-    image_size: int | tuple[int, int] | None = None,
-    image_resize_algorithm: Resampling | None = None,
-):
-    if chat_template_type == "qwen2_vl":
-        return Qwen2VLProcessingStrategy(
-            processor, chat_template, image_size, image_resize_algorithm
-        )
-    if chat_template_type == "gemma3":
-        return Gemma3ProcessingStrategy(
-            processor, chat_template, image_size, image_resize_algorithm
-        )
-    if chat_template_type in [
-        "llama3_2_vision",
-        "llava",
-        "mistral_v7_tekken",
-        "pixtral",
-    ]:
-        return ProcessingStrategy(
-            processor, chat_template, image_size, image_resize_algorithm
-        )
-    raise ValueError(f"Unsupported chat template type: {chat_template_type}")
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -411,15 +411,11 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
        if turn_idx >= len(turns):
            raise ValueError(f"Turn index {turn_idx} out of range")

-        # mistral/gemma3 does not output message if it contains only system message
+        # mistral does not output message if it contains only system message
        if (
            turn_idx == 0
            and turns[0].get("role") == "system"
-            and (
-                "mistral" in self.tokenizer.name_or_path.lower()
-                # gemma3 uses gemma tokenizer
-                or "gemma" in self.tokenizer.name_or_path.lower()
-            )
+            and "mistral" in self.tokenizer.name_or_path.lower()
        ):
            return -1, -1

--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -14,7 +14,6 @@ import transformers.modelcard
 from accelerate.logging import get_logger
 from accelerate.utils import save_fsdp_model
 from datasets import Dataset
-from huggingface_hub.errors import OfflineModeIsEnabled
 from peft import PeftConfig, PeftModel
 from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
@@ -27,7 +26,6 @@ from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.trainer import setup_trainer
@@ -158,8 +156,6 @@ def setup_signal_handler(
                _model.save_pretrained(
                    cfg.output_dir, safe_serialization=safe_serialization
                )
-
-            cleanup_distributed()
            sys.exit(0)

        _model_weakref = weakref.ref(model)
@@ -306,7 +302,7 @@ def create_model_card(cfg: DictDefault, trainer: Trainer):
                    model_card_kwarg["dataset_tags"] = dataset_tags

            trainer.create_model_card(**model_card_kwarg)
-        except (AttributeError, UnicodeDecodeError, OfflineModeIsEnabled):
+        except (AttributeError, UnicodeDecodeError):
            pass
    elif cfg.hub_model_id:
        # Defensively push to the hub to ensure the model card is updated
@@ -318,7 +314,6 @@ def save_initial_configs(
    tokenizer: PreTrainedTokenizer,
    model: PreTrainedModel,
    peft_config: PeftConfig | None,
-    processor: ProcessorMixin | None,
 ):
    """
    Save initial configurations before training.
@@ -346,10 +341,6 @@ def save_initial_configs(
        LOG.info(f"Pre-saving model config to {cfg.output_dir}...")
        model.config.save_pretrained(str(output_dir))

-    if processor:
-        LOG.info(f"Pre-saving processor to {cfg.output_dir}...")
-        processor.save_pretrained(str(output_dir))
-

 def setup_model_card(cfg: DictDefault):
    """
@@ -417,7 +408,6 @@ def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) ->
    PeftModel | PreTrainedModel,
    PreTrainedTokenizer,
    PeftConfig | None,
-    ProcessorMixin | None,
 ]:
    """
    Load model, tokenizer, trainer, etc. Helper function to encapsulate the full
@@ -433,7 +423,6 @@ def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) ->
            - Model
            - Tokenizer
            - PEFT config
-            - Processor
    """
    # Load tokenizer, processor and model
    model, tokenizer, peft_config, processor = setup_model_and_tokenizer(cfg)
@@ -464,7 +453,6 @@ def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) ->
        model,
        tokenizer,
        peft_config,
-        processor,
    )


@@ -481,35 +469,42 @@ def train(
    Returns:
        Tuple of (model, tokenizer) after training
    """
-    # Setup model, tokenizer, (causal or RLHF) trainer, etc.
+    # Setup model, tokenizer, (causal or RLHF) trainer etc.
    (
        trainer,
        model,
        tokenizer,
        peft_config,
-        processor,
    ) = setup_model_and_trainer(cfg, dataset_meta)

-    # Handle untrained tokens if configured
+    # Determine if we need to resume from a checkpoint
+    resume_from_checkpoint = determine_resume_checkpoint(cfg)
+
+    # Configuration for saving
    safe_serialization = cfg.save_safetensors is True
+
+    # Handle untrained tokens if configured
    train_dataset = dataset_meta.train_dataset
    handle_untrained_tokens_fix(
        cfg, model, tokenizer, train_dataset, safe_serialization
    )

-    # Additional setup
-    save_initial_configs(cfg, tokenizer, model, peft_config, processor)
+    # Save initial configs
+    save_initial_configs(cfg, tokenizer, model, peft_config)
+
+    # Set up signal handler for graceful termination
    setup_signal_handler(cfg, model, safe_serialization)
+
+    # Set up badges and config info for model card
    setup_model_card(cfg)

    # Execute the training
-    resume_from_checkpoint = determine_resume_checkpoint(cfg)
    execute_training(cfg, trainer, resume_from_checkpoint)

-    # Save the trained model and cleanup
+    # Save the trained model
    save_trained_model(cfg, trainer, model, safe_serialization)
+
+    # Create model card
    create_model_card(cfg, trainer)
-    if not cfg.use_ray:
-        cleanup_distributed()

    return model, tokenizer, trainer
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -816,6 +816,27 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
        return control


+class SaveModelCallback(TrainerCallback):
+    """Callback to save model on train end"""
+
+    def on_step_end(  # pylint: disable=unused-argument
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        # Save
+        if state.global_step >= state.max_steps:
+            control.should_save = True
+
+    def on_train_end(  # pylint: disable=unused-argument
+        self, args, state, control, **kwargs
+    ):
+        control.should_save = True
+        return control
+
+
 class GCCallback(TrainerCallback):
    """Callback to garbage collect torch cache"""

--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
--- a/src/axolotl/utils/collators/mm_chat.py
+++ b/src/axolotl/utils/collators/mm_chat.py
@@ -2,17 +2,15 @@
 Collators for multi-modal chat messages and packing
 """

+from copy import deepcopy
 from dataclasses import dataclass
 from typing import Any, Optional, Union

-import torch
-from torch import Tensor
-from transformers import PreTrainedTokenizerBase
+from PIL import Image
+from transformers import PreTrainedTokenizerBase, ProcessorMixin
 from transformers.data.data_collator import DataCollatorMixin
 from transformers.utils import PaddingStrategy

-from axolotl.processing_strategies import ProcessingStrategy
-

@dataclass
 class MultiModalChatDataCollator(DataCollatorMixin):
@@ -21,9 +19,11 @@ class MultiModalChatDataCollator(DataCollatorMixin):
    """

    tokenizer: PreTrainedTokenizerBase
-    processing_strategy: ProcessingStrategy
-    packing: bool = False
+    processor: ProcessorMixin
    return_tensors: str = "pt"
+    chat_template: Optional[str] = None
+    packing: bool = False
+    max_images: int = -1
    padding: Union[bool, str, PaddingStrategy] = True
    pad_to_multiple_of: Optional[int] = None

@@ -31,62 +31,162 @@ class MultiModalChatDataCollator(DataCollatorMixin):
        if self.packing:
            raise ValueError("Packing is currently not supported.")

-    def torch_call(self, examples: list[dict]) -> dict[str, Any]:
-        return self.process_rows(examples)
+    def torch_call(
+        self, examples: list[Union[list[int], Any, dict[str, Any]]]
+    ) -> dict[str, Any]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+
+        return self.__class__.process_rows(
+            examples, self.processor, self.chat_template, self.max_images
+        )
+
+    @staticmethod
+    def process_rows(examples, processor, chat_template, max_images, length_only=False):
+        # HINT: use `_torch_collate_batch` to stack and pad tensors
+        # see also DataCollatorWithFlattening and DefaultDataCollator
+
+        # *** This is COPIED from the trl example sft_vlm.py code ***
+        # use this as a starting point
+
+        def _preprocess(examples: list[dict]) -> list[dict]:
+            """
+            Preprocess conversation examples to ensure consistent format.
+
+            Converts different conversation formats to OpenAI format with 'messages'.
+            Supports two formats:
+            1. OpenAI format with 'messages'
+            2. Legacy format with 'conversations'
+
+            Args:
+                examples: list of conversation dictionaries
+
+            Returns:
+                dict in OpenAI format with 'messages' key
+
+            Raises:
+                ValueError: If the conversation format is not supported
+            """
+            role_mapping = {
+                "human": "user",
+                "gpt": "assistant",
+            }
+
+            def normalize_role(role: str) -> str:
+                """Normalize role names to OpenAI format. Default to original role if not found."""
+                return role_mapping.get(role, role)
+
+            def convert_legacy_format(example: dict) -> dict:
+                """Convert legacy 'conversations' format to OpenAI 'messages' format."""
+                messages = [
+                    {
+                        "role": normalize_role(convo["from"]),
+                        "content": convo["value"],
+                    }
+                    for convo in example["conversations"]
+                ]
+
+                # Create new dict without 'conversations' key
+                result = deepcopy(example)
+                result.pop("conversations")
+                return {"messages": messages, **result}
+
+            processed_examples = []
+            for example in examples:
+                # OpenAI format
+                if "messages" in example:
+                    processed_examples.append(example)
+
+                # Legacy format
+                elif "conversations" in example:
+                    processed_examples.append(convert_legacy_format(example))
+
+                else:
+                    raise ValueError(
+                        "Only `messages` and `conversations` message keys are currently supported."
+                    )
+
+            return processed_examples
+
+        def _process_images(examples, max_images):
+            """
+            Process images from examples, ensuring consistency in image presence and applying max_images limit.
+
+            Args:
+                examples: List of dictionaries that may contain 'images' key
+                max_images: Maximum number of images to keep per example (0 means no limit)
+
+            Returns:
+                Either None (if no images) or List[Image objects] (if all examples have images)
+
+            Raises:
+                ValueError: If there's a mix of None and non-None images
+            """
+
+            def get_image(example):
+                if "images" not in example:
+                    return None
+                images = example["images"]
+                if isinstance(images, str):
+                    return Image.open(images)
+                return images
+
+            images = [get_image(example) for example in examples]
+
+            # Count None and non-None images
+            none_count = sum(1 for img in images if img is None)
+
+            # All images are None
+            if none_count == len(images):
+                return None
+
+            # Mix of None and non-None images
+            if none_count > 0:
+                raise ValueError(
+                    "All images should be either None or not None. "
+                    "Please provide images for all examples or None."
+                )
+
+            # Apply max_images limit if specified
+            if max_images > 0:
+                images = [
+                    (
+                        img_batch[:max_images]
+                        if isinstance(img_batch, (list, tuple))
+                        else img_batch
+                    )
+                    for img_batch in images
+                ]
+
+            return images

-    def process_rows(
-        self,
-        examples: list[dict],
-    ) -> dict[str, Tensor]:
        # Preprocess the examples
-        examples = self.processing_strategy(examples)
+        examples = _preprocess(examples)

-        # Initialize batch
-        batch: dict[str, Any] = {}
-
-        # Process each example
-        for example in examples:
-            # Apply chat template to process the example
-            # This method requires transformers>=4.49.0
-            result = self.processing_strategy.processor.apply_chat_template(
-                example["messages"],
-                add_generation_prompt=True,
-                tokenize=True,
-                return_tensors="pt",
-                padding=True,
-                return_dict=True,
-                chat_template=self.processing_strategy.chat_template,
+        # Get the texts and images, and apply the chat template
+        texts = [
+            processor.apply_chat_template(
+                example["messages"], chat_template=chat_template, tokenize=False
            )
+            for example in examples
+        ]

-            # TODO: Check if need handling for len(input_ids) > sequence_len
+        images = _process_images(examples, max_images=max_images)

-            # Add the processed tensors to our batch
-            for key in result.keys():
-                if key not in batch:
-                    batch[key] = []
+        # Tokenize the texts and process the images
+        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)

-                batch[key].append(result[key].squeeze(0))
-
-        # Pad sequences to the same length
-        input_ids = torch.nn.utils.rnn.pad_sequence(
-            batch["input_ids"],
-            batch_first=True,
-            padding_value=self.tokenizer.pad_token_id,
+        # The labels are the input_ids, and we mask the padding tokens in the loss computation
+        labels = batch["input_ids"].clone()
+        labels[labels == processor.tokenizer.pad_token_id] = -100  #
+        # Ignore the image token index in the loss computation (model specific)
+        image_token_id = processor.tokenizer.convert_tokens_to_ids(
+            processor.image_token
        )
+        labels[labels == image_token_id] = -100
+        batch["labels"] = labels

-        attention_mask = torch.nn.utils.rnn.pad_sequence(
-            batch["attention_mask"], batch_first=True, padding_value=0
-        )
-
-        # Create the final batch
-        final_batch = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-
-        # Process the labels
-        final_batch["labels"] = self.processing_strategy.process_labels(
-            final_batch["input_ids"]
-        )
-
-        return final_batch
+        if length_only:
+            return {
+                "length": [len(sample["input_ids"]) for sample in batch["input_ids"]]
+            }
+        return batch
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -13,7 +13,7 @@ from axolotl.integrations.base import PluginManager
 from axolotl.integrations.config import merge_input_args
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config
+from axolotl.utils.models import load_model_config
 from axolotl.utils.schemas.config import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
 )
@@ -158,7 +158,7 @@ def normalize_config(cfg):

    cfg.is_multimodal = (
        hasattr(model_config, "model_type")
-        and model_config.model_type in MULTIMODAL_AUTO_MODEL_MAPPING
+        and model_config.model_type in ["llava", "mllama"]
        or any(
            multimodal_name in cfg.base_model.lower()
            for multimodal_name in [
@@ -171,6 +171,7 @@ def normalize_config(cfg):
        cfg.processor_config = (
            cfg.processor_config or cfg.base_model_config or cfg.base_model
        )
+        model_config = model_config.text_config

    cfg.model_config_type = model_config.model_type

--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -6,12 +6,8 @@ from pathlib import Path
 from typing import Optional, Union

 from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
-from huggingface_hub import hf_hub_download, snapshot_download
-from huggingface_hub.errors import (
-    HFValidationError,
-    RepositoryNotFoundError,
-    RevisionNotFoundError,
-)
+from huggingface_hub import hf_hub_download
+from huggingface_hub.errors import HFValidationError

 from axolotl.utils.dict import DictDefault

@@ -74,25 +70,20 @@ def load_dataset_w_config(
    # pylint: disable=invalid-name
    ds: Optional[Union[Dataset, DatasetDict]] = None  # pylint: disable=invalid-name
    ds_from_hub = False
+    ds_trust_remote_code = config_dataset.trust_remote_code
    try:
        # this is just a basic check to see if the path is a
        # valid HF dataset that's loadable
-        snapshot_download(
-            repo_id=config_dataset.path,
-            repo_type="dataset",
+        load_dataset(
+            config_dataset.path,
+            name=config_dataset.name,
+            streaming=True,
            token=use_auth_token,
            revision=config_dataset.revision,
-            ignore_patterns=["*"],
+            trust_remote_code=ds_trust_remote_code,
        )
        ds_from_hub = True
-    except (
-        RepositoryNotFoundError,
-        RevisionNotFoundError,
-        FileNotFoundError,
-        ConnectionError,
-        HFValidationError,
-        ValueError,
-    ):
+    except (FileNotFoundError, ConnectionError, HFValidationError, ValueError):
        pass

    ds_from_cloud = False
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -71,8 +71,8 @@ def barrier():

 def is_main_process():
    """
-    Check if the current process is the main process. If not in distributed mode,
-    always return `True`.
+    Check if the current process is the main process.
+    If not in distributed mode, always return True.
    """
    if not is_distributed():
        return True
@@ -87,18 +87,6 @@ def get_world_size():
    return int(os.getenv("WORLD_SIZE", "1"))


-def cleanup_distributed():
-    """
-    Destroy process group if torch distributed is initialized. Called in training early
-    termination or when training successfully completes.
-    """
-    # Ensure that all operations are completed before destroying the process group
-    torch.cuda.synchronize()
-    # Destroy the process group
-    if torch.distributed.is_initialized():
-        torch.distributed.destroy_process_group()
-
-
@contextmanager
 def zero_only():
    """
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -8,7 +8,7 @@ import math
 import os
 import types
 from functools import cached_property
-from typing import Any, Dict, Optional, Tuple
+from typing import Any, Dict, Optional, Tuple, Union  # noqa: F401

 import addict
 import bitsandbytes as bnb
@@ -25,7 +25,7 @@ from peft import (
    prepare_model_for_kbit_training,
 )
 from torch import nn
-from transformers import (
+from transformers import (  # noqa: F401
    AddedToken,
    AutoConfig,
    AutoModelForCausalLM,
@@ -34,17 +34,12 @@ from transformers import (
    AutoTokenizer,
    AwqConfig,
    BitsAndBytesConfig,
-    Gemma3ForConditionalGeneration,
    GPTQConfig,
    LlavaForConditionalGeneration,
-    Mistral3ForConditionalGeneration,
    MllamaForConditionalGeneration,
-    PretrainedConfig,
    PreTrainedModel,
    PreTrainedTokenizerBase,
    ProcessorMixin,
-    Qwen2_5_VLForConditionalGeneration,
-    Qwen2VLForConditionalGeneration,
 )
 from transformers.integrations.deepspeed import (
    HfTrainerDeepSpeedConfig,
@@ -74,13 +69,9 @@ from axolotl.utils.model_shard_quant import load_sharded_model, load_sharded_mod

 LOG = logging.getLogger(__name__)

-MULTIMODAL_AUTO_MODEL_MAPPING = {
-    "mllama": MllamaForConditionalGeneration,
+MULTIMODEL_AUTO_MODEL_MAPPING = {
    "llava": LlavaForConditionalGeneration,
-    "qwen2_vl": Qwen2VLForConditionalGeneration,
-    "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
-    "mistral3": Mistral3ForConditionalGeneration,
-    "gemma3": Gemma3ForConditionalGeneration,
+    "mllama": MllamaForConditionalGeneration,
 }


@@ -108,30 +99,9 @@ def get_module_class_from_name(module, name):
    return None


-def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
-    # Set use_cache to False
-    if hasattr(model_config, "use_cache"):
-        model_config.use_cache = False
-
+def check_model_config(cfg: DictDefault, model_config: Union[AutoConfig, DictDefault]):
    if cfg.is_multimodal:
-        # For multimodal configs, use_cache is set in the text_config
-        if hasattr(model_config, "get_text_config"):
-            text_config = model_config.get_text_config()
-            if hasattr(text_config, "use_cache"):
-                text_config.use_cache = False
-        else:
-            raise ValueError(
-                "No text config found for multimodal model. Please raise an Issue with model details."
-            )
-
-        # check if image_size is not set and load image size from model config if available
-        if (
-            cfg.image_size is None
-            and hasattr(model_config, "vision_config")
-            and hasattr(model_config.vision_config, "image_size")
-        ):
-            cfg.image_size = model_config.vision_config.image_size
-            LOG.debug(f"Loaded image size: {cfg.image_size} from model config")
+        model_config = model_config.text_config

    quant_config_exists = (
        hasattr(model_config, "quantization_config")
@@ -470,31 +440,6 @@ def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
        **processor_kwargs,
    )

-    # Attempt to load image size from processor if available
-    if (
-        cfg.image_size is None
-        and hasattr(processor, "size")
-        and any(dim in processor.size for dim in ["width", "height"])
-    ):
-        im_width = None
-        im_height = None
-        if "width" in processor.size:
-            im_width = processor.size["width"]
-        if "height" in processor.size:
-            im_height = processor.size["height"]
-
-        # If both width and height are set, use a tuple
-        if im_width is not None and im_height is not None:
-            cfg.image_size = (im_width, im_height)
-        # If only width is set, use as integer
-        elif im_width is not None:
-            cfg.image_size = im_width
-        # If only height is set, use as integer
-        elif im_height is not None:
-            cfg.image_size = im_height
-
-        LOG.debug(f"Loaded image size: {cfg.image_size} from processor")
-
    return processor


@@ -531,6 +476,10 @@ class ModelLoader:

        # init model config
        self.model_config = load_model_config(cfg)
+        if cfg.is_multimodal:
+            self.text_model_config = self.model_config.text_config
+        else:
+            self.text_model_config = self.model_config

        self.auto_model_loader = AutoModelForCausalLM  # pylint: disable=invalid-name

@@ -609,10 +558,7 @@ class ModelLoader:
            # Initialize ring attn for sequence parallelism. This must be done after
            # model init but before the first forward pass, since it modifies flash
            # attn to use ring comm for SP training across multiple GPUs.
-            register_ring_attn(
-                sequence_parallel_degree=self.cfg.sequence_parallel_degree,
-                heads_k_stride=self.cfg.heads_k_stride,
-            )
+            register_ring_attn(self.cfg.sequence_parallel_degree)

    def patch_attention(self) -> None:
        if hasattr(self.model_config, "model_type"):
@@ -727,7 +673,7 @@ class ModelLoader:
        should be set according to the type of the model.
        """
        if self.cfg.is_multimodal:
-            self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get(
+            self.auto_model_loader = MULTIMODEL_AUTO_MODEL_MAPPING.get(
                self.model_config.model_type, AutoModelForVision2Seq
            )

@@ -950,6 +896,8 @@ class ModelLoader:
            quantization_config = (
                quantization_config or self.model_kwargs["quantization_config"]
            )
+            if self.cfg.is_multimodal:
+                self.model_config.text_config = self.text_model_config
            self.model = load_sharded_model_quant(
                self.base_model,
                self.model_config,
@@ -970,6 +918,9 @@ class ModelLoader:

            _ = _configure_zero3_memory_efficient_loading()

+            if self.cfg.is_multimodal:
+                self.model_config.text_config = self.text_model_config
+
            # Load model with random initialization if specified
            if self.cfg.random_init_weights:
                # AutoModel classes support the from_config method
@@ -1024,6 +975,8 @@ class ModelLoader:
            and self.model_type != "AutoModelForCausalLM"
            and not self.cfg.trust_remote_code
        ):
+            if self.cfg.is_multimodal:
+                self.model_config.text_config = self.text_model_config
            if self.cfg.gptq:
                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
@@ -1039,7 +992,25 @@ class ModelLoader:
                    **self.model_kwargs,
                )
        else:
+            # Shouldn't be a problem most of the time. will obviously error if the model doesn't support this
+            # when training starts
+            if (
+                hasattr(self.text_model_config, "max_seq_len")
+                and self.text_model_config.max_seq_len
+                and self.cfg.sequence_len > self.text_model_config.max_seq_len
+            ):
+                self.text_model_config.max_seq_len = self.cfg.sequence_len
+                LOG.warning(f"increasing context length to {self.cfg.sequence_len}")
+            elif (
+                hasattr(self.text_model_config, "max_sequence_length")
+                and self.text_model_config.max_sequence_length
+                and self.cfg.sequence_len > self.text_model_config.max_sequence_length
+            ):
+                self.text_model_config.max_sequence_length = self.cfg.sequence_len
+                LOG.warning(f"increasing context length to {self.cfg.sequence_len}")
            if self.cfg.gptq:
+                if self.cfg.is_multimodal:
+                    self.model_config.text_config = self.text_model_config
                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
@@ -1058,6 +1029,8 @@ class ModelLoader:

                _ = _configure_zero3_memory_efficient_loading()

+                if self.cfg.is_multimodal:
+                    self.model_config.text_config = self.text_model_config
                self.model = self.auto_model_loader.from_pretrained(
                    self.base_model,
                    config=self.model_config,
@@ -1221,9 +1194,7 @@ class ModelLoader:
            )
        ):
            resize_kwargs = {}
-            if self.cfg.mean_resizing_embeddings is not None and not (
-                self.model_config.model_type == "llava"
-            ):
+            if self.cfg.mean_resizing_embeddings is not None:
                resize_kwargs["mean_resizing"] = self.cfg.mean_resizing_embeddings
            self.model.resize_token_embeddings(embeddings_len, **resize_kwargs)
        else:
@@ -1322,6 +1293,8 @@ class ModelLoader:
                requires_grad.append(f"{name}: {param.requires_grad}")
        if len(requires_grad) == 0:
            LOG.warning("there are no parameters that require gradient updates")
+        if hasattr(self.model, "config"):
+            self.model.config.use_cache = False

        if self.cfg.flash_optimum:
            from optimum.bettertransformer import BetterTransformer
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -42,7 +42,6 @@ from axolotl.utils.schemas.model import (
    ModelOutputConfig,
    SpecialTokensConfig,
 )
-from axolotl.utils.schemas.multimodal import MultiModalConfig
 from axolotl.utils.schemas.peft import LoraConfig, ReLoRAConfig
 from axolotl.utils.schemas.training import HyperparametersConfig
 from axolotl.utils.schemas.trl import TRLConfig
@@ -65,7 +64,6 @@ class AxolotlInputConfig(
    LISAConfig,
    GradioConfig,
    RayConfig,
-    MultiModalConfig,
    RemappedParameters,
    DeprecatedParameters,
    BaseModel,
@@ -248,7 +246,6 @@ class AxolotlInputConfig(
    val_set_size: float | None = Field(default=0.0)

    sequence_parallel_degree: int | None = None
-    heads_k_stride: int | None = None

    special_tokens: SpecialTokensConfig | None = None
    tokens: list[str] | None = None
@@ -1109,7 +1106,7 @@ class AxolotlInputConfig(

    @field_validator("sequence_parallel_degree", mode="before")
    @classmethod
-    def check_sequence_parallel_degree(cls, value, info):
+    def check_sequence_parallel_config(cls, value, info):
        if not value:
            value = 1

--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -22,7 +22,6 @@ class ChatTemplate(str, Enum):
    mistral_v1 = "mistral_v1"  # pylint: disable=invalid-name
    mistral_v2v3 = "mistral_v2v3"  # pylint: disable=invalid-name
    mistral_v3_tekken = "mistral_v3_tekken"  # pylint: disable=invalid-name
-    mistral_v7_tekken = "mistral_v7_tekken"  # pylint: disable=invalid-name
    gemma = "gemma"  # pylint: disable=invalid-name
    cohere = "cohere"  # pylint: disable=invalid-name
    llama3 = "llama3"  # pylint: disable=invalid-name
@@ -37,10 +36,6 @@ class ChatTemplate(str, Enum):
    tokenizer_default = "tokenizer_default"  # pylint: disable=invalid-name
    exaone = "exaone"  # pylint: disable=invalid-name
    metharme = "metharme"  # pylint: disable=invalid-name
-    pixtral = "pixtral"  # pylint: disable=invalid-name
-    llava = "llava"  # pylint: disable=invalid-name
-    qwen2_vl = "qwen2_vl"  # pylint: disable=invalid-name
-    gemma3 = "gemma3"  # pylint: disable=invalid-name


 class CustomSupportedOptimizers(str, Enum):
--- a/src/axolotl/utils/schemas/multimodal.py
+++ b/src/axolotl/utils/schemas/multimodal.py
@@ -1,48 +0,0 @@
-"""Pydantic models for multimodal-related configuration"""
-
-from typing import Literal
-
-from PIL.Image import Resampling
-from pydantic import BaseModel, Field, field_validator
-
-
-class MultiModalConfig(BaseModel):
-    """Multi-modal configuration subset"""
-
-    image_size: int | tuple[int, int] | None = Field(
-        default=None,
-        json_schema_extra={
-            "description": (
-                "The size of the image to resize to. It can be an integer (resized into padded-square image) or a tuple (width, height)."
-                "If not provided, we will attempt to load from preprocessor.size, otherwise, images won't be resized."
-            )
-        },
-    )
-    image_resize_algorithm: (
-        Literal["bilinear", "bicubic", "lanczos"] | Resampling | None
-    ) = Field(
-        default=None,
-        json_schema_extra={
-            "description": "The resampling algorithm to use for image resizing. Default is bilinear. Please refer to PIL.Image.Resampling for more details."
-        },
-    )
-
-    @field_validator("image_resize_algorithm", mode="before")
-    @classmethod
-    def convert_image_resize_algorithm(cls, image_resize_algorithm):
-        """
-        Convert the image resize algorithm to a PIL.Image.Resampling enum.
-        """
-        if isinstance(image_resize_algorithm, str):
-            image_resize_algorithm = image_resize_algorithm.lower()
-            if image_resize_algorithm == "bilinear":
-                image_resize_algorithm = Resampling.BILINEAR
-            elif image_resize_algorithm == "bicubic":
-                image_resize_algorithm = Resampling.BICUBIC
-            elif image_resize_algorithm == "lanczos":
-                image_resize_algorithm = Resampling.LANCZOS
-            else:
-                raise ValueError(
-                    f"Invalid image resize algorithm: {image_resize_algorithm}"
-                )
-        return image_resize_algorithm
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -11,11 +11,7 @@ import time

 import pytest
 import requests
-from datasets import load_dataset
 from huggingface_hub import snapshot_download
-from transformers import AutoTokenizer
-
-from tests.hf_offline_utils import disable_hf_offline, enable_hf_offline


 def retry_on_request_exceptions(max_retries=3, delay=1):
@@ -29,11 +25,9 @@ def retry_on_request_exceptions(max_retries=3, delay=1):
                except (
                    requests.exceptions.ReadTimeout,
                    requests.exceptions.ConnectionError,
-                    requests.exceptions.HTTPError,
                ) as exc:
                    if attempt < max_retries - 1:
-                        wait = 2**attempt * delay  # in seconds
-                        time.sleep(wait)
+                        time.sleep(delay)
                    else:
                        raise exc

@@ -43,7 +37,6 @@ def retry_on_request_exceptions(max_retries=3, delay=1):


@retry_on_request_exceptions(max_retries=3, delay=5)
-@disable_hf_offline
 def snapshot_download_w_retry(*args, **kwargs):
    return snapshot_download(*args, **kwargs)

@@ -51,19 +44,19 @@ def snapshot_download_w_retry(*args, **kwargs):
@pytest.fixture(scope="session", autouse=True)
 def download_smollm2_135m_model():
    # download the model
-    snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M", repo_type="model")
+    snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M")


@pytest.fixture(scope="session", autouse=True)
 def download_llama_68m_random_model():
    # download the model
-    snapshot_download_w_retry("JackFram/llama-68m", repo_type="model")
+    snapshot_download_w_retry("JackFram/llama-68m")


@pytest.fixture(scope="session", autouse=True)
 def download_qwen_2_5_half_billion_model():
    # download the model
-    snapshot_download_w_retry("Qwen/Qwen2.5-0.5B", repo_type="model")
+    snapshot_download_w_retry("Qwen/Qwen2.5-0.5B")


@pytest.fixture(scope="session", autouse=True)
@@ -108,37 +101,6 @@ def download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset():
    )


-@pytest.fixture(scope="session", autouse=True)
-def download_fozzie_alpaca_dpo_dataset():
-    # download the dataset
-    snapshot_download_w_retry(
-        "fozziethebeat/alpaca_messages_2k_dpo_test", repo_type="dataset"
-    )
-    snapshot_download_w_retry(
-        "fozziethebeat/alpaca_messages_2k_dpo_test",
-        repo_type="dataset",
-        revision="ea82cff",
-    )
-
-
-@pytest.fixture(scope="session")
-@disable_hf_offline
-def dataset_fozzie_alpaca_dpo_dataset(
-    download_fozzie_alpaca_dpo_dataset,
-):  # pylint: disable=unused-argument,redefined-outer-name
-    return load_dataset("fozziethebeat/alpaca_messages_2k_dpo_test", split="train")
-
-
-@pytest.fixture(scope="session")
-@disable_hf_offline
-def dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff(
-    download_fozzie_alpaca_dpo_dataset,
-):  # pylint: disable=unused-argument,redefined-outer-name
-    return load_dataset(
-        "fozziethebeat/alpaca_messages_2k_dpo_test", split="train", revision="ea82cff"
-    )
-
-
@pytest.fixture(scope="session", autouse=True)
 def download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset():
    # download the dataset
@@ -147,141 +109,10 @@ def download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset():
    )


-@pytest.fixture(scope="session", autouse=True)
-def download_argilla_dpo_pairs_dataset():
-    # download the dataset
-    snapshot_download_w_retry(
-        "argilla/distilabel-intel-orca-dpo-pairs", repo_type="dataset"
-    )
-
-
@pytest.fixture(scope="session", autouse=True)
 def download_tiny_shakespeare_dataset():
    # download the dataset
-    snapshot_download_w_retry("winglian/tiny-shakespeare", repo_type="dataset")
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_deepseek_model_fixture():
-    snapshot_download_w_retry("axolotl-ai-co/DeepSeek-V3-11M", repo_type="model")
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_huggyllama_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "huggyllama/llama-7b",
-        repo_type="model",
-        allow_patterns=["*token*", "config.json"],
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_llama_1b_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "NousResearch/Llama-3.2-1B",
-        repo_type="model",
-        allow_patterns=["*token*", "config.json"],
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_llama3_8b_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "NousResearch/Meta-Llama-3-8B", repo_type="model", allow_patterns=["*token*"]
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_llama3_8b_instruct_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "NousResearch/Meta-Llama-3-8B-Instruct",
-        repo_type="model",
-        allow_patterns=["*token*"],
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_phi_35_mini_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "microsoft/Phi-3.5-mini-instruct", repo_type="model", allow_patterns=["*token*"]
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_phi_3_medium_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "microsoft/Phi-3-medium-128k-instruct",
-        repo_type="model",
-        allow_patterns=["*token*"],
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_mistral_7b_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "casperhansen/mistral-7b-instruct-v0.1-awq",
-        repo_type="model",
-        allow_patterns=["*token*", "config.json"],
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_gemma_2b_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "unsloth/gemma-2b-it",
-        revision="703fb4a",
-        repo_type="model",
-        allow_patterns=["*token*", "config.json"],
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_gemma2_9b_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "mlx-community/gemma-2-9b-it-4bit",
-        repo_type="model",
-        allow_patterns=["*token*", "config.json"],
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_mlx_mistral_7b_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
-        repo_type="model",
-        allow_patterns=["*token*", "config.json"],
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-def download_llama2_model_fixture():
-    # download the tokenizer only
-    snapshot_download_w_retry(
-        "NousResearch/Llama-2-7b-hf",
-        repo_type="model",
-        allow_patterns=["*token*", "config.json"],
-    )
-
-
-@pytest.fixture(scope="session", autouse=True)
-@enable_hf_offline
-def tokenizer_huggyllama(
-    download_huggyllama_model_fixture,
-):  # pylint: disable=unused-argument,redefined-outer-name
-    tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
-    tokenizer.pad_token = "</s>"
-
-    return tokenizer
+    snapshot_download_w_retry("Trelis/tiny-shakespeare", repo_type="dataset")


@pytest.fixture
@@ -347,34 +178,3 @@ def cleanup_monkeypatches():
            module_globals = module_name_tuple[1]
            for module_global in module_globals:
                globals().pop(module_global, None)
-
-
-# # pylint: disable=redefined-outer-name,unused-argument
-# def test_load_fixtures(
-#     download_smollm2_135m_model,
-#     download_llama_68m_random_model,
-#     download_qwen_2_5_half_billion_model,
-#     download_tatsu_lab_alpaca_dataset,
-#     download_mhenrichsen_alpaca_2k_dataset,
-#     download_mhenrichsen_alpaca_2k_w_revision_dataset,
-#     download_mlabonne_finetome_100k_dataset,
-#     download_argilla_distilabel_capybara_dpo_7k_binarized_dataset,
-#     download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset,
-#     download_fozzie_alpaca_dpo_dataset,
-#     download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset,
-#     download_argilla_dpo_pairs_dataset,
-#     download_tiny_shakespeare_dataset,
-#     download_deepseek_model_fixture,
-#     download_huggyllama_model_fixture,
-#     download_llama_1b_model_fixture,
-#     download_llama3_8b_model_fixture,
-#     download_llama3_8b_instruct_model_fixture,
-#     download_phi_35_mini_model_fixture,
-#     download_phi_3_medium_model_fixture,
-#     download_mistral_7b_model_fixture,
-#     download_gemma_2b_model_fixture,
-#     download_gemma2_9b_model_fixture,
-#     download_mlx_mistral_7b_model_fixture,
-#     download_llama2_model_fixture,
-# ):
-#     pass
--- a/tests/core/chat/test_messages.py
+++ b/tests/core/chat/test_messages.py
@@ -10,13 +10,10 @@ from transformers import AddedToken, AutoTokenizer
 from axolotl.core.chat.format.chatml import format_message
 from axolotl.core.chat.messages import ChatFormattedChats, Chats

-from tests.hf_offline_utils import enable_hf_offline  # noqa
-

@pytest.fixture(scope="session", name="llama_tokenizer")
-@enable_hf_offline
 def llama_tokenizer_fixture():
-    return AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B")
+    return AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3.1-8B")


@pytest.fixture(scope="session", name="chatml_tokenizer")
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -5,6 +5,7 @@ e2e tests for kd trainer support in Axolotl
 from pathlib import Path

 import pytest
+from e2e.utils import check_tensorboard, require_torch_2_5_1

 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
@@ -12,8 +13,6 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault

-from tests.e2e.utils import check_tensorboard, require_torch_2_5_1
-

@pytest.fixture(name="kd_min_cfg")
 def min_cfg(temp_dir):
--- a/tests/e2e/integrations/test_liger.py
+++ b/tests/e2e/integrations/test_liger.py
@@ -2,13 +2,15 @@
 Simple end-to-end test for Liger integration
 """

+from e2e.utils import require_torch_2_4_1
+
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins
 from axolotl.utils.dict import DictDefault

-from tests.e2e.utils import check_model_output_exists, require_torch_2_4_1
+from ..utils import check_model_output_exists


 class LigerIntegrationTestCase:
--- a/tests/e2e/multigpu/test_grpo.py
+++ b/tests/e2e/multigpu/test_grpo.py
@@ -8,12 +8,11 @@ from pathlib import Path
 import pytest
 import yaml
 from accelerate.test_utils import execute_subprocess_async
+from e2e.utils import require_vllm
 from transformers.testing_utils import get_torch_dist_unique_port

 from axolotl.utils.dict import DictDefault

-from tests.e2e.utils import require_vllm
-

 class TestGRPO:
    """
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -9,13 +9,12 @@ from pathlib import Path
 import pytest
 import yaml
 from accelerate.test_utils import execute_subprocess_async
+from e2e.utils import check_tensorboard
 from huggingface_hub import snapshot_download
 from transformers.testing_utils import get_torch_dist_unique_port

 from axolotl.utils.dict import DictDefault

-from tests.e2e.utils import check_tensorboard
-
 LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
 os.environ["WANDB_DISABLED"] = "true"

--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -9,11 +9,10 @@ from pathlib import Path
 import pytest
 import yaml
 from accelerate.test_utils import execute_subprocess_async
+from e2e.utils import check_tensorboard, require_torch_lt_2_6_0

 from axolotl.utils.dict import DictDefault

-from tests.e2e.utils import check_tensorboard, require_torch_lt_2_6_0
-
 LOG = logging.getLogger(__name__)
 os.environ["WANDB_DISABLED"] = "true"

--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -144,7 +144,7 @@ def test_swiglu_mlp_integration(small_llama_model):
 def test_geglu_model_integration():
    """Test GeGLU activation with Gemma model."""
    model = AutoModelForCausalLM.from_pretrained(
-        "mhenrichsen/gemma-2b", torch_dtype=torch.float16, device_map="auto"
+        "mhenrichsen/gemma-2b", torch_dtype=torch.float16, device_map="cuda"
    )
    peft_config = get_peft_config(
        {
@@ -347,7 +347,7 @@ def test_model_architecture(model_config):
    """Test LoRA kernel patches across different model architectures."""
    # Load model with appropriate dtype
    model = AutoModelForCausalLM.from_pretrained(
-        model_config["name"], torch_dtype=model_config["dtype"], device_map="auto"
+        model_config["name"], torch_dtype=model_config["dtype"], device_map="cuda"
    )

    # Apply LoRA configuration
@@ -408,7 +408,7 @@ def test_kernel_training_integration():
    )

    # Load model
-    model, _, _ = load_model_and_tokenizer(cfg=cfg)
+    model, _ = load_model_and_tokenizer(cfg=cfg)

    # Verify correct activation function
    layer = model.model.model.layers[0]
--- a/tests/e2e/patched/test_sp.py
+++ b/tests/e2e/patched/test_sp.py
@@ -8,13 +8,14 @@ import pytest
 import torch
 from accelerate.state import PartialState

-from axolotl.monkeypatch.attention.ring_attn import (
-    get_ring_attn_group,
-    set_ring_attn_group,
-)
-from axolotl.utils.collators.batching import adjust_position_ids_for_slice
 from axolotl.utils.dict import DictDefault

+# Use a single patch for ring_flash_attn if it's not available
+ring_flash_attn_mock = MagicMock()
+with patch.dict("sys.modules", {"ring_flash_attn": ring_flash_attn_mock}):
+    from axolotl.monkeypatch.attention.ring_attn import get_ring_attn_group
+    from axolotl.utils.collators.batching import adjust_position_ids_for_slice
+

@pytest.fixture
 def partial_state():
@@ -78,22 +79,6 @@ class TestSequenceParallelHelpers:
 class TestRingAttention:
    """Tests for the ring attention functionality."""

-    @patch("torch.distributed.get_rank")
-    @patch("torch.distributed.get_world_size")
-    def test_get_ring_attn_group_no_registration(
-        self, mock_world_size, mock_rank, partial_state
-    ):
-        """Test that get_ring_attn_group returns None when no group has been registered."""
-        # Setup mocks
-        mock_world_size.return_value = 4
-        mock_rank.return_value = 0
-
-        # Get the group without registration
-        group = get_ring_attn_group()
-
-        # Verify that None was returned
-        assert group is None
-
    @patch("torch.distributed.new_group")
    @patch("torch.distributed.get_rank")
    @patch("torch.distributed.get_world_size")
@@ -110,16 +95,29 @@ class TestRingAttention:
        mock_new_group.return_value = mock_group

        # Call register_ring_attn with size 4
-        register_ring_attn(sequence_parallel_degree=4, heads_k_stride=1)
+        register_ring_attn(sequence_parallel_degree=4)

        # Verify the number of calls without examining the arguments
        assert mock_new_group.call_count == 2

-        # Verify that new_group was called
+        # Just verify that new_group was called
        mock_new_group.assert_called()

-        # Clean up
-        set_ring_attn_group(None)
+    @patch("torch.distributed.get_rank")
+    @patch("torch.distributed.get_world_size")
+    def test_get_ring_attn_group_no_registration(
+        self, mock_world_size, mock_rank, partial_state
+    ):
+        """Test that get_ring_attn_group returns None when no group has been registered."""
+        # Setup mocks
+        mock_world_size.return_value = 4
+        mock_rank.return_value = 0
+
+        # Get the group without registration
+        group = get_ring_attn_group()
+
+        # Verify that None was returned
+        assert group is None


 # Mock a simplified DataCollator test
--- a/tests/e2e/test_deepseekv3.py
+++ b/tests/e2e/test_deepseekv3.py
@@ -1,5 +1,5 @@
 """
-E2E tests for deepseekv3
+E2E tests for lora llama
 """

 import logging
@@ -14,8 +14,6 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault

-from tests.hf_offline_utils import enable_hf_offline
-
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"

@@ -25,7 +23,6 @@ class TestDeepseekV3:
    Test case for DeepseekV3 models
    """

-    @enable_hf_offline
    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
@@ -83,7 +80,6 @@ class TestDeepseekV3:
        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "adapter_model.safetensors").exists()

-    @enable_hf_offline
    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
--- a/tests/e2e/test_gemma2.py
+++ b/tests/e2e/test_gemma2.py
@@ -1,133 +0,0 @@
-"""
-E2E tests for gemma2
-"""
-
-import logging
-import os
-from pathlib import Path
-
-import pytest
-
-from axolotl.cli.args import TrainerCliArgs
-from axolotl.common.datasets import load_datasets
-from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
-from axolotl.utils.dict import DictDefault
-
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestGemma2:
-    """
-    Test case for Gemma2 models
-    """
-
-    @pytest.mark.parametrize(
-        "sample_packing",
-        [True, False],
-    )
-    def test_lora_gemma2(self, temp_dir, sample_packing):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "axolotl-ai-co/gemma-2-33M",
-                "trust_remote_code": True,
-                "sample_packing": sample_packing,
-                "flash_attention": True,
-                "sequence_len": 2048,
-                "adapter": "lora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0,
-                "datasets": [
-                    {
-                        "path": "mlabonne/FineTome-100k",
-                        "type": "chat_template",
-                        "field_messages": "conversations",
-                        "message_property_mappings": {
-                            "role": "from",
-                            "content": "value",
-                        },
-                        "drop_system_message": True,
-                        "split": "train[:1%]",
-                    },
-                ],
-                "special_tokens": {
-                    "bos_token": "<bos>",
-                    "eos_token": "<eos>",
-                },
-                "chat_template": "gemma",  # gemma2's template is same as gemma
-                "num_epochs": 1,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_bnb_8bit",
-                "lr_scheduler": "cosine",
-                "max_steps": 5,
-                "save_safetensors": True,
-                "bf16": True,
-            }
-        )
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "adapter_model.safetensors").exists()
-
-    @pytest.mark.parametrize(
-        "sample_packing",
-        [True, False],
-    )
-    def test_fft_gemma2(self, temp_dir, sample_packing):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "axolotl-ai-co/gemma-2-33M",
-                "trust_remote_code": True,
-                "sample_packing": sample_packing,
-                "flash_attention": True,
-                "sequence_len": 2048,
-                "val_set_size": 0,
-                "datasets": [
-                    {
-                        "path": "mlabonne/FineTome-100k",
-                        "type": "chat_template",
-                        "field_messages": "conversations",
-                        "message_property_mappings": {
-                            "role": "from",
-                            "content": "value",
-                        },
-                        "split": "train[:1%]",
-                        "drop_system_message": True,
-                    },
-                ],
-                "chat_template": "gemma",  # gemma2's template is same as gemma
-                "special_tokens": {
-                    "bos_token": "<bos>",
-                    "eos_token": "<eos>",
-                },
-                "num_epochs": 1,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_bnb_8bit",
-                "lr_scheduler": "cosine",
-                "max_steps": 5,
-                "save_safetensors": True,
-                "bf16": True,
-            }
-        )
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "model.safetensors").exists()
--- a/tests/e2e/test_gemma3_text.py
+++ b/tests/e2e/test_gemma3_text.py
@@ -1,131 +0,0 @@
-"""
-E2E tests for gemma3_text
-"""
-
-import logging
-import os
-from pathlib import Path
-
-import pytest
-
-from axolotl.cli.args import TrainerCliArgs
-from axolotl.common.datasets import load_datasets
-from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
-from axolotl.utils.dict import DictDefault
-
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestGemma3Text:
-    """
-    Test case for Gemma3Text models
-    """
-
-    @pytest.mark.parametrize(
-        "sample_packing",
-        [True, False],
-    )
-    def test_lora_gemma3_text(self, temp_dir, sample_packing):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "axolotl-ai-co/gemma-3-34M",
-                "trust_remote_code": True,
-                "sample_packing": sample_packing,
-                "flash_attention": True,
-                "sequence_len": 2048,
-                "adapter": "lora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0,
-                "datasets": [
-                    {
-                        "path": "mlabonne/FineTome-100k",
-                        "type": "chat_template",
-                        "field_messages": "conversations",
-                        "message_property_mappings": {
-                            "role": "from",
-                            "content": "value",
-                        },
-                        "split": "train[:1%]",
-                    },
-                ],
-                "special_tokens": {
-                    "bos_token": "<bos>",
-                    "eos_token": "<eos>",
-                },
-                "chat_template": "gemma3",
-                "num_epochs": 1,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_bnb_8bit",
-                "lr_scheduler": "cosine",
-                "max_steps": 5,
-                "save_safetensors": True,
-                "bf16": True,
-            }
-        )
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "adapter_model.safetensors").exists()
-
-    @pytest.mark.parametrize(
-        "sample_packing",
-        [True, False],
-    )
-    def test_fft_gemma3_text(self, temp_dir, sample_packing):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "axolotl-ai-co/gemma-3-34M",
-                "trust_remote_code": True,
-                "sample_packing": sample_packing,
-                "flash_attention": True,
-                "sequence_len": 2048,
-                "val_set_size": 0,
-                "datasets": [
-                    {
-                        "path": "mlabonne/FineTome-100k",
-                        "type": "chat_template",
-                        "field_messages": "conversations",
-                        "message_property_mappings": {
-                            "role": "from",
-                            "content": "value",
-                        },
-                        "split": "train[:1%]",
-                    },
-                ],
-                "chat_template": "gemma3",
-                "special_tokens": {
-                    "bos_token": "<bos>",
-                    "eos_token": "<eos>",
-                },
-                "num_epochs": 1,
-                "micro_batch_size": 1,
-                "gradient_accumulation_steps": 4,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_bnb_8bit",
-                "lr_scheduler": "cosine",
-                "max_steps": 5,
-                "save_safetensors": True,
-                "bf16": True,
-            }
-        )
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        assert (Path(temp_dir) / "model.safetensors").exists()
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -5,14 +5,14 @@ E2E tests for llama
 import logging
 import os

+from e2e.utils import check_model_output_exists
+
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault

-from tests.e2e.utils import check_model_output_exists
-
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"

--- a/tests/e2e/test_schedulers.py
+++ b/tests/e2e/test_schedulers.py
@@ -54,7 +54,7 @@ class TestCustomSchedulers(unittest.TestCase):
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
-                "optimizer": "adamw_torch_fused",
+                "optimizer": "adamw_hf",
                "max_steps": 20,
                "lr_scheduler": "rex",
                "warmup_steps": 5,
--- a/tests/hf_offline_utils.py
+++ b/tests/hf_offline_utils.py
@@ -1,85 +0,0 @@
-"""
-test utils for helpers and decorators
-"""
-
-import os
-from functools import wraps
-
-from huggingface_hub.utils import reset_sessions
-
-
-def reload_modules(hf_hub_offline):
-    # Force reload of the modules that check this variable
-    import importlib
-
-    import datasets
-    import huggingface_hub.constants
-
-    # Reload the constants module first, as others depend on it
-    importlib.reload(huggingface_hub.constants)
-    huggingface_hub.constants.HF_HUB_OFFLINE = hf_hub_offline
-    importlib.reload(datasets.config)
-    setattr(datasets.config, "HF_HUB_OFFLINE", hf_hub_offline)
-    reset_sessions()
-
-
-def enable_hf_offline(test_func):
-    """
-    test decorator that sets HF_HUB_OFFLINE environment variable to True and restores it after the test even if the test fails.
-    :param test_func:
-    :return:
-    """
-
-    @wraps(test_func)
-    def wrapper(*args, **kwargs):
-        # Save the original value of HF_HUB_OFFLINE environment variable
-        original_hf_offline = os.getenv("HF_HUB_OFFLINE")
-
-        # Set HF_OFFLINE environment variable to True
-        os.environ["HF_HUB_OFFLINE"] = "1"
-
-        reload_modules(True)
-        try:
-            # Run the test function
-            return test_func(*args, **kwargs)
-        finally:
-            # Restore the original value of HF_HUB_OFFLINE environment variable
-            if original_hf_offline is not None:
-                os.environ["HF_HUB_OFFLINE"] = original_hf_offline
-                reload_modules(bool(original_hf_offline))
-            else:
-                del os.environ["HF_HUB_OFFLINE"]
-                reload_modules(False)
-
-    return wrapper
-
-
-def disable_hf_offline(test_func):
-    """
-    test decorator that sets HF_HUB_OFFLINE environment variable to False and restores it after the wrapped func
-    :param test_func:
-    :return:
-    """
-
-    @wraps(test_func)
-    def wrapper(*args, **kwargs):
-        # Save the original value of HF_HUB_OFFLINE environment variable
-        original_hf_offline = os.getenv("HF_HUB_OFFLINE")
-
-        # Set HF_OFFLINE environment variable to True
-        os.environ["HF_HUB_OFFLINE"] = "0"
-
-        reload_modules(False)
-        try:
-            # Run the test function
-            return test_func(*args, **kwargs)
-        finally:
-            # Restore the original value of HF_HUB_OFFLINE environment variable
-            if original_hf_offline is not None:
-                os.environ["HF_HUB_OFFLINE"] = original_hf_offline
-                reload_modules(bool(original_hf_offline))
-            else:
-                del os.environ["HF_HUB_OFFLINE"]
-                reload_modules(False)
-
-    return wrapper
--- a/tests/prompt_strategies/conftest.py
+++ b/tests/prompt_strategies/conftest.py
@@ -4,13 +4,12 @@ shared fixtures for prompt strategies tests

 import pytest
 from datasets import Dataset
+from huggingface_hub import hf_hub_download
 from transformers import AutoTokenizer

 from axolotl.prompt_strategies.jinja_template_analyzer import JinjaTemplateAnalyzer
 from axolotl.utils.chat_templates import _CHAT_TEMPLATES

-from tests.hf_offline_utils import enable_hf_offline
-

@pytest.fixture(name="assistant_dataset")
 def fixture_assistant_dataset():
@@ -109,27 +108,31 @@ def fixture_toolcalling_dataset():


@pytest.fixture(name="llama3_tokenizer", scope="session", autouse=True)
-@enable_hf_offline
-def fixture_llama3_tokenizer(
-    download_llama3_8b_instruct_model_fixture,
-):  # pylint: disable=unused-argument,redefined-outer-name
+def fixture_llama3_tokenizer():
+    hf_hub_download(
+        repo_id="NousResearch/Meta-Llama-3-8B-Instruct",
+        filename="special_tokens_map.json",
+    )
+    hf_hub_download(
+        repo_id="NousResearch/Meta-Llama-3-8B-Instruct",
+        filename="tokenizer_config.json",
+    )
+    hf_hub_download(
+        repo_id="NousResearch/Meta-Llama-3-8B-Instruct", filename="tokenizer.json"
+    )
    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")

    return tokenizer


@pytest.fixture(name="smollm2_tokenizer", scope="session", autouse=True)
-@enable_hf_offline
 def fixture_smollm2_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")
    return tokenizer


@pytest.fixture(name="mistralv03_tokenizer", scope="session", autouse=True)
-@enable_hf_offline
-def fixture_mistralv03_tokenizer(
-    download_mlx_mistral_7b_model_fixture,
-):  # pylint: disable=unused-argument,redefined-outer-name
+def fixture_mistralv03_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(
        "mlx-community/Mistral-7B-Instruct-v0.3-4bit"
    )
@@ -137,7 +140,6 @@ def fixture_mistralv03_tokenizer(


@pytest.fixture(name="phi35_tokenizer", scope="session", autouse=True)
-@enable_hf_offline
 def fixture_phi35_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
    return tokenizer
--- a/tests/prompt_strategies/test_alpaca.py
+++ b/tests/prompt_strategies/test_alpaca.py
@@ -11,8 +11,6 @@ from axolotl.datasets import TokenizedPromptDataset
 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
 from axolotl.prompters import AlpacaPrompter, PromptStyle

-from tests.hf_offline_utils import enable_hf_offline
-

@pytest.fixture(name="alpaca_dataset")
 def fixture_alpaca_dataset():
@@ -28,7 +26,6 @@ def fixture_alpaca_dataset():


@pytest.fixture(name="tokenizer")
-@enable_hf_offline
 def fixture_tokenizer():
    # pylint: disable=all
    tokenizer = AutoTokenizer.from_pretrained(
--- a/tests/prompt_strategies/test_chat_template_utils.py
+++ b/tests/prompt_strategies/test_chat_template_utils.py
@@ -13,11 +13,8 @@ from axolotl.utils.chat_templates import (
    get_chat_template,
 )

-from tests.hf_offline_utils import enable_hf_offline
-

@pytest.fixture(name="llama3_tokenizer")
-@enable_hf_offline
 def fixture_llama3_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B")

--- a/tests/prompt_strategies/test_chat_templates_advanced.py
+++ b/tests/prompt_strategies/test_chat_templates_advanced.py
@@ -17,8 +17,6 @@ from axolotl.prompt_strategies.chat_template import (
 from axolotl.prompters import IGNORE_TOKEN_ID
 from axolotl.utils.chat_templates import get_chat_template

-from tests.hf_offline_utils import enable_hf_offline
-
 logging.basicConfig(level=logging.DEBUG)
 LOG = logging.getLogger("axolotl")

@@ -32,14 +30,12 @@ PARAMETRIZE_PARAMS = [
        "mistralv03_tokenizer_chat_template_jinja",
        "[/INST]",
    ),
-    # TODO: temporarily skip gemma due to gemma3 template
-    # Re-enable on new chat_template implementation for perf
-    # (
-    #     "gemma2_tokenizer",
-    #     "jinja",
-    #     "gemma2_tokenizer_chat_template_jinja",
-    #     "<end_of_turn>",
-    # ),
+    (
+        "gemma2_tokenizer",
+        "jinja",
+        "gemma2_tokenizer_chat_template_jinja",
+        "<end_of_turn>",
+    ),
    ("phi35_tokenizer", "phi_35", None, "<|end|>"),
 ]

@@ -97,11 +93,7 @@ class TestChatTemplateConfigurations:
        if (
            turn_idx == 0
            and turn.get("from") in ["system", "context"]
-            and (
-                "mistral" in tokenizer.name_or_path.lower()
-                or "gemma"
-                in tokenizer.name_or_path.lower()  # temporarily skip gemma due to gemma3 template
-            )
+            and "mistral" in tokenizer.name_or_path.lower()
        ):
            assert (
                start_idx == -1 and end_idx == -1
@@ -109,7 +101,6 @@ class TestChatTemplateConfigurations:
            return True
        return False

-    @enable_hf_offline
    def test_train_on_inputs_true(
        self,
        tokenizer,
--- a/tests/prompt_strategies/test_dpo_chat_templates.py
+++ b/tests/prompt_strategies/test_dpo_chat_templates.py
@@ -11,8 +11,6 @@ from transformers import AutoTokenizer
 from axolotl.prompt_strategies.dpo.chat_template import default
 from axolotl.utils.dict import DictDefault

-from tests.hf_offline_utils import enable_hf_offline
-

@pytest.fixture(name="assistant_dataset")
 def fixture_assistant_dataset():
@@ -80,8 +78,15 @@ def fixture_custom_assistant_dataset():
    )


+@pytest.fixture(name="llama3_tokenizer")
+def fixture_llama3_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B")
+    tokenizer.eos_token = "<|eot_id|>"
+
+    return tokenizer
+
+
@pytest.fixture(name="phi3_tokenizer")
-@enable_hf_offline
 def fixture_phi3_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-128k-instruct")

@@ -89,7 +94,6 @@ def fixture_phi3_tokenizer():


@pytest.fixture(name="gemma_tokenizer")
-@enable_hf_offline
 def fixture_gemma_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("unsloth/gemma-2b-it", revision="703fb4a")

--- a/tests/prompt_strategies/test_dpo_chatml.py
+++ b/tests/prompt_strategies/test_dpo_chatml.py
@@ -10,8 +10,6 @@ from axolotl.prompt_strategies.dpo import load as load_dpo
 from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.dict import DictDefault

-from tests.hf_offline_utils import enable_hf_offline
-

@pytest.fixture(name="minimal_dpo_cfg")
 def fixture_cfg():
@@ -36,8 +34,6 @@ class TestDPOChatml:
    Test loading DPO preference datasets with chatml formatting
    """

-    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
-    @enable_hf_offline
    def test_default(self, minimal_dpo_cfg):
        cfg = DictDefault(
            {
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -8,15 +8,12 @@ from transformers import LlamaTokenizer

 from axolotl.utils.data import encode_pretraining, md5

-from tests.hf_offline_utils import enable_hf_offline
-

 class TestEncodePretraining(unittest.TestCase):
    """
    test class for encode pretraining and md5 helper
    """

-    @enable_hf_offline
    def setUp(self):
        self.tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b")
        self.tokenizer.add_special_tokens(
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -4,37 +4,31 @@ Test dataset loading under various conditions.

 import shutil
 import tempfile
+import unittest
 from pathlib import Path
-from unittest.mock import patch

-import pytest
+from conftest import snapshot_download_w_retry
+from constants import (
+    ALPACA_MESSAGES_CONFIG_OG,
+    ALPACA_MESSAGES_CONFIG_REVISION,
+    SPECIAL_TOKENS,
+)
 from datasets import Dataset
-from huggingface_hub import snapshot_download
-from transformers import PreTrainedTokenizer
+from transformers import AutoTokenizer

 from axolotl.utils.data import load_tokenized_prepared_datasets
 from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.dict import DictDefault

-from tests.constants import (
-    ALPACA_MESSAGES_CONFIG_OG,
-    ALPACA_MESSAGES_CONFIG_REVISION,
-    SPECIAL_TOKENS,
-)
-from tests.hf_offline_utils import enable_hf_offline

-
-class TestDatasetPreparation:
+class TestDatasetPreparation(unittest.TestCase):
    """Test a configured dataloader."""

-    @pytest.fixture
-    def tokenizer(self, tokenizer_huggyllama) -> PreTrainedTokenizer:
-        tokenizer_huggyllama.add_special_tokens(SPECIAL_TOKENS)
-        yield tokenizer_huggyllama
-
-    @pytest.fixture
-    def dataset_fixture(self):
-        yield Dataset.from_list(
+    def setUp(self) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.add_special_tokens(SPECIAL_TOKENS)
+        # Alpaca dataset.
+        self.dataset = Dataset.from_list(
            [
                {
                    "instruction": "Evaluate this sentence for spelling and grammar mistakes",
@@ -44,9 +38,7 @@ class TestDatasetPreparation:
            ]
        )

-    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
-    @enable_hf_offline
-    def test_load_hub(self, tokenizer):
+    def test_load_hub(self):
        """Core use case.  Verify that processing data from the hub works"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            prepared_path = Path(tmp_dir) / "prepared"
@@ -63,28 +55,25 @@ class TestDatasetPreparation:
                }
            )

-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )

            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

-    @enable_hf_offline
-    @pytest.mark.skip("datasets bug with local datasets when offline")
-    def test_load_local_hub(self, tokenizer):
+    def test_load_local_hub(self):
        """Niche use case.  Verify that a local copy of a hub dataset can be loaded"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_path = snapshot_download(
+            snapshot_download_w_retry(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
            )
-            # offline mode doesn't actually copy it to local_dir, so we
-            # have to copy all the contents in the dir manually from the returned snapshot_path
-            shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)

            prepared_path = Path(tmp_dir) / "prepared"
            # Right now a local copy that doesn't fully conform to a dataset
@@ -107,7 +96,9 @@ class TestDatasetPreparation:
                }
            )

-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )

            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
@@ -115,12 +106,11 @@ class TestDatasetPreparation:
            assert "labels" in dataset.features
            shutil.rmtree(tmp_ds_path)

-    @enable_hf_offline
-    def test_load_from_save_to_disk(self, tokenizer, dataset_fixture):
+    def test_load_from_save_to_disk(self):
        """Usual use case.  Verify datasets saved via `save_to_disk` can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_name = Path(tmp_dir) / "tmp_dataset"
-            dataset_fixture.save_to_disk(str(tmp_ds_name))
+            self.dataset.save_to_disk(str(tmp_ds_name))

            prepared_path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
@@ -136,21 +126,22 @@ class TestDatasetPreparation:
                }
            )

-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )

            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

-    @enable_hf_offline
-    def test_load_from_dir_of_parquet(self, tokenizer, dataset_fixture):
+    def test_load_from_dir_of_parquet(self):
        """Usual use case.  Verify a directory of parquet files can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
            tmp_ds_dir.mkdir()
            tmp_ds_path = tmp_ds_dir / "shard1.parquet"
-            dataset_fixture.to_parquet(tmp_ds_path)
+            self.dataset.to_parquet(tmp_ds_path)

            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
@@ -171,21 +162,22 @@ class TestDatasetPreparation:
                }
            )

-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )

            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

-    @enable_hf_offline
-    def test_load_from_dir_of_json(self, tokenizer, dataset_fixture):
+    def test_load_from_dir_of_json(self):
        """Standard use case.  Verify a directory of json files can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
            tmp_ds_dir.mkdir()
            tmp_ds_path = tmp_ds_dir / "shard1.json"
-            dataset_fixture.to_json(tmp_ds_path)
+            self.dataset.to_json(tmp_ds_path)

            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
@@ -206,19 +198,20 @@ class TestDatasetPreparation:
                }
            )

-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )

            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

-    @enable_hf_offline
-    def test_load_from_single_parquet(self, tokenizer, dataset_fixture):
+    def test_load_from_single_parquet(self):
        """Standard use case.  Verify a single parquet file can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.parquet"
-            dataset_fixture.to_parquet(tmp_ds_path)
+            self.dataset.to_parquet(tmp_ds_path)

            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
@@ -235,19 +228,20 @@ class TestDatasetPreparation:
                }
            )

-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )

            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

-    @enable_hf_offline
-    def test_load_from_single_json(self, tokenizer, dataset_fixture):
+    def test_load_from_single_json(self):
        """Standard use case.  Verify a single json file can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.json"
-            dataset_fixture.to_json(tmp_ds_path)
+            self.dataset.to_json(tmp_ds_path)

            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
@@ -264,15 +258,15 @@ class TestDatasetPreparation:
                }
            )

-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )

            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

-    @pytest.mark.skip(reason="TODO: fix hf offline mode for CI rate limits")
-    @enable_hf_offline
    def test_load_hub_with_dpo(self):
        """Verify that processing dpo data from the hub works"""

@@ -291,9 +285,7 @@ class TestDatasetPreparation:
        assert len(train_dataset) == 1800
        assert "conversation" in train_dataset.features

-    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
-    @enable_hf_offline
-    def test_load_hub_with_revision(self, tokenizer):
+    def test_load_hub_with_revision(self):
        """Verify that processing data from the hub works with a specific revision"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            prepared_path = Path(tmp_dir) / "prepared"
@@ -315,17 +307,16 @@ class TestDatasetPreparation:
                }
            )

-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )

            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

-    @enable_hf_offline
-    def test_load_hub_with_revision_with_dpo(
-        self, dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff
-    ):
+    def test_load_hub_with_revision_with_dpo(self):
        """Verify that processing dpo data from the hub works with a specific revision"""

        cfg = DictDefault(
@@ -338,34 +329,22 @@ class TestDatasetPreparation:
            }
        )

-        # pylint: disable=duplicate-code
-        with patch(
-            "axolotl.utils.data.shared.load_dataset_w_config"
-        ) as mock_load_dataset:
-            # Set up the mock to return different values on successive calls
-            mock_load_dataset.return_value = (
-                dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff
-            )
+        train_dataset, _ = load_prepare_preference_datasets(cfg)

-            train_dataset, _ = load_prepare_preference_datasets(cfg)
+        assert len(train_dataset) == 1800
+        assert "conversation" in train_dataset.features

-            assert len(train_dataset) == 1800
-            assert "conversation" in train_dataset.features
-
-    @enable_hf_offline
-    @pytest.mark.skip("datasets bug with local datasets when offline")
-    def test_load_local_hub_with_revision(self, tokenizer):
+    def test_load_local_hub_with_revision(self):
        """Verify that a local copy of a hub dataset can be loaded with a specific revision"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_path = snapshot_download(
+            snapshot_download_w_retry(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
                revision="d05c1cb",
            )
-            shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)

            prepared_path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
@@ -386,7 +365,9 @@ class TestDatasetPreparation:
                }
            )

-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )

            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
@@ -394,19 +375,17 @@ class TestDatasetPreparation:
            assert "labels" in dataset.features
            shutil.rmtree(tmp_ds_path)

-    @enable_hf_offline
-    def test_loading_local_dataset_folder(self, tokenizer):
+    def test_loading_local_dataset_folder(self):
        """Verify that a dataset downloaded to a local folder can be loaded"""

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
-            snapshot_path = snapshot_download(
+            snapshot_download_w_retry(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
            )
-            shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)

            prepared_path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
@@ -422,10 +401,16 @@ class TestDatasetPreparation:
                }
            )

-            dataset, _ = load_tokenized_prepared_datasets(tokenizer, cfg, prepared_path)
+            dataset, _ = load_tokenized_prepared_datasets(
+                self.tokenizer, cfg, prepared_path
+            )

            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
            shutil.rmtree(tmp_ds_path)
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_exact_deduplication.py
+++ b/tests/test_exact_deduplication.py
@@ -8,8 +8,9 @@ import hashlib
 import unittest
 from unittest.mock import patch

-import pytest
+from constants import ALPACA_MESSAGES_CONFIG_REVISION, SPECIAL_TOKENS
 from datasets import Dataset
+from transformers import AutoTokenizer

 from axolotl.utils.config import normalize_config
 from axolotl.utils.data import prepare_dataset
@@ -18,9 +19,6 @@ from axolotl.utils.data.utils import deduplicate_and_log_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_processor, load_tokenizer

-from tests.constants import ALPACA_MESSAGES_CONFIG_REVISION
-from tests.hf_offline_utils import enable_hf_offline
-

 def verify_deduplication(actual_dataset, expected_dataset, dataset_name):
    """
@@ -216,12 +214,13 @@ class TestDeduplicateIndividualFunctions(unittest.TestCase):
        verify_deduplication(eval_dataset, expected_dataset_eval, "eval_dataset")


-class TestDeduplicateRLDataset:
+class TestDeduplicateRLDataset(unittest.TestCase):
    """Test a configured dataloader with deduplication."""

-    @pytest.fixture
-    def cfg(self):
-        fixture = DictDefault(
+    def setUp(self) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.add_special_tokens(SPECIAL_TOKENS)
+        self.cfg = DictDefault(
            {
                "tokenizer_config": "huggyllama/llama-7b",
                "sequence_len": 1024,
@@ -234,66 +233,34 @@ class TestDeduplicateRLDataset:
                ],
            }
        )
-        yield fixture

-    @enable_hf_offline
-    def test_load_with_deduplication(
-        self, cfg, dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff, tokenizer_huggyllama
-    ):
+    def test_load_with_deduplication(self):
        """Verify that loading with deduplication removes duplicates."""

-        # pylint: disable=duplicate-code
-        with (
-            patch(
-                "axolotl.utils.data.shared.load_dataset_w_config"
-            ) as mock_load_dataset,
-            patch("axolotl.utils.models.load_tokenizer") as mock_load_tokenizer,
-        ):
-            # Set up the mock to return different values on successive calls
-            mock_load_dataset.side_effect = [
-                dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff,
-                dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff,
-            ]
-            mock_load_tokenizer.return_value = tokenizer_huggyllama
+        # Load the dataset using the deduplication setting
+        train_dataset, _ = load_prepare_preference_datasets(self.cfg)

-            train_dataset, _ = load_prepare_preference_datasets(cfg)
+        # Verify that the dataset has been deduplicated
+        assert len(train_dataset) == 1800, "Dataset was not properly deduplicated"

-            # Verify that the dataset has been deduplicated
-            assert len(train_dataset) == 1800, "Dataset was not properly deduplicated"
+    def test_load_without_deduplication(self):
+        """Verify that loading without deduplication retains duplicates."""
+        self.cfg.dataset_exact_deduplication = False
+        # Load the dataset without deduplication
+        train_dataset, _ = load_prepare_preference_datasets(self.cfg)

-    @enable_hf_offline
-    def test_load_without_deduplication(
-        self, cfg, dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff, tokenizer_huggyllama
-    ):
-        # pylint: disable=duplicate-code
-        with (
-            patch(
-                "axolotl.utils.data.shared.load_dataset_w_config"
-            ) as mock_load_dataset,
-            patch("axolotl.utils.models.load_tokenizer") as mock_load_tokenizer,
-        ):
-            # Set up the mock to return different values on successive calls
-            mock_load_dataset.side_effect = [
-                dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff,
-                dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff,
-            ]
-            mock_load_tokenizer.return_value = tokenizer_huggyllama
-
-            cfg.dataset_exact_deduplication = False
-            # Load the dataset without deduplication
-            train_dataset, _ = load_prepare_preference_datasets(cfg)
-
-            # Verify that the dataset retains duplicates
-            assert (
-                len(train_dataset) == 1800 * 2
-            ), "Dataset deduplication occurred when it should not have"
+        # Verify that the dataset retains duplicates
+        assert (
+            len(train_dataset) == 1800 * 2
+        ), "Dataset deduplication occurred when it should not have"


 class TestDeduplicateNonRL(unittest.TestCase):
    """Test prepare_dataset function with different configurations."""

-    @enable_hf_offline
    def setUp(self) -> None:
+        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.add_special_tokens(SPECIAL_TOKENS)
        self.cfg_1 = DictDefault(
            {
                "base_model": "huggyllama/llama-7b",
@@ -319,8 +286,6 @@ class TestDeduplicateNonRL(unittest.TestCase):
        )
        normalize_config(self.cfg_1)

-    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
-    @enable_hf_offline
    def test_prepare_dataset_with_deduplication_train(self):
        """Verify that prepare_dataset function processes the dataset correctly with deduplication."""
        self.cfg_1.dataset_exact_deduplication = True
@@ -346,8 +311,6 @@ class TestDeduplicateNonRL(unittest.TestCase):
            "Train dataset should have 2000 samples after deduplication.",
        )

-    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
-    @enable_hf_offline
    def test_prepare_dataset_with_deduplication_eval(self):
        """Verify that prepare_dataset function processes the dataset correctly with deduplication."""
        self.cfg_1.dataset_exact_deduplication = True
@@ -373,8 +336,6 @@ class TestDeduplicateNonRL(unittest.TestCase):
            "Eval dataset should have 2000 samples after deduplication.",
        )

-    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
-    @enable_hf_offline
    def test_prepare_dataset_without_deduplication(self):
        """Verify that prepare_dataset function processes the dataset correctly without deduplication."""
        self.cfg_1.dataset_exact_deduplication = False
--- a/tests/test_packed_batch_sampler.py
+++ b/tests/test_packed_batch_sampler.py
@@ -12,8 +12,6 @@ from axolotl.utils.data.utils import drop_long_seq_in_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths

-from tests.hf_offline_utils import enable_hf_offline
-

@pytest.fixture(name="tokenizer")
 def fixture_tokenizer():
@@ -27,7 +25,6 @@ class TestBatchedSamplerPacking:
    Test class for packing streaming dataset sequences
    """

-    @pytest.mark.skip(reason="TODO: fix hf offline mode for CI rate limits")
    @pytest.mark.parametrize(
        "batch_size, num_workers",
        [
@@ -38,12 +35,11 @@ class TestBatchedSamplerPacking:
        ],
    )
    @pytest.mark.parametrize("max_seq_length", [4096, 512])
-    @enable_hf_offline
    def test_packing(self, batch_size, num_workers, tokenizer, max_seq_length):
        import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401

        dataset = load_dataset(
-            "winglian/tiny-shakespeare",
+            "Trelis/tiny-shakespeare",
            split="train",
        )

--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -10,15 +10,12 @@ from axolotl.datasets import ConstantLengthDataset, TokenizedPromptDataset
 from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
 from axolotl.prompters import AlpacaPrompter

-from tests.hf_offline_utils import enable_hf_offline
-

 class TestPacking(unittest.TestCase):
    """
    Test class for packing dataset sequences
    """

-    @enable_hf_offline
    def setUp(self) -> None:
        # pylint: disable=duplicate-code
        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
--- a/tests/test_packed_pretraining.py
+++ b/tests/test_packed_pretraining.py
@@ -1,60 +1,43 @@
 """Module for testing streaming dataset sequence packing"""

 import functools
-import random
-import string
+import unittest

 import pytest
 import torch
-from datasets import IterableDataset
+from datasets import load_dataset
 from torch.utils.data import DataLoader
+from transformers import AutoTokenizer

 from axolotl.utils.data import get_dataset_wrapper, wrap_pretraining_dataset
 from axolotl.utils.dict import DictDefault


-class TestPretrainingPacking:
+class TestPretrainingPacking(unittest.TestCase):
    """
    Test class for packing streaming dataset sequences
    """

-    @pytest.fixture
-    def random_text(self):
-        # seed with random.seed(0) for reproducibility
-        random.seed(0)
+    def setUp(self) -> None:
+        # pylint: disable=duplicate-code
+        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
+        self.tokenizer.pad_token = "</s>"

-        # generate row of random text with "words" of between 2 and 10 characters and
-        # between 400 to 1200 characters per line
-        def rand_txt():
-            return " ".join(
-                [
-                    "".join(
-                        random.choices(string.ascii_lowercase, k=random.randint(2, 10))
-                    )
-                    for _ in range(random.randint(50, 200))
-                ]
-            )
-
-        # Create a list of 2000 random texts rather than just using it within the
-        # generator so the test runs faster
-        data = [rand_txt() for _ in range(500)]
-
-        # Create an IterableDataset
-        def generator():
-            for row in data:
-                yield {"text": row}
-
-        return IterableDataset.from_generator(generator)
-
-    @pytest.mark.flaky(retries=1, delay=5)
-    def test_packing_stream_dataset(self, tokenizer_huggyllama, random_text):
-        dataset = random_text
+    @pytest.mark.flaky(retries=3, delay=5)
+    def test_packing_stream_dataset(self):
+        # pylint: disable=duplicate-code
+        dataset = load_dataset(
+            "allenai/c4",
+            "en",
+            streaming=True,
+        )["train"]

        cfg = DictDefault(
            {
                "pretraining_dataset": [
                    {
-                        "path": "winglian/tiny-shakespeare",
+                        "path": "allenai/c4",
+                        "name": "en",
                        "type": "pretrain",
                    }
                ],
@@ -71,16 +54,15 @@ class TestPretrainingPacking:
        ds_wrapper_partial = functools.partial(
            get_dataset_wrapper,
            cfg.pretraining_dataset[0],
-            tokenizer_huggyllama,
+            self.tokenizer,
            cfg,
            cfg.pretraining_dataset[0]["type"] or "pretrain",
        )

-        # pylint: disable=duplicate-code
        original_bsz = cfg.micro_batch_size
        train_dataset = wrap_pretraining_dataset(
            dataset,
-            tokenizer_huggyllama,
+            self.tokenizer,
            cfg,
            ds_wrapper_partial,
            max_tokens=cfg.sequence_len,
@@ -96,7 +78,7 @@ class TestPretrainingPacking:
        )
        idx = 0
        for data in trainer_loader:
-            if idx > 3:
+            if idx > 10:
                break
            assert data["input_ids"].shape == torch.Size(
                [1, original_bsz * cfg.sequence_len]
@@ -113,3 +95,7 @@ class TestPretrainingPacking:
            #     [1, original_bsz * cfg.sequence_len]
            # )
            idx += 1
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -5,7 +5,6 @@ import logging
 import unittest
 from pathlib import Path

-import pytest
 from datasets import load_dataset
 from transformers import AddedToken, AutoTokenizer, LlamaTokenizer

@@ -23,8 +22,6 @@ from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
 from axolotl.prompters import AlpacaPrompter, PromptStyle
 from axolotl.utils.dict import DictDefault

-from tests.hf_offline_utils import enable_hf_offline
-
 LOG = logging.getLogger("axolotl")

 test_data = {
@@ -66,7 +63,6 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
    Test class for prompt tokenization strategies.
    """

-    @enable_hf_offline
    def setUp(self) -> None:
        # pylint: disable=duplicate-code
        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
@@ -123,7 +119,6 @@ class InstructionWSystemPromptTokenizingStrategyTest(unittest.TestCase):
    Test class for prompt tokenization strategies with sys prompt from the dataset
    """

-    @enable_hf_offline
    def setUp(self) -> None:
        # pylint: disable=duplicate-code
        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
@@ -165,7 +160,6 @@ class Llama2ChatTokenizationTest(unittest.TestCase):
    Test class for prompt tokenization strategies with sys prompt from the dataset
    """

-    @enable_hf_offline
    def setUp(self) -> None:
        # pylint: disable=duplicate-code
        self.tokenizer = LlamaTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")
@@ -244,7 +238,6 @@ If a question does not make any sense, or is not factually coherent, explain why
 class OrpoTokenizationTest(unittest.TestCase):
    """test case for the ORPO tokenization"""

-    @enable_hf_offline
    def setUp(self) -> None:
        # pylint: disable=duplicate-code
        tokenizer = LlamaTokenizer.from_pretrained(
@@ -269,7 +262,6 @@ class OrpoTokenizationTest(unittest.TestCase):
            "argilla/ultrafeedback-binarized-preferences-cleaned", split="train"
        ).select([0])

-    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
    def test_orpo_integration(self):
        strat = load(
            self.tokenizer,
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -9,15 +9,12 @@ import pytest
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_tokenizer

-from tests.hf_offline_utils import enable_hf_offline
-

 class TestTokenizers:
    """
    test class for the load_tokenizer fn
    """

-    @enable_hf_offline
    def test_default_use_fast(self):
        cfg = DictDefault(
            {
@@ -27,7 +24,6 @@ class TestTokenizers:
        tokenizer = load_tokenizer(cfg)
        assert "Fast" in tokenizer.__class__.__name__

-    @enable_hf_offline
    def test_dont_use_fast(self):
        cfg = DictDefault(
            {
@@ -38,7 +34,6 @@ class TestTokenizers:
        tokenizer = load_tokenizer(cfg)
        assert "Fast" not in tokenizer.__class__.__name__

-    @enable_hf_offline
    def test_special_tokens_modules_to_save(self):
        # setting special_tokens to new token
        cfg = DictDefault(
@@ -73,7 +68,6 @@ class TestTokenizers:
        )
        load_tokenizer(cfg)

-    @enable_hf_offline
    def test_add_additional_special_tokens(self):
        cfg = DictDefault(
            {
@@ -89,7 +83,6 @@ class TestTokenizers:
        tokenizer = load_tokenizer(cfg)
        assert len(tokenizer) == 32001

-    @enable_hf_offline
    def test_added_tokens_overrides(self, temp_dir):
        cfg = DictDefault(
            {
@@ -111,12 +104,11 @@ class TestTokenizers:
            128042
        ]

-    @enable_hf_offline
    def test_added_tokens_overrides_with_toolargeid(self, temp_dir):
        cfg = DictDefault(
            {
                # use with tokenizer that has reserved_tokens in added_tokens
-                "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_config": "NousResearch/Llama-3.2-1B",
                "added_tokens_overrides": {1000000: "BROKEN_RANDOM_OVERRIDE_1"},
                "output_dir": temp_dir,
            }
--- a/tests/utils/init.py
+++ b/tests/utils/init.py
Author	SHA1	Message	Date
Dan Saunders	c649d569b4	simplify by installing no deps	2025-03-21 13:27:54 -04:00
Dan Saunders	b88b389b17	installing axolotl prior to quartodoc build	2025-03-21 16:52:51 +00:00