From fc4e37920b9bb00d65804f64d66a0243c94b4451 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 27 Jan 2026 17:08:24 -0500
Subject: [PATCH] transformers v5 upgrade (#3272)

* Prepare for transformers v5 upgrade

* fix hf cli

* update for hf hub changes

* fix tokenizer apply_chat_template args

* remap include_tokens_per_second

* fix tps

* handle migration for warmup

* use latest hf hub

* Fix scan -> ls

* fix import

* fix for renaming of mistral common tokenizer -> backend

* update for fixed tokenziation for llama

* Skip phi35 tests for now

* remove mistral patch fixed upstream in huggingface/transformers#41439

* use namespacing for patch

* don't rely on sdist for e2e tests for now

* run modal ci without waiting too

* Fix dep for ci

* fix imports

* Fix fp8 check

* fsdp2 fixes

* fix version handling

* update fsdp version tests for new v5 behavior

* Fail multigpu tests after 3 failures

* skip known v5 broken tests for now and cleanup

* bump deps

* unmark skipped test

* re-enable test_fsdp_qlora_prequant_packed test

* increase multigpu ci timeout

* skip broken gemma3 test

* reduce timout back to original 120min now that the hanging test is skipped

* fix for un-necessary collator for pretraining with bsz=1

* fix: safe_serialization deprecated in transformers v5 rc01 (#3318)

* torch_dtype deprecated

* load model in float32 for consistency with tests

* revert some test fixtures back

* use hf cache ls instead of scan

* don't strip fsdp_version

more fdsp_Version fixes for v5
fix version in fsdp_config
fix aliasing
fix fsdp_version check
check fsdp_version is 2 in both places

* Transformers v5 rc2 (#3347)

* bump dep

* use latest fbgemm, grab model config as part of fixture, un-skip test

* import AutoConfig

* don't need more problematic autoconfig when specifying config.json manually

* add fixtures for argilla ultrafeedback datasets

* download phi4-reasoning

* fix arg

* update tests for phi fast tokenizer changes

* use explicit model types for gemma3

---------

Co-authored-by: Wing Lian <wing@axolotl.ai>

* fix: AutoModelForVision2Seq -> AutoModelForImageTextToText

* chore: remove duplicate

* fix: attempt fix gemma3 text mode

* chore: lint

* ga release of v5

* need property setter for name_or_path for mistral tokenizer

* vllm not compatible with transformers v5

* setter for chat_template w mistral too

---------

Co-authored-by: NanoCode012 <nano@axolotl.ai>
Co-authored-by: salman <salman.mohammadi@outlook.com>
---
 .github/workflows/main.yml                    |  2 +-
 .github/workflows/multi-gpu-e2e.yml           |  2 +-
 .github/workflows/tests.yml                   | 16 ++---
 .runpod/src/config/config.yaml                |  4 --
 cicd/multigpu.sh                              |  2 +-
 docs/amd_hpc.qmd                              |  2 +-
 docs/installation.qmd                         |  2 +-
 examples/jamba/qlora_fsdp_large.yaml          |  1 -
 examples/llama-3/qlora-fsdp-405b.yaml         |  1 -
 examples/mamba/config.yml                     |  1 -
 requirements.txt                              |  6 +-
 src/axolotl/cli/checks.py                     |  2 +-
 src/axolotl/cli/merge_lora.py                 |  2 -
 src/axolotl/cli/merge_sharded_fsdp_weights.py | 36 +++-------
 src/axolotl/cli/quantize.py                   |  4 +-
 src/axolotl/core/builders/base.py             | 10 +--
 src/axolotl/core/builders/causal.py           |  4 +-
 src/axolotl/core/trainers/base.py             | 53 +++++++-------
 .../integrations/llm_compressor/utils.py      |  3 -
 src/axolotl/loaders/model.py                  |  7 +-
 src/axolotl/loaders/patch_manager.py          |  7 --
 src/axolotl/loaders/processor.py              |  2 +-
 src/axolotl/models/mamba/modeling_mamba.py    |  1 -
 .../mistral3/mistral_common_tokenizer.py      | 16 ++---
 src/axolotl/monkeypatch/relora.py             |  3 +-
 .../transformers/trainer_context_parallel.py  | 10 ++-
 src/axolotl/processing_strategies.py          |  5 +-
 .../prompt_strategies/chat_template.py        |  2 +
 src/axolotl/train.py                          | 34 +++------
 src/axolotl/utils/callbacks/perplexity.py     |  6 +-
 .../utils/mistral/mistral_tokenizer.py        | 24 ++++---
 src/axolotl/utils/schemas/fsdp.py             |  7 +-
 src/axolotl/utils/schemas/model.py            | 14 +++-
 src/axolotl/utils/schemas/validation.py       | 71 ++++++++++---------
 tests/conftest.py                             | 51 ++++++++++---
 tests/core/test_builders.py                   |  1 -
 .../integrations/test_cut_cross_entropy.py    |  4 +-
 tests/e2e/integrations/test_fp8.py            |  1 -
 tests/e2e/integrations/test_hooks.py          |  2 +-
 tests/e2e/integrations/test_kd.py             |  1 -
 tests/e2e/integrations/test_liger.py          |  2 -
 tests/e2e/integrations/test_llm_compressor.py |  1 -
 tests/e2e/multigpu/solo/test_grpo.py          |  3 -
 tests/e2e/multigpu/test_fp8_fsdp2.py          |  5 +-
 tests/e2e/multigpu/test_fsdp1.py              |  1 +
 tests/e2e/multigpu/test_fsdp2.py              |  4 ++
 tests/e2e/multigpu/test_gemma3.py             |  2 +
 tests/e2e/multigpu/test_llama.py              |  1 -
 .../patched/test_activation_checkpointing.py  |  1 -
 tests/e2e/patched/test_peft_embeddings.py     |  1 -
 tests/e2e/patched/test_resume.py              |  1 -
 tests/e2e/solo/test_relora_llama.py           |  1 -
 tests/e2e/test_activation_offloading.py       |  1 -
 tests/e2e/test_deepseekv3.py                  |  2 -
 tests/e2e/test_diffusion.py                   |  2 -
 tests/e2e/test_embeddings_lr.py               |  2 -
 tests/e2e/test_gemma2.py                      |  2 -
 tests/e2e/test_gemma3_text.py                 |  2 -
 tests/e2e/test_llama.py                       |  4 --
 tests/e2e/test_llama_pretrain.py              |  1 -
 tests/e2e/test_llama_vision.py                |  2 -
 tests/e2e/test_mamba.py                       |  1 -
 tests/e2e/test_optimizers.py                  |  1 -
 tests/e2e/test_qat.py                         |  1 -
 tests/e2e/test_save_first_step.py             |  2 -
 tests/e2e/test_streaming.py                   |  1 -
 tests/e2e/utils.py                            | 26 +++----
 tests/hf_offline_utils.py                     |  1 +
 .../test_mistral_tokenizer_patch.py           | 35 ---------
 .../test_chat_templates_advanced.py           |  2 +-
 tests/test_normalize_config.py                |  7 +-
 tests/test_perplexity.py                      |  4 +-
 tests/test_tokenizers.py                      |  2 +
 tests/utils/schemas/validation/test_fsdp.py   | 25 +++++--
 74 files changed, 262 insertions(+), 309 deletions(-)
 delete mode 100644 tests/monkeypatch/test_mistral_tokenizer_patch.py

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 0e1ccb89a..e081f2127 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -38,7 +38,7 @@ jobs:
             cuda_version: 12.9.1
             python_version: "3.12"
             pytorch: 2.9.1
-            axolotl_extras: vllm
+            axolotl_extras:
             platforms: "linux/amd64,linux/arm64"
           - cuda: 130
             cuda_version: 13.0.0
diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index 107572ad6..5187a08c7 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -45,7 +45,7 @@ jobs:
             cuda_version: 12.9.1
             python_version: "3.12"
             pytorch: 2.9.1
-            axolotl_extras: "fbgemm-gpu,vllm"
+            axolotl_extras: "fbgemm-gpu"
             num_gpus: 2
             dockerfile: "Dockerfile-uv.jinja"
           - cuda: 130
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index bcbb76df3..c866e0cfc 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -115,10 +115,10 @@ jobs:
 
       - name: Pre-Download dataset fixture
         run: |
-          huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
+          hf download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
 
       - name: Show HF cache
-        run: hf cache scan
+        run: hf cache ls
 
       - name: Run tests
         run: |
@@ -132,7 +132,7 @@ jobs:
           pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml
 
       - name: Show HF cache
-        run: hf cache scan
+        run: hf cache ls
 
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v5
@@ -210,7 +210,7 @@ jobs:
           axolotl --help
 
       - name: Show HF cache
-        run: hf cache scan
+        run: hf cache ls
 
       - name: Run tests
         run: |
@@ -219,10 +219,10 @@ jobs:
           pytest -v --durations=10 tests/cli/
 
       - name: Show HF cache
-        run: hf cache scan
+        run: hf cache ls
 
   gate-skip-e2e:
-    needs: [pre-commit, pytest, pytest-sdist]
+    needs: [pre-commit]
     runs-on: ubuntu-latest
     outputs:
       skip: ${{ steps.compute.outputs.skip }}
@@ -258,7 +258,7 @@ jobs:
     # this job needs to be run on self-hosted GPU runners...
     runs-on: [self-hosted, modal]
     timeout-minutes: 120
-    needs: [pre-commit, pytest, pytest-sdist, gate-skip-e2e]
+    needs: [pre-commit, pytest]
 
     strategy:
       fail-fast: false
@@ -269,7 +269,7 @@ jobs:
             python_version: "3.12"
             pytorch: 2.9.1
             num_gpus: 1
-            axolotl_extras: vllm
+            axolotl_extras:
             dockerfile: "Dockerfile-uv.jinja"
     steps:
       - name: Checkout
diff --git a/.runpod/src/config/config.yaml b/.runpod/src/config/config.yaml
index f482a7331..fde3730b2 100644
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -224,9 +224,6 @@
 # eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
 # eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
 
-# # Save model as safetensors (require safetensors package)
-# save_safetensors:
-
 # # Whether to mask out or include the human's prompt from the training labels
 # train_on_inputs: false
 # # Group similarly sized data to minimize padding.
@@ -512,7 +509,6 @@ profiler_steps: ${PROFILER_STEPS}
 loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
 loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
 
-save_safetensors: ${SAVE_SAFETENSORS}
 train_on_inputs: ${TRAIN_ON_INPUTS}
 group_by_length: ${GROUP_BY_LENGTH}
 gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
diff --git a/cicd/multigpu.sh b/cicd/multigpu.sh
index 307dd4960..e00fe909e 100755
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -2,7 +2,7 @@
 set -e
 
 # Only run two tests at a time to avoid OOM on GPU (with coverage collection)
-pytest -v --durations=10 -n2 --maxfail=4 \
+pytest -v --durations=10 -n2 --maxfail=3 \
   --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
   --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
   /workspace/axolotl/tests/e2e/multigpu/ \
diff --git a/docs/amd_hpc.qmd b/docs/amd_hpc.qmd
index c6dbe82d0..259b01ab5 100644
--- a/docs/amd_hpc.qmd
+++ b/docs/amd_hpc.qmd
@@ -86,7 +86,7 @@ export HF_DATASETS_OFFLINE=1
 Download a base model using the Hugging Face CLI:
 
 ```bash
-huggingface-cli download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
+hf download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
 ```
 
 ### 10. Create Axolotl Configuration
diff --git a/docs/installation.qmd b/docs/installation.qmd
index b8d427eb0..5df8f87e8 100644
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -165,7 +165,7 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
    ```
 4. (Optional) Login to Hugging Face:
    ```{.bash}
-   huggingface-cli login
+   hf auth login
    ```
 
 ## Troubleshooting {#sec-troubleshooting}
diff --git a/examples/jamba/qlora_fsdp_large.yaml b/examples/jamba/qlora_fsdp_large.yaml
index 150e5e2ec..4db889fbc 100644
--- a/examples/jamba/qlora_fsdp_large.yaml
+++ b/examples/jamba/qlora_fsdp_large.yaml
@@ -19,7 +19,6 @@ datasets:
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: jamba-large-fsdp-qlora-ft
-save_safetensors: true
 adapter: qlora
 sequence_len: 2048
 sample_packing: true
diff --git a/examples/llama-3/qlora-fsdp-405b.yaml b/examples/llama-3/qlora-fsdp-405b.yaml
index 8ddb84d65..5c236f2cf 100644
--- a/examples/llama-3/qlora-fsdp-405b.yaml
+++ b/examples/llama-3/qlora-fsdp-405b.yaml
@@ -12,7 +12,6 @@ datasets:
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out/qlora-llama3_1-405b
-save_safetensors: true
 
 adapter: qlora
 
diff --git a/examples/mamba/config.yml b/examples/mamba/config.yml
index e6b335804..5f36595a3 100644
--- a/examples/mamba/config.yml
+++ b/examples/mamba/config.yml
@@ -47,6 +47,5 @@ saves_per_epoch: 1
 weight_decay: 0.0
 special_tokens:
 tokens:
-save_safetensors: False
 
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/requirements.txt b/requirements.txt
index 2d5fa12fc..21fdda226 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,17 +9,17 @@ liger-kernel==0.6.4
 # END section
 
 packaging==26.0
-
-huggingface_hub>=0.36.0
+huggingface_hub>=1.1.7
 peft>=0.18.1
 tokenizers>=0.22.1
-transformers==4.57.6
+transformers==5.0.0
 accelerate==1.12.0
 datasets==4.5.0
 deepspeed>=0.18.3
 trl==0.27.0
 hf_xet==1.2.0
 kernels==0.11.5
+
 trackio>=0.13.0
 typing-extensions>=4.15.0
 
diff --git a/src/axolotl/cli/checks.py b/src/axolotl/cli/checks.py
index a743e74dc..254da8bae 100644
--- a/src/axolotl/cli/checks.py
+++ b/src/axolotl/cli/checks.py
@@ -44,7 +44,7 @@ def check_user_token() -> bool:
         return bool(user_info)
     except LocalTokenNotFoundError:
         LOG.warning(
-            "Error verifying HuggingFace token. Remember to log in using `huggingface-cli login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets."
+            "Error verifying HuggingFace token. Remember to log in using `hf auth login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets."
         )
         return False
     except HTTPError:
diff --git a/src/axolotl/cli/merge_lora.py b/src/axolotl/cli/merge_lora.py
index 482767b12..e7ad89036 100644
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -24,7 +24,6 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
         cfg: Dictionary mapping `axolotl` config keys to values.
     """
     model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
-    safe_serialization = cfg.save_safetensors is True
 
     LOG.info("Running merge of LoRA with base model...")
     model = model.merge_and_unload(progressbar=True)
@@ -42,7 +41,6 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
         LOG.info(f"Saving merged model to: {str(Path(cfg.output_dir) / 'merged')}...")
         model.save_pretrained(
             str(Path(cfg.output_dir) / "merged"),
-            safe_serialization=safe_serialization,
             progressbar=True,
         )
         tokenizer.save_pretrained(
diff --git a/src/axolotl/cli/merge_sharded_fsdp_weights.py b/src/axolotl/cli/merge_sharded_fsdp_weights.py
index 1d9736b9d..f12d5ab5d 100644
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -14,8 +14,6 @@ from accelerate import PartialState
 from accelerate.utils import (
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
-    WEIGHTS_INDEX_NAME,
-    WEIGHTS_NAME,
     is_torch_version,
 )
 from huggingface_hub import split_torch_state_dict_into_shards
@@ -40,17 +38,15 @@ class BFloat16CastPlanner(_EmptyStateDictLoadPlanner):
 def _distributed_checkpoint_to_merged_weights(
     checkpoint_dir: Union[str, Path],
     save_path: str,
-    safe_serialization: bool = False,
     max_shard_size: str = "5GB",
 ) -> Path:
     """
     Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`. Will
-    save under `save_path` as either `model.safetensors` or `pytorch_model.bin`.
+    save under `save_path` as `model.safetensors`.
 
     Args:
         checkpoint_dir: Directory where distributed checkpoint is saved.
         save_path: Path to save model to.
-        safe_serialization: Whether to save in safetensors format.
         max_shard_size: Max size of model shards to save.
 
     Returns:
@@ -76,11 +72,7 @@ def _distributed_checkpoint_to_merged_weights(
         if isinstance(value, torch.Tensor) and value.dtype != torch.bfloat16:
             state_dict[key] = value.to(torch.bfloat16)
 
-    weights_name = SAFE_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
-
-    filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(
-        ".safetensors", "{suffix}.safetensors"
-    )
+    filename_pattern = SAFE_WEIGHTS_NAME.replace(".safetensors", "{suffix}.safetensors")
     state_dict_split = split_torch_state_dict_into_shards(
         state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size
     )
@@ -98,19 +90,12 @@ def _distributed_checkpoint_to_merged_weights(
 
     for shard_file, tensors in filename_to_tensors:
         shard = {tensor: state_dict[tensor] for tensor in tensors}
-
-        if safe_serialization:
-            safe_save_file(
-                shard, os.path.join(save_path_, shard_file), metadata={"format": "pt"}
-            )
-        else:
-            torch.save(shard, os.path.join(save_path_, shard_file))
+        safe_save_file(
+            shard, os.path.join(save_path_, shard_file), metadata={"format": "pt"}
+        )
 
     if index is not None:
-        save_index_file = (
-            SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
-        )
-        save_index_file = os.path.join(save_path_, save_index_file)
+        save_index_file = os.path.join(save_path_, SAFE_WEIGHTS_INDEX_NAME)
         # Save the index as well
         with open(save_index_file, "w", encoding="utf-8") as fout:
             content = json.dumps(index, indent=2, sort_keys=True) + "\n"
@@ -123,13 +108,11 @@ def _distributed_checkpoint_to_merged_weights(
 def merge_fsdp_weights(
     checkpoint_dir: str,
     output_path: str,
-    safe_serialization: bool = False,
     remove_checkpoint_dir: bool = False,
 ):
     """
     Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
-    `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors` if
-    `safe_serialization` else `pytorch_model.bin`.
+    `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors`.
 
     Note: this is a CPU-bound process.
 
@@ -138,8 +121,6 @@ def merge_fsdp_weights(
             The directory containing the FSDP checkpoints (can be either the model or optimizer).
         output_path (`str`):
             The path to save the merged checkpoint.
-        safe_serialization (`bool`, *optional*, defaults to `True`):
-            Whether to save the merged weights with safetensors (recommended).
         remove_checkpoint_dir (`bool`, *optional*, defaults to `False`):
             Whether to remove the checkpoint directory after merging.
 
@@ -177,7 +158,7 @@ def merge_fsdp_weights(
     if state.is_main_process:
         LOG.info(f"Merging FSDP weights from {checkpoint_dir_}")
         save_path = _distributed_checkpoint_to_merged_weights(
-            checkpoint_dir_, output_path, safe_serialization
+            checkpoint_dir_, output_path
         )
         LOG.info(f"Successfully merged FSDP weights and saved to {save_path}")
         if remove_checkpoint_dir:
@@ -210,7 +191,6 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
     merge_fsdp_weights(
         checkpoint_dir=str(fsdp_dir),
         output_path=output_path,
-        safe_serialization=True,
     )
     state = PartialState()
     state.wait_for_everyone()
diff --git a/src/axolotl/cli/quantize.py b/src/axolotl/cli/quantize.py
index f4fcc6d7d..939443a01 100644
--- a/src/axolotl/cli/quantize.py
+++ b/src/axolotl/cli/quantize.py
@@ -102,12 +102,10 @@ def do_quantize(
     LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}.")
     model.save_pretrained(
         str(Path(output_dir) / "quantized"),
-        safe_serialization=False,
         progressbar=True,
     )
     tokenizer.save_pretrained(
         str(Path(output_dir) / "quantized"),
-        safe_serialization=False,
         progressbar=True,
         save_jinja_files=cfg.tokenizer_save_jinja_files,
     )
@@ -121,7 +119,7 @@ def do_quantize(
             hub_model_id.rstrip("-")
             + f"-{quantization_config_to_str[type(quantization_config)]}"
         )
-        model.push_to_hub(hub_model_id, safe_serialization=False)
+        model.push_to_hub(hub_model_id)
         tokenizer.push_to_hub(hub_model_id)
         if processor:
             processor.push_to_hub(hub_model_id)
diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
index 412f6da2f..f3a965435 100644
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -216,7 +216,7 @@ class TrainerBuilderBase(abc.ABC):
     def _configure_warmup_and_logging(
         self, total_num_steps: int, training_args_kwargs: dict
     ):
-        warmup_steps = 0
+        warmup_steps: int | float = 0
         warmup_ratio = 0.0
         if self.cfg.warmup_steps is not None:
             warmup_steps = self.cfg.warmup_steps
@@ -230,6 +230,10 @@ class TrainerBuilderBase(abc.ABC):
         else:
             warmup_ratio = 0.03
 
+        # transformers v5
+        if warmup_ratio > 0.0 and warmup_steps == 0:
+            warmup_steps = warmup_ratio
+
         if warmup_steps == 1:
             warmup_steps = 2
 
@@ -242,7 +246,6 @@ class TrainerBuilderBase(abc.ABC):
                 else max(min(int(0.005 * total_num_steps), 10), 1)
             )
 
-        training_args_kwargs["warmup_ratio"] = warmup_ratio
         training_args_kwargs["warmup_steps"] = warmup_steps
 
     def _configure_precision_settings(self, training_args_kwargs: dict):
@@ -530,9 +533,7 @@ class TrainerBuilderBase(abc.ABC):
             "loraplus_lr_ratio",
             "loraplus_lr_embedding",
             "output_dir",
-            "save_safetensors",
             "save_only_model",
-            "include_tokens_per_second",
             "weight_decay",
             "seed",
             "dion_momentum",
@@ -545,6 +546,7 @@ class TrainerBuilderBase(abc.ABC):
 
         arg_map = {
             "dion_learning_rate": "dion_lr",
+            "include_num_input_tokens_seen": "include_tokens_per_second",
         }
         for kwarg, cfg_arg in arg_map.items():
             if hasattr(self.cfg, cfg_arg) and getattr(self.cfg, cfg_arg) is not None:
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
index cda98087f..3a9f8ba1b 100644
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -437,7 +437,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                 or self.cfg.micro_batch_size > 1
             ):
                 return DataCollatorForSeq2Seq(self.tokenizer, **kwargs)
-            if not (self.cfg.sample_packing and self.cfg.pretrain_multipack_attn):
+            if not (self.cfg.sample_packing and self.cfg.pretrain_multipack_attn) or (
+                self.cfg.micro_batch_size == 1 and is_eval is False
+            ):
                 return None
 
         if self.cfg.model_config_type == "mamba":
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 799dcf02e..a45e246a1 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -25,7 +25,7 @@ from torch.utils.data import (
 from transformers import PreTrainedModel, Trainer
 from transformers.trainer import TRAINING_ARGS_NAME
 from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length, seed_worker
-from transformers.utils import SAFE_WEIGHTS_NAME, WEIGHTS_NAME, is_peft_available
+from transformers.utils import SAFE_WEIGHTS_NAME, is_peft_available
 from trl.trainer.utils import pad_to_length
 from typing_extensions import override
 
@@ -738,43 +738,38 @@ class AxolotlTrainer(
                 ).save_pretrained(
                     output_dir,
                     state_dict=state_dict,
-                    safe_serialization=self.args.save_safetensors,
                 )
             else:
                 LOG.info(
                     "Trainer.model is not a `PreTrainedModel`, only saving its state dict."
                 )
-                if self.args.save_safetensors:
-                    safetensors.torch.save_file(
-                        state_dict,
-                        os.path.join(output_dir, SAFE_WEIGHTS_NAME),
-                        metadata={"format": "pt"},
-                    )
-                else:
-                    torch.save(state_dict, os.path.join(output_dir, WEIGHTS_NAME))
+                safetensors.torch.save_file(
+                    state_dict,
+                    os.path.join(output_dir, SAFE_WEIGHTS_NAME),
+                    metadata={"format": "pt"},
+                )
         else:
             self.model.save_pretrained(
                 output_dir,
                 state_dict=state_dict,
-                safe_serialization=self.args.save_safetensors,
                 is_main_process=self.accelerator.is_main_process,
             )
 
-            if self.processing_class is not None:
-                self.processing_class.save_pretrained(output_dir)
-            elif (
-                self.data_collator is not None
-                and hasattr(self.data_collator, "tokenizer")
-                and self.data_collator.tokenizer is not None
-            ):
-                LOG.info(
-                    "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
-                )
-                save_jinja_files = True
-                if self.axolotl_cfg:
-                    save_jinja_files = self.axolotl_cfg.tokenizer_save_jinja_files
-                self.data_collator.tokenizer.save_pretrained(
-                    output_dir, save_jinja_files=save_jinja_files
-                )
-            # Good practice: save your training arguments together with the trained model
-            torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
+        if self.processing_class is not None:
+            self.processing_class.save_pretrained(output_dir)
+        elif (
+            self.data_collator is not None
+            and hasattr(self.data_collator, "tokenizer")
+            and self.data_collator.tokenizer is not None
+        ):
+            LOG.info(
+                "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
+            )
+            save_jinja_files = True
+            if self.axolotl_cfg:
+                save_jinja_files = self.axolotl_cfg.tokenizer_save_jinja_files
+            self.data_collator.tokenizer.save_pretrained(
+                output_dir, save_jinja_files=save_jinja_files
+            )
+        # Good practice: save your training arguments together with the trained model
+        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
diff --git a/src/axolotl/integrations/llm_compressor/utils.py b/src/axolotl/integrations/llm_compressor/utils.py
index f04454e5b..1abcb6bd4 100644
--- a/src/axolotl/integrations/llm_compressor/utils.py
+++ b/src/axolotl/integrations/llm_compressor/utils.py
@@ -12,7 +12,6 @@ def save_compressed_model(
     model: PreTrainedModel,
     output_dir: Union[str, bytes],
     trainer: Trainer,
-    safe_serialization: bool = False,
     save_compressed: bool = False,
 ) -> None:
     """
@@ -22,7 +21,6 @@ def save_compressed_model(
         model (PreTrainedModel): The model to be saved.
         output_dir (str or bytes): Path where the model files will be written.
         trainer (Trainer): Hugging Face Trainer for process synchronization.
-        safe_serialization (bool): Use safe serialization if True.
         save_compressed (bool): Write compressed tensors if True.
     """
     trainer.accelerator.wait_for_everyone()
@@ -34,7 +32,6 @@ def save_compressed_model(
     modify_save_pretrained(model)
     model.save_pretrained(
         output_dir,
-        safe_serialization=safe_serialization,
         save_compressed=save_compressed,
         skip_sparsity_compression_stats=not save_compressed,
     )
diff --git a/src/axolotl/loaders/model.py b/src/axolotl/loaders/model.py
index 1eeed3565..0133148eb 100644
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -26,7 +26,6 @@ from torch.distributed import DeviceMesh
 from transformers import (
     AutoModelForCausalLM,
     AutoModelForImageTextToText,
-    AutoModelForVision2Seq,
     AwqConfig,
     BitsAndBytesConfig,
     GPTQConfig,
@@ -434,7 +433,7 @@ class ModelLoader:
         """
         if self.cfg.is_multimodal:
             self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get(
-                self.model_config.model_type, AutoModelForVision2Seq
+                self.model_config.model_type, AutoModelForImageTextToText
             )
             if isinstance(self.auto_model_loader, str):
                 self.auto_model_loader = AutoModelForImageTextToText
@@ -476,6 +475,7 @@ class ModelLoader:
             max_memory = None
 
         self.model_kwargs["torch_dtype"] = self.cfg.torch_dtype
+        self.model_kwargs["dtype"] = self.cfg.torch_dtype
 
         is_ds_zero3 = is_deepspeed_zero3_enabled()
 
@@ -670,7 +670,7 @@ class ModelLoader:
         Uses the selected loader when provided; otherwise falls back to the auto loader.
         """
         loader = model_loader_class or self.auto_model_loader
-        if loader in [AutoModelForCausalLM, AutoModelForVision2Seq]:
+        if loader in [AutoModelForCausalLM, AutoModelForImageTextToText]:
             model = loader.from_config(
                 config=self.model_config,
                 trust_remote_code=self.cfg.trust_remote_code or False,
@@ -788,6 +788,7 @@ class ModelLoader:
                 # Use auto model loader (handles gptq and default cases)
                 model_loader_class = self.auto_model_loader
 
+            self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
             if self.cfg.reinit_weights:
                 self.model = self._load_model_from_config(model_loader_class)
             else:
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index b7a53c4d5..30c3ba0fd 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -220,13 +220,6 @@ class PatchManager:
 
             patch_qwen3_next_modeling_packing()
 
-        if self.cfg.model_config_type == "mistral3" and self.cfg.processor_type:
-            from axolotl.monkeypatch.models.mistral3.mistral_common_tokenizer import (
-                apply_mistral_tokenizer_image_patch,
-            )
-
-            apply_mistral_tokenizer_image_patch()
-
         if self.cfg.model_config_type == "kimi_linear":
             from axolotl.monkeypatch.models.kimi_linear.patch_kimi_linear import (
                 patch_kimi_model,
diff --git a/src/axolotl/loaders/processor.py b/src/axolotl/loaders/processor.py
index 827b4be35..124dad39e 100644
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -31,7 +31,7 @@ def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
 
             from axolotl.utils.mistral import HFMistralTokenizer
 
-            tokenization_mistral_common.MistralCommonTokenizer = HFMistralTokenizer
+            tokenization_mistral_common.MistralCommonBackend = HFMistralTokenizer
 
         _patch_mistralcommontokenizer()
 
diff --git a/src/axolotl/models/mamba/modeling_mamba.py b/src/axolotl/models/mamba/modeling_mamba.py
index 2cfe11544..e6158a0a9 100644
--- a/src/axolotl/models/mamba/modeling_mamba.py
+++ b/src/axolotl/models/mamba/modeling_mamba.py
@@ -111,7 +111,6 @@ class MambaLMHeadModel(nn.Module, GenerationMixin):
         self,
         save_directory: Union[str, os.PathLike],
         state_dict: Optional[dict] = None,
-        safe_serialization: Optional[bool] = None,
     ):
         if state_dict is None:
             state_dict = self.state_dict()
diff --git a/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py b/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
index 9e7259a05..a77a0129e 100644
--- a/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
+++ b/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
@@ -1,5 +1,5 @@
 """
-Monkeypatch to fix inefficient tensor conversion in MistralCommonTokenizer.apply_chat_template
+Monkeypatch to fix inefficient tensor conversion in MistralCommonBackend.apply_chat_template
 """
 
 import importlib
@@ -12,11 +12,11 @@ LOG = get_logger(__name__)
 
 
 def apply_mistral_tokenizer_image_patch():
-    """Apply patch to MistralCommonTokenizer.apply_chat_template to fix image tensor conversion."""
-    from transformers.tokenization_mistral_common import MistralCommonTokenizer
+    """Apply patch to MistralCommonBackend.apply_chat_template to fix image tensor conversion."""
+    from transformers.tokenization_mistral_common import MistralCommonBackend
 
     # Get original source
-    original_source = inspect.getsource(MistralCommonTokenizer.apply_chat_template)
+    original_source = inspect.getsource(MistralCommonBackend.apply_chat_template)
     original_source, _ = detab_code(original_source)
 
     # Define the replacement
@@ -41,7 +41,7 @@ def apply_mistral_tokenizer_image_patch():
         )
 
         # Load necessary imports from the module
-        module_name = MistralCommonTokenizer.__module__
+        module_name = MistralCommonBackend.__module__
         module = importlib.import_module(module_name)
 
         # Detect what needs to be imported
@@ -79,7 +79,7 @@ def apply_mistral_tokenizer_image_patch():
         exec(patched_source, globals())  # nosec B102
 
         # Replace the method
-        MistralCommonTokenizer.apply_chat_template = patched_apply_chat_template
-        LOG.info("Successfully applied MistralCommonTokenizer tensor conversion patch")
+        MistralCommonBackend.apply_chat_template = patched_apply_chat_template
+        LOG.info("Successfully applied MistralCommonBackend tensor conversion patch")
     else:
-        LOG.warning("Could not find target code for MistralCommonTokenizer patching")
+        LOG.warning("Could not find target code for MistralCommonBackend patching")
diff --git a/src/axolotl/monkeypatch/relora.py b/src/axolotl/monkeypatch/relora.py
index a01d850b3..cf93c32dd 100644
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -155,7 +155,6 @@ class ReLoRACallback(TrainerCallback):
                     f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
                     "adapter",
                 ),
-                safe_serialization=True,
             )
             with torch.no_grad():
                 merge_and_save(
@@ -214,7 +213,7 @@ class ReLoRACallback(TrainerCallback):
 
                     self.last_full_model = checkpoint_folder
             else:
-                model.model.save_pretrained(checkpoint_folder, safe_serialization=True)
+                model.model.save_pretrained(checkpoint_folder)
 
         return control
 
diff --git a/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
index ba8b16dda..15f90423e 100644
--- a/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
@@ -52,9 +52,15 @@ def patch_prepare_context_parallel_inputs() -> None:
         if item in patched_source:
             items_to_import.append(item)
 
-    exec(f"from {module_name} import ({', '.join(items_to_import)})", globals())
-    exec(patched_source, globals())
+    # Use a separate namespace to capture the exec'd function
+    namespace = {}
+    exec(f"from {module_name} import ({', '.join(items_to_import)})", namespace)
+    exec(patched_source, namespace)
 
+    # Explicitly get the function from the namespace
+    axolotl_prepare_context_parallel_inputs = namespace[
+        "axolotl_prepare_context_parallel_inputs"
+    ]
     Trainer._original_prepare_context_parallel_inputs = (
         Trainer._prepare_context_parallel_inputs
     )
diff --git a/src/axolotl/processing_strategies.py b/src/axolotl/processing_strategies.py
index c209c892a..077db4388 100644
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -14,7 +14,6 @@ from transformers.models.voxtral import VoxtralProcessor
 
 from axolotl.utils.dict import remove_none_values
 from axolotl.utils.logging import get_logger
-from axolotl.utils.mistral.mistral3_processor import Mistral3Processor
 
 LOG = get_logger(__name__)
 
@@ -430,7 +429,7 @@ class Mistral3ProcessingStrategy(ProcessingStrategy):
 
     def __init__(
         self,
-        processor: Mistral3Processor,
+        processor,
         chat_template: Optional[str] = None,
         image_size: int | tuple[int, int] | None = None,
         image_resize_algorithm: Resampling | None = None,
@@ -493,6 +492,8 @@ def get_processing_strategy(
     image_size: int | tuple[int, int] | None = None,
     image_resize_algorithm: Resampling | None = None,
 ):
+    from axolotl.utils.mistral.mistral3_processor import Mistral3Processor
+
     processing_kwargs = {
         "processor": processor,
         "chat_template": chat_template,
diff --git a/src/axolotl/prompt_strategies/chat_template.py b/src/axolotl/prompt_strategies/chat_template.py
index 0fec64d81..57d3bfdf2 100644
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -150,6 +150,8 @@ class ChatTemplatePrompter(Prompter):
 
         return self.tokenizer.apply_chat_template(
             conversation,
+            tokenize=True,
+            return_dict=False,
             **chat_template_kwargs,
         )
 
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index cce3b8a6a..856996b62 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -135,16 +135,13 @@ def setup_reference_model(
     return model_ref
 
 
-def setup_signal_handler(
-    cfg: DictDefault, model: PreTrainedModel, safe_serialization: bool
-):
+def setup_signal_handler(cfg: DictDefault, model: PreTrainedModel):
     """
     Set up signal handler for graceful termination.
 
     Args:
         cfg: Dictionary mapping `axolotl` config keys to values.
         model: The model to save on termination
-        safe_serialization: Whether to use safe serialization when saving
     """
     # ray workers don't have access to this signal
     if cfg.local_rank == 0 and not cfg.use_ray:
@@ -152,9 +149,7 @@ def setup_signal_handler(
         def terminate_handler(_, __, model_weakref):
             if model_weakref() is not None:
                 _model = model_weakref()
-                _model.save_pretrained(
-                    cfg.output_dir, safe_serialization=safe_serialization
-                )
+                _model.save_pretrained(cfg.output_dir)
 
             cleanup_distributed()
             sys.exit(0)
@@ -219,7 +214,6 @@ def save_trained_model(
     cfg: DictDefault,
     trainer: Any,
     model: PreTrainedModel,
-    safe_serialization: bool,
 ):
     """
     Save the trained model according to configuration and training setup.
@@ -228,7 +222,6 @@ def save_trained_model(
         cfg: Dictionary mapping `axolotl` config keys to values.
         trainer: The trainer object.
         model: The trained model to save.
-        safe_serialization: Whether to use safe serialization.
     """
     LOG.info(f"Training completed! Saving trained model to {cfg.output_dir}.")
 
@@ -283,7 +276,6 @@ def save_trained_model(
                 merge_fsdp_weights(
                     checkpoint_dir=str(fsdp_dir),
                     output_path=merged_path,
-                    safe_serialization=True,
                 )
                 trainer.accelerator.wait_for_everyone()
                 if trainer.accelerator.is_main_process:
@@ -330,11 +322,9 @@ def save_trained_model(
                 pass
     elif cfg.local_rank == 0:
         if cfg.rl and cfg.adapter and not cfg.rl_adapter_ref_model:
-            trainer.model.save_pretrained(
-                cfg.output_dir, safe_serialization=safe_serialization
-            )
+            trainer.model.save_pretrained(cfg.output_dir)
 
-        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
+        model.save_pretrained(cfg.output_dir)
 
     if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
         # TODO: add integration support so this can be implemented completely within the plugin
@@ -344,7 +334,6 @@ def save_trained_model(
             model=model,
             output_dir=cfg.output_dir,
             trainer=trainer,
-            safe_serialization=safe_serialization,
             save_compressed=cfg.llmcompressor.save_compressed,
         )
 
@@ -449,7 +438,6 @@ def handle_untrained_tokens_fix(
     model: PreTrainedModel,
     tokenizer: PreTrainedTokenizer,
     train_dataset: Dataset,
-    safe_serialization: bool,
 ):
     """
     Apply fixes for untrained tokens if configured.
@@ -459,7 +447,6 @@ def handle_untrained_tokens_fix(
         model: The model to apply fixes to.
         tokenizer: The tokenizer for token identification.
         train_dataset: The training dataset to use.
-        safe_serialization: Whether to use safe serialization when saving.
     """
     if not cfg.fix_untrained_tokens:
         return
@@ -483,9 +470,7 @@ def handle_untrained_tokens_fix(
     fix_untrained_tokens(model, tokenizer, train_dataset, **fix_kwargs)
 
     if cfg.local_rank == 0:
-        model.save_pretrained(
-            str(Path(cfg.output_dir)), safe_serialization=safe_serialization
-        )
+        model.save_pretrained(str(Path(cfg.output_dir)))
 
 
 def setup_model_and_trainer(
@@ -582,15 +567,12 @@ def train(
     ) = setup_model_and_trainer(cfg, dataset_meta)
 
     # Handle untrained tokens if configured
-    safe_serialization = cfg.save_safetensors is True
     train_dataset = dataset_meta.train_dataset
-    handle_untrained_tokens_fix(
-        cfg, model, tokenizer, train_dataset, safe_serialization
-    )
+    handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)
 
     # Additional setup
     save_initial_configs(cfg, tokenizer, model, peft_config, processor)
-    setup_signal_handler(cfg, model, safe_serialization)
+    setup_signal_handler(cfg, model)
     setup_model_card(cfg)
 
     # Execute the training
@@ -602,7 +584,7 @@ def train(
         torch.cuda.empty_cache()
 
     # Save the trained model and cleanup
-    save_trained_model(cfg, trainer, model, safe_serialization)
+    save_trained_model(cfg, trainer, model)
     tokenizer.save_pretrained(
         str(Path(cfg.output_dir)), save_jinja_files=cfg.tokenizer_save_jinja_files
     )
diff --git a/src/axolotl/utils/callbacks/perplexity.py b/src/axolotl/utils/callbacks/perplexity.py
index a5b39c304..36cacfb81 100644
--- a/src/axolotl/utils/callbacks/perplexity.py
+++ b/src/axolotl/utils/callbacks/perplexity.py
@@ -7,7 +7,11 @@ from torch import Tensor
 from tqdm import tqdm
 from transformers.modeling_outputs import CausalLMOutput
 from transformers.modeling_utils import PreTrainedModel
-from transformers.tokenization_utils import PreTrainedTokenizer
+
+try:
+    from transformers.tokenization_python import PreTrainedTokenizer
+except ImportError:
+    from transformers.tokenization_utils import PreTrainedTokenizer
 
 from axolotl.utils.distributed import is_main_process
 
diff --git a/src/axolotl/utils/mistral/mistral_tokenizer.py b/src/axolotl/utils/mistral/mistral_tokenizer.py
index 3ce6be780..a5ffe9a28 100644
--- a/src/axolotl/utils/mistral/mistral_tokenizer.py
+++ b/src/axolotl/utils/mistral/mistral_tokenizer.py
@@ -7,11 +7,11 @@ import numpy as np
 from mistral_common.protocol.instruct.validator import ValidationMode
 from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub
 from torch import Tensor
-from transformers.tokenization_mistral_common import MistralCommonTokenizer
+from transformers.tokenization_mistral_common import MistralCommonBackend
 from transformers.tokenization_utils_base import VERY_LARGE_INTEGER
 
 
-class HFMistralTokenizer(MistralCommonTokenizer):
+class HFMistralTokenizer(MistralCommonBackend):
     """
     Wraps mistral_common.tokens.tokenizers.mistral.MistralTokenizer
     and exposes HuggingFace API for special tokens.
@@ -37,11 +37,19 @@ class HFMistralTokenizer(MistralCommonTokenizer):
     def name_or_path(self) -> str:
         return self._name_or_path
 
+    @name_or_path.setter
+    def name_or_path(self, name_or_path: str) -> None:
+        self._name_or_path = name_or_path
+
     @property
     def chat_template(self) -> str | None:
         """Chat template is not supported. Dummy method to satisfy HuggingFace API."""
         return "[This is a dummy chat template]"
 
+    @chat_template.setter
+    def chat_template(self, chat_template: str | None) -> None:
+        pass
+
     def _set_mode(self, mode: ValidationMode):
         """Set the mode of the MistralRequestValidator.
 
@@ -133,7 +141,7 @@ class HFMistralTokenizer(MistralCommonTokenizer):
         r"""
         Patched fn to pass `name_or_path` and remove extra kwargs.
 
-        Instantiate a `MistralCommonTokenizer` from a predefined
+        Instantiate a `MistralCommonBackend` from a predefined
         tokenizer.
 
         Args:
@@ -142,7 +150,7 @@ class HFMistralTokenizer(MistralCommonTokenizer):
 
                 - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                 - A path to a *directory* containing the tokenizer config, for instance saved
-                  using the [`MistralCommonTokenizer.tokenization_mistral_common.save_pretrained`] method, e.g.,
+                  using the [`MistralCommonBackend.tokenization_mistral_common.save_pretrained`] method, e.g.,
                   `./my_model_directory/`.
             mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`):
                 Validation mode for the `MistralTokenizer` tokenizer.
@@ -154,7 +162,7 @@ class HFMistralTokenizer(MistralCommonTokenizer):
                 exist.
             token (`str` or *bool*, *optional*):
                 The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
-                when running `huggingface-cli login` (stored in `~/.huggingface`).
+                when running `hf auth login` (stored in `~/.huggingface`).
             local_files_only (`bool`, *optional*, defaults to `False`):
                 Whether or not to only rely on local files and not to attempt to download any files.
             revision (`str`, *optional*, defaults to `"main"`):
@@ -179,12 +187,12 @@ class HFMistralTokenizer(MistralCommonTokenizer):
                 Whether or not the model should cleanup the spaces that were added when splitting the input text during the
                 tokenization process.
             kwargs (additional keyword arguments, *optional*):
-                Not supported by `MistralCommonTokenizer.from_pretrained`.
+                Not supported by `MistralCommonBackend.from_pretrained`.
                 Will raise an error if used.
         """
         if init_inputs:
             raise ValueError(
-                "`init_inputs` are not supported by `MistralCommonTokenizer.from_pretrained`."
+                "`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`."
             )
 
         # Delete trust_remote_code as it does nothing
@@ -196,7 +204,7 @@ class HFMistralTokenizer(MistralCommonTokenizer):
         # Handle kwargs and AutoTokenizer case
         if kwargs and not kwargs.keys() == {"_from_auto"}:
             raise ValueError(
-                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonTokenizer.from_pretrained`."
+                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.from_pretrained`."
             )
 
         if not os.path.isfile(pretrained_model_name_or_path):
diff --git a/src/axolotl/utils/schemas/fsdp.py b/src/axolotl/utils/schemas/fsdp.py
index f34f40e8e..60b5819c5 100644
--- a/src/axolotl/utils/schemas/fsdp.py
+++ b/src/axolotl/utils/schemas/fsdp.py
@@ -4,7 +4,7 @@ FSDP Configuration Schema
 
 from typing import Literal
 
-from pydantic import BaseModel, Field
+from pydantic import AliasChoices, BaseModel, Field
 
 
 class FSDPConfig(BaseModel):
@@ -12,6 +12,11 @@ class FSDPConfig(BaseModel):
     FSDP Configuration Schema
     """
 
+    fsdp_version: int | None = Field(
+        validation_alias=AliasChoices("fsdp_version", "version"),
+        default=None,
+        json_schema_extra={"description": "FSDP version"},
+    )
     activation_checkpointing: bool | None = Field(
         default=None,
         description="Enable activation checkpointing to reduce memory usage during forward passes",
diff --git a/src/axolotl/utils/schemas/model.py b/src/axolotl/utils/schemas/model.py
index 0931608a6..31de7b45e 100644
--- a/src/axolotl/utils/schemas/model.py
+++ b/src/axolotl/utils/schemas/model.py
@@ -123,10 +123,22 @@ class ModelOutputConfig(BaseModel):
     save_safetensors: bool | None = Field(
         default=True,
         json_schema_extra={
-            "description": "Save model as safetensors (require safetensors package). Default True"
+            "description": "Whether to save the model using safetensors format. Defaults to True."
         },
     )
 
+    @field_validator("save_safetensors")
+    @classmethod
+    def validate_save_safetensors(cls, v):
+        if v is False:
+            raise ValueError(
+                "save_safetensors=False is not supported in Transformers V5. "
+                "Transformers V5 always uses safetensors format for model serialization. "
+                "This field is deprecated and will be removed in a future version."
+            )
+        # Allow None and True, will default to True if None
+        return True if v is None else v
+
 
 class SpecialTokensConfig(BaseModel):
     """Special tokens configuration subset"""
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index bb9c3c673..9f225b75e 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -900,6 +900,43 @@ class OptimizationValidationMixin:
 
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp_config_kwargs_prefix(cls, data):
+        if fsdp_config := data.get("fsdp_config"):
+            should_fix = False
+            for key, _ in fsdp_config.items():
+                if key.startswith("fsdp_"):
+                    should_fix = True
+                    LOG.warning_once(
+                        "Configuring FSDP fields with the `fsdp_` prefix is deprecated. "
+                        "Please omit the `fsdp_` prefix from the any fields in `fsdp_config`."
+                    )
+            if should_fix:
+                update_fsdp_config = {}
+                for key, value in fsdp_config.items():
+                    if key.startswith("fsdp_") and key != "fsdp_version":
+                        update_fsdp_config[key.replace("fsdp_", "")] = value
+                    else:
+                        update_fsdp_config[key] = value
+                data["fsdp_config"] = update_fsdp_config
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp_version_in_fsdp_config(cls, data):
+        fsdp_config = data.get("fsdp_config") or {}
+        fsdp_version = data.get("fsdp_version", None)
+        if not fsdp_version and fsdp_config and fsdp_config.get("version"):
+            fsdp_cfg_version = fsdp_config.pop("version")
+            data["fsdp_version"] = fsdp_cfg_version
+            data["fsdp_config"]["fsdp_version"] = fsdp_cfg_version
+        elif not fsdp_version and fsdp_config and fsdp_config.get("fsdp_version"):
+            data["fsdp_version"] = fsdp_config.get("fsdp_version")
+        if fsdp_version and fsdp_config and not fsdp_config.get("fsdp_version"):
+            data["fsdp_config"]["fsdp_version"] = fsdp_version
+        return data
+
     @model_validator(mode="after")
     def check_fsdp_offload_w_8bit_optimizer(self):
         if (
@@ -1001,40 +1038,6 @@ class OptimizationValidationMixin:
 
         return data
 
-    @model_validator(mode="before")
-    @classmethod
-    def check_fsdp_version_in_fsdp_config(cls, data):
-        fsdp_config = data.get("fsdp_config") or {}
-        if fsdp_config and fsdp_config.get("fsdp_version"):
-            LOG.warning(
-                "Configuring `fsdp_version` in `fsdp_config` is deprecated. "
-                "Please configure `fsdp_version` as a top-level field."
-            )
-            data["fsdp_version"] = fsdp_config.pop("fsdp_version")
-        return data
-
-    @model_validator(mode="before")
-    @classmethod
-    def check_fsdp_config_kwargs_prefix(cls, data):
-        if fsdp_config := data.get("fsdp_config"):
-            should_fix = False
-            for key, _ in fsdp_config.items():
-                if key.startswith("fsdp_"):
-                    should_fix = True
-                    LOG.warning_once(
-                        "Configuring FSDP fields with the `fsdp_` prefix is deprecated. "
-                        "Please omit the `fsdp_` prefix from the any fields in `fsdp_config`."
-                    )
-            if should_fix:
-                update_fsdp_config = {}
-                for key, value in fsdp_config.items():
-                    if key.startswith("fsdp_") and key != "fsdp_version":
-                        update_fsdp_config[key.replace("fsdp_", "")] = value
-                    else:
-                        update_fsdp_config[key] = value
-                data["fsdp_config"] = update_fsdp_config
-        return data
-
 
 class SystemValidationMixin:
     """Validation methods related to system and hardware configuration."""
diff --git a/tests/conftest.py b/tests/conftest.py
index 4c8c80cb7..b542d377b 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -83,6 +83,12 @@ def download_smollm2_135m_model():
     snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M", repo_type="model")
 
 
+@pytest.fixture(scope="session", autouse=True)
+def download_smollm2_135m_instruct_model():
+    # download the model
+    snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M-Instruct", repo_type="model")
+
+
 @pytest.fixture(scope="session", autouse=True)
 def download_smollm2_135m_gptq_model():
     # download the model
@@ -143,12 +149,20 @@ def download_argilla_distilabel_intel_orca_dpo_dataset():
     )
 
 
-# @pytest.fixture(scope="session", autouse=True)
-# def download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset():
-#     # download the dataset
-#     snapshot_download_w_retry(
-#         "argilla/ultrafeedback-binarized-preferences-cleaned", repo_type="dataset"
-#     )
+@pytest.fixture(scope="session", autouse=True)
+def download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset():
+    # download the dataset
+    snapshot_download_w_retry(
+        "argilla/ultrafeedback-binarized-preferences-cleaned", repo_type="dataset"
+    )
+
+
+@pytest.fixture(scope="session", autouse=True)
+def download_argilla_ultrafeedback_binarized_preferences_cleaned_kto_dataset():
+    # download the dataset
+    snapshot_download_w_retry(
+        "argilla/ultrafeedback-binarized-preferences-cleaned-kto", repo_type="dataset"
+    )
 
 
 # @pytest.fixture(scope="session", autouse=True)
@@ -251,7 +265,9 @@ def download_llama_1b_model_fixture():
 def download_llama3_8b_model_fixture():
     # download the tokenizer only
     snapshot_download_w_retry(
-        "NousResearch/Meta-Llama-3-8B", repo_type="model", allow_patterns=["*token*"]
+        "NousResearch/Meta-Llama-3-8B",
+        repo_type="model",
+        allow_patterns=["*token*", "config.json"],
     )
 
 
@@ -261,7 +277,7 @@ def download_llama3_8b_instruct_model_fixture():
     snapshot_download_w_retry(
         "NousResearch/Meta-Llama-3-8B-Instruct",
         repo_type="model",
-        allow_patterns=["*token*"],
+        allow_patterns=["*token*", "config.json"],
     )
 
 
@@ -269,7 +285,19 @@ def download_llama3_8b_instruct_model_fixture():
 def download_phi_35_mini_model_fixture():
     # download the tokenizer only
     snapshot_download_w_retry(
-        "microsoft/Phi-3.5-mini-instruct", repo_type="model", allow_patterns=["*token*"]
+        "microsoft/Phi-3.5-mini-instruct",
+        repo_type="model",
+        allow_patterns=["*token*", "config.json"],
+    )
+
+
+@pytest.fixture(scope="session", autouse=True)
+def download_phi_4_reasoning_model_fixture():
+    # download the tokenizer only
+    snapshot_download_w_retry(
+        "microsoft/Phi-4-reasoning",
+        repo_type="model",
+        allow_patterns=["*token*", "config.json"],
     )
 
 
@@ -279,7 +307,7 @@ def download_phi_3_medium_model_fixture():
     snapshot_download_w_retry(
         "microsoft/Phi-3-medium-128k-instruct",
         repo_type="model",
-        allow_patterns=["*token*"],
+        allow_patterns=["*token*", "config.json"],
     )
 
 
@@ -562,6 +590,8 @@ def test_load_fixtures(
     download_mhenrichsen_alpaca_2k_dataset,
     download_mhenrichsen_alpaca_2k_w_revision_dataset,
     download_mlabonne_finetome_100k_dataset,
+    download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset,
+    download_argilla_ultrafeedback_binarized_preferences_cleaned_kto_dataset,
     download_argilla_distilabel_capybara_dpo_7k_binarized_dataset,
     download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset,
     download_argilla_dpo_pairs_dataset,
@@ -573,6 +603,7 @@ def test_load_fixtures(
     download_llama3_8b_instruct_model_fixture,
     download_phi_35_mini_model_fixture,
     download_phi_3_medium_model_fixture,
+    download_phi_4_reasoning_model_fixture,
     download_mistral_7b_model_fixture,
     download_gemma_2b_model_fixture,
     download_gemma2_9b_model_fixture,
diff --git a/tests/core/test_builders.py b/tests/core/test_builders.py
index c2d81cbcb..5f1481101 100644
--- a/tests/core/test_builders.py
+++ b/tests/core/test_builders.py
@@ -53,7 +53,6 @@ def fixture_base_cfg():
             # Checkpointing and saving
             "save_steps": 100,
             "output_dir": "./model-out",
-            "save_safetensors": True,
             "save_total_limit": 4,
             "save_only_model": False,
             # Hardware/performance settings
diff --git a/tests/e2e/integrations/test_cut_cross_entropy.py b/tests/e2e/integrations/test_cut_cross_entropy.py
index 1ba05077c..7da644ec3 100644
--- a/tests/e2e/integrations/test_cut_cross_entropy.py
+++ b/tests/e2e/integrations/test_cut_cross_entropy.py
@@ -10,7 +10,7 @@ from axolotl.utils import get_pytorch_version
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 
-from ..utils import check_model_output_exists
+from tests.e2e.utils import check_model_output_exists
 
 
 @pytest.fixture()
@@ -39,7 +39,6 @@ def min_cfg(temp_dir):
         "optimizer": "adamw_torch_fused",
         "output_dir": temp_dir,
         "lr_scheduler": "cosine",
-        "save_safetensors": True,
         "max_steps": 10,
         "bf16": "auto",
         "save_first_step": False,
@@ -92,7 +91,6 @@ class TestCutCrossEntropyIntegration:
                 "optimizer": "adamw_torch_fused",
                 "output_dir": temp_dir,
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "max_steps": 10,
                 "bf16": "auto",
                 "save_first_step": False,
diff --git a/tests/e2e/integrations/test_fp8.py b/tests/e2e/integrations/test_fp8.py
index 7db63cc4d..b708e8806 100644
--- a/tests/e2e/integrations/test_fp8.py
+++ b/tests/e2e/integrations/test_fp8.py
@@ -48,7 +48,6 @@ class FP8IntegrationTestCase:
                 "sample_packing": True,
                 "fp8": True,
                 "torch_compile": True,
-                "save_safetensors": True,
                 "save_first_step": False,
             }
         )
diff --git a/tests/e2e/integrations/test_hooks.py b/tests/e2e/integrations/test_hooks.py
index b85505caa..e056d1491 100644
--- a/tests/e2e/integrations/test_hooks.py
+++ b/tests/e2e/integrations/test_hooks.py
@@ -11,7 +11,7 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 
-from ..utils import check_model_output_exists
+from tests.e2e.utils import check_model_output_exists
 
 
 class LogHooksPlugin(BasePlugin):
diff --git a/tests/e2e/integrations/test_kd.py b/tests/e2e/integrations/test_kd.py
index d89044247..9e0e9406b 100644
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -65,7 +65,6 @@ def min_cfg(temp_dir):
         },
         "max_steps": 5,
         "output_dir": temp_dir,
-        "save_safetensors": True,
         "use_tensorboard": True,
         "save_first_step": False,
     }
diff --git a/tests/e2e/integrations/test_liger.py b/tests/e2e/integrations/test_liger.py
index e50483e6c..ba19cf41c 100644
--- a/tests/e2e/integrations/test_liger.py
+++ b/tests/e2e/integrations/test_liger.py
@@ -48,7 +48,6 @@ class LigerIntegrationTestCase:
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "bf16": "auto",
                 "max_steps": 5,
                 "save_first_step": False,
@@ -99,7 +98,6 @@ class LigerIntegrationTestCase:
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "bf16": "auto",
                 "max_steps": 5,
                 "save_first_step": False,
diff --git a/tests/e2e/integrations/test_llm_compressor.py b/tests/e2e/integrations/test_llm_compressor.py
index dceecea9f..5804bca10 100644
--- a/tests/e2e/integrations/test_llm_compressor.py
+++ b/tests/e2e/integrations/test_llm_compressor.py
@@ -57,7 +57,6 @@ class TestLLMCompressorIntegration:
                 "learning_rate": 1e-5,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "bf16": "auto",
                 "max_steps": 5,
                 "llmcompressor": {
diff --git a/tests/e2e/multigpu/solo/test_grpo.py b/tests/e2e/multigpu/solo/test_grpo.py
index 257a388d0..8d0bc3f68 100644
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -220,7 +220,6 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                 "learning_rate": 0.0001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
                 "save_first_step": False,
@@ -315,7 +314,6 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                 "learning_rate": 0.0001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
                 "save_first_step": False,
@@ -408,7 +406,6 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                 "learning_rate": 0.0001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
                 "save_first_step": False,
diff --git a/tests/e2e/multigpu/test_fp8_fsdp2.py b/tests/e2e/multigpu/test_fp8_fsdp2.py
index dc369f3de..8d7c01ce8 100644
--- a/tests/e2e/multigpu/test_fp8_fsdp2.py
+++ b/tests/e2e/multigpu/test_fp8_fsdp2.py
@@ -11,7 +11,7 @@ from transformers.testing_utils import get_torch_dist_unique_port
 
 from axolotl.utils.dict import DictDefault
 
-from tests.e2e.utils import most_recent_subdir, require_hopper, require_torch_2_7_0
+from tests.e2e.utils import most_recent_subdir, require_torch_2_7_0, supports_fp8
 
 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
 
@@ -49,7 +49,7 @@ class TestFP8FSDP2:
     """Test class for FP8 mixed precision with FSDP2 functionality."""
 
     @require_torch_2_7_0
-    @require_hopper
+    @supports_fp8
     def test_fp8_fsdp2_smoke(self, temp_dir):
         """Smoke test for 2-GPU FP8 + torch.compile + FSDP2 training"""
         cfg = DictDefault(
@@ -94,7 +94,6 @@ class TestFP8FSDP2:
                     "reshard_after_forward": True,
                 },
                 "use_tensorboard": True,
-                "save_safetensors": True,
                 "save_first_step": False,
             }
         )
diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py
index cb92c80b5..e50316287 100644
--- a/tests/e2e/multigpu/test_fsdp1.py
+++ b/tests/e2e/multigpu/test_fsdp1.py
@@ -244,6 +244,7 @@ class TestFSDP1:
 
         verify_training_success(temp_dir)
 
+    @pytest.mark.skip("broken in transformers v5")
     @pytest.mark.parametrize(
         "adapter_config",
         [
diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py
index 8b7ee710e..19239a3ec 100644
--- a/tests/e2e/multigpu/test_fsdp2.py
+++ b/tests/e2e/multigpu/test_fsdp2.py
@@ -150,6 +150,10 @@ class TestFSDP2:
                 },
                 "use_tensorboard": True,
                 "bf16": True,
+                # explicitly disable LORA kernels, as they may be auto-enabled
+                "lora_mlp_kernel": False,
+                "lora_qkv_kernel": False,
+                "lora_o_kernel": False,
             }
         )
 
diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py
index 51ec68b11..34f98c037 100644
--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -23,6 +23,7 @@ def download_model():
     snapshot_download("axolotl-mirrors/gemma-3-4b-pt", repo_type="model")
 
 
+@pytest.mark.skip(reason="FIXME")
 class TestMultiGPUGemma3:
     """
     Test case for Gemma3 models using LoRA
@@ -32,6 +33,7 @@ class TestMultiGPUGemma3:
         cfg = DictDefault(
             {
                 "base_model": "axolotl-mirrors/gemma-3-4b-pt",
+                "unfrozen_parameters": ["model.language_model.*", "lm_head"],
                 "sequence_len": 2048,
                 "ddp_find_unused_parameters": True,
                 "sample_packing": True,
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index 3383e71d1..1e3757dcf 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -901,7 +901,6 @@ class TestMultiGPULlama:
                 "flash_attention": True,
                 "sample_packing": True,
                 "bf16": True,
-                "save_safetensors": True,
                 # "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                 "use_tensorboard": True,
                 "save_first_step": False,
diff --git a/tests/e2e/patched/test_activation_checkpointing.py b/tests/e2e/patched/test_activation_checkpointing.py
index e8006c162..e53097828 100644
--- a/tests/e2e/patched/test_activation_checkpointing.py
+++ b/tests/e2e/patched/test_activation_checkpointing.py
@@ -66,7 +66,6 @@ class TestActivationCheckpointing:
                 "flash_attention": True,
                 "sample_packing": True,
                 "bf16": True,
-                "save_safetensors": True,
                 "gradient_checkpointing": gradient_checkpointing,
                 "save_first_step": False,
                 "dataset_num_proc": 4,
diff --git a/tests/e2e/patched/test_peft_embeddings.py b/tests/e2e/patched/test_peft_embeddings.py
index 374ef97d8..ae3145de8 100644
--- a/tests/e2e/patched/test_peft_embeddings.py
+++ b/tests/e2e/patched/test_peft_embeddings.py
@@ -46,7 +46,6 @@ class TestLlamaPeftEmbeddings:
                 "flash_attention": True,
                 "sample_packing": False,
                 "bf16": "auto",
-                "save_safetensors": True,
                 "embeddings_skip_upcast": True,
                 "save_first_step": False,
             }
diff --git a/tests/e2e/patched/test_resume.py b/tests/e2e/patched/test_resume.py
index e6240f208..f6c7585c3 100644
--- a/tests/e2e/patched/test_resume.py
+++ b/tests/e2e/patched/test_resume.py
@@ -58,7 +58,6 @@ class TestResumeLlama:
                 "save_total_limit": 5,
                 "max_steps": 15,
                 "use_tensorboard": True,
-                "save_safetensors": True,
                 "save_first_step": False,
                 "include_tkps": True,
             }
diff --git a/tests/e2e/solo/test_relora_llama.py b/tests/e2e/solo/test_relora_llama.py
index be77684ba..091bb90c6 100644
--- a/tests/e2e/solo/test_relora_llama.py
+++ b/tests/e2e/solo/test_relora_llama.py
@@ -63,7 +63,6 @@ class TestReLoraLlama(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_8bit",
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "use_tensorboard": True,
                 "save_first_step": False,
             }
diff --git a/tests/e2e/test_activation_offloading.py b/tests/e2e/test_activation_offloading.py
index 9df85ab31..5715e68ba 100644
--- a/tests/e2e/test_activation_offloading.py
+++ b/tests/e2e/test_activation_offloading.py
@@ -57,7 +57,6 @@ class TestActivationOffloading:
                 "flash_attention": True,
                 "sample_packing": True,
                 "bf16": "auto",
-                "save_safetensors": True,
                 "gradient_checkpointing": True,
                 "activation_offloading": True,
                 "save_first_step": False,
diff --git a/tests/e2e/test_deepseekv3.py b/tests/e2e/test_deepseekv3.py
index e11be8265..0e3aafaf0 100644
--- a/tests/e2e/test_deepseekv3.py
+++ b/tests/e2e/test_deepseekv3.py
@@ -64,7 +64,6 @@ class TestDeepseekV3:
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
-                "save_safetensors": True,
                 "bf16": True,
                 "save_first_step": False,
             }
@@ -113,7 +112,6 @@ class TestDeepseekV3:
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
-                "save_safetensors": True,
                 "bf16": True,
                 "save_first_step": False,
             }
diff --git a/tests/e2e/test_diffusion.py b/tests/e2e/test_diffusion.py
index cc3d8070b..89d35adc1 100644
--- a/tests/e2e/test_diffusion.py
+++ b/tests/e2e/test_diffusion.py
@@ -41,7 +41,6 @@ class TestDiffusion:
                 "optimizer": "adamw_torch",
                 "lr_scheduler": "cosine",
                 "bf16": True,
-                "save_safetensors": True,
                 "save_first_step": False,
                 "logging_steps": 1,
                 "eval_steps": 3,
@@ -97,7 +96,6 @@ class TestDiffusion:
                 "optimizer": "adamw_torch",
                 "lr_scheduler": "cosine",
                 "bf16": True,
-                "save_safetensors": True,
                 "save_first_step": False,
                 "logging_steps": 1,
                 "eval_steps": 2,
diff --git a/tests/e2e/test_embeddings_lr.py b/tests/e2e/test_embeddings_lr.py
index 633e449ef..2b2e8e5e8 100644
--- a/tests/e2e/test_embeddings_lr.py
+++ b/tests/e2e/test_embeddings_lr.py
@@ -44,7 +44,6 @@ class TestEmbeddingsLrScale(unittest.TestCase):
                 "optimizer": "adamw_torch_fused",
                 "embedding_lr_scale": 0.5,
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
                 "save_first_step": False,
@@ -89,7 +88,6 @@ class TestEmbeddingsLrScale(unittest.TestCase):
                 "optimizer": "adamw_torch_fused",
                 "embedding_lr": 0.000005,
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
                 "save_first_step": False,
diff --git a/tests/e2e/test_gemma2.py b/tests/e2e/test_gemma2.py
index 9e9f1a9cc..1719deeee 100644
--- a/tests/e2e/test_gemma2.py
+++ b/tests/e2e/test_gemma2.py
@@ -61,7 +61,6 @@ class TestGemma2:
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
-                "save_safetensors": True,
                 "bf16": True,
             }
         )
@@ -111,7 +110,6 @@ class TestGemma2:
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
-                "save_safetensors": True,
                 "bf16": True,
             }
         )
diff --git a/tests/e2e/test_gemma3_text.py b/tests/e2e/test_gemma3_text.py
index 6cd999242..b723f21e5 100644
--- a/tests/e2e/test_gemma3_text.py
+++ b/tests/e2e/test_gemma3_text.py
@@ -60,7 +60,6 @@ class TestGemma3Text:
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
-                "save_safetensors": True,
                 "bf16": True,
                 "save_first_step": False,
             }
@@ -110,7 +109,6 @@ class TestGemma3Text:
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
-                "save_safetensors": True,
                 "bf16": True,
                 "save_first_step": False,
             }
diff --git a/tests/e2e/test_llama.py b/tests/e2e/test_llama.py
index de085cbe2..795b0de37 100644
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -43,7 +43,6 @@ class TestLlama:
                 "flash_attention": True,
                 "sample_packing": True,
                 "bf16": True,
-                "save_safetensors": True,
                 "save_first_step": False,
             }
         )
@@ -90,7 +89,6 @@ class TestLlama:
                 "flash_attention": True,
                 "sample_packing": True,
                 "bf16": True,
-                "save_safetensors": True,
                 "save_first_step": False,
             }
         )
@@ -134,7 +132,6 @@ class TestLlama:
                 "flash_attention": True,
                 "sample_packing": True,
                 "bf16": True,
-                "save_safetensors": True,
                 "save_first_step": False,
             }
         )
@@ -174,7 +171,6 @@ class TestLlama:
                 "sample_packing": False,
                 "batch_flattening": True,
                 "bf16": True,
-                "save_safetensors": True,
                 "save_first_step": False,
             }
         )
diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py
index f0daa9dd6..3aa594fbd 100644
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -49,7 +49,6 @@ class TestPretrainLlama:
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
                 "save_first_step": False,
diff --git a/tests/e2e/test_llama_vision.py b/tests/e2e/test_llama_vision.py
index 0cc927f76..edfc7a9b3 100644
--- a/tests/e2e/test_llama_vision.py
+++ b/tests/e2e/test_llama_vision.py
@@ -51,7 +51,6 @@ class TestLlamaVision(unittest.TestCase):
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
-                "save_safetensors": True,
                 "bf16": True,
                 "save_first_step": False,
             }
@@ -97,7 +96,6 @@ class TestLlamaVision(unittest.TestCase):
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
-                "save_safetensors": True,
                 "bf16": True,
                 "save_first_step": False,
             }
diff --git a/tests/e2e/test_mamba.py b/tests/e2e/test_mamba.py
index 67935377d..c45026bf5 100644
--- a/tests/e2e/test_mamba.py
+++ b/tests/e2e/test_mamba.py
@@ -49,7 +49,6 @@ class TestMamba(unittest.TestCase):
                 "max_steps": 20,
                 "save_steps": 10,
                 "eval_steps": None,
-                "save_safetensors": False,
                 "save_first_step": False,
             }
         )
diff --git a/tests/e2e/test_optimizers.py b/tests/e2e/test_optimizers.py
index dbea92a5b..de6c41fbe 100644
--- a/tests/e2e/test_optimizers.py
+++ b/tests/e2e/test_optimizers.py
@@ -224,7 +224,6 @@ class TestCustomOptimizers(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "schedule_free_adamw",
                 "lr_scheduler": "constant",
-                "save_safetensors": True,
                 "max_steps": 10,
                 "save_first_step": False,
             }
diff --git a/tests/e2e/test_qat.py b/tests/e2e/test_qat.py
index 2f8398ef7..5cfbc8553 100644
--- a/tests/e2e/test_qat.py
+++ b/tests/e2e/test_qat.py
@@ -54,7 +54,6 @@ class TestQATLlama:
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
                 "max_steps": 5,
-                "save_safetensors": True,
                 "bf16": True,
                 "save_first_step": False,
             }
diff --git a/tests/e2e/test_save_first_step.py b/tests/e2e/test_save_first_step.py
index ce2d3f145..c717edb6a 100644
--- a/tests/e2e/test_save_first_step.py
+++ b/tests/e2e/test_save_first_step.py
@@ -46,7 +46,6 @@ class TestSaveFirstStepCallback(unittest.TestCase):
                 "flash_attention": True,
                 "sample_packing": True,
                 "bf16": True,
-                "save_safetensors": True,
                 "save_first_step": True,
             }
         )
@@ -86,7 +85,6 @@ class TestSaveFirstStepCallback(unittest.TestCase):
                 "flash_attention": True,
                 "sample_packing": True,
                 "bf16": True,
-                "save_safetensors": True,
                 "save_first_step": False,
             }
         )
diff --git a/tests/e2e/test_streaming.py b/tests/e2e/test_streaming.py
index 5dccf00dd..125eb43eb 100644
--- a/tests/e2e/test_streaming.py
+++ b/tests/e2e/test_streaming.py
@@ -50,7 +50,6 @@ class TestStreamingDatasets:
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "save_safetensors": True,
                 "bf16": "auto",
                 "use_tensorboard": True,
                 "save_first_step": False,
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
index a2dd8bc5e..842cbf118 100644
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -167,6 +167,13 @@ def require_hopper(test_case):
     return unittest.skipUnless(is_hopper(), "test requires h100/hopper GPU")(test_case)
 
 
+def supports_fp8(test_case):
+    compute_capability = torch.cuda.get_device_capability()
+    return unittest.skipUnless(
+        compute_capability >= (9, 0), "test requires h100 or newer GPU"
+    )(test_case)
+
+
 def check_tensorboard(
     temp_run_dir: str,
     tag: str,
@@ -193,21 +200,10 @@ def check_model_output_exists(temp_dir: str, cfg: DictDefault) -> None:
     """
     helper function to check if a model output file exists after training
 
-    checks based on adapter or not and if safetensors saves are enabled or not
+    checks based on adapter or not (always safetensors in Transformers V5)
     """
 
-    if cfg.save_safetensors:
-        if not cfg.adapter:
-            assert (Path(temp_dir) / "model.safetensors").exists()
-        else:
-            assert (Path(temp_dir) / "adapter_model.safetensors").exists()
+    if not cfg.adapter:
+        assert (Path(temp_dir) / "model.safetensors").exists()
     else:
-        # check for both, b/c in trl, it often defaults to saving safetensors
-        if not cfg.adapter:
-            assert (Path(temp_dir) / "pytorch_model.bin").exists() or (
-                Path(temp_dir) / "model.safetensors"
-            ).exists()
-        else:
-            assert (Path(temp_dir) / "adapter_model.bin").exists() or (
-                Path(temp_dir) / "adapter_model.safetensors"
-            ).exists()
+        assert (Path(temp_dir) / "adapter_model.safetensors").exists()
diff --git a/tests/hf_offline_utils.py b/tests/hf_offline_utils.py
index 221db1c51..b93b83f9f 100644
--- a/tests/hf_offline_utils.py
+++ b/tests/hf_offline_utils.py
@@ -13,6 +13,7 @@ def reload_modules(hf_hub_offline):
 
     import datasets
     import huggingface_hub.constants
+    # from huggingface_hub.utils import reset_sessions
 
     # Reload the constants module first, as others depend on it
     importlib.reload(huggingface_hub.constants)
diff --git a/tests/monkeypatch/test_mistral_tokenizer_patch.py b/tests/monkeypatch/test_mistral_tokenizer_patch.py
deleted file mode 100644
index cb82c0890..000000000
--- a/tests/monkeypatch/test_mistral_tokenizer_patch.py
+++ /dev/null
@@ -1,35 +0,0 @@
-"""Integration tests for MistralCommonTokenizer patches."""
-
-import pytest
-
-
-class TestMistralTokenizerPatchIntegration:
-    """Test MistralCommonTokenizer patch integration."""
-
-    @pytest.mark.integration
-    def test_mistral_tokenizer_image_patch(self):
-        """Test that MistralCommonTokenizer image patch can be applied."""
-        try:
-            from transformers.tokenization_mistral_common import MistralCommonTokenizer
-        except ImportError:
-            pytest.skip("MistralCommonTokenizer not available")
-
-        from axolotl.monkeypatch.models.mistral3.mistral_common_tokenizer import (
-            apply_mistral_tokenizer_image_patch,
-        )
-
-        # Store original method
-        original_apply_chat_template = MistralCommonTokenizer.apply_chat_template
-
-        # Apply patch
-        apply_mistral_tokenizer_image_patch()
-
-        # Verify patch was applied
-        assert (
-            MistralCommonTokenizer.apply_chat_template != original_apply_chat_template
-        ), "apply_chat_template was not patched"
-
-        # Verify the method is still callable
-        assert callable(MistralCommonTokenizer.apply_chat_template), (
-            "Patched method is not callable"
-        )
diff --git a/tests/prompt_strategies/test_chat_templates_advanced.py b/tests/prompt_strategies/test_chat_templates_advanced.py
index fd39a4305..7d4e6883f 100644
--- a/tests/prompt_strategies/test_chat_templates_advanced.py
+++ b/tests/prompt_strategies/test_chat_templates_advanced.py
@@ -37,7 +37,7 @@ PARAMETRIZE_PARAMS = [
         "gemma2_tokenizer_chat_template_jinja",
         "<end_of_turn>",
     ),
-    ("phi35_tokenizer", "phi_35", None, "<|end|>"),
+    # ("phi35_tokenizer", "phi_35", None, "<|end|>"),  # seems to be broken w transformers v5
     ("phi4_tokenizer", "phi_4", None, "<|im_end|>"),
 ]
 
diff --git a/tests/test_normalize_config.py b/tests/test_normalize_config.py
index f0d3a2d72..ae93a8bd2 100644
--- a/tests/test_normalize_config.py
+++ b/tests/test_normalize_config.py
@@ -127,8 +127,7 @@ class NormalizeConfigTestCase(unittest.TestCase):
         self.assertNotIn("fsdp_auto_wrap_policy", cfg_with_version.fsdp_config)
         self.assertNotIn("fsdp_offload_params", cfg_with_version.fsdp_config)
         self.assertNotIn("fsdp_cpu_ram_efficient_loading", cfg_with_version.fsdp_config)
-        self.assertNotIn("fsdp_version", cfg_with_version.fsdp_config)
-        self.assertNotIn("version", cfg_with_version.fsdp_config)
+        self.assertIn("fsdp_version", cfg_with_version.fsdp_config)
 
         cfg_without_version = self._get_base_cfg() | DictDefault(
             {
@@ -191,9 +190,7 @@ class NormalizeConfigTestCase(unittest.TestCase):
         self.assertEqual(cfg.fsdp_config.activation_checkpointing, True)
 
         # Check original fsdp_ keys are removed
-        self.assertNotIn("fsdp_version", cfg.fsdp_config)
         self.assertNotIn("fsdp_state_dict_type", cfg.fsdp_config)
         self.assertNotIn("fsdp_reshard_after_forward", cfg.fsdp_config)
 
-        # Ensure no duplicate version key
-        self.assertNotIn("version", cfg.fsdp_config)
+        self.assertIn("fsdp_version", cfg.fsdp_config)
diff --git a/tests/test_perplexity.py b/tests/test_perplexity.py
index 8f4306994..899308ba6 100644
--- a/tests/test_perplexity.py
+++ b/tests/test_perplexity.py
@@ -16,7 +16,9 @@ def metric(tokenizer):
 
 @fixture()
 def model():
-    return AutoModelForCausalLM.from_pretrained(MODEL_NAME, trust_remote_code=True)
+    return AutoModelForCausalLM.from_pretrained(
+        MODEL_NAME, trust_remote_code=True, dtype="float32"
+    )
 
 
 @fixture()
diff --git a/tests/test_tokenizers.py b/tests/test_tokenizers.py
index 406462038..114c2bea2 100644
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -17,6 +17,7 @@ class TestTokenizers:
     test class for the load_tokenizer fn
     """
 
+    @pytest.mark.skip("LlamaTokenizer no longer has a Fast/Slow tokenizer")
     @enable_hf_offline
     def test_default_use_fast(self):
         cfg = DictDefault(
@@ -27,6 +28,7 @@ class TestTokenizers:
         tokenizer = load_tokenizer(cfg)
         assert "Fast" in tokenizer.__class__.__name__
 
+    @pytest.mark.skip("LlamaTokenizer no longer has a Fast/Slow tokenizer")
     @enable_hf_offline
     def test_dont_use_fast(self):
         cfg = DictDefault(
diff --git a/tests/utils/schemas/validation/test_fsdp.py b/tests/utils/schemas/validation/test_fsdp.py
index 9fa327797..ce3f3aa07 100644
--- a/tests/utils/schemas/validation/test_fsdp.py
+++ b/tests/utils/schemas/validation/test_fsdp.py
@@ -13,17 +13,29 @@ class TestFSDPValidation:
     test class for pydantic fsdp validation
     """
 
-    def test_fsdp_version_in_fsdp_config(self, min_base_cfg):
+    def test_fsdp_version_from_fsdp_config(self, min_base_cfg):
         cfg = min_base_cfg | DictDefault(
             fsdp_config={
-                "fsdp_version": 2,
+                "version": 2,
             },
         )
         cfg = validate_config(
             cfg,
         )
         assert cfg.fsdp_version == 2
-        assert cfg.fsdp_config.fsdp_version is None
+
+    def test_fsdp_version_in_fsdp_config(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fsdp_version=2,
+            fsdp_config={
+                "reshard_after_forward": True,
+            },
+        )
+        cfg = validate_config(
+            cfg,
+        )
+        assert cfg.fsdp_version == 2
+        assert cfg.fsdp_config.fsdp_version == 2
 
     def test_fsdp_offload_w_8bit_optim(self, min_base_cfg):
         cfg = min_base_cfg | DictDefault(
@@ -116,9 +128,10 @@ class TestFSDPValidation:
         )
         cfg = validate_config(cfg)
         assert cfg.fsdp_version == 2
-        assert cfg.fsdp_config.fsdp_version is None
-        for keys in cfg.fsdp_config.keys():
-            assert not keys.startswith("fsdp_")
+        assert cfg.fsdp_config.fsdp_version == 2
+        for key in cfg.fsdp_config.keys():
+            if key != "fsdp_version":
+                assert not key.startswith("fsdp_")
         assert cfg.fsdp_config.auto_wrap_policy == "TRANSFORMER_BASED_WRAP"
         assert cfg.fsdp_config.transformer_layer_cls_to_wrap == "LlamaDecoderLayer"
         assert cfg.fsdp_config.reshard_after_forward is True