From e0a2523a3bf875c9e82de0ecea67371e634553f1 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 13 Aug 2025 06:39:39 -0400
Subject: [PATCH 001/115] Workaround to unblock docs build in main (#3055)

Co-authored-by: Salman Mohammadi <salman.mohammadi@outlook.com>
---
 TODO.md                        | 10 ----------
 src/axolotl/cli/utils/train.py |  6 ++----
 2 files changed, 2 insertions(+), 14 deletions(-)
 delete mode 100644 TODO.md

diff --git a/TODO.md b/TODO.md
deleted file mode 100644
index 2002bbbaf..000000000
--- a/TODO.md
+++ /dev/null
@@ -1,10 +0,0 @@
-# todo list
-
-- [] Validation of parameters for combinations that won't work
-
-
-
-## things that are known not to work
-
-- FSDP offload and gradient_checkpointing - https://github.com/pytorch/pytorch/issues/82203
-- adamw_bnb_8bit doesn't play well with FSDP offload
diff --git a/src/axolotl/cli/utils/train.py b/src/axolotl/cli/utils/train.py
index f1ac857b3..31b0bcf58 100644
--- a/src/axolotl/cli/utils/train.py
+++ b/src/axolotl/cli/utils/train.py
@@ -67,14 +67,12 @@ def build_command(base_cmd: list[str], options: dict[str, Any]) -> list[str]:
 
 def generate_config_files(config: str, sweep: str | None) -> Iterator[tuple[str, bool]]:
     """
-    Generate list of configuration files to process.
+    Generate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating
+    whether this is a group of configurations (i.e., a sweep).
 
     Args:
         config: Base configuration file
         sweep: Sweep configuration file
-
-    Yields:
-        Tuple of configuration file name and whether this is a group of configurations
     """
 
     if not sweep:

From 09145de8fa0306c3b88212da71564d3e3892ad31 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 13 Aug 2025 19:41:07 -0400
Subject: [PATCH 002/115] upgrade transformers==4.55.1 and bitsandbytes==0.47.0
 (#3064)

* upgrade transformers==4.55.1

* also upgrade bnb

* remove bnb params4bit patch (upstreamed)

* use latest causal-conv1d

* fix patching ring-flash-attn with now missing imports

---------

Co-authored-by: Dan Saunders <danjsaund@gmail.com>
---
 docker/Dockerfile-base                        |   2 +-
 requirements.txt                              |   4 +-
 src/axolotl/loaders/patch_manager.py          |   2 -
 src/axolotl/monkeypatch/fsdp2_qlora.py        |  61 -----------
 .../monkeypatch/ring_attn/adapters/batch.py   |  11 +-
 src/axolotl/monkeypatch/ring_attn/patch.py    |  11 +-
 src/axolotl/utils/schemas/validation.py       |  21 +++-
 tests/e2e/patched/test_fsdp2_qlora.py         | 102 +-----------------
 8 files changed, 38 insertions(+), 176 deletions(-)

diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index 0434a583f..d1151cedd 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,7 +37,7 @@ WORKDIR /workspace
 
 RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
     python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
-    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
+    CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE python3 -m pip install --no-cache-dir causal_conv1d==1.5.2 && \
     python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
     python3 -m pip cache purge
 
diff --git a/requirements.txt b/requirements.txt
index 370bf5a5e..5f7767812 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 
 # START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.46.1
+bitsandbytes==0.47.0
 # triton 3.4.0 is not compatible with CCE
 triton>=3.0.0,<3.4.0
 mamba-ssm==1.2.0.post1
@@ -14,7 +14,7 @@ packaging==23.2
 
 huggingface_hub>=0.33.0
 peft==0.17.0
-transformers==4.55.0
+transformers==4.55.1
 tokenizers>=0.21.1
 accelerate==1.10.0
 datasets==4.0.0
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index f1ca3c725..628d897d0 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -285,12 +285,10 @@ class PatchManager:
             and self.cfg.adapter == "qlora"
         ):
             from axolotl.monkeypatch.fsdp2_qlora import (
-                apply_bnb_torch_function_patch,
                 apply_init_sharded_param_patch,
                 apply_init_unsharded_param_patch,
             )
 
-            apply_bnb_torch_function_patch()
             apply_init_sharded_param_patch()
             apply_init_unsharded_param_patch()
 
diff --git a/src/axolotl/monkeypatch/fsdp2_qlora.py b/src/axolotl/monkeypatch/fsdp2_qlora.py
index a2cb7e472..5a4332fff 100644
--- a/src/axolotl/monkeypatch/fsdp2_qlora.py
+++ b/src/axolotl/monkeypatch/fsdp2_qlora.py
@@ -9,73 +9,12 @@ Params4bit parameters.
 import importlib
 import inspect
 
-import torch
-from torch.nn import Parameter
-
 from axolotl.monkeypatch.utils import detab_code
 from axolotl.utils.logging import get_logger
 
 LOG = get_logger(__name__)
 
 
-def patched_torch_function(cls, func, types, args=(), kwargs=None):
-    """
-    Patched version of Params4bit.__torch_function__ for preserving Params4bit
-    class identity and attributes.
-    """
-    if kwargs is None:
-        kwargs = {}
-
-    if func in [torch.chunk, torch.split]:
-        tensor = args[0]
-        result = Parameter.__torch_function__(func, types, args, kwargs)
-
-        if isinstance(result, tuple):
-            return tuple(
-                cls(
-                    data=chunk,
-                    requires_grad=tensor.requires_grad,
-                    quant_state=tensor.quant_state,
-                    blocksize=tensor.blocksize,
-                    compress_statistics=tensor.compress_statistics,
-                    quant_type=tensor.quant_type,
-                    quant_storage=tensor.quant_storage,
-                    module=tensor.module,
-                    bnb_quantized=tensor.bnb_quantized,
-                )
-                for chunk in result
-            )
-
-        return cls(
-            data=result,
-            requires_grad=tensor.requires_grad,
-            quant_state=tensor.quant_state,
-            blocksize=tensor.blocksize,
-            compress_statistics=tensor.compress_statistics,
-            quant_type=tensor.quant_type,
-            quant_storage=tensor.quant_storage,
-            module=tensor.module,
-            bnb_quantized=tensor.bnb_quantized,
-        )
-
-    return Parameter.__torch_function__(func, types, args, kwargs)
-
-
-# pylint: disable=protected-access
-def apply_bnb_torch_function_patch():
-    """
-    Patch Params4bit.__torch_function__ using Axolotl-style approach.
-
-    Returns:
-        True if patching succeeded, False otherwise.
-    """
-    from bitsandbytes.nn.modules import Params4bit
-
-    Params4bit.__torch_function__ = classmethod(patched_torch_function)
-
-    LOG.info("Successfully patched Params4bit.__torch_function__")
-
-
 # pylint: disable=protected-access
 def apply_init_sharded_param_patch():
     """Apply patch to FSDPParam._init_sharded_param to support Params4bit."""
diff --git a/src/axolotl/monkeypatch/ring_attn/adapters/batch.py b/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
index ebed9ebdc..607b4dd71 100644
--- a/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
+++ b/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
@@ -20,12 +20,15 @@ from ring_flash_attn import ring_flash_attn_func
 from ring_flash_attn.adapters.hf_adapter import check_params
 from transformers.modeling_flash_attention_utils import is_flash_attn_greater_or_equal
 
-try:
+try:  # pylint: disable=duplicate-code
     from transformers.modeling_flash_attention_utils import _flash_supports_window
 except ImportError:
-    from transformers.modeling_flash_attention_utils import (
-        _flash_supports_window_size as _flash_supports_window,
-    )
+    try:
+        from transformers.modeling_flash_attention_utils import (
+            _flash_supports_window_size as _flash_supports_window,
+        )
+    except ImportError:
+        _flash_supports_window = True
 
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
 
diff --git a/src/axolotl/monkeypatch/ring_attn/patch.py b/src/axolotl/monkeypatch/ring_attn/patch.py
index 934687a16..ea0f9dd02 100644
--- a/src/axolotl/monkeypatch/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/ring_attn/patch.py
@@ -15,12 +15,15 @@ import torch
 import torch.distributed as dist
 from torch.distributed import DeviceMesh
 
-try:
+try:  # pylint: disable=duplicate-code
     from transformers.modeling_flash_attention_utils import _flash_supports_window
 except ImportError:
-    from transformers.modeling_flash_attention_utils import (
-        _flash_supports_window_size as _flash_supports_window,
-    )
+    try:
+        from transformers.modeling_flash_attention_utils import (
+            _flash_supports_window_size as _flash_supports_window,
+        )
+    except ImportError:
+        _flash_supports_window = True
 
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 from axolotl.utils.logging import get_logger
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 72991c947..0d6d05a0e 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -3,6 +3,7 @@
 # pylint: disable=too-many-boolean-expressions
 
 import json
+import sys
 import tempfile
 from pathlib import Path
 
@@ -1251,10 +1252,26 @@ class ComplexValidationMixin:
 
             try:
                 import transformers.modeling_flash_attention_utils
+                from transformers.utils import is_flash_attn_greater_or_equal
 
                 # pylint: disable=protected-access
-                transformers.modeling_flash_attention_utils._flash_supports_window_size = (
-                    transformers.modeling_flash_attention_utils._flash_supports_window
+                transformers.modeling_flash_attention_utils._flash_supports_window = (
+                    True
+                )
+                setattr(
+                    sys.modules["transformers.modeling_flash_attention_utils"],
+                    "_flash_supports_window",
+                    True,
+                )
+                setattr(
+                    sys.modules["transformers.modeling_flash_attention_utils"],
+                    "_flash_supports_window_size",
+                    True,
+                )
+                setattr(
+                    sys.modules["transformers.modeling_flash_attention_utils"],
+                    "is_flash_attn_greater_or_equal",
+                    is_flash_attn_greater_or_equal,
                 )
                 import ring_flash_attn  # noqa: F401 # pylint:disable=unused-import
             except ImportError as exception:
diff --git a/tests/e2e/patched/test_fsdp2_qlora.py b/tests/e2e/patched/test_fsdp2_qlora.py
index 9dd053ad8..ca17b81d1 100644
--- a/tests/e2e/patched/test_fsdp2_qlora.py
+++ b/tests/e2e/patched/test_fsdp2_qlora.py
@@ -1,126 +1,28 @@
-"""Integration tests for FSDP Params4bit patches."""
+"""Integration tests for FSDP2 Params4bit patches."""
 
-from unittest.mock import Mock, patch
-
-import bitsandbytes as bnb
 import pytest
-import torch
 from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam
 
-from axolotl.monkeypatch.fsdp2_qlora import (
-    apply_bnb_torch_function_patch,
-    patched_torch_function,
-)
-
-
-@pytest.fixture
-def mock_params4bit():
-    """Create a mock Params4bit instance with test attributes."""
-    mock_instance = Mock()
-    mock_instance.requires_grad = True
-    mock_instance.quant_state = "test_state"
-    mock_instance.blocksize = 128
-    mock_instance.compress_statistics = True
-    mock_instance.quant_type = "fp4"
-    mock_instance.quant_storage = "test_storage"
-    mock_instance.module = "test_module"
-    mock_instance.bnb_quantized = True
-    return mock_instance
-
-
-class TestBnbTorchFunctionPatch:
-    """Test the Params4bit.__torch_function__ patch."""
-
-    def test_apply_patch(self):
-        """Test that the patch can be applied."""
-        with patch("bitsandbytes.nn.modules.Params4bit") as mock_cls:
-            apply_bnb_torch_function_patch()
-            assert hasattr(mock_cls, "__torch_function__")
-            assert isinstance(mock_cls.__torch_function__, classmethod)
-
-    # pylint: disable=redefined-outer-name
-    def test_torch_chunk_preserves_attributes(self, mock_params4bit):
-        """Test that torch.chunk preserves Params4bit attributes."""
-        mock_cls = Mock()
-        chunks = (torch.tensor([1, 2]), torch.tensor([3, 4]))
-
-        with patch("torch.nn.Parameter.__torch_function__", return_value=chunks):
-            result = patched_torch_function(
-                mock_cls,
-                torch.chunk,
-                (type(mock_params4bit),),
-                args=(mock_params4bit, 2),
-            )
-
-            assert isinstance(result, tuple)
-            assert len(result) == 2
-
-            # Check that Params4bit constructor was called with preserved attributes
-            assert mock_cls.call_count == 2
-            for call in mock_cls.call_args_list:
-                kwargs = call[1]
-                assert kwargs["requires_grad"] == mock_params4bit.requires_grad
-                assert kwargs["quant_state"] == mock_params4bit.quant_state
-                assert kwargs["blocksize"] == mock_params4bit.blocksize
-
-    # pylint: disable=redefined-outer-name
-    def test_other_functions_fallback(self, mock_params4bit):
-        """Test that non-chunk/split functions use Parameter fallback."""
-        mock_cls = Mock()
-        fallback_result = torch.tensor([5, 6, 7])
-
-        with patch(
-            "torch.nn.Parameter.__torch_function__", return_value=fallback_result
-        ) as mock_fallback:
-            result = patched_torch_function(
-                mock_cls, torch.add, (type(mock_params4bit),), args=(mock_params4bit, 1)
-            )
-
-            # Should call Parameter.__torch_function__ and return its result
-            mock_fallback.assert_called_once()
-            assert result is fallback_result
-            mock_cls.assert_not_called()
-
 
 class TestFSDPPatchIntegration:
     """Test FSDP patch integration."""
 
     @pytest.mark.integration
-    def test_all_patches_together(self):
+    def test_fsdp2_init_patches(self):
         """Test that all patches can be applied together."""
         from axolotl.monkeypatch.fsdp2_qlora import (
             apply_init_sharded_param_patch,
             apply_init_unsharded_param_patch,
         )
 
-        # Store original methods before patching
-        original_torch_function = getattr(
-            bnb.nn.modules.Params4bit, "__torch_function__", None
-        )
-
         # pylint: disable=protected-access
         original_init_sharded = FSDPParam._init_sharded_param
         original_init_unsharded = FSDPParam.init_unsharded_param
 
         # Apply patches
-        apply_bnb_torch_function_patch()
         apply_init_sharded_param_patch()
         apply_init_unsharded_param_patch()
 
-        # Verify patches were applied
-        current_torch_function = getattr(
-            bnb.nn.modules.Params4bit, "__torch_function__", None
-        )
-        if original_torch_function is not None:
-            assert (
-                current_torch_function != original_torch_function
-            ), "Params4bit.__torch_function__ was not patched"
-        else:
-            assert (
-                current_torch_function is not None
-            ), "Params4bit.__torch_function__ was not added"
-
-        # Check that FSDP methods were patched
         assert (
             # pylint: disable=protected-access
             FSDPParam._init_sharded_param

From 506e3a39074a76df223af211af8e503343ea6b3e Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Thu, 14 Aug 2025 08:21:50 +0700
Subject: [PATCH 003/115] fix: fsdp_config validation being None (#3061) [skip
 ci]

* fix: fsdp_config validation being None

* fix: handling

---------

Co-authored-by: salman <salman.mohammadi@outlook.com>
---
 src/axolotl/utils/schemas/validation.py | 26 ++++++++++++-------------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 0d6d05a0e..217244b01 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -370,10 +370,10 @@ class TrainingValidationMixin:
                 "see speed improvements. Please consider setting `torch_compile: "
                 "true` in your config."
             )
+        fsdp_config = data.get("fsdp_config") or {}
         if data.get("fp8") and (
-            data.get("fsdp_config", {}).get("activation_checkpointing", False) is True
-            or data.get("fsdp_config", {}).get("fsdp_activation_checkpointing", False)
-            is True
+            fsdp_config.get("activation_checkpointing", False) is True
+            or fsdp_config.get("fsdp_activation_checkpointing", False) is True
         ):
             LOG.warning(
                 "FP8 + FSDP2 + activation checkpointing may be slower than BF16 "
@@ -818,13 +818,13 @@ class OptimizationValidationMixin:
     @model_validator(mode="before")
     @classmethod
     def check_fsdp_version_in_fsdp_config(cls, data):
-        if data.get("fsdp_config"):
-            if data.get("fsdp_config", {}).get("fsdp_version"):
-                LOG.warning(
-                    "Configuring `fsdp_version` in `fsdp_config` is deprecated. "
-                    "Please configure `fsdp_version` as a top-level field."
-                )
-                data["fsdp_version"] = data.get("fsdp_config").pop("fsdp_version")
+        fsdp_config = data.get("fsdp_config") or {}
+        if fsdp_config and fsdp_config.get("fsdp_version"):
+            LOG.warning(
+                "Configuring `fsdp_version` in `fsdp_config` is deprecated. "
+                "Please configure `fsdp_version` as a top-level field."
+            )
+            data["fsdp_version"] = fsdp_config.pop("fsdp_version")
         return data
 
     @model_validator(mode="before")
@@ -1152,10 +1152,8 @@ class ModelCompatibilityValidationMixin:
     @classmethod
     def check_gpt_oss_fsdp_loading(cls, data):
         if data.get("model_quantization_config", "") == "Mxfp4Config":
-            if (
-                data.get("fsdp_config", {}).get("cpu_ram_efficient_loading", False)
-                is True
-            ):
+            fsdp_config = data.get("fsdp_config") or {}
+            if fsdp_config.get("cpu_ram_efficient_loading", False) is True:
                 raise ValueError(
                     "FSDP cpu_ram_efficient_loading is not supported for Mxfp4Config model quantization."
                 )

From 48b7ae16778674ca01223fcfff362d5ad8798b48 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 13 Aug 2025 21:23:05 -0400
Subject: [PATCH 004/115] use updated patch releasE (#3066)

---
 docker/Dockerfile-base | 2 +-
 requirements.txt       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index d1151cedd..87918cc41 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,7 +37,7 @@ WORKDIR /workspace
 
 RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
     python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
-    CAUSAL_CONV1D_SKIP_CUDA_BUILD=TRUE python3 -m pip install --no-cache-dir causal_conv1d==1.5.2 && \
+    CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir causal_conv1d==1.5.2 && \
     python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
     python3 -m pip cache purge
 
diff --git a/requirements.txt b/requirements.txt
index 5f7767812..c2552002f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -14,7 +14,7 @@ packaging==23.2
 
 huggingface_hub>=0.33.0
 peft==0.17.0
-transformers==4.55.1
+transformers==4.55.2
 tokenizers>=0.21.1
 accelerate==1.10.0
 datasets==4.0.0

From d1de6f5f3d5966068099fcbb904446b21bfd7b29 Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Thu, 14 Aug 2025 03:57:51 +0100
Subject: [PATCH 005/115] Add option to skip slow tests in PRs (#3060) [skip
 ci]

* testing e2e skip [skip-e2e]

* testing e2e skip [skip-e2e]

* testing e2e skip [skip-e2e]

* testing e2e skip [skip-e2e]

* testing e2e skip [skip-e2e]

* testing e2e skip [skip-e2e]

* testing e2e skip [skip-e2e]

* testing e2e skip [skip-e2e]

* testing e2e skip [skip-e2e]

* testing e2e skip [skip-e2e]

* testing e2e skip [skip-e2e]

* stop running multigpu [skip-e2e]

* should work now [skip-e2e]

* reverting [skip-e2e]

* testing [skip-e2e]

* debug [skip-e2e]

* debug [skip-e2e]

* round 2[skip-e2e]

* removing debug [skip-e2e]

* support skipping whole PR [skip-e2e]

* use script for e2e skip [skip-e2e]

* contributing [skip-e2e]

* contributing [skip-e2e]

---------

Co-authored-by: Wing Lian <wing@axolotl.ai>
---
 .github/CONTRIBUTING.md     |  7 +++++++
 .github/workflows/tests.yml | 42 +++++++++++++++++++++++++++++++++----
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 8f67908e8..fcfd96891 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -57,6 +57,13 @@ We welcome ideas for improvements and new features. To suggest an enhancement, o
 5. Push your branch to your fork on GitHub.
 6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues.
 
+#### Skipping CI Checks
+
+You can skip certain CI checks by including specific keywords in your commit messages:
+
+- `[skip ci]` or `skip ci` - Skips all CI checks for that commit
+- `[skip-e2e]` or `skip-e2e` - Skips only end-to-end tests while running other CI checks. You may also include this in the title of your PR to disable end-to-end tests for the entire PR.
+
 ## Style Guidelines
 
 ### Code Style
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 912b3f1d6..fe63aa313 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -188,13 +188,44 @@ jobs:
         run: |
           find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
 
+  gate-skip-e2e:
+    needs: [pre-commit, pytest, pytest-sdist]
+    runs-on: ubuntu-latest
+    outputs:
+      skip: ${{ steps.compute.outputs.skip }}
+    steps:
+      - uses: actions/github-script@v7
+        id: compute
+        with:
+          script: |
+            const token = /\[skip-e2e\]/i;
+            let msg = '';
+            if (context.eventName === 'push') {
+              msg = context.payload.head_commit?.message || '';
+            } else if (context.eventName === 'pull_request') {
+              const { owner, repo } = context.repo;
+              const prNumber = context.payload.pull_request.number;
+              const commits = await github.paginate(
+                github.rest.pulls.listCommits,
+                { owner, repo, pull_number: prNumber, per_page: 100 }
+              );
+              msg = commits.at(-1)?.commit?.message || '';
+            }
+            const title = context.payload.pull_request?.title || '';
+            const body  = context.payload.pull_request?.body  || '';
+            const skip = token.test(msg) || token.test(title) || token.test(body);
+            core.setOutput('skip', String(skip));
+
   docker-e2e-tests-1st:
     # Run this job first as a gate for running the remainder of the test matrix
-    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
+    if: >
+      github.repository_owner == 'axolotl-ai-cloud' &&
+      (github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
+      needs.gate-skip-e2e.outputs.skip != 'true'
     # this job needs to be run on self-hosted GPU runners...
     runs-on: [self-hosted, modal]
     timeout-minutes: 120
-    needs: [pre-commit, pytest, pytest-sdist]
+    needs: [pre-commit, pytest, pytest-sdist, gate-skip-e2e]
 
     strategy:
       fail-fast: false
@@ -240,13 +271,16 @@ jobs:
           modal run cicd.e2e_tests
 
   docker-e2e-tests:
-    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && !github.event.pull_request.draft }}
+    if: >
+      github.repository_owner == 'axolotl-ai-cloud' &&
+      (github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
+      needs.gate-skip-e2e.outputs.skip != 'true'
     # this job needs to be run on self-hosted GPU runners...
     runs-on: [self-hosted, modal]
     timeout-minutes: 120
     # Only run the remainder of the matrix if the first e2e check passed;
     # this is to save on wasted compute costs for known failures that get caught in the first run
-    needs: [pre-commit, pytest, docker-e2e-tests-1st]
+    needs: [pre-commit, pytest, gate-skip-e2e, docker-e2e-tests-1st]
 
     strategy:
       fail-fast: false

From 130ef7c51acdf5fcb44cf16a91defcd4d5976966 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Fri, 15 Aug 2025 10:52:57 -0400
Subject: [PATCH 006/115] Various fixes for VLMs (#3063)

* fix to not use batch feature indexing

* more vlm fixes

* use AutoModelForImageTextToText

* add example yaml and need num2words for chat template

* improve handling of adding image tokens to conversation

* add lfm2-vl support

* update the lfm readme

* fix markdown and add rtol for loss checks

* feat: add smolvlm2 processing strat

* fix: check for causal-conv1d in lfm models

* feat: add docs for lfm2

* feat: add new models and tips to docs

* feat: add smolvlm2 docs and remove extra dep

* chore: update docs

* feat: add video instructions

* chore: cleanup

* chore: comments

* fix: typo

* feat: add usage stats

* chore: refactor

---------

Co-authored-by: NanoCode012 <nano@axolotl.ai>
---
 docs/multimodal.qmd                           | 49 ++++++++-
 examples/LiquidAI/README.md                   | 58 +++++++++++
 .../{lfm2 => LiquidAI}/lfm2-350m-fft.yaml     |  1 -
 examples/LiquidAI/lfm2-vl-lora.yaml           | 58 +++++++++++
 examples/lfm2/README.md                       |  7 --
 examples/smolvlm2/README.md                   | 49 +++++++++
 examples/smolvlm2/smolvlm2-2B-lora.yaml       | 56 +++++++++++
 src/axolotl/loaders/constants.py              | 25 ++---
 src/axolotl/loaders/model.py                  | 14 +++
 src/axolotl/processing_strategies.py          | 99 +++++++++++++------
 .../prompt_strategies/chat_template.py        | 23 +++--
 src/axolotl/utils/collators/mm_chat.py        | 66 +++----------
 tests/e2e/utils.py                            |  7 +-
 13 files changed, 391 insertions(+), 121 deletions(-)
 create mode 100644 examples/LiquidAI/README.md
 rename examples/{lfm2 => LiquidAI}/lfm2-350m-fft.yaml (96%)
 create mode 100644 examples/LiquidAI/lfm2-vl-lora.yaml
 delete mode 100644 examples/lfm2/README.md
 create mode 100644 examples/smolvlm2/README.md
 create mode 100644 examples/smolvlm2/smolvlm2-2B-lora.yaml

diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd
index dbb365f73..d839ce211 100644
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -13,10 +13,13 @@ format:
 - [Pixtral](#sec-pixtral)
 - [Llava-1.5](#sec-llava-15)
 - [Mistral-Small-3.1](#sec-mistral-small-31)
+- [Voxtral](#sec-voxtral)
 - [Gemma-3](#sec-gemma-3)
 - [Gemma-3n](#sec-gemma-3n)
 - [Qwen2-VL](#sec-qwen2-vl)
 - [Qwen2.5-VL](#sec-qwen25-vl)
+- [SmolVLM2](#sec-smolvlm2)
+- [LFM2-VL](#sec-lfm2-vl)
 
 ## Usage
 
@@ -31,7 +34,7 @@ skip_prepare_dataset: true
 remove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training
 sample_packing: false  # not yet supported with multimodal
 
-chat_template:  # see in next section
+chat_template:  # see in next section if specified
 
 # example dataset
 datasets:
@@ -97,6 +100,16 @@ base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
 chat_template: mistral_v7_tekken
 ```
 
+### Voxtral {#sec-voxtral}
+
+::: {.callout-tip}
+Please make sure to install audio lib via `pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'`
+:::
+
+```yaml
+base_model: mistralai/Voxtral-Mini-3B-2507
+```
+
 ### Gemma-3 {#sec-gemma-3}
 
 ::: {.callout-tip}
@@ -143,6 +156,26 @@ base_model: Qwen/Qwen2.5-VL-7B-Instruct
 chat_template: qwen2_vl  # same as qwen2-vl
 ```
 
+### SmolVLM2 {#sec-smolvlm2}
+
+::: {.callout-tip}
+Please make sure to install `num2words` via `pip3 install num2words==0.5.14`
+:::
+
+```yaml
+base_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct
+```
+
+### LFM2-VL {#sec-lfm2-vl}
+
+::: {.callout-warning}
+Please uninstall `causal-conv1d` via `pip3 uninstall -y causal-conv1d`
+:::
+
+```yaml
+base_model: LiquidAI/LFM2-VL-450M
+```
+
 ## Dataset Format
 
 For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
@@ -181,6 +214,20 @@ You may need to install `librosa` via `pip3 install librosa==0.11.0`.
 
 :::
 
+### Video
+
+::: {.callout-warning}
+
+This is not well tested at the moment. We welcome contributors!
+
+:::
+
+For video loading, you can use the following keys within `content` alongside `"type": "video"`:
+
+- `"path": "/path/to/video.mp4"`
+- `"url": "https://example.com/video.mp4"`
+- `"video": np.ndarray | list[PIL.Image.Image] | torch.Tensor` (or list of the aforementioned)
+
 ### Example
 
 Here is an example of a multi-modal dataset:
diff --git a/examples/LiquidAI/README.md b/examples/LiquidAI/README.md
new file mode 100644
index 000000000..96fc74a92
--- /dev/null
+++ b/examples/LiquidAI/README.md
@@ -0,0 +1,58 @@
+# Finetune Liquid Foundation Models 2 (LFM2) with Axolotl
+
+[Liquid Foundation Models 2 (LFM2)](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) are a family of small, open-weight models from [Liquid AI](https://www.liquid.ai/) focused on quality, speed, and memory efficiency. Liquid AI released text-only [LFM2](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) and text+vision [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) models.
+
+LFM2 features a new hybrid Liquid architecture with multiplicative gates, short-range convolutions, and grouped query attention, enabling fast training and inference.
+
+This guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.
+
+## Getting Started
+
+1.  Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+    Here is an example of how to install from pip:
+    ```bash
+    # Ensure you have a compatible version of Pytorch installed
+    pip3 install packaging setuptools wheel ninja
+    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    ```
+
+2.  Run one of the finetuning examples below.
+
+    **LFM2**
+    ```bash
+    # FFT SFT (1x48GB @ 25GiB)
+    axolotl train examples/LiquidAI/lfm2-350m-fft.yaml
+    ```
+
+    **LFM2-VL**
+    ```bash
+    # LoRA SFT (1x48GB @ 2.7GiB)
+    axolotl train examples/LiquidAI/lfm2-vl-lora.yaml
+    ```
+
+### TIPS
+
+- **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
+  ```bash
+  pip uninstall -y causal-conv1d
+  ```
+
+- **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
+- **Dataset Formats**:
+  - For LFM2 models, the dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+  - For LFM2-VL models, Axolotl follows the multi-content Messages format. See our [Multimodal docs](https://docs.axolotl.ai/docs/multimodal.html#dataset-format) for details.
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+
+## Related Resources
+
+- [LFM2 Blog](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models)
+- [LFM2-VL Blog](https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/lfm2/lfm2-350m-fft.yaml b/examples/LiquidAI/lfm2-350m-fft.yaml
similarity index 96%
rename from examples/lfm2/lfm2-350m-fft.yaml
rename to examples/LiquidAI/lfm2-350m-fft.yaml
index 16a0a028e..d19815008 100644
--- a/examples/lfm2/lfm2-350m-fft.yaml
+++ b/examples/LiquidAI/lfm2-350m-fft.yaml
@@ -2,7 +2,6 @@ base_model: LiquidAI/LFM2-350M
 
 chunked_cross_entropy: true
 
-chat_template: tokenizer_default
 eot_tokens:
   - "<|im_end|>"
 datasets:
diff --git a/examples/LiquidAI/lfm2-vl-lora.yaml b/examples/LiquidAI/lfm2-vl-lora.yaml
new file mode 100644
index 000000000..7fee17f92
--- /dev/null
+++ b/examples/LiquidAI/lfm2-vl-lora.yaml
@@ -0,0 +1,58 @@
+base_model: LiquidAI/LFM2-VL-450M
+trust_remote_code: true
+model_type: AutoModelForImageTextToText
+processor_type: AutoProcessor
+
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:1%]
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: lora
+lora_model_dir:
+
+sequence_len: 8192
+pad_to_sequence_len: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+logging_steps: 1
+flash_attention: true
+eager_attention:
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/lfm2/README.md b/examples/lfm2/README.md
deleted file mode 100644
index eb9ca911f..000000000
--- a/examples/lfm2/README.md
+++ /dev/null
@@ -1,7 +0,0 @@
-# Liquid Foundation Models 2
-
-LFM2 support in transformers exists in the main branch, but is not yet included in the transformers release.
-
-```bash
-pip install --upgrade --no-deps --force-reinstall git+https://github.com/huggingface/transformers.git
-```
diff --git a/examples/smolvlm2/README.md b/examples/smolvlm2/README.md
new file mode 100644
index 000000000..9c0ae4836
--- /dev/null
+++ b/examples/smolvlm2/README.md
@@ -0,0 +1,49 @@
+# Finetune SmolVLM2 with Axolotl
+
+[SmolVLM2](https://huggingface.co/collections/HuggingFaceTB/smolvlm2-smallest-video-lm-ever-67ab6b5e84bf8aaa60cb17c7) are a family of lightweight, open-source multimodal models from HuggingFace designed to analyze and understand video, image, and text content.
+
+These models are built for efficiency, making them well-suited for on-device applications where computational resources are limited. Models are available in multiple sizes, including 2.2B, 500M, and 256M.
+
+This guide shows how to fine-tune SmolVLM2 models with Axolotl.
+
+## Getting Started
+
+1.  Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
+
+    Here is an example of how to install from pip:
+    ```bash
+    # Ensure you have a compatible version of Pytorch installed
+    pip3 install packaging setuptools wheel ninja
+    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
+    ```
+
+2. Install an extra dependency:
+
+    ```bash
+    pip3 install num2words==0.5.14
+    ```
+
+3.  Run the finetuning example:
+
+    ```bash
+    # LoRA SFT (1x48GB @ 6.8GiB)
+    axolotl train examples/smolvlm2/smolvlm2-2B-lora.yaml
+    ```
+
+## TIPS
+
+- **Dataset Format**: For video finetuning, your dataset must be compatible with the multi-content Messages format. For more details, see our documentation on [Multimodal Formats](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
+- **Dataset Loading**: Read more on how to prepare and load your own datasets in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+
+## Related Resources
+
+- [SmolVLM2 Blog](https://huggingface.co/blog/smolvlm2)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/smolvlm2/smolvlm2-2B-lora.yaml b/examples/smolvlm2/smolvlm2-2B-lora.yaml
new file mode 100644
index 000000000..1aeff408d
--- /dev/null
+++ b/examples/smolvlm2/smolvlm2-2B-lora.yaml
@@ -0,0 +1,56 @@
+base_model: HuggingFaceTB/SmolVLM2-2.2B-Instruct
+trust_remote_code: true
+processor_type: AutoProcessor
+
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+datasets:
+  - path: HuggingFaceH4/llava-instruct-mix-vsft
+    type: chat_template
+    split: train[:1%]
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: lora
+lora_model_dir:
+
+sequence_len: 8192
+pad_to_sequence_len: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.text_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+logging_steps: 1
+flash_attention: true
+eager_attention:
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/src/axolotl/loaders/constants.py b/src/axolotl/loaders/constants.py
index 3fabf9d94..4939cb28d 100644
--- a/src/axolotl/loaders/constants.py
+++ b/src/axolotl/loaders/constants.py
@@ -1,26 +1,13 @@
 """Shared constants for axolotl.loaders module"""
 
-from transformers import (
-    Gemma3ForConditionalGeneration,
-    Gemma3nForConditionalGeneration,
-    Llama4ForConditionalGeneration,
-    LlavaForConditionalGeneration,
-    Mistral3ForConditionalGeneration,
-    MllamaForConditionalGeneration,
-    Qwen2_5_VLForConditionalGeneration,
-    Qwen2VLForConditionalGeneration,
+from transformers import AutoModelForImageTextToText
+from transformers.models.auto.modeling_auto import (
+    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
 )
 
-MULTIMODAL_AUTO_MODEL_MAPPING = {
-    "mllama": MllamaForConditionalGeneration,
-    "llama4": Llama4ForConditionalGeneration,
-    "llava": LlavaForConditionalGeneration,
-    "qwen2_vl": Qwen2VLForConditionalGeneration,
-    "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
-    "mistral3": Mistral3ForConditionalGeneration,
-    "gemma3": Gemma3ForConditionalGeneration,
-    "gemma3n": Gemma3nForConditionalGeneration,
-}
+MULTIMODAL_AUTO_MODEL_MAPPING = dict(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)
+
+MULTIMODAL_AUTO_MODEL_MAPPING["lfm2-vl"] = AutoModelForImageTextToText
 
 try:
     from transformers import VoxtralForConditionalGeneration
diff --git a/src/axolotl/loaders/model.py b/src/axolotl/loaders/model.py
index 6bf1f149b..53ae428a2 100644
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -25,6 +25,7 @@ from peft import (
 from torch.distributed import DeviceMesh
 from transformers import (
     AutoModelForCausalLM,
+    AutoModelForImageTextToText,
     AutoModelForVision2Seq,
     AwqConfig,
     BitsAndBytesConfig,
@@ -212,6 +213,7 @@ class ModelLoader:
             self.model_kwargs["use_kernels"] = self.cfg.use_kernels
         self._set_quantization_config()
         self._set_attention_config()
+        self._check_model_requirements()
 
     def _apply_post_model_load_setup(self):
         """Configure the model after it has been loaded."""
@@ -432,6 +434,8 @@ class ModelLoader:
             self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get(
                 self.model_config.model_type, AutoModelForVision2Seq
             )
+            if isinstance(self.auto_model_loader, str):
+                self.auto_model_loader = AutoModelForImageTextToText
 
     def _set_device_map_config(self):
         """Setup `device_map` according to config"""
@@ -628,6 +632,16 @@ class ModelLoader:
         if self.cfg.low_cpu_mem_usage:
             self.model_kwargs["low_cpu_mem_usage"] = True
 
+    def _check_model_requirements(self):
+        if self.cfg.model_config_type in ["lfm2-vl", "lfm2"]:
+            from transformers.utils.import_utils import is_causal_conv1d_available
+
+            if is_causal_conv1d_available():
+                raise ImportError(
+                    "The 'causal-conv1d' package is installed but causes compatibility issues with LFM2 models. "
+                    "Please uninstall it by running: `pip uninstall -y causal-conv1d`"
+                )
+
     def _configure_zero3_memory_efficient_loading(
         self,
     ) -> HfTrainerDeepSpeedConfig | None:
diff --git a/src/axolotl/processing_strategies.py b/src/axolotl/processing_strategies.py
index 4cc5e85a1..31597d5a6 100644
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -6,7 +6,7 @@ from typing import Optional
 from PIL import Image, ImageOps
 from PIL.Image import Resampling
 from torch import Tensor, zeros_like
-from transformers import ProcessorMixin, VoxtralProcessor
+from transformers import ProcessorMixin, SmolVLMProcessor, VoxtralProcessor
 from transformers.image_utils import load_image
 
 from axolotl.utils.dict import remove_none_values
@@ -138,7 +138,7 @@ class ProcessingStrategy:
                     image_key = key
                     break
 
-            # if the image key exists, add the image to the first message
+            # if the image key exists, add the image to the first user message
             if image_key is not None and processed_example[image_key] is not None:
                 # TODO: check if it's normal to be single image only for common datasets
                 # From observation, it's usually a list of single image but some datasets may have several columns for images
@@ -179,26 +179,34 @@ class ProcessingStrategy:
 
                 # Look for any image type in the first message
                 # some dataset have an {type: "image"} in the first message
+                msg_ind_to_add = None
                 ind_to_add = None
+                first_user_idx = None
 
-                for i, content in enumerate(
-                    processed_example["messages"][0]["content"]
-                ):
-                    # Usually datasets created with image columns, don't have it in the messages itself
-                    if content["type"] == "image" and all(
-                        k not in content for k in ["image", "url", "path", "base64"]
+                for msg_idx, msg_content in enumerate(processed_example["messages"]):
+                    if first_user_idx is None and msg_content["role"] == "user":
+                        first_user_idx = msg_idx
+                    for i, content in enumerate(
+                        processed_example["messages"][msg_idx]["content"]
                     ):
-                        ind_to_add = i
-                        break
+                        # Usually datasets created with image columns, don't have it in the messages itself
+                        if content["type"] == "image" and all(
+                            k not in content for k in ["image", "url", "path", "base64"]
+                        ):
+                            msg_ind_to_add = msg_idx
+                            ind_to_add = i
+                            break
 
                 # If an image type is found, add the image to that index
-                if ind_to_add is not None:
-                    processed_example["messages"][0]["content"][ind_to_add][
-                        "image"
-                    ] = image_value
+                if ind_to_add is not None and msg_ind_to_add is not None:
+                    processed_example["messages"][msg_ind_to_add]["content"][
+                        ind_to_add
+                    ]["image"] = image_value
                 else:
-                    # if no image type is found, add it to end of the first message
-                    processed_example["messages"][0]["content"].append(
+                    # if no image type is found, add it to end of the first user message
+                    if first_user_idx is None:
+                        first_user_idx = 0
+                    processed_example["messages"][first_user_idx]["content"].append(
                         {
                             "type": "image",
                             "image": image_value,
@@ -395,6 +403,24 @@ class VoxtralProcessingStrategy(ProcessingStrategy):
         return labels
 
 
+class SmolVLM2ProcessingStrategy(ProcessingStrategy):
+    """Processing Strategy class for SmolVLM2"""
+
+    def __init__(
+        self,
+        processor: ProcessorMixin,
+        chat_template: Optional[str] = None,
+        image_size: int | tuple[int, int] | None = None,
+        image_resize_algorithm: Resampling | None = None,
+    ):
+        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
+        self.image_token = "<image>"  # nosec
+
+        self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
+            processor.tokenizer.additional_special_tokens.index(self.image_token)
+        ]
+
+
 def get_processing_strategy(
     processor: ProcessorMixin,
     chat_template,
@@ -402,32 +428,43 @@ def get_processing_strategy(
     image_size: int | tuple[int, int] | None = None,
     image_resize_algorithm: Resampling | None = None,
 ):
+    processing_kwargs = {
+        "processor": processor,
+        "chat_template": chat_template,
+        "image_size": image_size,
+        "image_resize_algorithm": image_resize_algorithm,
+    }
+
+    if chat_template_type in [None, "tokenizer_default"] and hasattr(
+        processor.tokenizer, "chat_template"
+    ):
+        processing_kwargs["chat_template"] = processor.tokenizer.chat_template
+
     if chat_template_type == "qwen2_vl":
         return Qwen2VLProcessingStrategy(
-            processor, chat_template, image_size, image_resize_algorithm
+            **processing_kwargs,
         )
     if chat_template_type == "gemma3":
         return Gemma3ProcessingStrategy(
-            processor, chat_template, image_size, image_resize_algorithm
+            **processing_kwargs,
         )
     if chat_template_type == "gemma3n":
         return Gemma3nProcessingStrategy(
-            processor, chat_template, image_size, image_resize_algorithm
-        )
-    if chat_template_type in [
-        "llama3_2_vision",
-        "llama4",
-        "llava",
-        "mistral_v7_tekken",
-        "pixtral",
-    ]:
-        return ProcessingStrategy(
-            processor, chat_template, image_size, image_resize_algorithm
+            **processing_kwargs,
         )
 
     if isinstance(processor, VoxtralProcessor):
         return VoxtralProcessingStrategy(
-            processor, chat_template, image_size, image_resize_algorithm
+            **processing_kwargs,
         )
 
-    raise ValueError(f"Unsupported chat template type: {chat_template_type}")
+    if isinstance(processor, SmolVLMProcessor):
+        return SmolVLM2ProcessingStrategy(
+            **processing_kwargs,
+        )
+
+    # llama3_2_vision, llama4, llava
+    # mistral_v7_tekken, pixtral, lfm2vl
+    return ProcessingStrategy(
+        **processing_kwargs,
+    )
diff --git a/src/axolotl/prompt_strategies/chat_template.py b/src/axolotl/prompt_strategies/chat_template.py
index 8241dd385..f927b7fcb 100644
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -129,13 +129,21 @@ class ChatTemplatePrompter(Prompter):
                 images=images,
                 return_tensors="pt",
             )
+            if hasattr(batch, "to_dict"):
+                batch = batch.to_dict()
+            else:
+                batch = dict(batch)
+
             # workaround since processor works in batches instead of single examples
+            out = {}
             for k, val in batch.items():
-                if k in ["pixel_values"]:
-                    batch[k] = val.tolist()
+                if hasattr(val, "tolist"):
+                    out[k] = (
+                        val.tolist() if k == "pixel_values" else val.squeeze(0).tolist()
+                    )
                 else:
-                    batch[k] = val.squeeze().tolist()
-            return batch
+                    out[k] = val
+            return out
 
         return self.tokenizer.apply_chat_template(
             conversation,
@@ -433,10 +441,13 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                 tokenized_prompt["attention_mask"] = [1] * len(input_ids)
             else:
                 input_ids = tokenized_res["input_ids"]
-                tokenized_prompt = tokenized_res
+                tokenized_prompt = dict(tokenized_res)
 
             if not self.train_on_inputs:
-                user_prompt_len = len(prompt_ids)
+                if isinstance(prompt_ids, dict):
+                    user_prompt_len = len(prompt_ids["input_ids"])
+                else:
+                    user_prompt_len = len(prompt_ids)
                 labels = [-100] * user_prompt_len + input_ids[user_prompt_len:]
             else:
                 labels = input_ids
diff --git a/src/axolotl/utils/collators/mm_chat.py b/src/axolotl/utils/collators/mm_chat.py
index 0075d4830..542918527 100644
--- a/src/axolotl/utils/collators/mm_chat.py
+++ b/src/axolotl/utils/collators/mm_chat.py
@@ -5,7 +5,6 @@ Collators for multi-modal chat messages and packing
 from dataclasses import dataclass
 from typing import Any, Optional, Union
 
-import torch
 from torch import Tensor
 from transformers import PreTrainedTokenizerBase
 from transformers.data.data_collator import DataCollatorMixin
@@ -42,62 +41,19 @@ class MultiModalChatDataCollator(DataCollatorMixin):
         examples = self.processing_strategy(examples)
 
         # Initialize batch
-        batch: dict[str, Any] = {}
+        messages = [ex["messages"] for ex in examples]
 
-        # Process each example
-        for example in examples:
-            # Apply chat template to process the example
-            # This method requires transformers>=4.49.0
-            result = self.processing_strategy.processor.apply_chat_template(
-                example["messages"],
-                add_generation_prompt=False,
-                tokenize=True,
-                return_tensors="pt",
-                padding=True,
-                return_dict=True,
-                chat_template=self.processing_strategy.chat_template,
-            )
-
-            # TODO: Check if need handling for len(input_ids) > sequence_len
-
-            # Add the processed tensors to our batch
-            for key in result.keys():
-                if key not in batch:
-                    batch[key] = []
-
-                batch[key].append(result[key].squeeze(0))
-
-        # Pad sequences to the same length
-        input_ids = torch.nn.utils.rnn.pad_sequence(
-            batch["input_ids"],
-            batch_first=True,
-            padding_value=self.tokenizer.pad_token_id,
+        batch = self.processing_strategy.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=False,
+            tokenize=True,
+            return_tensors="pt",
+            padding=True,
+            return_dict=True,
+            chat_template=self.processing_strategy.chat_template,
         )
 
-        attention_mask = torch.nn.utils.rnn.pad_sequence(
-            batch["attention_mask"], batch_first=True, padding_value=0
-        )
-
-        # Create the final batch
-        final_batch = {
-            "input_ids": input_ids,
-            "attention_mask": attention_mask,
-        }
-
-        for key, val in batch.items():
-            if key in ["input_ids", "attention_mask"]:
-                continue
-
-            if key in ["token_type_ids", "cross_attention_mask"]:
-                final_batch[key] = torch.nn.utils.rnn.pad_sequence(
-                    val, batch_first=True, padding_value=0
-                )
-            else:
-                final_batch[key] = torch.stack(val)
-
         # Process the labels
-        final_batch["labels"] = self.processing_strategy.process_labels(
-            final_batch["input_ids"]
-        )
+        batch["labels"] = self.processing_strategy.process_labels(batch["input_ids"])
 
-        return final_batch
+        return batch
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
index 5931fe148..939ed5c1c 100644
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -147,7 +147,11 @@ def require_hopper(test_case):
 
 
 def check_tensorboard(
-    temp_run_dir: str, tag: str, lt_val: float, assertion_err: str
+    temp_run_dir: str,
+    tag: str,
+    lt_val: float,
+    assertion_err: str,
+    rtol: float = 0.02,
 ) -> None:
     """
     helper function to parse and check tensorboard logs
@@ -157,6 +161,7 @@ def check_tensorboard(
     reader = SummaryReader(event_file)
     df = reader.scalars  # pylint: disable=invalid-name
     df = df[(df.tag == tag)]  # pylint: disable=invalid-name
+    lt_val = (1 + rtol) * lt_val
     if "%s" in assertion_err:
         assert df.value.values[-1] < lt_val, assertion_err % df.value.values[-1]
     else:

From ecbe8b2b61bd24c8a6de662ad0ec3ca733feb4b1 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Fri, 15 Aug 2025 21:25:01 -0400
Subject: [PATCH 007/115] [GPT-OSS] improve FSDP shard merging and
 documentation for GPT-OSS (#3073)

* improve fsdp shard merging

* improve logging

* update information on merging and inferencing GPT-OSS

* cleanup readme

* automate cleanup of FSDP prefix

* import GRPO only if necessary

* only modify config.json on rank0

* merge final checkpoint at end of training

* prevent circular import

* Fix saving for sharded state dict

* devx, move merged to output dir

* move import back to top

* Fix stuck merge

* fix conditionals from pr feedback and add test
---
 examples/gpt-oss/README.md                    | 33 ++++++++-
 .../gpt-oss-120b-fft-fsdp2-offload.yaml       |  1 +
 src/axolotl/cli/merge_sharded_fsdp_weights.py | 27 ++++++-
 src/axolotl/core/trainers/__init__.py         |  1 -
 src/axolotl/train.py                          | 73 +++++++++++--------
 src/axolotl/utils/train.py                    | 45 ++++++++++++
 tests/utils/test_train.py                     | 24 ++++++
 7 files changed, 170 insertions(+), 34 deletions(-)
 create mode 100644 src/axolotl/utils/train.py
 create mode 100644 tests/utils/test_train.py

diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md
index 6dadb8230..9db5e9887 100644
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -33,13 +33,44 @@ Note: Memory usage taken from `device_mem_reserved(gib)` from logs.
 
 ### Training 120B
 
-On 8xH100s
+On 8xH100s, make sure you have ~3TB of free disk space. With each checkpoint clocking in at ~720GB, along with the base
+model, and final model output, you may need at least 3TB of free disk space to keep at least 2 checkpoints.
 
 ```bash
 # FFT SFT with offloading (8x80GB @ ~49GiB/GPU)
 axolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
 ```
 
+ERRATA: Transformers saves the model Architecture prefixed with `FSDP` which needs to be manually renamed in `config.json`.
+See https://github.com/huggingface/transformers/pull/40207 for the status of this issue.
+
+```bash
+sed -i 's/FSDPGptOssForCausalLM/GptOssForCausalLM/g' ./outputs/gpt-oss-out/config.json
+```
+
+When using SHARDED_STATE_DICT with FSDP, the final checkpoint should automatically merge the sharded weights to your
+configured `output_dir`. However, if that step fails due to a disk space error, you can take an additional step to
+merge the sharded weights.  This step will automatically determine the last checkpoint directory and merge the sharded
+weights to `{output_dir}/merged`.
+
+```bash
+axolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
+mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/
+```
+
+
+### Inferencing your fine-tuned model
+
+GPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425
+for more information about using a special vllm-openai docker image for inferencing with vLLM.
+
+SGLang has 0-day support in main, see https://github.com/sgl-project/sglang/issues/8833 for infomation on installing
+SGLang from source. Once you've installed SGLang, run the following command to launch a SGLang server:
+
+```bash
+python3 -m sglang.launch_server --model ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-120b --host 0.0.0.0 --port 8888 --tp 8
+```
+
 ### Tool use
 
 GPT-OSS has a comprehensive tool understanding. Axolotl supports tool calling datasets for Supervised Fine-tuning.
diff --git a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
index 4a9d51fdf..4b4fbd89b 100644
--- a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
@@ -20,6 +20,7 @@ datasets:
 dataset_prepared_path: last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/gpt-oss-out/
+save_total_limit: 2  # the 120B model can use up to 720GB of disk space per checkpoint, so let's only keep the last 2
 
 sequence_len: 4096
 sample_packing: true
diff --git a/src/axolotl/cli/merge_sharded_fsdp_weights.py b/src/axolotl/cli/merge_sharded_fsdp_weights.py
index c08d30ec8..c99f37fb1 100644
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -10,6 +10,7 @@ import fire
 import torch
 import torch.distributed.checkpoint as dist_cp
 import torch.distributed.checkpoint.format_utils as dist_cp_format_utils
+from accelerate import PartialState
 from accelerate.utils import (
     SAFE_WEIGHTS_INDEX_NAME,
     SAFE_WEIGHTS_NAME,
@@ -23,6 +24,7 @@ from torch.distributed.checkpoint.format_utils import _EmptyStateDictLoadPlanner
 
 from axolotl.cli.config import load_cfg
 from axolotl.utils.logging import get_logger
+from axolotl.utils.train import determine_last_checkpoint
 
 LOG = get_logger(__name__)
 
@@ -143,7 +145,6 @@ def merge_fsdp_weights(
         ValueError: If torch version < 2.3.0, or if `checkpoint_dir` does not exist.
     """
     checkpoint_dir_ = Path(checkpoint_dir)
-    from accelerate.state import PartialState
 
     if not is_torch_version(">=", "2.3.0"):
         raise ValueError("`merge_fsdp_weights` requires PyTorch >= 2.3.0`")
@@ -180,7 +181,6 @@ def merge_fsdp_weights(
         if remove_checkpoint_dir:
             LOG.info(f"Removing old checkpoint directory {checkpoint_dir_}")
             shutil.rmtree(checkpoint_dir_)
-    state.wait_for_everyone()
 
 
 def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
@@ -195,11 +195,32 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
     parsed_cfg = load_cfg(config, **kwargs)
 
     fsdp_dir = Path(parsed_cfg.output_dir) / "pytorch_model_fsdp_0"
+    if not fsdp_dir.exists():
+        checkpoint_dir = determine_last_checkpoint(parsed_cfg, update=False)
+        if checkpoint_dir:
+            fsdp_dir = Path(checkpoint_dir) / "pytorch_model_fsdp_0"
+        if not fsdp_dir.exists():
+            raise ValueError(
+                f"Could not find FSDP checkpoint `pytorch_model_fsdp_0` in {checkpoint_dir}"
+            )
+
+    output_path = str(Path(parsed_cfg.output_dir) / "merged")
     merge_fsdp_weights(
         checkpoint_dir=str(fsdp_dir),
-        output_path=str(Path(parsed_cfg.output_dir) / "merged"),
+        output_path=output_path,
         safe_serialization=True,
     )
+    state = PartialState()
+    state.wait_for_everyone()
+    LOG.info(
+        f"FSDP SHARDED_STATE_DICT weights successfully merged to: {output_path}",
+        main_process_only=True,
+    )
+    LOG.info(
+        "Merged weights are only the safetensors and doesn't include the model configuration "
+        f"or tokenizer which may be found in {parsed_cfg.output_dir}.",
+        main_process_only=True,
+    )
 
 
 if __name__ == "__main__":
diff --git a/src/axolotl/core/trainers/__init__.py b/src/axolotl/core/trainers/__init__.py
index 5f97e387a..a9cda4efc 100644
--- a/src/axolotl/core/trainers/__init__.py
+++ b/src/axolotl/core/trainers/__init__.py
@@ -5,7 +5,6 @@
 
 from .base import AxolotlTrainer
 from .dpo.trainer import AxolotlDPOTrainer
-from .grpo.trainer import AxolotlGRPOSequenceParallelTrainer, AxolotlGRPOTrainer
 from .mamba import AxolotlMambaTrainer
 from .trl import (
     AxolotlCPOTrainer,
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index e8a2cbabe..8005389f1 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -4,11 +4,14 @@ from __future__ import annotations
 
 import importlib
 import inspect
+import json
 import os
+import shutil
 import signal
 import sys
 import typing
 import weakref
+from collections import OrderedDict
 from contextlib import ExitStack
 from pathlib import Path
 from typing import Any, Dict
@@ -38,6 +41,7 @@ from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import RLType
+from axolotl.utils.train import determine_last_checkpoint
 from axolotl.utils.trainer import setup_trainer
 
 try:
@@ -46,7 +50,7 @@ except ImportError:
     BetterTransformer = None
 
 if typing.TYPE_CHECKING:
-    from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
+    from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
 
 LOG = get_logger(__name__)
 
@@ -124,32 +128,6 @@ def setup_reference_model(
     return model_ref
 
 
-def determine_resume_checkpoint(cfg: DictDefault) -> str | None:
-    """
-    Determine the checkpoint to resume from based on configuration.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-
-    Returns:
-        Path to the checkpoint to resume from, or `None` if not resuming.
-    """
-    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
-        possible_checkpoints = [
-            str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
-        ]
-        if len(possible_checkpoints) > 0:
-            sorted_paths = sorted(
-                possible_checkpoints,
-                key=lambda path: int(path.split("-")[-1]),
-            )
-            cfg.resume_from_checkpoint = sorted_paths[-1]
-            LOG.info(
-                f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
-            )
-    return cfg.resume_from_checkpoint
-
-
 def setup_signal_handler(
     cfg: DictDefault, model: PreTrainedModel, safe_serialization: bool
 ):
@@ -282,12 +260,49 @@ def save_trained_model(
             else:
                 state_dict_type = cfg.fsdp_config.state_dict_type
             trainer.accelerator.state.fsdp_plugin.set_state_dict_type(state_dict_type)
-        trainer.save_model(cfg.output_dir)
+        trainer.save_model(cfg.output_dir)  # only handles FULL_STATE_DICT
         if state_dict_type == "SHARDED_STATE_DICT":
             LOG.info(
                 "The final model was saved with a sharded state dict. Please ensure you merge "
                 "the sharded weights with `merge-sharded-fsdp-weights`."
             )
+            checkpoint_dir = determine_last_checkpoint(cfg, update=False)
+            if (
+                not (Path(cfg.output_dir) / "model.safetensors.index.json").exists()
+                and checkpoint_dir
+            ):
+                # import here to prevent circular import
+                from axolotl.cli.merge_sharded_fsdp_weights import merge_fsdp_weights
+
+                fsdp_dir = Path(checkpoint_dir) / "pytorch_model_fsdp_0"
+                merged_path = str(Path(cfg.output_dir) / "merged")
+                merge_fsdp_weights(
+                    checkpoint_dir=str(fsdp_dir),
+                    output_path=merged_path,
+                    safe_serialization=True,
+                )
+                trainer.accelerator.wait_for_everyone()
+                if trainer.accelerator.is_main_process:
+                    # move all files in merged_path to cfg.output_dir
+                    for merged_file in Path(merged_path).iterdir():
+                        shutil.move(str(merged_file), cfg.output_dir)
+                    shutil.rmtree(merged_path)  # remove what should be an empty dir
+        # TODO(wing):see https://github.com/huggingface/transformers/pull/40207
+        # cleanup the FSDP prefix in the model config.json
+        if trainer.accelerator.is_main_process:
+            with open(
+                Path(cfg.output_dir) / "config.json", "r", encoding="utf-8"
+            ) as config_file_io:
+                # read the model config as an OrderedDict
+                config = json.load(config_file_io, object_pairs_hook=OrderedDict)
+                config["architectures"] = [
+                    name.lstrip("FSDP") for name in config["architectures"]
+                ]
+            # write the updated model config back
+            with open(
+                os.path.join(cfg.output_dir, "config.json"), "w", encoding="utf-8"
+            ) as config_file_io:
+                json.dump(config, config_file_io, indent=2)
     elif cfg.deepspeed and is_deepspeed_zero3_enabled():
         # Copied over from: https://github.com/huggingface/accelerate/blob/5ae611118057232f441055f7ef9ba0b0f2b8d533/docs/source/usage_guides/deepspeed.md#saving-and-loading
         trainer.accelerator.wait_for_everyone()
@@ -564,7 +579,7 @@ def train(
     setup_model_card(cfg)
 
     # Execute the training
-    resume_from_checkpoint = determine_resume_checkpoint(cfg)
+    resume_from_checkpoint = determine_last_checkpoint(cfg)
     execute_training(cfg, trainer, resume_from_checkpoint)
 
     # clear cache
diff --git a/src/axolotl/utils/train.py b/src/axolotl/utils/train.py
new file mode 100644
index 000000000..1393459d9
--- /dev/null
+++ b/src/axolotl/utils/train.py
@@ -0,0 +1,45 @@
+"""Training utils for checkpoints"""
+
+from pathlib import Path
+
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def determine_last_checkpoint(cfg: DictDefault, update: bool = True) -> str | None:
+    """
+    Determine the checkpoint to resume from based on configuration.
+
+    Args:
+        cfg: Dictionary mapping `axolotl` config keys to values.
+        update: Whether to update the config with the determined checkpoint
+
+    Returns:
+        Path to the checkpoint to resume from, or `None` if not resuming.
+    """
+    last_checkpoint = None
+    checkpoints = sorted(
+        (
+            p
+            for p in Path(cfg.output_dir).glob("checkpoint-*")
+            if p.name.split("-")[-1].isdigit()
+        ),
+        key=lambda p: int(p.name.split("-")[-1]),
+    )
+    if checkpoints:
+        last_checkpoint = str(checkpoints[-1])
+        if not update:
+            return last_checkpoint
+
+    if (
+        cfg.resume_from_checkpoint is None
+        and cfg.auto_resume_from_checkpoints
+        and last_checkpoint is not None
+    ):
+        cfg.resume_from_checkpoint = last_checkpoint
+        LOG.info(
+            f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
+        )
+    return cfg.resume_from_checkpoint
diff --git a/tests/utils/test_train.py b/tests/utils/test_train.py
new file mode 100644
index 000000000..a1f6f6088
--- /dev/null
+++ b/tests/utils/test_train.py
@@ -0,0 +1,24 @@
+"""test for train checkpoint utils"""
+
+import os
+
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.train import determine_last_checkpoint
+
+
+def test_determine_last_checkpoint(temp_dir):
+    cfg = DictDefault(
+        output_dir=temp_dir,
+    )
+    for cpt_idx in [1, 9, 10, 20]:
+        os.makedirs(
+            os.path.join(cfg.output_dir, f"checkpoint-{cpt_idx}"), exist_ok=True
+        )
+
+    last_checkpoint = determine_last_checkpoint(cfg, update=False)
+    assert last_checkpoint == os.path.join(cfg.output_dir, "checkpoint-20")
+
+    cfg.resume_from_checkpoint = None
+    cfg.auto_resume_from_checkpoints = True
+    determine_last_checkpoint(cfg, update=True)
+    assert cfg.resume_from_checkpoint == os.path.join(cfg.output_dir, "checkpoint-20")

From 0eef385b1ae3bc436c2b5f4230c24a6ad7372afa Mon Sep 17 00:00:00 2001
From: VED <146507396+ved1beta@users.noreply.github.com>
Date: Mon, 18 Aug 2025 18:09:13 +0530
Subject: [PATCH 008/115] [feat] truncation support with excess_length_strategy
 (#3068) [skip ci]

* feat:truncation support with excess_len

* pre-commit

* excess_length_strategy

* requested changes

* lint

* added handle_long_seq_in_dataset in sft

* comments improved
---
 src/axolotl/utils/data/sft.py       |  6 ++--
 src/axolotl/utils/data/utils.py     | 53 +++++++++++++++++++++++++++--
 src/axolotl/utils/schemas/config.py |  6 ++++
 tests/test_packed_batch_sampler.py  |  4 +--
 4 files changed, 61 insertions(+), 8 deletions(-)

diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py
index 975f26e71..2ae7d9052 100644
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -28,7 +28,7 @@ from axolotl.utils.data.shared import (
 )
 from axolotl.utils.data.utils import (
     deduplicate_and_log_datasets,
-    drop_long_seq_in_dataset,
+    handle_long_seq_in_dataset,
     retry_on_request_exceptions,
 )
 from axolotl.utils.data.wrappers import get_dataset_wrapper
@@ -339,9 +339,9 @@ def _load_raw_datasets(
 
     if not cfg.skip_prepare_dataset:
         if split == "test" and cfg.eval_sequence_len:
-            dataset = drop_long_seq_in_dataset(dataset, cfg.eval_sequence_len, cfg)
+            dataset = handle_long_seq_in_dataset(dataset, cfg.eval_sequence_len, cfg)
         else:
-            dataset = drop_long_seq_in_dataset(dataset, cfg.sequence_len, cfg)
+            dataset = handle_long_seq_in_dataset(dataset, cfg.sequence_len, cfg)
         if cfg.sample_packing:
             dataset, _ = process_datasets_for_packing(cfg, dataset, None)
 
diff --git a/src/axolotl/utils/data/utils.py b/src/axolotl/utils/data/utils.py
index c0efb7a42..856a609c7 100644
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -148,7 +148,36 @@ def deduplicate_and_log_datasets(
     return dataset, other_dataset
 
 
-def drop_long_seq_in_dataset(
+def truncate_long_seq(sample, sequence_len=2048, min_sequence_len=2):
+    """
+    Truncate samples whose sequence length is too long (> sequence_len)
+    or drop those too short (< min_sequence_len).
+    """
+    min_sequence_len = min_sequence_len or 2
+
+    input_ids = sample["input_ids"]
+    results = []
+
+    # Batched (input_ids is a list of lists)
+    for i, seq in enumerate(input_ids):
+        length = len(seq)
+        if length < min_sequence_len:
+            results.append(False)
+        elif length > sequence_len:
+            sample["input_ids"][i] = seq[:sequence_len]
+            if "attention_mask" in sample:
+                sample["attention_mask"][i] = sample["attention_mask"][i][:sequence_len]
+            if "labels" in sample:
+                sample["labels"][i] = sample["labels"][i][:sequence_len]
+            if "position_ids" in sample:
+                sample["position_ids"][i] = sample["position_ids"][i][:sequence_len]
+            results.append(True)
+        else:
+            results.append(True)
+    return results
+
+
+def handle_long_seq_in_dataset(
     dataset: Dataset, sequence_len: int, cfg: DictDefault
 ) -> Dataset:
     """Remove sequences longer than configured maximum from dataset.
@@ -192,8 +221,21 @@ def drop_long_seq_in_dataset(
     if filter_map_kwargs:
         drop_long_kwargs["desc"] = f"Dropping Long Sequences (>{sequence_len})"
 
+    excess_length_strategy = (cfg.excess_length_strategy or "drop").lower()
+    if excess_length_strategy == "truncate":
+        process_fn = functools.partial(
+            truncate_long_seq,
+            sequence_len=sequence_len,
+            min_sequence_len=cfg.min_sample_len,
+        )
+        drop_long_kwargs["desc"] = (
+            f"Truncating/Filtering Sequences (target_len={sequence_len})"
+        )
+    else:
+        process_fn = drop_long
+
     dataset = dataset.filter(
-        drop_long,
+        process_fn,
         batched=True,
         **filter_map_kwargs,
         **drop_long_kwargs,
@@ -201,6 +243,11 @@ def drop_long_seq_in_dataset(
     if prior_len:
         dropped = prior_len - len(dataset)
         if dropped:
-            LOG.warning(f"Dropped {dropped} long samples from dataset")
+            action = (
+                "truncated/filtered"
+                if excess_length_strategy == "truncate"
+                else "dropped"
+            )
+            LOG.warning(f"{action.title()} {dropped} samples from dataset")
 
     return dataset
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 21e99c048..a607b3dca 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -414,6 +414,12 @@ class AxolotlInputConfig(
             "description": "The maximum length of an input to train with, this should typically be less than 2048 as most models have a token/context limit of 2048"
         },
     )
+    excess_length_strategy: Literal["drop", "truncate"] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "What to do when a tokenized row exceeds sequence_len. 'drop' removes the row; 'truncate' slices tensors to sequence_len. Defaults to 'drop' for backward compatibility."
+        },
+    )
     eval_sequence_len: int | None = Field(
         default=None,
         json_schema_extra={
diff --git a/tests/test_packed_batch_sampler.py b/tests/test_packed_batch_sampler.py
index 7cb645db7..47894a35b 100644
--- a/tests/test_packed_batch_sampler.py
+++ b/tests/test_packed_batch_sampler.py
@@ -8,7 +8,7 @@ from transformers import AutoTokenizer
 from axolotl.datasets import TokenizedPromptDataset
 from axolotl.prompt_strategies.completion import load
 from axolotl.utils.collators import V2BatchSamplerDataCollatorForSeq2Seq
-from axolotl.utils.data.utils import drop_long_seq_in_dataset
+from axolotl.utils.data.utils import handle_long_seq_in_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 
@@ -70,7 +70,7 @@ class TestBatchedSamplerPacking:
         )
         train_dataset = concatenate_datasets([dataset_wrapper])
 
-        train_dataset = drop_long_seq_in_dataset(train_dataset, cfg.sequence_len, cfg)
+        train_dataset = handle_long_seq_in_dataset(train_dataset, cfg.sequence_len, cfg)
 
         lengths = get_dataset_lengths(train_dataset)
         batch_sampler = MultipackBatchSampler(

From c10eb811fac677ee2d7c0c38a44d084fcc613cdc Mon Sep 17 00:00:00 2001
From: VED <146507396+ved1beta@users.noreply.github.com>
Date: Mon, 18 Aug 2025 18:14:37 +0530
Subject: [PATCH 009/115] data_parallel_size in in VllmserveCliArgs (#3074)

* data_parallel_size in in VllmserveCliArgs

* moved to 43
---
 src/axolotl/cli/args.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/axolotl/cli/args.py b/src/axolotl/cli/args.py
index 31d854d41..9bb544aff 100644
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -40,6 +40,12 @@ class VllmServeCliArgs:
         default=None,
         metadata={"help": "Number of tensor parallel workers to use."},
     )
+    data_parallel_size: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "Number of data parallel workers to use for vLLM serving. This controls how many model replicas are used for parallel inference."
+        },
+    )
     host: Optional[str] = field(
         default=None,  # nosec B104
         metadata={"help": "Host address to run the server on."},

From 05cedbfb1e8a125adcfaa0a03d1b9b2a3fa97e80 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 19 Aug 2025 13:30:37 -0400
Subject: [PATCH 010/115] add baseten info for gpt-oss recipe (#3078)

* add bsaeten info for gpt-oss recipe

* incorporate PR review
---
 examples/gpt-oss/README.md | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md
index 9db5e9887..98f3ea892 100644
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -41,6 +41,12 @@ model, and final model output, you may need at least 3TB of free disk space to k
 axolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
 ```
 
+To simplify fine-tuning across 2 nodes × 8x H100 (80GB) GPUs, we've partnered with [Baseten](https://baseten.co) to showcase multi-node
+training of the 120B model using Baseten Truss. You can read more about this recipe on
+[Baseten's blog](https://www.baseten.co/blog/how-to-fine-tune-gpt-oss-120b-with-baseten-and-axolotl/). The recipe can
+be found on their
+[GitHub](https://github.com/basetenlabs/ml-cookbook/tree/main/examples/oss-gpt-120b-axolotl/training).
+
 ERRATA: Transformers saves the model Architecture prefixed with `FSDP` which needs to be manually renamed in `config.json`.
 See https://github.com/huggingface/transformers/pull/40207 for the status of this issue.
 

From 050210e637a7ca2fdb65491eced13bd4d1ce5d10 Mon Sep 17 00:00:00 2001
From: goggle <jinkyung908@gmail.com>
Date: Wed, 20 Aug 2025 09:25:20 +0900
Subject: [PATCH 011/115] fix: Sweep runs overwrite each other because
 output_dir from base config is reused (#3080)

* refactor: improve output_dir handling in generate_config_files

* fix typo

* cli: harden sweep output_dir handling with base fallback

- Ensure sweep permutations always resolve a valid output_dir
- Default to ./model-out if neither permutation nor base config sets output_dir
- Append sweepXXXX suffix consistently for each permutation
- Prevent Path(None) TypeError and improve robustness of sweep config generation

* fix typo

* chore: lint

---------

Co-authored-by: Wing Lian <wing@axolotl.ai>
---
 src/axolotl/cli/utils/sweeps.py | 3 ++-
 src/axolotl/cli/utils/train.py  | 8 +++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/axolotl/cli/utils/sweeps.py b/src/axolotl/cli/utils/sweeps.py
index d21664964..bb1368cf6 100644
--- a/src/axolotl/cli/utils/sweeps.py
+++ b/src/axolotl/cli/utils/sweeps.py
@@ -3,11 +3,12 @@
 import random
 from copy import deepcopy
 from itertools import product
+from typing import Any
 
 
 def generate_sweep_configs(
     base_config: dict[str, list], sweeps_config: dict[str, list]
-) -> list[dict[str, list]]:
+) -> list[dict[str, Any]]:
     """
     Recursively generates all possible configurations by applying sweeps to the base config.
 
diff --git a/src/axolotl/cli/utils/train.py b/src/axolotl/cli/utils/train.py
index 31b0bcf58..b133d7271 100644
--- a/src/axolotl/cli/utils/train.py
+++ b/src/axolotl/cli/utils/train.py
@@ -4,6 +4,7 @@ import os
 import subprocess  # nosec
 import sys
 import tempfile
+from pathlib import Path
 from typing import Any, Iterator, Literal
 
 import yaml
@@ -88,7 +89,12 @@ def generate_config_files(config: str, sweep: str | None) -> Iterator[tuple[str,
     # Generate all possible configurations
     permutations = generate_sweep_configs(base_config, sweep_config)
     is_group = len(permutations) > 1
-    for permutation in permutations:
+    base_output_dir = base_config.get("output_dir", "./model-out")
+    for idx, permutation in enumerate(permutations, start=1):
+        permutation_dir = Path(permutation.get("output_dir", base_output_dir))
+        permutation_id = f"sweep{idx:04d}"
+        permutation["output_dir"] = str(permutation_dir / permutation_id)
+
         # pylint: disable=consider-using-with
         temp_file = tempfile.NamedTemporaryFile(
             mode="w",

From 06eaf6c448a52b9119ef48ad16ccb7286aac41c3 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 20 Aug 2025 08:52:26 -0400
Subject: [PATCH 012/115] misc fixes (#3085)

---
 examples/gpt-oss/README.md                         | 14 ++++++++++++++
 .../gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml     |  2 +-
 examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml        |  2 +-
 src/axolotl/cli/cloud/modal_.py                    |  2 +-
 src/axolotl/cli/inference.py                       |  2 +-
 src/axolotl/cli/preprocess.py                      |  3 ++-
 6 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md
index 98f3ea892..0aa04a71c 100644
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -67,9 +67,23 @@ mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/
 
 ### Inferencing your fine-tuned model
 
+#### vLLM
+
 GPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425
 for more information about using a special vllm-openai docker image for inferencing with vLLM.
 
+Optionally, vLLM can be installed from nightly:
+
+```bash
+pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
+```
+and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment):
+```bash
+vllm serve ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-20b --host 0.0.0.0 --port 8888  --tensor-parallel-size 8
+```
+
+#### SGLang
+
 SGLang has 0-day support in main, see https://github.com/sgl-project/sglang/issues/8833 for infomation on installing
 SGLang from source. Once you've installed SGLang, run the following command to launch a SGLang server:
 
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
index a6ba83433..1b142b6c3 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
@@ -15,7 +15,7 @@ datasets:
     field_thinking: thinking
     template_thinking_key: thinking
 
-dataset_prepared_path: last_run_prepared
+dataset_prepared_path: ./outputs/last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/gpt-oss-out/
 
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
index aa658c863..bdbb70fae 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
@@ -15,7 +15,7 @@ datasets:
     field_thinking: thinking
     template_thinking_key: thinking
 
-dataset_prepared_path: last_run_prepared
+dataset_prepared_path: ./outputs/last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/gpt-oss-out/
 
diff --git a/src/axolotl/cli/cloud/modal_.py b/src/axolotl/cli/cloud/modal_.py
index 240c6d894..0509cba69 100644
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -82,7 +82,7 @@ class ModalCloud(Cloud):
         return res
 
     def get_image(self):
-        docker_tag = "main-py3.11-cu124-2.6.0"
+        docker_tag = "main-py3.11-cu126-2.7.1"
         if self.config.docker_tag:
             docker_tag = self.config.docker_tag
         docker_image = f"axolotlai/axolotl:{docker_tag}"
diff --git a/src/axolotl/cli/inference.py b/src/axolotl/cli/inference.py
index 83b567b64..d03a91bc7 100644
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -64,7 +64,7 @@ def do_inference(
             importlib.import_module("axolotl.prompters"), prompter
         )
     elif cfg.chat_template:
-        chat_template_str = get_chat_template(cfg.chat_template)
+        chat_template_str = get_chat_template(cfg.chat_template, tokenizer=tokenizer)
     elif cfg.datasets[0].type == "chat_template":
         chat_template_str = get_chat_template_from_config(
             cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
diff --git a/src/axolotl/cli/preprocess.py b/src/axolotl/cli/preprocess.py
index 5d692c315..4120062d8 100644
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -97,7 +97,8 @@ def do_cli(
     """
     # pylint: disable=duplicate-code
     os.environ["AXOLOTL_IS_PREPROCESS"] = "1"
-    parsed_cfg = load_cfg(config, **kwargs)
+    is_preprocess = kwargs.pop("is_preprocess", True)
+    parsed_cfg = load_cfg(config, is_preprocess=is_preprocess, **kwargs)
     parsed_cfg.is_preprocess = True
     parser = transformers.HfArgumentParser(PreprocessCliArgs)
     parsed_cli_args, _ = parser.parse_args_into_dataclasses(

From 07fd22f39b89f09dfe0e32dce97923e48685e3cb Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 20 Aug 2025 15:17:48 -0400
Subject: [PATCH 013/115] better handling of lora w bias with fsdp2 and
 handling of files when saving model checkpoint (#3090)

---
 src/axolotl/cli/cloud/modal_.py             | 2 +-
 src/axolotl/monkeypatch/accelerate/fsdp2.py | 2 +-
 src/axolotl/train.py                        | 6 +++++-
 3 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/axolotl/cli/cloud/modal_.py b/src/axolotl/cli/cloud/modal_.py
index 0509cba69..6d4f999b4 100644
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -200,7 +200,7 @@ class ModalCloud(Cloud):
         if family in ["a10", "a10g"]:
             return modal.gpu.A10G(count=count)
         if family == "h100":
-            return modal.gpu.H100(count=count)
+            return f"H100:{count}"
         if family == "t4":
             return modal.gpu.T4(count=count)
         if family == "l4":
diff --git a/src/axolotl/monkeypatch/accelerate/fsdp2.py b/src/axolotl/monkeypatch/accelerate/fsdp2.py
index efc388294..66d3d0d2d 100644
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -187,7 +187,7 @@ def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
 
     # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
     # wrap this. Therefore we must ensure the bias has the same dtype as the weight
-    if module.base_layer.bias is not None:
+    if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None:
         if module.base_layer.weight.dtype != module.base_layer.bias.dtype:
             log_bias_dtype_mismatch = True
             module.base_layer.bias.data = module.base_layer.bias.data.to(
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index 8005389f1..dd39cc228 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -253,7 +253,9 @@ def save_trained_model(
             # final model weights have already been saved by `ReLoRACallback.on_train_end`
             return
 
-    if trainer.is_fsdp_enabled or cfg.fsdp_config:
+    if (  # pylint: disable=too-many-nested-blocks
+        trainer.is_fsdp_enabled or cfg.fsdp_config
+    ):
         if cfg.fsdp_config or cfg.fsdp:
             if cfg.fsdp_config.final_state_dict_type:
                 state_dict_type = cfg.fsdp_config.final_state_dict_type
@@ -285,6 +287,8 @@ def save_trained_model(
                 if trainer.accelerator.is_main_process:
                     # move all files in merged_path to cfg.output_dir
                     for merged_file in Path(merged_path).iterdir():
+                        if (Path(cfg.output_dir) / merged_file.name).exists():
+                            (Path(cfg.output_dir) / merged_file.name).unlink()
                         shutil.move(str(merged_file), cfg.output_dir)
                     shutil.rmtree(merged_path)  # remove what should be an empty dir
         # TODO(wing):see https://github.com/huggingface/transformers/pull/40207

From 08e517ea4828fea0b17401274aeb505722f8f867 Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Wed, 20 Aug 2025 22:14:13 -0400
Subject: [PATCH 014/115] Update .coderabbit.yaml (#3091) [skip ci]

---
 .coderabbit.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.coderabbit.yaml b/.coderabbit.yaml
index 95c044f02..b7cf7d969 100644
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -12,5 +12,6 @@ reviews:
   auto_review:
     enabled: true
     drafts: false
+    auto_incremental_review: true
 chat:
   auto_reply: true

From 0fa752e58b593440ced0dd1cec0630f9b7b92664 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Thu, 21 Aug 2025 15:04:10 -0400
Subject: [PATCH 015/115] upgrade flash-attn to 2.8.3 for gpt-oss attn sink
 support (#3082)

---
 examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml  | 2 +-
 examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml | 2 +-
 examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml   | 2 +-
 examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml           | 2 +-
 examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml  | 2 +-
 setup.py                                              | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
index 4b4fbd89b..62f3167e8 100644
--- a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
@@ -44,7 +44,7 @@ bf16: true
 tf32: true
 
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
 activation_offloading: true
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
index 440f0c509..ccb84e28e 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
@@ -40,7 +40,7 @@ bf16: true
 tf32: true
 
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
 activation_offloading: true
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
index 1b142b6c3..69a3c434d 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
@@ -41,7 +41,7 @@ bf16: true
 tf32: true
 
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
 activation_offloading: true
diff --git a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
index bdbb70fae..4a0f1ad70 100644
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
@@ -40,7 +40,7 @@ bf16: true
 tf32: true
 
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
 activation_offloading: true
diff --git a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
index c4e1a982d..b6deacb1b 100644
--- a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
@@ -53,7 +53,7 @@ bf16: true
 tf32: true
 
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
 
 gradient_checkpointing: true
 activation_offloading: true
diff --git a/setup.py b/setup.py
index de6f19e56..5aab9d7c0 100644
--- a/setup.py
+++ b/setup.py
@@ -118,9 +118,9 @@ def get_package_version():
 
 
 extras_require = {
-    "flash-attn": ["flash-attn==2.8.2"],
+    "flash-attn": ["flash-attn==2.8.3"],
     "ring-flash-attn": [
-        "flash-attn==2.8.2",
+        "flash-attn==2.8.3",
         "ring-flash-attn>=0.1.7",
         "yunchang==0.6.0",
     ],

From ab4d604a8fa4ddd07bcc56128ee664b5207cb541 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Fri, 22 Aug 2025 07:26:30 -0400
Subject: [PATCH 016/115] upgrade peft for 0.17.1 (#3094)

* upgrade peft to 0.17.1

* upgrade for transformers too
---
 requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index c2552002f..c51c9d1fe 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,8 +13,8 @@ liger-kernel==0.6.1
 packaging==23.2
 
 huggingface_hub>=0.33.0
-peft==0.17.0
-transformers==4.55.2
+peft>=0.17.0
+transformers==4.55.3
 tokenizers>=0.21.1
 accelerate==1.10.0
 datasets==4.0.0

From eea7a006e1fa805cd92543aa43e0abf7b9a06396 Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Fri, 22 Aug 2025 14:29:10 -0400
Subject: [PATCH 017/115] make multipack sampler patch explicit (#3096)

* make multipack sampler patch explicit

* combining
---
 src/axolotl/common/datasets.py                |  1 -
 src/axolotl/loaders/patch_manager.py          |  8 +++
 .../monkeypatch/data/batch_dataset_fetcher.py | 57 ++++++++++++++++++-
 tests/test_packed_batch_sampler.py            | 24 +++++---
 4 files changed, 79 insertions(+), 11 deletions(-)

diff --git a/src/axolotl/common/datasets.py b/src/axolotl/common/datasets.py
index 761317dfb..0ff52ebe1 100644
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -6,7 +6,6 @@ from dataclasses import dataclass
 
 from datasets import Dataset
 
-import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.loaders import load_processor, load_tokenizer
 from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index 628d897d0..4959bd6ba 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -277,6 +277,14 @@ class PatchManager:
                 has_remote_code=has_remote_code,
             )
 
+        if self.cfg.sample_packing:
+            from axolotl.monkeypatch.data.batch_dataset_fetcher import (
+                apply_multipack_dataloader_patch,
+            )
+
+            LOG.info("Applying multipack dataloader patch for sample packing...")
+            apply_multipack_dataloader_patch()
+
     def _apply_fsdp2_bnb_patches(self):
         """Apply FSDP2 BNB patches."""
         if (
diff --git a/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py b/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
index df8d106fd..73bf37b61 100644
--- a/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
+++ b/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
@@ -1,4 +1,4 @@
-"""monkey patches for the dataset fetcher to handle batches of packed indexes"""
+"""Monkey patches for the dataset fetcher to handle batches of packed indexes."""
 
 # pylint: disable=protected-access
 
@@ -6,10 +6,20 @@ import torch
 from torch.utils.data._utils.fetch import _BaseDatasetFetcher
 from torch.utils.data._utils.worker import _worker_loop
 
+_ORIGINAL_MAP_DATASET_FETCHER = None
+_ORIGINAL_WORKER_LOOP = None
+_IS_PATCHED = False
+
 
 class _MapDatasetFetcher(_BaseDatasetFetcher):
+    """
+    Custom dataset fetcher that handles nested batch structures from
+    MultipackBatchSampler.
+    """
+
     def fetch(self, possibly_batched_index):
         if isinstance(possibly_batched_index[0], list):
+            # Handle nested structure from MultipackBatchSampler
             data = [None for i in possibly_batched_index]
             for i, possibly_batched_index_ in enumerate(possibly_batched_index):
                 if self.auto_collation:
@@ -23,6 +33,7 @@ class _MapDatasetFetcher(_BaseDatasetFetcher):
                 else:
                     data[i] = self.dataset[possibly_batched_index_]
         else:
+            # Standard batch handling
             if self.auto_collation:
                 if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__:
                     data = self.dataset.__getitems__(possibly_batched_index)
@@ -34,14 +45,54 @@ class _MapDatasetFetcher(_BaseDatasetFetcher):
 
 
 def patch_fetchers():
+    """Apply patches to PyTorch's DataLoader components."""
     torch.utils.data._utils.fetch._MapDatasetFetcher = _MapDatasetFetcher
     torch.utils.data.dataloader._utils.fetch._MapDatasetFetcher = _MapDatasetFetcher
 
 
 def patched_worker_loop(*args, **kwargs):
+    """Worker loop that ensures patches are applied in worker processes."""
     patch_fetchers()
     return _worker_loop(*args, **kwargs)
 
 
-torch.utils.data._utils.worker._worker_loop = patched_worker_loop
-patch_fetchers()
+def apply_multipack_dataloader_patch():
+    """
+    This patch allows DataLoader to correctly process batches that contain multiple bins
+    of packed sequences.
+    """
+    # pylint: disable=global-statement
+    global _ORIGINAL_MAP_DATASET_FETCHER, _ORIGINAL_WORKER_LOOP, _IS_PATCHED
+
+    if _IS_PATCHED:
+        return
+
+    # Store original implementations
+    _ORIGINAL_MAP_DATASET_FETCHER = torch.utils.data._utils.fetch._MapDatasetFetcher
+    _ORIGINAL_WORKER_LOOP = torch.utils.data._utils.worker._worker_loop
+
+    # Apply patches
+    patch_fetchers()
+    torch.utils.data._utils.worker._worker_loop = patched_worker_loop
+
+    _IS_PATCHED = True
+
+
+def remove_multipack_dataloader_patch():
+    """Remove the monkeypatch and restore original PyTorch DataLoader behavior."""
+    # pylint: disable=global-statement
+    global _IS_PATCHED
+
+    if not _IS_PATCHED:
+        return
+
+    if _ORIGINAL_MAP_DATASET_FETCHER:
+        torch.utils.data._utils.fetch._MapDatasetFetcher = _ORIGINAL_MAP_DATASET_FETCHER
+        torch.utils.data.dataloader._utils.fetch._MapDatasetFetcher = (
+            _ORIGINAL_MAP_DATASET_FETCHER
+        )
+
+    if _ORIGINAL_WORKER_LOOP:
+        torch.utils.data._utils.worker._worker_loop = _ORIGINAL_WORKER_LOOP
+
+    _IS_PATCHED = False
diff --git a/tests/test_packed_batch_sampler.py b/tests/test_packed_batch_sampler.py
index 47894a35b..d839c6ea3 100644
--- a/tests/test_packed_batch_sampler.py
+++ b/tests/test_packed_batch_sampler.py
@@ -48,7 +48,13 @@ class TestBatchedSamplerPacking:
         max_seq_length,
         sequential,
     ):
-        import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
+        from axolotl.monkeypatch.data.batch_dataset_fetcher import (
+            apply_multipack_dataloader_patch,
+            remove_multipack_dataloader_patch,
+        )
+
+        # Apply the patch for multipack handling
+        apply_multipack_dataloader_patch()
 
         dataset = dataset_winglian_tiny_shakespeare["train"]
 
@@ -101,10 +107,14 @@ class TestBatchedSamplerPacking:
             for pack in batch:
                 batch_idxs.extend(pack)
 
-        for batch in loader:
-            assert batch["input_ids"].numel() <= batch_size * max_seq_length
-            assert batch["input_ids"].shape[1] == max_seq_length
+        try:
+            for batch in loader:
+                assert batch["input_ids"].numel() <= batch_size * max_seq_length
+                assert batch["input_ids"].shape[1] == max_seq_length
 
-        original_idxs = set(range(len(train_dataset)))
-        assert original_idxs == set(batch_idxs)
-        assert len(batch_idxs) == len(set(batch_idxs))
+            original_idxs = set(range(len(train_dataset)))
+            assert original_idxs == set(batch_idxs)
+            assert len(batch_idxs) == len(set(batch_idxs))
+        finally:
+            # Clean up: remove the patch after the test
+            remove_multipack_dataloader_patch()

From 79ddaebe9a6af7efefebdbb54772d11d09561786 Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Sat, 23 Aug 2025 23:37:33 -0400
Subject: [PATCH 018/115] Add ruff, remove black, isort, flake8, pylint (#3092)

* black, isort, flake8 -> ruff

* remove unused

* add back needed import

* fix
---
 .bandit                                       |     2 +-
 .flake8                                       |     5 -
 .isort.cfg                                    |     4 -
 .pre-commit-config.yaml                       |    20 +-
 .pylintrc                                     |    15 -
 cicd/multigpu.py                              |     4 +-
 cicd/single_gpu.py                            |     4 +-
 docs/scripts/generate_config_docs.py          |     7 +-
 .../colab-axolotl-example.ipynb               | 19872 ++++++++--------
 pyproject.toml                                |    31 +
 scripts/chat_datasets.py                      |     6 +-
 scripts/unsloth_install.py                    |     5 +-
 src/axolotl/cli/art.py                        |     2 +-
 src/axolotl/cli/cloud/modal_.py               |     9 +-
 src/axolotl/cli/config.py                     |     2 +-
 src/axolotl/cli/evaluate.py                   |     4 +-
 src/axolotl/cli/inference.py                  |     5 +-
 src/axolotl/cli/main.py                       |     2 -
 src/axolotl/cli/merge_sharded_fsdp_weights.py |     8 +-
 src/axolotl/cli/preprocess.py                 |     4 +-
 src/axolotl/cli/train.py                      |     2 +-
 src/axolotl/cli/utils/args.py                 |     4 +-
 src/axolotl/cli/utils/sweeps.py               |     7 +-
 src/axolotl/cli/utils/train.py                |     1 -
 src/axolotl/cli/vllm_serve.py                 |     3 +-
 src/axolotl/common/datasets.py                |     1 +
 src/axolotl/convert.py                        |     4 +-
 src/axolotl/core/attention/flex_block_mask.py |     8 +-
 src/axolotl/core/builders/base.py             |    14 +-
 src/axolotl/core/builders/causal.py           |     6 +-
 src/axolotl/core/builders/rl.py               |     6 +-
 src/axolotl/core/chat/format/chatml.py        |     2 +-
 src/axolotl/core/chat/messages.py             |    30 +-
 .../core/datasets/transforms/chat_builder.py  |    26 +-
 src/axolotl/core/trainers/__init__.py         |     1 -
 src/axolotl/core/trainers/base.py             |    13 +-
 src/axolotl/core/trainers/dpo/trainer.py      |     6 +-
 src/axolotl/core/trainers/grpo/__init__.py    |     6 +-
 src/axolotl/core/trainers/grpo/trainer.py     |    29 +-
 src/axolotl/core/trainers/mamba.py            |     5 +-
 src/axolotl/core/trainers/mixins/__init__.py  |     1 -
 .../mixins/activation_checkpointing.py        |     2 +-
 .../trainers/mixins/distributed_parallel.py   |     1 -
 src/axolotl/core/trainers/mixins/optimizer.py |    16 +-
 src/axolotl/core/trainers/mixins/scheduler.py |    10 +-
 src/axolotl/core/training_args_base.py        |     1 -
 src/axolotl/datasets.py                       |     4 +-
 src/axolotl/evaluate.py                       |     2 +-
 src/axolotl/integrations/base.py              |    25 +-
 src/axolotl/integrations/config.py            |    22 +-
 .../cut_cross_entropy/__init__.py             |    13 +-
 .../integrations/cut_cross_entropy/args.py    |     1 +
 src/axolotl/integrations/grokfast/__init__.py |     8 +-
 .../integrations/grokfast/optimizer.py        |     1 -
 src/axolotl/integrations/kd/__init__.py       |     3 +-
 src/axolotl/integrations/kd/args.py           |     5 +-
 src/axolotl/integrations/kd/callbacks.py      |     4 +-
 src/axolotl/integrations/kd/chat_template.py  |     8 +-
 src/axolotl/integrations/kd/collator.py       |    26 +-
 .../kd/collator_online_teacher.py             |    15 +-
 src/axolotl/integrations/kd/kernels/liger.py  |     6 +-
 src/axolotl/integrations/kd/kernels/models.py |     3 +-
 .../kd/topk_logprob/forward_kl.py             |     8 +-
 src/axolotl/integrations/kd/trainer.py        |     1 -
 src/axolotl/integrations/liger/__init__.py    |     1 +
 src/axolotl/integrations/liger/models/base.py |     3 +-
 .../integrations/liger/models/deepseekv2.py   |     2 -
 .../integrations/liger/models/jamba.py        |     2 -
 .../integrations/liger/models/llama4.py       |    17 +-
 .../integrations/liger/models/qwen3.py        |    12 +-
 .../integrations/liger/models/qwen3_moe.py    |    14 +-
 src/axolotl/integrations/lm_eval/__init__.py  |     3 +-
 src/axolotl/integrations/lm_eval/cli.py       |     1 -
 src/axolotl/integrations/spectrum/__init__.py |     6 +-
 src/axolotl/integrations/spectrum/args.py     |     1 +
 src/axolotl/kernels/geglu.py                  |     2 -
 src/axolotl/kernels/lora.py                   |     2 -
 src/axolotl/kernels/quantize.py               |     2 -
 src/axolotl/kernels/swiglu.py                 |     2 -
 src/axolotl/loaders/__init__.py               |     1 -
 src/axolotl/loaders/adapter.py                |    12 +-
 src/axolotl/loaders/model.py                  |    23 +-
 src/axolotl/loaders/tokenizer.py              |    10 +-
 src/axolotl/models/mamba/__init__.py          |     2 +-
 src/axolotl/models/mamba/modeling_mamba.py    |     3 +-
 src/axolotl/monkeypatch/accelerate/fsdp2.py   |     7 +-
 .../accelerate/parallelism_config.py          |     4 +-
 .../monkeypatch/attention/flex_attn.py        |    26 +-
 src/axolotl/monkeypatch/attention/xformers.py |    12 +-
 .../monkeypatch/btlm_attn_hijack_flash.py     |     8 +-
 .../monkeypatch/data/batch_dataset_fetcher.py |     2 -
 src/axolotl/monkeypatch/fsdp2_qlora.py        |    13 +-
 .../gradient_checkpointing/__init__.py        |     8 +-
 .../gradient_checkpointing/offload_cpu.py     |     8 +-
 .../gradient_checkpointing/offload_disk.py    |     8 +-
 .../monkeypatch/llama_attn_hijack_flash.py    |    32 +-
 .../monkeypatch/llama_attn_hijack_xformers.py |     8 +-
 src/axolotl/monkeypatch/llama_expand_mask.py  |     4 +-
 .../monkeypatch/llama_patch_multipack.py      |     8 +-
 src/axolotl/monkeypatch/lora_kernels.py       |    53 +-
 src/axolotl/monkeypatch/loss/chunked.py       |    10 +-
 .../monkeypatch/mistral_attn_hijack_flash.py  |     2 -
 src/axolotl/monkeypatch/mixtral/__init__.py   |     6 +-
 .../monkeypatch/models/llama4/modeling.py     |    14 +-
 src/axolotl/monkeypatch/multipack.py          |     8 +-
 src/axolotl/monkeypatch/peft/utils.py         |    16 +-
 src/axolotl/monkeypatch/relora.py             |     9 +-
 src/axolotl/monkeypatch/ring_attn/__init__.py |     1 -
 .../monkeypatch/ring_attn/adapters/batch.py   |     6 +-
 src/axolotl/monkeypatch/ring_attn/patch.py    |    35 +-
 .../monkeypatch/stablelm_attn_hijack_flash.py |    36 +-
 src/axolotl/monkeypatch/tiled_mlp/patch.py    |     8 +-
 src/axolotl/monkeypatch/trainer/lr.py         |     2 +-
 .../monkeypatch/trainer_accelerator_args.py   |    12 +-
 src/axolotl/monkeypatch/trainer_fsdp_optim.py |    16 +-
 .../transformers/trainer_loss_calc.py         |    16 +-
 src/axolotl/monkeypatch/unsloth_.py           |    46 +-
 src/axolotl/monkeypatch/xformers_/__init__.py |     4 +-
 src/axolotl/processing_strategies.py          |     6 +-
 src/axolotl/prompt_strategies/__init__.py     |     2 +-
 src/axolotl/prompt_strategies/alpaca_chat.py  |     4 +-
 .../prompt_strategies/alpaca_w_system.py      |     6 +-
 src/axolotl/prompt_strategies/base.py         |     2 +-
 .../bradley_terry/__init__.py                 |     3 +-
 .../bradley_terry/chat_template.py            |     2 -
 .../prompt_strategies/bradley_terry/llama3.py |     2 +-
 .../prompt_strategies/chat_template.py        |    13 +-
 src/axolotl/prompt_strategies/completion.py   |    12 +-
 src/axolotl/prompt_strategies/context_qa.py   |     1 -
 src/axolotl/prompt_strategies/creative_acr.py |     4 +-
 .../prompt_strategies/dpo/chat_template.py    |     4 +-
 src/axolotl/prompt_strategies/dpo/chatml.py   |    14 +-
 src/axolotl/prompt_strategies/dpo/llama3.py   |    15 +-
 .../prompt_strategies/dpo/passthrough.py      |     8 +-
 .../prompt_strategies/dpo/user_defined.py     |     2 +-
 src/axolotl/prompt_strategies/dpo/zephyr.py   |     7 +-
 src/axolotl/prompt_strategies/input_output.py |     1 -
 src/axolotl/prompt_strategies/kto/chatml.py   |    14 +-
 src/axolotl/prompt_strategies/kto/llama3.py   |    14 +-
 .../prompt_strategies/kto/user_defined.py     |     4 +-
 src/axolotl/prompt_strategies/llama2_chat.py  |     4 +-
 .../prompt_strategies/messages/__init__.py    |     4 +-
 .../prompt_strategies/messages/chat.py        |    11 +-
 src/axolotl/prompt_strategies/metharme.py     |     4 +-
 .../prompt_strategies/orpo/chat_template.py   |    51 +-
 src/axolotl/prompt_strategies/pygmalion.py    |     6 +-
 .../prompt_strategies/stepwise_supervised.py  |     2 +-
 src/axolotl/prompt_strategies/user_defined.py |    18 +-
 src/axolotl/prompt_tokenizers.py              |    14 +-
 src/axolotl/prompters.py                      |     8 +-
 src/axolotl/train.py                          |    12 +-
 src/axolotl/utils/__init__.py                 |     1 -
 src/axolotl/utils/callbacks/__init__.py       |    95 +-
 src/axolotl/utils/callbacks/comet_.py         |     6 +-
 src/axolotl/utils/callbacks/lisa.py           |     4 +-
 src/axolotl/utils/callbacks/mlflow_.py        |     7 +-
 src/axolotl/utils/callbacks/profiler.py       |    44 +-
 src/axolotl/utils/callbacks/qat.py            |     4 +-
 src/axolotl/utils/config/__init__.py          |     6 +-
 src/axolotl/utils/ctx_managers/__init__.py    |     1 -
 .../utils/ctx_managers/sequence_parallel.py   |     2 +-
 src/axolotl/utils/data/pretraining.py         |     3 +-
 src/axolotl/utils/data/rl.py                  |     1 -
 src/axolotl/utils/data/shared.py              |     4 +-
 src/axolotl/utils/data/utils.py               |     2 +-
 src/axolotl/utils/data/wrappers.py            |     3 +-
 src/axolotl/utils/dict.py                     |     8 +-
 src/axolotl/utils/distributed.py              |    10 +-
 src/axolotl/utils/environment.py              |     2 +-
 src/axolotl/utils/lora.py                     |     1 +
 .../utils/mistral/mistral_tokenizer.py        |     4 +-
 src/axolotl/utils/model_shard_quant.py        |    14 +-
 src/axolotl/utils/optimizers/adopt.py         |     9 +-
 src/axolotl/utils/samplers/multipack.py       |     8 +-
 src/axolotl/utils/schedulers.py               |     5 +-
 src/axolotl/utils/schemas/config.py           |     9 +-
 src/axolotl/utils/schemas/datasets.py         |     1 -
 src/axolotl/utils/schemas/enums.py            |     2 -
 src/axolotl/utils/schemas/training.py         |     6 +-
 src/axolotl/utils/schemas/validation.py       |    42 +-
 src/axolotl/utils/tokenization.py             |     2 +-
 src/axolotl/utils/trainer.py                  |     2 +-
 ...setuptools_axolotl_dynamic_dependencies.py |     1 -
 tests/cli/test_cli_evaluate.py                |     3 -
 tests/cli/test_cli_inference.py               |     2 -
 .../test_cli_merge_sharded_fsdp_weights.py    |     2 -
 tests/cli/test_cli_train.py                   |     2 -
 tests/cli/test_utils.py                       |    34 +-
 tests/conftest.py                             |    37 +-
 tests/constants.py                            |     1 +
 tests/core/test_builders.py                   |     9 +-
 .../integrations/test_cut_cross_entropy.py    |     4 -
 tests/e2e/integrations/test_fp8.py            |     3 +-
 tests/e2e/integrations/test_hooks.py          |    31 +-
 tests/e2e/integrations/test_kd.py             |     4 +-
 tests/e2e/integrations/test_liger.py          |     6 +-
 tests/e2e/kernels/test_geglu.py               |     6 +-
 tests/e2e/kernels/test_lora.py                |     4 +-
 tests/e2e/kernels/test_quantize.py            |     2 -
 tests/e2e/kernels/test_swiglu.py              |     8 +-
 tests/e2e/multigpu/solo/test_flex.py          |     1 -
 tests/e2e/multigpu/solo/test_grpo.py          |     2 +-
 tests/e2e/multigpu/test_eval.py               |     2 -
 tests/e2e/multigpu/test_fp8_fsdp2.py          |    14 +-
 tests/e2e/multigpu/test_fsdp1.py              |    14 +-
 tests/e2e/multigpu/test_fsdp2.py              |    14 +-
 tests/e2e/multigpu/test_gemma3.py             |     1 -
 tests/e2e/multigpu/test_llama.py              |    12 -
 tests/e2e/multigpu/test_ray.py                |     3 -
 tests/e2e/multigpu/test_tp.py                 |     1 -
 .../lora_kernels/test_lora_kernel_patching.py |    25 +-
 tests/e2e/patched/test_4d_multipack_llama.py  |     2 -
 .../patched/test_activation_checkpointing.py  |     3 +-
 tests/e2e/patched/test_cli_integrations.py    |     1 -
 tests/e2e/patched/test_fa_xentropy.py         |     1 -
 tests/e2e/patched/test_falcon_samplepack.py   |     2 -
 tests/e2e/patched/test_flattening.py          |     1 -
 tests/e2e/patched/test_fsdp2_qlora.py         |    15 +-
 tests/e2e/patched/test_fused_llama.py         |     1 -
 tests/e2e/patched/test_llama_s2_attention.py  |     2 -
 .../e2e/patched/test_lora_llama_multipack.py  |     2 -
 tests/e2e/patched/test_mistral_samplepack.py  |     2 -
 tests/e2e/patched/test_mixtral_samplepack.py  |     2 -
 tests/e2e/patched/test_model_patches.py       |     2 +-
 tests/e2e/patched/test_peft_embeddings.py     |     1 -
 tests/e2e/patched/test_phi_multipack.py       |     2 -
 tests/e2e/patched/test_resume.py              |     1 -
 tests/e2e/patched/test_unsloth_qlora.py       |     1 -
 tests/e2e/solo/test_flex.py                   |     1 -
 tests/e2e/solo/test_relora_llama.py           |     7 +-
 tests/e2e/test_activation_offloading.py       |     3 -
 tests/e2e/test_deepseekv3.py                  |     2 -
 tests/e2e/test_dpo.py                         |     7 -
 tests/e2e/test_embeddings_lr.py               |     2 -
 tests/e2e/test_evaluate.py                    |     1 -
 tests/e2e/test_falcon.py                      |     3 -
 tests/e2e/test_gemma2.py                      |     2 -
 tests/e2e/test_gemma3_text.py                 |     2 -
 tests/e2e/test_imports.py                     |     8 +-
 tests/e2e/test_llama.py                       |     4 -
 tests/e2e/test_llama_pretrain.py              |     1 -
 tests/e2e/test_llama_vision.py                |     2 -
 tests/e2e/test_load_model.py                  |    14 +-
 tests/e2e/test_lora_llama.py                  |     1 -
 tests/e2e/test_mamba.py                       |     1 -
 tests/e2e/test_mistral.py                     |     2 -
 tests/e2e/test_mixtral.py                     |     5 -
 tests/e2e/test_optimizers.py                  |     7 -
 tests/e2e/test_packing_loss.py                |     1 -
 tests/e2e/test_phi.py                         |     2 -
 tests/e2e/test_preprocess.py                  |     2 +-
 .../e2e/test_process_reward_model_smollm2.py  |     1 -
 tests/e2e/test_qat.py                         |     2 -
 tests/e2e/test_quantization.py                |    26 +-
 tests/e2e/test_qwen.py                        |     1 -
 tests/e2e/test_reward_model_smollm2.py        |     1 -
 tests/e2e/test_save_first_step.py             |     2 -
 tests/e2e/test_schedulers.py                  |     1 -
 tests/e2e/utils.py                            |    19 +-
 tests/hf_offline_utils.py                     |     2 +-
 tests/integrations/test_liger.py              |     2 -
 tests/patched/test_validation.py              |    18 +-
 tests/prompt_strategies/conftest.py           |     7 +-
 tests/prompt_strategies/messages/test_chat.py |     7 +-
 tests/prompt_strategies/test_alpaca.py        |     1 -
 ...est_chat_template_ds_schema_unification.py |     6 +-
 .../prompt_strategies/test_chat_templates.py  |    91 +-
 .../test_chat_templates_advanced.py           |   202 +-
 .../test_chat_templates_mistral.py            |     2 +-
 .../test_chat_templates_thinking.py           |     9 +-
 .../test_dpo_chat_templates.py                |     6 -
 tests/prompt_strategies/test_stepwise.py      |     1 -
 tests/test_chunked_xentropy.py                |     2 +-
 tests/test_datasets.py                        |     1 -
 tests/test_dict.py                            |    44 +-
 tests/test_exact_deduplication.py             |    14 +-
 tests/test_loaders.py                         |    27 +-
 tests/test_lora.py                            |     1 -
 tests/test_packed_batch_sampler.py            |     2 +-
 tests/test_packed_dataset.py                  |    10 +-
 tests/test_packed_pretraining.py              |     1 -
 tests/test_perplexity.py                      |     2 -
 tests/test_prompt_tokenizers.py               |     8 +-
 tests/test_schedulers.py                      |     2 +-
 tests/test_validation_dataset.py              |     1 -
 tests/utils/schemas/validation/test_fsdp.py   |     1 -
 286 files changed, 10979 insertions(+), 11435 deletions(-)
 delete mode 100644 .flake8
 delete mode 100644 .isort.cfg
 delete mode 100644 .pylintrc

diff --git a/.bandit b/.bandit
index 82e88e814..b81428751 100644
--- a/.bandit
+++ b/.bandit
@@ -1,3 +1,3 @@
 [bandit]
 exclude = tests
-skips = B101,B615
+skips = B101,B615,B102,B110
diff --git a/.flake8 b/.flake8
deleted file mode 100644
index fd69af775..000000000
--- a/.flake8
+++ /dev/null
@@ -1,5 +0,0 @@
-[flake8]
-max-line-length = 88
-
-select = C,E,F,W,B,B950
-extend-ignore = E203, E501, W503
diff --git a/.isort.cfg b/.isort.cfg
deleted file mode 100644
index bf9afe319..000000000
--- a/.isort.cfg
+++ /dev/null
@@ -1,4 +0,0 @@
-[settings]
-profile=black
-known_third_party=wandb,comet_ml
-known_local_folder=src,tests
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4c9268529..4c2861346 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,22 +10,12 @@ repos:
     -   id: trailing-whitespace
     -   id: no-commit-to-branch
         args: ['--branch', 'main']
--   repo: https://github.com/psf/black
-    rev: 25.1.0
+-   repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.12.9
     hooks:
-    -   id: black
--   repo: https://github.com/pycqa/isort
-    rev: 6.0.1
-    hooks:
-      - id: isort
--   repo: https://github.com/PyCQA/flake8
-    rev: 7.3.0
-    hooks:
-    - id: flake8
--   repo: https://github.com/pylint-dev/pylint
-    rev: v3.3.8
-    hooks:
-    - id: pylint
+    -   id: ruff
+        args: [--fix]
+    -   id: ruff-format
 -   repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.17.1
     hooks:
diff --git a/.pylintrc b/.pylintrc
deleted file mode 100644
index 208dd32b6..000000000
--- a/.pylintrc
+++ /dev/null
@@ -1,15 +0,0 @@
-[MASTER]
-init-hook="from pylint.config import find_default_config_files; import sys; sys.path.append(next(find_default_config_files()).parent.as_posix())"
-
-[TYPECHECK]
-
-# List of members which are set dynamically and missed by Pylint inference
-# system, and so shouldn't trigger E1101 when accessed.
-generated-members=numpy.*, torch.*
-
-
-[pylint.messages_control]
-disable=missing-function-docstring, line-too-long, import-error,
-    too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
-    too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
-    too-many-positional-arguments, possibly-used-before-assignment
diff --git a/cicd/multigpu.py b/cicd/multigpu.py
index 2c067f143..5bd8d3c04 100644
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -2,8 +2,6 @@
 modal application to run axolotl gpu tests in Modal
 """
 
-# pylint: disable=duplicate-code
-
 import os
 import pathlib
 import tempfile
@@ -63,7 +61,7 @@ def run_cmd(cmd: str, run_folder: str):
 
     # Propagate errors from subprocess.
     if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
+        exit(exit_code)
 
 
 @app.function(
diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py
index eb34e1748..0e2922e90 100644
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -1,7 +1,5 @@
 """Modal app to run axolotl GPU tests"""
 
-# pylint: disable=duplicate-code
-
 import os
 import pathlib
 import tempfile
@@ -70,4 +68,4 @@ def run_cmd(cmd: str, run_folder: str):
 
     # Propagate errors from subprocess.
     if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
+        exit(exit_code)
diff --git a/docs/scripts/generate_config_docs.py b/docs/scripts/generate_config_docs.py
index e22da7d05..6efa2038b 100644
--- a/docs/scripts/generate_config_docs.py
+++ b/docs/scripts/generate_config_docs.py
@@ -47,7 +47,6 @@ class QuartoGenerator:
         """Check if a type is a Pydantic BaseModel."""
         return inspect.isclass(type_obj) and issubclass(type_obj, BaseModel)
 
-    # pylint: disable=too-many-return-statements
     def _extract_nested_type(self, field_type) -> Any:
         """Extract the actual type from complex type annotations."""
         # Handle Annotated types (Python 3.9+)
@@ -124,7 +123,6 @@ class QuartoGenerator:
 
         return field_type
 
-    # pylint: disable=too-many-return-statements
     def _extract_all_pydantic_models_from_type(
         self, field_type
     ) -> list[type[BaseModel]]:
@@ -318,7 +316,6 @@ class QuartoGenerator:
 
         return all_groups
 
-    # pylint: disable=too-many-return-statements
     def _extract_field_groups_from_source(
         self, model_class: type[BaseModel]
     ) -> list[dict]:
@@ -503,7 +500,7 @@ class QuartoGenerator:
                     nested_schema = nested_model.model_json_schema()
                     nested_properties = nested_schema.get("properties", {})
                     nested_required = nested_schema.get("required", [])
-                except Exception:  # pylint: disable=broad-exception-caught
+                except Exception:
                     # Fallback: use model fields directly
                     nested_properties = {}
                     nested_required = []
@@ -607,7 +604,7 @@ class QuartoGenerator:
             schema = model_class.model_json_schema()
             properties = schema.get("properties", {})
             required = schema.get("required", [])
-        except Exception as e:  # pylint: disable=broad-exception-caught
+        except Exception as e:
             print(
                 f"Warning: Could not generate JSON schema ({e}). Using model fields instead."
             )
diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index 69881997e..30ef1c3de 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -1,9934 +1,9944 @@
 {
-  "cells": [
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "OPLSwmgdrB7g"
-      },
-      "source": [
-        "# Fine-Tune Qwen3 14B with Axolotl\n",
-        "\n",
-        "[<img src=\"https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png\" alt=\"Built with Axolotl\" width=\"200\" height=\"32\"/>](https://github.com/axolotl-ai-cloud/axolotl)\n",
-        "\n",
-        "Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.\n",
-        "\n",
-        "- ⭐ us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n",
-        "- 📜 Read the [Docs](http://docs.axolotl.ai/)\n",
-        "- 💬 Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n",
-        "- 📰 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "rVjKD7CbxIP3"
-      },
-      "source": [
-        "# Installation\n",
-        "\n",
-        "Axolotl is easy to install from [pip](https://pypi.org/project/axolotl/), or use our [pre-built Docker images](http://docs.axolotl.ai/docs/docker.html) for a hassle free dependency experience. See our [docs](http://docs.axolotl.ai/docs/installation.html) for more information."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "msOCO4NRmRLa"
-      },
-      "outputs": [],
-      "source": [
-        "%%capture\n",
-        "# This step can take ~5-10 minutes to install dependencies\n",
-        "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-        "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8\""
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "N0OW0YeksDLr"
-      },
-      "source": [
-        "## Demo: Talk Like a Pirate\n",
-        "\n",
-        "In this demo, we are training the model ***to respond like a pirate***. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab."
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "8Du2fANTsNCK"
-      },
-      "source": [
-        "### Upload your own dataset or use a Huggingface dataset\n",
-        "\n",
-        "You can choose to use your own JSONL file from your own [Google Drive](https://drive.google.com/drive/home); for example downloading the [Pirate-Ultrachat JSONL](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k/blob/main/train.jsonl) to your Google Drive. JSONL datasets should be formatted similar to the [OpenAI dataset format](https://cookbook.openai.com/examples/chat_finetuning_data_prep).\n",
-        "\n",
-        "You can also simply use the [`winglian/pirate-ultrachat-10k`](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k) dataset directly.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "fGEEjyQ-r_IV"
-      },
-      "outputs": [],
-      "source": [
-        "# Default to HF dataset location\n",
-        "dataset_id = \"winglian/pirate-ultrachat-10k\"\n",
-        "uploaded = {}"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "c5MyYqk7vIsG"
-      },
-      "outputs": [],
-      "source": [
-        "import os\n",
-        "# Optionally, upload your own JSONL to your Google Drive\n",
-        "GOOGLE_DRIVE_PATH = \"\"  # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n",
-        "\n",
-        "# \"Select All\" permissions, or you may get the error:\n",
-        "# \"MessageError: Error: credential propagation was unsuccessful\"\n",
-        "if GOOGLE_DRIVE_PATH:\n",
-        "    from google.colab import drive\n",
-        "    # Mount your Google Drive\n",
-        "    GOOGLE_DRIVE_MNT = \"/content/drive/\"\n",
-        "    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n",
-        "    tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n",
-        "    # make sure file exists\n",
-        "    if not os.path.isfile(tmp_path):\n",
-        "        raise ValueError(f\"File {tmp_path} does not exist\")\n",
-        "    dataset_id = tmp_path\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "U6pTk3A9xj1W"
-      },
-      "source": [
-        "# Configure for Supervised Fine-Tuning (SFT)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 151,
-          "referenced_widgets": [
-            "388f618924274d21a066f098f4f1e744",
-            "7c95f85a2b1f47a1bd846d110c47bb3c",
-            "083f9cda8d754c168beee10d2f8955a2",
-            "62e1a65582f446a78612eaa804e08a7d",
-            "487a177d020f4605834878b2fdc7afa3",
-            "7fd44cf9ca6e4726bfd7ac21846d6a14",
-            "366a343b62fa47d8985a3bd464d99f9e",
-            "a0a11e929edd4189b79723d618522c33",
-            "e87ea87fcff247b5bbcc331ba79a8dc2",
-            "5e18768f7ad6434ba8b8b8a2e853e204",
-            "bb33aec33a6447078c31bfd728942994"
-          ]
-        },
-        "id": "fdRioqytmTtX",
-        "outputId": "f0acdcec-4b41-4a3f-ffed-c2d2d929158e"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-08 13:40:27,488] [INFO] [root.register:348] [PID:174] Attempting to load plugin: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
-            "[2025-05-08 13:40:27,493] [INFO] [root.register:351] [PID:174] Plugin loaded successfully: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
-            "[2025-05-08 13:40:27,959] [INFO] [axolotl.utils.schemas.config.check_eval_packing:721] [PID:174] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`\u001b[39m\n",
-            "[2025-05-08 13:40:27,960] [INFO] [axolotl.utils.schemas.config.hint_sample_packing_padding:514] [PID:174] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing\u001b[39m\n",
-            "[2025-05-08 13:40:27,961] [INFO] [axolotl.utils.schemas.config.check_bf16:1251] [PID:174] [RANK:0] bf16 support detected, but not enabled for this configuration.\u001b[39m\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "388f618924274d21a066f098f4f1e744",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-08 13:40:28,590] [INFO] [axolotl.normalize_config:237] [PID:174] [RANK:0] cuda memory usage baseline: 0.000GB (+0.002GB cache, +0.359GB misc)\u001b[39m\n"
-          ]
-        }
-      ],
-      "source": [
-        "from axolotl.utils.dict import DictDefault\n",
-        "from axolotl.cli.config import load_cfg\n",
-        "\n",
-        "# Axolotl provides full control and transparency over model and training configuration\n",
-        "config = DictDefault(\n",
-        "    base_model = \"Qwen/Qwen3-14B\",  # Use the instruct tuned model, but we're aligning it to be a pirate\n",
-        "    load_in_4bit = True,  # set to True for qLoRA\n",
-        "    adapter = \"qlora\",\n",
-        "    lora_r = 32,\n",
-        "    lora_alpha = 64,\n",
-        "    lora_target_modules = [\n",
-        "        \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",  # train self_attn linear modules\n",
-        "        \"gate_proj\", \"down_proj\", \"up_proj\",  # train MLP linear modules\n",
-        "    ],\n",
-        "    lora_qkv_kernel = True,  # optimized triton kernels for LoRA\n",
-        "    lora_o_kernel = True,\n",
-        "    lora_mlp_kernel = True,\n",
-        "    embeddings_skip_upcast = True,  # keep embeddings in fp16 so the model fits in 15GB VRAM\n",
-        "    xformers_attention = True,  # use xformers on Colab w/ T4 for memory efficient attention, flash_attention only on Ampere or above\n",
-        "    plugins = [\n",
-        "        # more efficient training using Apple's Cut Cross Entropy; https://github.com/apple/ml-cross-entropy\n",
-        "        \"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\",\n",
-        "    ],\n",
-        "    sample_packing = True,  # 2-6x increase in tokens per micro-batch\n",
-        "    # when using packing, use a slightly higher learning rate to account for fewer steps\n",
-        "    # alternatively, reduce the micro_batch_size + gradient_accumulation_steps to achieve closer to the same number of steps/epoch\n",
-        "    learning_rate = 0.00019,\n",
-        "    sequence_len = 4096,  # larger sequence length improves packing efficiency for more tokens/sec\n",
-        "    micro_batch_size = 1,\n",
-        "    gradient_accumulation_steps = 1,\n",
-        "    gradient_checkpointing = True,  # tradeoff reduced VRAM for increased time\n",
-        "    gradient_checkpointing_kwargs = {\n",
-        "        \"use_reentrant\": False,\n",
-        "    },\n",
-        "    optimizer = \"paged_adamw_8bit\",\n",
-        "    lr_scheduler = \"cosine\",\n",
-        "    warmup_steps = 5,\n",
-        "    fp16 = True,  # use float16 + automatic mixed precision, bfloat16 not supported on Colab w/ T4\n",
-        "    bf16 = False,\n",
-        "    max_grad_norm = 0.1,  # gradient clipping\n",
-        "    num_epochs = 1,\n",
-        "    saves_per_epoch = 2,  # how many checkpoints to save over one epoch\n",
-        "    logging_steps = 1,\n",
-        "    output_dir = \"./outputs/qwen-sft-pirate-rrr\",\n",
-        "    chat_template = \"qwen3\",\n",
-        "    datasets = [\n",
-        "        {\n",
-        "            \"path\": dataset_id,  # Huggingface Dataset id or path to train.jsonl\n",
-        "            \"type\": \"chat_template\",\n",
-        "            \"split\": \"train\",\n",
-        "            \"eot_tokens\": [\"<|im_end|>\"],\n",
-        "        }\n",
-        "    ],\n",
-        "    dataloader_prefetch_factor = 8,  # dataloader optimizations\n",
-        "    dataloader_num_workers = 2,\n",
-        "    dataloader_pin_memory = True,\n",
-        "  )\n",
-        "\n",
-        "# validates the configuration\n",
-        "cfg = load_cfg(config)"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "id": "715UpvnSoBIS"
-      },
-      "outputs": [],
-      "source": [
-        "from axolotl.utils import patch_optimized_env\n",
-        "# speedup downloads from HF 🤗 and set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
-        "patch_optimized_env()"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "Vc6MC-hwyH-n"
-      },
-      "source": [
-        "# Datasets\n",
-        "\n",
-        "Axolotl has a robust suite of loaders and transforms to parse most open datasets of any format into the appropriate chat template for your model. Axolotl will mask input tokens from the user's prompt so that the train loss is only calculated against the model's response. For more information, [see our documentation](http://docs.axolotl.ai/docs/dataset-formats/conversation.html) on dataset preparation.\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 1000,
-          "referenced_widgets": [
-            "b82aa8c57f7c422a9a9c90f333ed2a99",
-            "c0991cf63ee6458b96e9a75e7a88b61a",
-            "71c8af139cd248b1b51101fd46a93f35",
-            "1d5117195d4b49eb8f1a73b18419f7ce",
-            "3c21e4a511b4441192c03b7f1d0976e9",
-            "ed28e2e0410d4e0b855467e798e53d66",
-            "d93f134f802b4b69b575bdaf07dbd27c",
-            "d0e9dce55cec4c1ca619a0ccf209d924",
-            "4c727d40ef0443449afc31724ee79f0c",
-            "0dea5caa27384f5689e3cab51f558727",
-            "a6f48410b9964fefba0c3009a77dc838",
-            "95caff42f08a4c2aa14c867b8f37f231",
-            "de7c37ee83e24f0c889e84d07279c2ec",
-            "9d4897eefb5f48259ffb2d23e332f752",
-            "253017b0d0534e54ab44e181f6d7c82d",
-            "27beaf06e41b472abdb544a43c720c5a",
-            "34cf3df51fbc41cabfdbba153c007f0e",
-            "ac764024cf1c4e08ba7749afd2cd20ac",
-            "30a81da86f8043eca301e86a8651201a",
-            "e8b7a81040904c1e89e58978223b1737",
-            "1c6f1f10667545aaab958016ba7e2c94",
-            "e6e969610738449887259063967f82b0",
-            "a138859f19b74fc0928dc236ab5359db",
-            "9b42e08b3c9548818488268768a118b1",
-            "12b56912736849fea2ad8124456fdc5c",
-            "879c8ab5873847a8833bd74123be90a4",
-            "20352e5f58d24bb8b1f3940efd14fe4a",
-            "d955dcaa0e944e719f3a06139dd54a03",
-            "d3de2662c7964f1ba96e58da382af720",
-            "97e36007e1304e1583fd81bfb13f0edd",
-            "c65dc74c7d6f4bab8f7dd28455161dd8",
-            "ef223e8504b64e3592589880326aaf41",
-            "598da69727bd4fb8b1caf465ac736d7a",
-            "5f86cd894de94c3280fadc1e2fd0ee13",
-            "a20927bf5f2c41f58c1e31ac858ab36c",
-            "0a46ad75c198463d843fb35e813642cb",
-            "09007681cf8d42aeb8c1d2f6a74e470a",
-            "ebc80d1a55fa47f4a5ea2756588569ec",
-            "1811cda0644e4190a9469d1774435d82",
-            "35c811d2ae8e43f3b5cecbdd3cfa857f",
-            "b8e39e4dddc3497fbc29ae45c66da759",
-            "63b4e563e85c4f03b1b72beda9577bcc",
-            "b195f160ca20442fadd8b5aed0ee41af",
-            "ca65e32eb52f48c09a84b33cb18f22cd",
-            "7cd0b85ebd204b7aba908417811ce4e0",
-            "7baeab52d6694c32b1efd1ea1a0a7782",
-            "519a7b154022443db6703f04a9142bae",
-            "d4183e9715f34d249942b8271cca3bdf",
-            "da2347ac94764a3fa2743343cf0d3cd2",
-            "93a44a11aa4846fa8efc6c1413ef1627",
-            "a55060adc3564407ac81ad7297d34aaa",
-            "d02274afd47b462291c745f261209d42",
-            "0f417447a7bd4a33acca96fa37aec877",
-            "63580b6fb30642479fe3000915bf551a",
-            "8f726dbfb45d4528afa33e36a6313267",
-            "03b093d592ba4386aa61f7b8483da660",
-            "b8766a88716948cf968f4563531a76d9",
-            "6f3a28b912714c6e931003549664bfa3",
-            "16d1283741404b7bb319094c992fce01",
-            "2a5bb0e818ab47be8cf6465988328503",
-            "2b3a2659b12244bd8548320320016dbf",
-            "0cd7efffbb3c4c4b972e63749f61ab97",
-            "5ca240f31e6b44e3882c5eb37cd5a309",
-            "5eb06edeb58e4930b1affef2a59eae81",
-            "a4e5789584564049b83df7c6c54a3e08",
-            "ff3a94b146a948b6907f5d80c7157f99",
-            "258b7c635c1045329d4669e48c46ccd5",
-            "6f68ed9889f54ad2ae8a3b95ac263a83",
-            "80366349d81e4dcc892db6cd56e384f3",
-            "c73055099c084dca996159e23e162d0b",
-            "977f799afaac4a55b2dc1cffa7d5b63b",
-            "41f3b32c2f6b4034ae7a3b9124e28bc7",
-            "a10d0a76010f4e508c65a9b69ebc5156",
-            "f8ef805b776145c3bfa9ba8d90972058",
-            "cc587493c33c4f118d1b1170f85be24c",
-            "e40d1c1ac9494b3bade9858324e7ffdf",
-            "d65b6b060d9845779299491ac5599c31",
-            "0f6907ebbc6242c8bde059cef1e1bd29",
-            "5bdfd87fc6cd4f9dabef7cfee29c8060",
-            "64f54d4a744a4627a07c3c0120276f3b",
-            "65b75b9b8bc143cf997796af68ff6668",
-            "d6fe74e4255444368f8f90a62157d869",
-            "4d468f96ec924681ad65eb671674b93e",
-            "ad7599de524549c48bf2d3124ad4b299",
-            "0546d04aae644dde846c58a4afb598a6",
-            "897b77a56c09479bb11d7f2a30997e55",
-            "81c3db71ac704280ad030072655f1537",
-            "042e091f75694c47aee761e760e76773",
-            "ef0a3c7a6f14460fb4da096928ae249e",
-            "07fb3a2c8315494e97b447e672dfae06",
-            "ec030fc3c346426f9abc3a89892258d3",
-            "e3fb3fc6afe04b3c9b7ac61809ce78fa",
-            "c3be9109d63c485d9c0ef4f9bc0f9218",
-            "12815f401eba44658caa7b2e490137a8",
-            "30e02aa2d0d241979369e598287f2639",
-            "dfd2a2649b8341ef913207526708aff1",
-            "4f1977d7e4824ef1a14b65f0f42bba10",
-            "c6164e05a1914ae48083db9ad7f4ef7c",
-            "813621384dc748b0ad06775e22761c0b",
-            "dc892a596f6942d7973c616c38f0eebb",
-            "c84cc07789be48aebb322c23d355289e",
-            "bed8726b8069434687c75452e21f19e5",
-            "16a188a0b06d45f980dcf3933509fe0a",
-            "60c1a0d765c14a1d888317e6a507e4ea",
-            "0077aedc3d174560bce924ee89e9c006",
-            "00321cce58884f6f9b3855a21fcd9187",
-            "fa864b41586f4a7aa56aeafd1d84eb75",
-            "3225603166b54e7aab766b9964a2f660",
-            "349eee9f56d64f0cba6fc24ff2c50c9b",
-            "7e5d3774060e4589aa65982da5ea4ef4",
-            "7c2485c6cdfe463da6fdb35982a1070d",
-            "ad1236893754446881e153adc9d5c962",
-            "daee63fd167e4441a32324b51b00ad2b",
-            "fe41858c6bd04c58840112b67c19a336",
-            "d262c82138024169b9f3aa034ca756fa",
-            "62e302ebdad64aada0ffe64ae1c873f3",
-            "bd1b0dfed6d34d16af33a4a58330f5ec",
-            "d07c8b97d3314f1c852e44bdd40f61ed",
-            "ebb69a2c3d0a4299a484698287b3087c",
-            "e5a82df528bb4e408797a3b6c2758f4a",
-            "f113ebd8c1c34806bea4dd7ed3035173"
-          ]
-        },
-        "id": "KQQhgK8FoDfF",
-        "outputId": "f69441d8-95f9-4885-c306-6c8709090ff6"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "b82aa8c57f7c422a9a9c90f333ed2a99",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "tokenizer_config.json:   0%|          | 0.00/9.68k [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "95caff42f08a4c2aa14c867b8f37f231",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "a138859f19b74fc0928dc236ab5359db",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "5f86cd894de94c3280fadc1e2fd0ee13",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-08 13:41:00,844] [DEBUG] [axolotl.utils.models.load_tokenizer:441] [PID:174] [RANK:0] EOS: 151645 / <|im_end|>\u001b[39m\n",
-            "[2025-05-08 13:41:00,845] [DEBUG] [axolotl.utils.models.load_tokenizer:442] [PID:174] [RANK:0] BOS: None / None\u001b[39m\n",
-            "[2025-05-08 13:41:00,846] [DEBUG] [axolotl.utils.models.load_tokenizer:443] [PID:174] [RANK:0] PAD: 151643 / <|endoftext|>\u001b[39m\n",
-            "[2025-05-08 13:41:00,847] [DEBUG] [axolotl.utils.models.load_tokenizer:444] [PID:174] [RANK:0] UNK: None / None\u001b[39m\n",
-            "[2025-05-08 13:41:00,869] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:271] [PID:174] [RANK:0] Unable to find prepared dataset in last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n",
-            "[2025-05-08 13:41:00,870] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:272] [PID:174] [RANK:0] Loading raw datasets...\u001b[39m\n",
-            "\u001b[33m[2025-05-08 13:41:00,870] [WARNING] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:274] [PID:174] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset.\u001b[39m\n",
-            "[2025-05-08 13:41:00,871] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:281] [PID:174] [RANK:0] No seed provided, using default seed of 42\u001b[39m\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "7cd0b85ebd204b7aba908417811ce4e0",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "train.jsonl:   0%|          | 0.00/27.3M [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "03b093d592ba4386aa61f7b8483da660",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Generating train split: 0 examples [00:00, ? examples/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-08 13:41:04,196] [INFO] [axolotl.utils.data.sft.get_dataset_wrapper:484] [PID:174] [RANK:0] Loading dataset with base_type: chat_template and prompt_style: None\u001b[39m\n",
-            "[2025-05-08 13:41:04,233] [INFO] [axolotl.__call__:761] [PID:174] [RANK:0] Using chat template:\n",
-            "---\n",
-            "{%- if tools %}\n",
-            "    {{- '<|im_start|>system\\n' }}\n",
-            "    {%- if messages[0].role == 'system' %}\n",
-            "        {{- messages[0].content + '\\n\\n' }}\n",
-            "    {%- endif %}\n",
-            "    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n",
-            "    {%- for tool in tools %}\n",
-            "        {{- \"\\n\" }}\n",
-            "        {{- tool | tojson }}\n",
-            "    {%- endfor %}\n",
-            "    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n",
-            "{%- else %}\n",
-            "    {%- if messages[0].role == 'system' %}\n",
-            "        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n",
-            "    {%- endif %}\n",
-            "{%- endif %}\n",
-            "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n",
-            "{%- for message in messages[::-1] %}\n",
-            "    {%- set index = (messages|length - 1) - loop.index0 %}\n",
-            "    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n",
-            "        {%- set ns.multi_step_tool = false %}\n",
-            "        {%- set ns.last_query_index = index %}\n",
-            "    {%- endif %}\n",
-            "{%- endfor %}\n",
-            "{%- for message in messages %}\n",
-            "    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n",
-            "        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n",
-            "    {%- elif message.role == \"assistant\" %}\n",
-            "        {%- set content = message.content %}\n",
-            "        {%- set reasoning_content = '' %}\n",
-            "        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n",
-            "            {%- set reasoning_content = message.reasoning_content %}\n",
-            "        {%- else %}\n",
-            "            {%- if '</think>' in message.content %}\n",
-            "                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n",
-            "                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n",
-            "            {%- endif %}\n",
-            "        {%- endif %}\n",
-            "        {%- if loop.index0 > ns.last_query_index %}\n",
-            "            {%- if loop.last or (not loop.last and reasoning_content) %}\n",
-            "                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n",
-            "            {%- else %}\n",
-            "                {{- '<|im_start|>' + message.role + '\\n' + content }}\n",
-            "            {%- endif %}\n",
-            "        {%- else %}\n",
-            "            {{- '<|im_start|>' + message.role + '\\n' + content }}\n",
-            "        {%- endif %}\n",
-            "        {%- if message.tool_calls %}\n",
-            "            {%- for tool_call in message.tool_calls %}\n",
-            "                {%- if (loop.first and content) or (not loop.first) %}\n",
-            "                    {{- '\\n' }}\n",
-            "                {%- endif %}\n",
-            "                {%- if tool_call.function %}\n",
-            "                    {%- set tool_call = tool_call.function %}\n",
-            "                {%- endif %}\n",
-            "                {{- '<tool_call>\\n{\"name\": \"' }}\n",
-            "                {{- tool_call.name }}\n",
-            "                {{- '\", \"arguments\": ' }}\n",
-            "                {%- if tool_call.arguments is string %}\n",
-            "                    {{- tool_call.arguments }}\n",
-            "                {%- else %}\n",
-            "                    {{- tool_call.arguments | tojson }}\n",
-            "                {%- endif %}\n",
-            "                {{- '}\\n</tool_call>' }}\n",
-            "            {%- endfor %}\n",
-            "        {%- endif %}\n",
-            "        {{- '<|im_end|>\\n' }}\n",
-            "    {%- elif message.role == \"tool\" %}\n",
-            "        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n",
-            "            {{- '<|im_start|>user' }}\n",
-            "        {%- endif %}\n",
-            "        {{- '\\n<tool_response>\\n' }}\n",
-            "        {{- message.content }}\n",
-            "        {{- '\\n</tool_response>' }}\n",
-            "        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n",
-            "            {{- '<|im_end|>\\n' }}\n",
-            "        {%- endif %}\n",
-            "    {%- endif %}\n",
-            "{%- endfor %}\n",
-            "{%- if add_generation_prompt %}\n",
-            "    {{- '<|im_start|>assistant\\n' }}\n",
-            "    {%- if enable_thinking is defined and enable_thinking is false %}\n",
-            "        {{- '<think>\\n\\n</think>\\n\\n' }}\n",
-            "    {%- endif %}\n",
-            "{%- endif %}\n",
-            "---\u001b[39m\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "258b7c635c1045329d4669e48c46ccd5",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Tokenizing Prompts (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-08 13:42:09,195] [INFO] [axolotl.utils.data.utils.drop_long_seq_in_dataset:177] [PID:174] [RANK:0] min_input_len: 23\u001b[39m\n",
-            "[2025-05-08 13:42:09,196] [INFO] [axolotl.utils.data.utils.drop_long_seq_in_dataset:179] [PID:174] [RANK:0] max_input_len: 3380\u001b[39m\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "0f6907ebbc6242c8bde059cef1e1bd29",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Dropping Long Sequences (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "ef0a3c7a6f14460fb4da096928ae249e",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Drop Samples with Zero Trainable Tokens (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "dc892a596f6942d7973c616c38f0eebb",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Add position_id column (Sample Packing) (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-08 13:42:21,651] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:351] [PID:174] [RANK:0] Saving merged prepared dataset to disk... last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "7c2485c6cdfe463da6fdb35982a1070d",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Saving the dataset (0/1 shards):   0%|          | 0/9985 [00:00<?, ? examples/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-08 13:42:25,711] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:411] [PID:174] [RANK:0] gather_len_batches: [1540]\u001b[39m\n",
-            "[2025-05-08 13:42:25,714] [INFO] [axolotl.calc_sample_packing_eff_est:491] [PID:174] [RANK:0] sample_packing_eff_est across ranks: [0.9987832601968344]\u001b[39m\n"
-          ]
-        }
-      ],
-      "source": [
-        "from axolotl.common.datasets import load_datasets\n",
-        "\n",
-        "# Load, parse and tokenize the datasets to be formatted with qwen3 chat template\n",
-        "# Drop long samples from the dataset that overflow the max sequence length\n",
-        "dataset_meta = load_datasets(cfg=cfg)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "mrSNfHpk0EAe"
-      },
-      "source": [
-        "# Training\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 1000,
-          "referenced_widgets": [
-            "004d9177a6a14118a5930dc3cc13147b",
-            "a80410b919e442c49aea15acc1ce1a72",
-            "c6e00f5224364822bc4239b176686919",
-            "ec11d1e5ae7b42c883d9b1f38a65356e",
-            "734185351eb543fa9a00a881dcbb9fe7",
-            "fa1282ccc7544e4f818e2f03ccffe4a5",
-            "bbbf575d2a4b4c6ea8389be79b2a6039",
-            "2a51b36be41745468e4c2d7a21b1c0d2",
-            "4fd114abe9f5494ab59858949f5055f1",
-            "936d04b5fe1b4c63bf0b080e423d051b",
-            "f1cef8e8dc2646fb9fd09f3b09081074",
-            "cdebbc55a1164c018546c2ac6f8c620c",
-            "a44f630e099e43899f20a77084ae60cd",
-            "c3725c7f79fe415fbd1ea336f0cc9cf1",
-            "0e50870ed0c643e0b6c18cc5d7ddae7f",
-            "c33ced495f70464aa4a3a91922090853",
-            "ed5ca967ad5342929e578ac6aa4dc4c0",
-            "af401d117d5047629d3a6e2361757b62",
-            "b191ac001a2e4962bc9a245fcdf26e6b",
-            "054c8dffadba48c6b895a6cc62448ecc",
-            "bfcdbba993b74972a9e3e575f86908ff",
-            "6ebb2ec171414e47a14765505f64bb3c",
-            "500e272208a246089613bf788a165271",
-            "200df5e79b9244849e589ecb0250a520",
-            "cc94432d08464affa3e58b560bdad194",
-            "3036608c71904ce9ae4bb2a9fa8802d9",
-            "adacfdcc1b0140efac56918e9ccf064e",
-            "f4a1795dc7514a718f478245f521f0ba",
-            "5e746eb25bbe416fb585fa24e79f5177",
-            "b5b65414154544aa8a71b1a39164aad7",
-            "f0a58fbd0fca4340890041f99fa2f8c8",
-            "5ca6be24acb548cea130bd58e9954c7c",
-            "5cfb02ee044b4011a378efa8b54a370f",
-            "4d05314858354e729d76094b3b0ce761",
-            "c42acf646f344a88b8c11f81e67f7206",
-            "7be6f04c284e4326bb4ff3d301e7b3c6",
-            "ffdbb12a2f2c4d14911685e7683e0ef0",
-            "bee3501b2a17427784a717e50a85e7fa",
-            "8bc9d8ba866c442b9118d9630009939c",
-            "9f56a2d9979c4bd8928c644c22c3ecdf",
-            "9503a45960984adc97b58e16c50662e0",
-            "da6e93f3e4984780b930fe7a706983ea",
-            "ab93eabd7cea4b94b4b7a387f101e8a1",
-            "704f2f5a9b1c49d5a75a0025a5dda11b",
-            "dd0e646fad3f4a89ba23b39d162bd8d9",
-            "d43c6df07ddb466587807d6dbe1ff614",
-            "e0e8b840b8ea4d0d9db09afe99fa287d",
-            "9327977822be4b1294f80e876552e305",
-            "77304d1a46b3468a98483e02ec0ac4a4",
-            "8c4d4fc5a30f4e7cb3be53fe2adda33d",
-            "e90658f4bcb642baa78426012f863152",
-            "f7434f3e03124a1c938a39af79d7fa59",
-            "c1314f241a434c41b45d84dc4d3b30f8",
-            "37de928300e34184881039378bd75e7f",
-            "0e936d9dbf9c4fdd86bbfe9730dedc47",
-            "e21e180307e5485cbbe908672fd6639a",
-            "2e2b0c1599c341a198f632f46a40c90e",
-            "bff139df987d4a62abec6456cb27f3d4",
-            "ebe1cc366d324ad59b264c8b3c431441",
-            "114dece49dba437c8572ef94b23c3b1e",
-            "be724f04b03942b2a033a7e8898bb4fd",
-            "fcbab4d8dced41a18dfccce81e3a45a0",
-            "c1f9c267ba3f40039cdb5eb3267e8043",
-            "33b3b1d0295646edaac7b4822761aeb0",
-            "fba7aa824b38467ab3061b226114cdec",
-            "f3075dccbd2747b4a7913b66f44f2596",
-            "fe18bba7f3fb4c31bf840541f36b3425",
-            "fd4f333f7ece4450b04e1a9af1f9d2f6",
-            "f60a2bdb6b6b4e0e8c3508580e247132",
-            "c0892a1881de4eb4bfabc6a68f87ae99",
-            "1bec6297c90242a88672d195bc09d429",
-            "d1f9b10c130542f094c8fd3d1e23b5e9",
-            "e575d87a7efe4ec7b1efde489839d4a6",
-            "edc99591b9c747b689b94d0052fec14c",
-            "35cc989ca3374e7dba0cb166febc4bde",
-            "158c8b85dbf34de6a94b4e35e2fc7d5a",
-            "0b4c9753a7cb4354b8e5f187e6e1ad7c",
-            "4471ff62258549fba9514bb67050f965",
-            "9cd5211b5d8b457aa0002f1d17b80028",
-            "19127c7bb1554ccbac877059f9a82db0",
-            "f4667818b9d34a09891cd727a429a610",
-            "9ed02dc43412471a9ab47f3620ccf3a5",
-            "6932489232ec4ab18a160b1e7fbcdfe1",
-            "4540927d98f54466b434ba4c0edf045d",
-            "e400cbf14bcc446a9d33b210cd93550b",
-            "71002199df6b40c9a1ac40df5fb27a1b",
-            "4b27c267393640f28f6eae0875bd2ed9",
-            "9858cb74a09748a39e8149baac96702c",
-            "eb1c9535e6a546098b760528b2ea387c",
-            "18357b321ce44d7b8bd9d1c886f69275",
-            "279937fe03bc4e4eb25b472d7e9df163",
-            "bca2c7185b6749fd899c06a2ba4c5e46",
-            "1f7d30f71bbd4547a9150d21da071055",
-            "e366ae3fceec4566b9ed303d6c5f90af",
-            "5dd7d150dbe04f08b165ce7f2c27cd11",
-            "b634bb73cfa743d09a5999101b840976",
-            "742b1030acfd414bbd9d5327b7e3826d",
-            "0f480e3a0b0a45d2a2d2dec3cad923f3",
-            "fcb30372e7404c5d8a1ad4df91e6c7b2",
-            "2860e3bb3baf4f7da058465850e800c5",
-            "3efd18ea8eaa41918894883da9541bfa",
-            "e09f1bcbb9d94c09be53e5e1303642c2",
-            "82177df57a494de8900c14c2f5185175",
-            "ccfcdc95baf646f8aeb3d516742383f2",
-            "8f5bd719974e41c3a8dd9a5b0d3d71e6",
-            "b87c84de30e84b3abf4871461fb9cbd3",
-            "e7d8e4fe58384e93a106de546068c65e",
-            "0aa8ab56b85f4171a79c3bc210594025",
-            "67da6c4260574869aa24c3cbc1bc1654",
-            "94b9088614464f60a203de39dbcae853",
-            "fea1b70fb46745feb5111b3929175b5d",
-            "f365820a3d3c42b2948abfe32065de14",
-            "823f1c78f15043e38bbd4dca3932a86a",
-            "a1959759c5424da9961fb2a308d4dee4",
-            "34c9c0137b504cd799c6bd6de69507c2",
-            "735d4f225b24414294fc1b213c61223c",
-            "5e5e15b0569b474c9620083b3ec6af55",
-            "03a3c744d716431488163b4358b80f92",
-            "a5434ee714f9498d83870544b67c0cb7",
-            "3aaecbf540f54a2db9ab0931e3b1fe57",
-            "9e333ed3b5014069ac1dd969255dd591"
-          ]
-        },
-        "id": "IwrpurmloGOy",
-        "outputId": "84fa167f-ba27-4255-d508-dc9df56ad39b"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "\n",
-            "     #@@ #@@      @@# @@#\n",
-            "    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.\n",
-            "    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@\n",
-            "      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@\n",
-            "    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@\n",
-            "    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@\n",
-            "     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@\n",
-            "                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@\n",
-            "    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@\n",
-            "                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@\n",
-            "    @@@@  @@@@@@@@@@@@@@@@\n",
-            "\n",
-            "[2025-05-07 22:08:14,344] [INFO] [axolotl.monkeypatch.peft.utils.patch_peft_prep_code:76] [PID:1336] [RANK:0] patching prepare_model_for_kbit_training to allow for overrides\u001b[39m\n",
-            "[2025-05-07 22:08:14,549] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:80] [PID:1336] [RANK:0] Applying Cut Cross Entropy to model type: qwen3\u001b[39m\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "004d9177a6a14118a5930dc3cc13147b",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "model.safetensors.index.json:   0%|          | 0.00/36.5k [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "cdebbc55a1164c018546c2ac6f8c620c",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "model-00001-of-00008.safetensors:   0%|          | 0.00/3.84G [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "500e272208a246089613bf788a165271",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "model-00002-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "4d05314858354e729d76094b3b0ce761",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "model-00003-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "dd0e646fad3f4a89ba23b39d162bd8d9",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "model-00004-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "e21e180307e5485cbbe908672fd6639a",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "model-00005-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "fe18bba7f3fb4c31bf840541f36b3425",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "model-00006-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "4471ff62258549fba9514bb67050f965",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "model-00007-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "eb1c9535e6a546098b760528b2ea387c",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "model-00008-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-07 22:09:49,798] [INFO] [accelerate.utils.modeling.get_balanced_memory:990] [PID:1336] We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n"
-          ]
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "2860e3bb3baf4f7da058465850e800c5",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "fea1b70fb46745feb5111b3929175b5d",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-07 22:11:37,521] [INFO] [axolotl.utils.models.load_model:1302] [PID:1336] [RANK:0] cuda memory usage after model load: 9.264GB (+1.721GB cache, +0.375GB misc)\u001b[39m\n",
-            "[2025-05-07 22:11:37,532] [INFO] [axolotl.utils.models.prepare_model:1205] [PID:1336] [RANK:0] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
-            "[2025-05-07 22:11:37,537] [INFO] [axolotl.utils.models.load_model:1341] [PID:1336] [RANK:0] Converting modules to torch.float16\u001b[39m\n",
-            "trainable params: 128,450,560 || all params: 14,896,757,760 || trainable%: 0.8623\n",
-            "[2025-05-07 22:11:40,170] [INFO] [axolotl.utils.models.load_model:1402] [PID:1336] [RANK:0] cuda memory usage after adapters: 9.743GB (+1.476GB cache, +0.375GB misc)\u001b[39m\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "/usr/local/lib/python3.11/dist-packages/axolotl/core/trainers/base.py:64: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `AxolotlTrainer.__init__`. Use `processing_class` instead.\n",
-            "  super().__init__(*_args, **kwargs)\n",
-            "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
-          ]
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-07 22:11:41,755] [INFO] [axolotl.train.save_initial_configs:359] [PID:1336] [RANK:0] Pre-saving adapter config to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
-            "[2025-05-07 22:11:41,756] [INFO] [axolotl.train.save_initial_configs:363] [PID:1336] [RANK:0] Pre-saving tokenizer to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
-            "[2025-05-07 22:11:41,974] [INFO] [axolotl.train.save_initial_configs:366] [PID:1336] [RANK:0] Pre-saving model config to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
-            "[2025-05-07 22:11:41,982] [INFO] [axolotl.train.execute_training:211] [PID:1336] [RANK:0] Starting trainer...\u001b[39m\n",
-            "[2025-05-07 22:11:45,047] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:411] [PID:1336] [RANK:0] gather_len_batches: [1540]\u001b[39m\n"
-          ]
-        },
-        {
-          "name": "stderr",
-          "output_type": "stream",
-          "text": [
-            "You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
-            "You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
-          ]
-        },
-        {
-          "data": {
-            "text/html": [
-              "\n",
-              "    <div>\n",
-              "      \n",
-              "      <progress value='25' max='25' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
-              "      [25/25 09:25, Epoch 0/1]\n",
-              "    </div>\n",
-              "    <table border=\"1\" class=\"dataframe\">\n",
-              "  <thead>\n",
-              " <tr style=\"text-align: left;\">\n",
-              "      <th>Step</th>\n",
-              "      <th>Training Loss</th>\n",
-              "    </tr>\n",
-              "  </thead>\n",
-              "  <tbody>\n",
-              "    <tr>\n",
-              "      <td>1</td>\n",
-              "      <td>1.092300</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>2</td>\n",
-              "      <td>1.554200</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>3</td>\n",
-              "      <td>1.041400</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>4</td>\n",
-              "      <td>1.733800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>5</td>\n",
-              "      <td>1.430000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>6</td>\n",
-              "      <td>1.258500</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>7</td>\n",
-              "      <td>1.343600</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>8</td>\n",
-              "      <td>1.101700</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>9</td>\n",
-              "      <td>1.086500</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>10</td>\n",
-              "      <td>0.813200</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>11</td>\n",
-              "      <td>0.689600</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>12</td>\n",
-              "      <td>0.826700</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>13</td>\n",
-              "      <td>1.541800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>14</td>\n",
-              "      <td>0.948000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>15</td>\n",
-              "      <td>1.357000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>16</td>\n",
-              "      <td>1.085800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>17</td>\n",
-              "      <td>1.516800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>18</td>\n",
-              "      <td>1.146800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>19</td>\n",
-              "      <td>0.834800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>20</td>\n",
-              "      <td>0.968000</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>21</td>\n",
-              "      <td>1.388800</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>22</td>\n",
-              "      <td>1.511500</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>23</td>\n",
-              "      <td>1.338500</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>24</td>\n",
-              "      <td>1.206600</td>\n",
-              "    </tr>\n",
-              "    <tr>\n",
-              "      <td>25</td>\n",
-              "      <td>1.504600</td>\n",
-              "    </tr>\n",
-              "  </tbody>\n",
-              "</table><p>"
-            ],
-            "text/plain": [
-              "<IPython.core.display.HTML object>"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "[2025-05-07 22:12:42,746] [INFO] [axolotl.callbacks.on_step_end:128] [PID:1336] [RANK:0] cuda memory usage while training: 9.768GB (+3.287GB cache, +0.646GB misc)\u001b[39m\n",
-            "[2025-05-07 22:21:46,859] [INFO] [axolotl.train.save_trained_model:231] [PID:1336] [RANK:0] Training completed! Saving pre-trained model to ./outputs/qwen-sft-pirate-rrr.\u001b[39m\n"
-          ]
-        }
-      ],
-      "source": [
-        "from axolotl.train import train\n",
-        "\n",
-        "# just train the first 25 steps for demo.\n",
-        "# This is sufficient to align the model as we've used packing to maximize the trainable samples per step.\n",
-        "cfg.max_steps = 25\n",
-        "model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "j1b9ypF78eCb"
-      },
-      "source": [
-        "# Inferencing the trained model"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "r3_vHhif8YEs",
-        "outputId": "e5050605-f6c9-421c-98f9-bde56a281eae"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "Ahoy there, matey! Shiver me timbers, ye be lookin' for the Pythagorean theorem, eh? Well, hold yer horses and listen up, for I'll be tellin' ye all about it in me own special way.\n",
-            "\n",
-            "The Pythagorean theorem be a real gem of a mathematical trick that helps ye find the length of a side of a right triangle. Now, a right triangle be a triangle with a right angle, which be that little corner that looks like a square. \n",
-            "\n",
-            "The theorem be named after a clever fellow named Pythagoras, who be a mathematician from ancient Greece. He discovered that if ye have a right triangle, the square of the length of the hypotenuse (that be the side opposite the right angle) be equal to the sum of the squares of the other two sides. \n",
-            "\n",
-            "In other words, if ye have a triangle with sides of length a, b, and c (\n"
-          ]
-        }
-      ],
-      "source": [
-        "import torch\n",
-        "from transformers import TextStreamer\n",
-        "\n",
-        "messages = [\n",
-        "    {\n",
-        "        \"role\": \"user\",\n",
-        "        \"content\": \"Explain the Pythagorean theorem to me.\",\n",
-        "    },\n",
-        "]\n",
-        "\n",
-        "prompt = tokenizer.apply_chat_template(\n",
-        "    messages,\n",
-        "    add_generation_prompt=True,\n",
-        "    tokenize=False,\n",
-        "    enable_thinking = False,\n",
-        ")\n",
-        "\n",
-        "outputs = model.generate(\n",
-        "    **tokenizer(prompt, return_tensors = \"pt\").to(\"cuda\"),\n",
-        "    max_new_tokens = 192,\n",
-        "    temperature = 1.0, top_p = 0.8, top_k = 32,\n",
-        "    streamer = TextStreamer(tokenizer, skip_prompt = True),\n",
-        ")\n"
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "HoGwT2JRSIjA"
-      },
-      "source": [
-        "# Saving your trained model\n",
-        "\n",
-        "Axolotl automatically saves checkpoints to the `output_dir` path.\n",
-        "\n"
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/"
-        },
-        "id": "5BmSbiy6NaaS",
-        "outputId": "f5e1d913-7d55-42d2-8340-f9f1b0bc2b38"
-      },
-      "outputs": [
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "total 506M\n",
-            "-rw-r--r-- 1 root root  845 May  7 22:21 adapter_config.json\n",
-            "-rw-r--r-- 1 root root 491M May  7 22:21 adapter_model.safetensors\n",
-            "-rw-r--r-- 1 root root  707 May  7 22:11 added_tokens.json\n",
-            "drwxr-xr-x 2 root root 4.0K May  7 22:17 checkpoint-13\n",
-            "drwxr-xr-x 2 root root 4.0K May  7 22:21 checkpoint-25\n",
-            "-rw-r--r-- 1 root root 1.2K May  7 22:11 config.json\n",
-            "-rw-r--r-- 1 root root 1.6M May  7 22:11 merges.txt\n",
-            "-rw-r--r-- 1 root root 2.6K May  7 22:21 README.md\n",
-            "-rw-r--r-- 1 root root  613 May  7 22:11 special_tokens_map.json\n",
-            "-rw-r--r-- 1 root root 9.5K May  7 22:11 tokenizer_config.json\n",
-            "-rw-r--r-- 1 root root  11M May  7 22:11 tokenizer.json\n",
-            "-rw-r--r-- 1 root root 2.7M May  7 22:11 vocab.json\n"
-          ]
-        }
-      ],
-      "source": [
-        "# Show the saved checkpoints in the output_dir\n",
-        "!ls -lh \"./outputs/qwen-sft-pirate-rrr\""
-      ]
-    },
-    {
-      "cell_type": "markdown",
-      "metadata": {
-        "id": "_PCIFWxuOZd6"
-      },
-      "source": [
-        "Setting `hub_model_id: ` in the original config would have automatically uploaded the model to HuggingFace Hub (e.g. `hub_model_id: username/model_id`)\n",
-        "\n",
-        "If you prefer to manually upload the training artifacts, we can still upload the entire final checkpoint to HuggingFace from the CLI."
-      ]
-    },
-    {
-      "cell_type": "code",
-      "execution_count": null,
-      "metadata": {
-        "colab": {
-          "base_uri": "https://localhost:8080/",
-          "height": 955,
-          "referenced_widgets": [
-            "c12ea43372ac4d57bb9605f1a429b397",
-            "86816687746246b4a6105e8010384e25",
-            "6f05e9bebf7b40c9835808e77de6c236",
-            "c7433acd3c4841e6958ae8f7e87b1808",
-            "19c1e38389fa46c7b7e2152a56e1df34",
-            "0e067d8db8ed48308a718d5f57683fd1",
-            "131065f118274a1586ac38e39ed84ef0",
-            "8640ac440fbc4644b9a3af7ba3ae7183",
-            "5cea7996f02040b187ece0bb2d6a8d1f",
-            "2e257c8be2da40b4bb67a9e4ab6811f3",
-            "56e3768bef5a4b9db4168c5c17f509c2",
-            "62c028fdef904dedb9cdeca2b3bda725",
-            "a7cf477e80fc43e0ad82c7997b076dce",
-            "835bcc28a5564fb9b3d651bc8e32dc46",
-            "9f1c9a0695384bdaa6f8b847ef89bee8",
-            "b1bea589efa14258a9982071b87938bf",
-            "590eef89881545aa8bbef9a8bbe7fb00",
-            "4b1f04ff63d14a118fdd15814dff50e4",
-            "39789237703c4a418134243055c9cbf5",
-            "a3a945817f684328b34651fe052393ec"
-          ]
-        },
-        "id": "2yw8pLvlSMl8",
-        "outputId": "6e489ab2-4abe-4e28-84ca-959f912433a4"
-      },
-      "outputs": [
-        {
-          "data": {
-            "application/vnd.jupyter.widget-view+json": {
-              "model_id": "c12ea43372ac4d57bb9605f1a429b397",
-              "version_major": 2,
-              "version_minor": 0
-            },
-            "text/plain": [
-              "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
-            ]
-          },
-          "metadata": {},
-          "output_type": "display_data"
-        },
-        {
-          "name": "stdout",
-          "output_type": "stream",
-          "text": [
-            "It seems you are trying to upload a large folder at once. This might take some time and then fail if the folder is too large. For such cases, it is recommended to upload in smaller batches or to use `HfApi().upload_large_folder(...)`/`huggingface-cli upload-large-folder` instead. For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder.\n",
-            "Start hashing 40 files.\n",
-            "Finished hashing 40 files.\n",
-            "Uploading files using Xet Storage..\n",
-            "Uploading...:  87% 1.82G/2.10G [00:23<00:04, 67.3MB/s]Cancellation requested; stopping current tasks.\n",
-            "Traceback (most recent call last):\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 598, in _upload_xet_files\n",
-            "    upload_files(\n",
-            "RuntimeError: Xet Runtime Error: Task cancelled; possible runtime shutdown in progress (task 9 was cancelled).\n",
-            "\n",
-            "During handling of the above exception, another exception occurred:\n",
-            "\n",
-            "Traceback (most recent call last):\n",
-            "  File \"/usr/local/bin/huggingface-cli\", line 8, in <module>\n",
-            "    sys.exit(main())\n",
-            "             ^^^^^^\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/huggingface_cli.py\", line 57, in main\n",
-            "    service.run()\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 207, in run\n",
-            "    print(self._upload())\n",
-            "          ^^^^^^^^^^^^^^\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 302, in _upload\n",
-            "    return self.api.upload_folder(\n",
-            "           ^^^^^^^^^^^^^^^^^^^^^^^\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
-            "    return fn(*args, **kwargs)\n",
-            "           ^^^^^^^^^^^^^^^^^^^\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
-            "    return fn(self, *args, **kwargs)\n",
-            "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4942, in upload_folder\n",
-            "    commit_info = self.create_commit(\n",
-            "                  ^^^^^^^^^^^^^^^^^^^\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
-            "    return fn(*args, **kwargs)\n",
-            "           ^^^^^^^^^^^^^^^^^^^\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
-            "    return fn(self, *args, **kwargs)\n",
-            "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4202, in create_commit\n",
-            "    self.preupload_lfs_files(\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4483, in preupload_lfs_files\n",
-            "    _upload_xet_files(**upload_kwargs, create_pr=create_pr)  # type: ignore [arg-type]\n",
-            "    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
-            "    return fn(*args, **kwargs)\n",
-            "           ^^^^^^^^^^^^^^^^^^^\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 592, in _upload_xet_files\n",
-            "    with progress_cm as progress:\n",
-            "  File \"/usr/local/lib/python3.11/dist-packages/tqdm/std.py\", line 1138, in __exit__\n",
-            "    def __exit__(self, exc_type, exc_value, traceback):\n",
-            "\n",
-            "KeyboardInterrupt\n",
-            "^C\n"
-          ]
-        }
-      ],
-      "source": [
-        "from huggingface_hub import notebook_login\n",
-        "# remove the partial epoch checkpoints\n",
-        "!rm -rf \"./outputs/qwen-sft-pirate-rrr/checkpoint-*\"\n",
-        "\n",
-        "# HF Notebook login widget\n",
-        "notebook_login()\n",
-        "\n",
-        "# upload the LoRA adapter for your model to HF, remember to update the username/model-name below\n",
-        "!huggingface-cli upload --repo-type=model winglian/pirate-qwen-14B \"./outputs/qwen-sft-pirate-rrr\""
-      ]
-    }
-  ],
-  "metadata": {
-    "accelerator": "GPU",
-    "colab": {
-      "gpuType": "T4",
-      "provenance": []
-    },
-    "kernelspec": {
-      "display_name": "Python 3",
-      "name": "python3"
-    },
-    "language_info": {
-      "name": "python"
-    },
-    "widgets": {
-      "application/vnd.jupyter.widget-state+json": {
-        "00321cce58884f6f9b3855a21fcd9187": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "004d9177a6a14118a5930dc3cc13147b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_a80410b919e442c49aea15acc1ce1a72",
-              "IPY_MODEL_c6e00f5224364822bc4239b176686919",
-              "IPY_MODEL_ec11d1e5ae7b42c883d9b1f38a65356e"
-            ],
-            "layout": "IPY_MODEL_734185351eb543fa9a00a881dcbb9fe7"
-          }
-        },
-        "0077aedc3d174560bce924ee89e9c006": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "03a3c744d716431488163b4358b80f92": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "03b093d592ba4386aa61f7b8483da660": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_b8766a88716948cf968f4563531a76d9",
-              "IPY_MODEL_6f3a28b912714c6e931003549664bfa3",
-              "IPY_MODEL_16d1283741404b7bb319094c992fce01"
-            ],
-            "layout": "IPY_MODEL_2a5bb0e818ab47be8cf6465988328503"
-          }
-        },
-        "042e091f75694c47aee761e760e76773": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "0546d04aae644dde846c58a4afb598a6": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "054c8dffadba48c6b895a6cc62448ecc": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "07fb3a2c8315494e97b447e672dfae06": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_12815f401eba44658caa7b2e490137a8",
-            "placeholder": "​",
-            "style": "IPY_MODEL_30e02aa2d0d241979369e598287f2639",
-            "value": "Drop Samples with Zero Trainable Tokens (num_proc=2): 100%"
-          }
-        },
-        "083f9cda8d754c168beee10d2f8955a2": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_a0a11e929edd4189b79723d618522c33",
-            "max": 728,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_e87ea87fcff247b5bbcc331ba79a8dc2",
-            "value": 728
-          }
-        },
-        "09007681cf8d42aeb8c1d2f6a74e470a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_b195f160ca20442fadd8b5aed0ee41af",
-            "placeholder": "​",
-            "style": "IPY_MODEL_ca65e32eb52f48c09a84b33cb18f22cd",
-            "value": " 11.4M/11.4M [00:00&lt;00:00, 21.8MB/s]"
-          }
-        },
-        "0a46ad75c198463d843fb35e813642cb": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_b8e39e4dddc3497fbc29ae45c66da759",
-            "max": 11422654,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_63b4e563e85c4f03b1b72beda9577bcc",
-            "value": 11422654
-          }
-        },
-        "0aa8ab56b85f4171a79c3bc210594025": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "0b4c9753a7cb4354b8e5f187e6e1ad7c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "0cd7efffbb3c4c4b972e63749f61ab97": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "0dea5caa27384f5689e3cab51f558727": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "0e067d8db8ed48308a718d5f57683fd1": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_b1bea589efa14258a9982071b87938bf",
-            "placeholder": "​",
-            "style": "IPY_MODEL_590eef89881545aa8bbef9a8bbe7fb00",
-            "value": "\n<b>Pro Tip:</b> If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. </center>"
-          }
-        },
-        "0e50870ed0c643e0b6c18cc5d7ddae7f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_bfcdbba993b74972a9e3e575f86908ff",
-            "placeholder": "​",
-            "style": "IPY_MODEL_6ebb2ec171414e47a14765505f64bb3c",
-            "value": " 3.84G/3.84G [00:09&lt;00:00, 664MB/s]"
-          }
-        },
-        "0e936d9dbf9c4fdd86bbfe9730dedc47": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "0f417447a7bd4a33acca96fa37aec877": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "0f480e3a0b0a45d2a2d2dec3cad923f3": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "0f6907ebbc6242c8bde059cef1e1bd29": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_5bdfd87fc6cd4f9dabef7cfee29c8060",
-              "IPY_MODEL_64f54d4a744a4627a07c3c0120276f3b",
-              "IPY_MODEL_65b75b9b8bc143cf997796af68ff6668"
-            ],
-            "layout": "IPY_MODEL_d6fe74e4255444368f8f90a62157d869"
-          }
-        },
-        "114dece49dba437c8572ef94b23c3b1e": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "12815f401eba44658caa7b2e490137a8": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "12b56912736849fea2ad8124456fdc5c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_97e36007e1304e1583fd81bfb13f0edd",
-            "max": 1671853,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_c65dc74c7d6f4bab8f7dd28455161dd8",
-            "value": 1671853
-          }
-        },
-        "131065f118274a1586ac38e39ed84ef0": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": "center",
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": "flex",
-            "flex": null,
-            "flex_flow": "column",
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": "50%"
-          }
-        },
-        "158c8b85dbf34de6a94b4e35e2fc7d5a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "16a188a0b06d45f980dcf3933509fe0a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_349eee9f56d64f0cba6fc24ff2c50c9b",
-            "placeholder": "​",
-            "style": "IPY_MODEL_7e5d3774060e4589aa65982da5ea4ef4",
-            "value": " 9985/9985 [00:04&lt;00:00, 2604.11 examples/s]"
-          }
-        },
-        "16d1283741404b7bb319094c992fce01": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_a4e5789584564049b83df7c6c54a3e08",
-            "placeholder": "​",
-            "style": "IPY_MODEL_ff3a94b146a948b6907f5d80c7157f99",
-            "value": " 9985/0 [00:00&lt;00:00, 50763.46 examples/s]"
-          }
-        },
-        "1811cda0644e4190a9469d1774435d82": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "18357b321ce44d7b8bd9d1c886f69275": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_e366ae3fceec4566b9ed303d6c5f90af",
-            "placeholder": "​",
-            "style": "IPY_MODEL_5dd7d150dbe04f08b165ce7f2c27cd11",
-            "value": "model-00008-of-00008.safetensors: 100%"
-          }
-        },
-        "19127c7bb1554ccbac877059f9a82db0": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "danger",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_e400cbf14bcc446a9d33b210cd93550b",
-            "max": 3963750880,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_71002199df6b40c9a1ac40df5fb27a1b",
-            "value": 3963750502
-          }
-        },
-        "19c1e38389fa46c7b7e2152a56e1df34": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ButtonModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ButtonModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ButtonView",
-            "button_style": "",
-            "description": "Login",
-            "disabled": false,
-            "icon": "",
-            "layout": "IPY_MODEL_835bcc28a5564fb9b3d651bc8e32dc46",
-            "style": "IPY_MODEL_9f1c9a0695384bdaa6f8b847ef89bee8",
-            "tooltip": ""
-          }
-        },
-        "1bec6297c90242a88672d195bc09d429": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "1c6f1f10667545aaab958016ba7e2c94": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "1d5117195d4b49eb8f1a73b18419f7ce": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_0dea5caa27384f5689e3cab51f558727",
-            "placeholder": "​",
-            "style": "IPY_MODEL_a6f48410b9964fefba0c3009a77dc838",
-            "value": " 9.68k/9.68k [00:00&lt;00:00, 812kB/s]"
-          }
-        },
-        "1f7d30f71bbd4547a9150d21da071055": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "200df5e79b9244849e589ecb0250a520": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_f4a1795dc7514a718f478245f521f0ba",
-            "placeholder": "​",
-            "style": "IPY_MODEL_5e746eb25bbe416fb585fa24e79f5177",
-            "value": "model-00002-of-00008.safetensors: 100%"
-          }
-        },
-        "20352e5f58d24bb8b1f3940efd14fe4a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "253017b0d0534e54ab44e181f6d7c82d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_1c6f1f10667545aaab958016ba7e2c94",
-            "placeholder": "​",
-            "style": "IPY_MODEL_e6e969610738449887259063967f82b0",
-            "value": " 2.78M/2.78M [00:00&lt;00:00, 17.8MB/s]"
-          }
-        },
-        "258b7c635c1045329d4669e48c46ccd5": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_6f68ed9889f54ad2ae8a3b95ac263a83",
-              "IPY_MODEL_80366349d81e4dcc892db6cd56e384f3",
-              "IPY_MODEL_c73055099c084dca996159e23e162d0b"
-            ],
-            "layout": "IPY_MODEL_977f799afaac4a55b2dc1cffa7d5b63b"
-          }
-        },
-        "279937fe03bc4e4eb25b472d7e9df163": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "danger",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_b634bb73cfa743d09a5999101b840976",
-            "max": 1912371880,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_742b1030acfd414bbd9d5327b7e3826d",
-            "value": 1912371698
-          }
-        },
-        "27beaf06e41b472abdb544a43c720c5a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "2860e3bb3baf4f7da058465850e800c5": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_3efd18ea8eaa41918894883da9541bfa",
-              "IPY_MODEL_e09f1bcbb9d94c09be53e5e1303642c2",
-              "IPY_MODEL_82177df57a494de8900c14c2f5185175"
-            ],
-            "layout": "IPY_MODEL_ccfcdc95baf646f8aeb3d516742383f2"
-          }
-        },
-        "2a51b36be41745468e4c2d7a21b1c0d2": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "2a5bb0e818ab47be8cf6465988328503": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "2b3a2659b12244bd8548320320016dbf": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "2e257c8be2da40b4bb67a9e4ab6811f3": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "2e2b0c1599c341a198f632f46a40c90e": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_be724f04b03942b2a033a7e8898bb4fd",
-            "placeholder": "​",
-            "style": "IPY_MODEL_fcbab4d8dced41a18dfccce81e3a45a0",
-            "value": "model-00005-of-00008.safetensors: 100%"
-          }
-        },
-        "3036608c71904ce9ae4bb2a9fa8802d9": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_5ca6be24acb548cea130bd58e9954c7c",
-            "placeholder": "​",
-            "style": "IPY_MODEL_5cfb02ee044b4011a378efa8b54a370f",
-            "value": " 3.96G/3.96G [00:10&lt;00:00, 531MB/s]"
-          }
-        },
-        "30a81da86f8043eca301e86a8651201a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "30e02aa2d0d241979369e598287f2639": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "3225603166b54e7aab766b9964a2f660": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "33b3b1d0295646edaac7b4822761aeb0": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "349eee9f56d64f0cba6fc24ff2c50c9b": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "34c9c0137b504cd799c6bd6de69507c2": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "34cf3df51fbc41cabfdbba153c007f0e": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "35c811d2ae8e43f3b5cecbdd3cfa857f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "35cc989ca3374e7dba0cb166febc4bde": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "366a343b62fa47d8985a3bd464d99f9e": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "37de928300e34184881039378bd75e7f": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "388f618924274d21a066f098f4f1e744": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_7c95f85a2b1f47a1bd846d110c47bb3c",
-              "IPY_MODEL_083f9cda8d754c168beee10d2f8955a2",
-              "IPY_MODEL_62e1a65582f446a78612eaa804e08a7d"
-            ],
-            "layout": "IPY_MODEL_487a177d020f4605834878b2fdc7afa3"
-          }
-        },
-        "39789237703c4a418134243055c9cbf5": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "3aaecbf540f54a2db9ab0931e3b1fe57": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "3c21e4a511b4441192c03b7f1d0976e9": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "3efd18ea8eaa41918894883da9541bfa": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_8f5bd719974e41c3a8dd9a5b0d3d71e6",
-            "placeholder": "​",
-            "style": "IPY_MODEL_b87c84de30e84b3abf4871461fb9cbd3",
-            "value": "Loading checkpoint shards: 100%"
-          }
-        },
-        "41f3b32c2f6b4034ae7a3b9124e28bc7": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "4471ff62258549fba9514bb67050f965": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_9cd5211b5d8b457aa0002f1d17b80028",
-              "IPY_MODEL_19127c7bb1554ccbac877059f9a82db0",
-              "IPY_MODEL_f4667818b9d34a09891cd727a429a610"
-            ],
-            "layout": "IPY_MODEL_9ed02dc43412471a9ab47f3620ccf3a5"
-          }
-        },
-        "4540927d98f54466b434ba4c0edf045d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "487a177d020f4605834878b2fdc7afa3": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "4b1f04ff63d14a118fdd15814dff50e4": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "LabelModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "LabelModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "LabelView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_39789237703c4a418134243055c9cbf5",
-            "placeholder": "​",
-            "style": "IPY_MODEL_a3a945817f684328b34651fe052393ec",
-            "value": "Connecting..."
-          }
-        },
-        "4b27c267393640f28f6eae0875bd2ed9": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "4c727d40ef0443449afc31724ee79f0c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "4d05314858354e729d76094b3b0ce761": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_c42acf646f344a88b8c11f81e67f7206",
-              "IPY_MODEL_7be6f04c284e4326bb4ff3d301e7b3c6",
-              "IPY_MODEL_ffdbb12a2f2c4d14911685e7683e0ef0"
-            ],
-            "layout": "IPY_MODEL_bee3501b2a17427784a717e50a85e7fa"
-          }
-        },
-        "4d468f96ec924681ad65eb671674b93e": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "4f1977d7e4824ef1a14b65f0f42bba10": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "4fd114abe9f5494ab59858949f5055f1": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "500e272208a246089613bf788a165271": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_200df5e79b9244849e589ecb0250a520",
-              "IPY_MODEL_cc94432d08464affa3e58b560bdad194",
-              "IPY_MODEL_3036608c71904ce9ae4bb2a9fa8802d9"
-            ],
-            "layout": "IPY_MODEL_adacfdcc1b0140efac56918e9ccf064e"
-          }
-        },
-        "519a7b154022443db6703f04a9142bae": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_d02274afd47b462291c745f261209d42",
-            "max": 27341251,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_0f417447a7bd4a33acca96fa37aec877",
-            "value": 27341251
-          }
-        },
-        "56e3768bef5a4b9db4168c5c17f509c2": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "590eef89881545aa8bbef9a8bbe7fb00": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "598da69727bd4fb8b1caf465ac736d7a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "5bdfd87fc6cd4f9dabef7cfee29c8060": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_4d468f96ec924681ad65eb671674b93e",
-            "placeholder": "​",
-            "style": "IPY_MODEL_ad7599de524549c48bf2d3124ad4b299",
-            "value": "Dropping Long Sequences (num_proc=2): 100%"
-          }
-        },
-        "5ca240f31e6b44e3882c5eb37cd5a309": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": "20px"
-          }
-        },
-        "5ca6be24acb548cea130bd58e9954c7c": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "5cea7996f02040b187ece0bb2d6a8d1f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "5cfb02ee044b4011a378efa8b54a370f": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "5dd7d150dbe04f08b165ce7f2c27cd11": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "5e18768f7ad6434ba8b8b8a2e853e204": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "5e5e15b0569b474c9620083b3ec6af55": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "5e746eb25bbe416fb585fa24e79f5177": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "5eb06edeb58e4930b1affef2a59eae81": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "5f86cd894de94c3280fadc1e2fd0ee13": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_a20927bf5f2c41f58c1e31ac858ab36c",
-              "IPY_MODEL_0a46ad75c198463d843fb35e813642cb",
-              "IPY_MODEL_09007681cf8d42aeb8c1d2f6a74e470a"
-            ],
-            "layout": "IPY_MODEL_ebc80d1a55fa47f4a5ea2756588569ec"
-          }
-        },
-        "60c1a0d765c14a1d888317e6a507e4ea": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "62c028fdef904dedb9cdeca2b3bda725": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "62e1a65582f446a78612eaa804e08a7d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_5e18768f7ad6434ba8b8b8a2e853e204",
-            "placeholder": "​",
-            "style": "IPY_MODEL_bb33aec33a6447078c31bfd728942994",
-            "value": " 728/728 [00:00&lt;00:00, 20.3kB/s]"
-          }
-        },
-        "62e302ebdad64aada0ffe64ae1c873f3": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "63580b6fb30642479fe3000915bf551a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "63b4e563e85c4f03b1b72beda9577bcc": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "64f54d4a744a4627a07c3c0120276f3b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_0546d04aae644dde846c58a4afb598a6",
-            "max": 9985,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_897b77a56c09479bb11d7f2a30997e55",
-            "value": 9985
-          }
-        },
-        "65b75b9b8bc143cf997796af68ff6668": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_81c3db71ac704280ad030072655f1537",
-            "placeholder": "​",
-            "style": "IPY_MODEL_042e091f75694c47aee761e760e76773",
-            "value": " 9985/9985 [00:02&lt;00:00, 3977.47 examples/s]"
-          }
-        },
-        "67da6c4260574869aa24c3cbc1bc1654": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "6932489232ec4ab18a160b1e7fbcdfe1": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "6ebb2ec171414e47a14765505f64bb3c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "6f05e9bebf7b40c9835808e77de6c236": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "PasswordModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "PasswordModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "PasswordView",
-            "continuous_update": true,
-            "description": "Token:",
-            "description_tooltip": null,
-            "disabled": false,
-            "layout": "IPY_MODEL_2e257c8be2da40b4bb67a9e4ab6811f3",
-            "placeholder": "​",
-            "style": "IPY_MODEL_56e3768bef5a4b9db4168c5c17f509c2",
-            "value": ""
-          }
-        },
-        "6f3a28b912714c6e931003549664bfa3": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_5ca240f31e6b44e3882c5eb37cd5a309",
-            "max": 1,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_5eb06edeb58e4930b1affef2a59eae81",
-            "value": 1
-          }
-        },
-        "6f68ed9889f54ad2ae8a3b95ac263a83": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_41f3b32c2f6b4034ae7a3b9124e28bc7",
-            "placeholder": "​",
-            "style": "IPY_MODEL_a10d0a76010f4e508c65a9b69ebc5156",
-            "value": "Tokenizing Prompts (num_proc=2): 100%"
-          }
-        },
-        "704f2f5a9b1c49d5a75a0025a5dda11b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "71002199df6b40c9a1ac40df5fb27a1b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "71c8af139cd248b1b51101fd46a93f35": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_d0e9dce55cec4c1ca619a0ccf209d924",
-            "max": 9675,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_4c727d40ef0443449afc31724ee79f0c",
-            "value": 9675
-          }
-        },
-        "734185351eb543fa9a00a881dcbb9fe7": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "735d4f225b24414294fc1b213c61223c": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "742b1030acfd414bbd9d5327b7e3826d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "77304d1a46b3468a98483e02ec0ac4a4": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "7baeab52d6694c32b1efd1ea1a0a7782": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_93a44a11aa4846fa8efc6c1413ef1627",
-            "placeholder": "​",
-            "style": "IPY_MODEL_a55060adc3564407ac81ad7297d34aaa",
-            "value": "train.jsonl: 100%"
-          }
-        },
-        "7be6f04c284e4326bb4ff3d301e7b3c6": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "danger",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_9503a45960984adc97b58e16c50662e0",
-            "max": 3963750880,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_da6e93f3e4984780b930fe7a706983ea",
-            "value": 3963750502
-          }
-        },
-        "7c2485c6cdfe463da6fdb35982a1070d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_ad1236893754446881e153adc9d5c962",
-              "IPY_MODEL_daee63fd167e4441a32324b51b00ad2b",
-              "IPY_MODEL_fe41858c6bd04c58840112b67c19a336"
-            ],
-            "layout": "IPY_MODEL_d262c82138024169b9f3aa034ca756fa"
-          }
-        },
-        "7c95f85a2b1f47a1bd846d110c47bb3c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_7fd44cf9ca6e4726bfd7ac21846d6a14",
-            "placeholder": "​",
-            "style": "IPY_MODEL_366a343b62fa47d8985a3bd464d99f9e",
-            "value": "config.json: 100%"
-          }
-        },
-        "7cd0b85ebd204b7aba908417811ce4e0": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_7baeab52d6694c32b1efd1ea1a0a7782",
-              "IPY_MODEL_519a7b154022443db6703f04a9142bae",
-              "IPY_MODEL_d4183e9715f34d249942b8271cca3bdf"
-            ],
-            "layout": "IPY_MODEL_da2347ac94764a3fa2743343cf0d3cd2"
-          }
-        },
-        "7e5d3774060e4589aa65982da5ea4ef4": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "7fd44cf9ca6e4726bfd7ac21846d6a14": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "80366349d81e4dcc892db6cd56e384f3": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_f8ef805b776145c3bfa9ba8d90972058",
-            "max": 9985,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_cc587493c33c4f118d1b1170f85be24c",
-            "value": 9985
-          }
-        },
-        "813621384dc748b0ad06775e22761c0b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "81c3db71ac704280ad030072655f1537": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "82177df57a494de8900c14c2f5185175": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_67da6c4260574869aa24c3cbc1bc1654",
-            "placeholder": "​",
-            "style": "IPY_MODEL_94b9088614464f60a203de39dbcae853",
-            "value": " 8/8 [01:47&lt;00:00, 11.64s/it]"
-          }
-        },
-        "823f1c78f15043e38bbd4dca3932a86a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_03a3c744d716431488163b4358b80f92",
-            "max": 239,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_a5434ee714f9498d83870544b67c0cb7",
-            "value": 239
-          }
-        },
-        "835bcc28a5564fb9b3d651bc8e32dc46": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "8640ac440fbc4644b9a3af7ba3ae7183": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "86816687746246b4a6105e8010384e25": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_8640ac440fbc4644b9a3af7ba3ae7183",
-            "placeholder": "​",
-            "style": "IPY_MODEL_5cea7996f02040b187ece0bb2d6a8d1f",
-            "value": "<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svg\nalt='Hugging Face'> <br> Copy a token from <a\nhref=\"https://huggingface.co/settings/tokens\" target=\"_blank\">your Hugging Face\ntokens page</a> and paste it below. <br> Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. </center>"
-          }
-        },
-        "879c8ab5873847a8833bd74123be90a4": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_ef223e8504b64e3592589880326aaf41",
-            "placeholder": "​",
-            "style": "IPY_MODEL_598da69727bd4fb8b1caf465ac736d7a",
-            "value": " 1.67M/1.67M [00:00&lt;00:00, 19.0MB/s]"
-          }
-        },
-        "897b77a56c09479bb11d7f2a30997e55": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "8bc9d8ba866c442b9118d9630009939c": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "8c4d4fc5a30f4e7cb3be53fe2adda33d": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "8f5bd719974e41c3a8dd9a5b0d3d71e6": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "8f726dbfb45d4528afa33e36a6313267": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "9327977822be4b1294f80e876552e305": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_37de928300e34184881039378bd75e7f",
-            "placeholder": "​",
-            "style": "IPY_MODEL_0e936d9dbf9c4fdd86bbfe9730dedc47",
-            "value": " 3.96G/3.96G [00:13&lt;00:00, 273MB/s]"
-          }
-        },
-        "936d04b5fe1b4c63bf0b080e423d051b": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "93a44a11aa4846fa8efc6c1413ef1627": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "94b9088614464f60a203de39dbcae853": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "9503a45960984adc97b58e16c50662e0": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "95caff42f08a4c2aa14c867b8f37f231": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_de7c37ee83e24f0c889e84d07279c2ec",
-              "IPY_MODEL_9d4897eefb5f48259ffb2d23e332f752",
-              "IPY_MODEL_253017b0d0534e54ab44e181f6d7c82d"
-            ],
-            "layout": "IPY_MODEL_27beaf06e41b472abdb544a43c720c5a"
-          }
-        },
-        "977f799afaac4a55b2dc1cffa7d5b63b": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "97e36007e1304e1583fd81bfb13f0edd": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "9858cb74a09748a39e8149baac96702c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "9b42e08b3c9548818488268768a118b1": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_d955dcaa0e944e719f3a06139dd54a03",
-            "placeholder": "​",
-            "style": "IPY_MODEL_d3de2662c7964f1ba96e58da382af720",
-            "value": "merges.txt: 100%"
-          }
-        },
-        "9cd5211b5d8b457aa0002f1d17b80028": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_6932489232ec4ab18a160b1e7fbcdfe1",
-            "placeholder": "​",
-            "style": "IPY_MODEL_4540927d98f54466b434ba4c0edf045d",
-            "value": "model-00007-of-00008.safetensors: 100%"
-          }
-        },
-        "9d4897eefb5f48259ffb2d23e332f752": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_30a81da86f8043eca301e86a8651201a",
-            "max": 2776833,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_e8b7a81040904c1e89e58978223b1737",
-            "value": 2776833
-          }
-        },
-        "9e333ed3b5014069ac1dd969255dd591": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "9ed02dc43412471a9ab47f3620ccf3a5": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "9f1c9a0695384bdaa6f8b847ef89bee8": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ButtonStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ButtonStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "button_color": null,
-            "font_weight": ""
-          }
-        },
-        "9f56a2d9979c4bd8928c644c22c3ecdf": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "a0a11e929edd4189b79723d618522c33": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "a10d0a76010f4e508c65a9b69ebc5156": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "a138859f19b74fc0928dc236ab5359db": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_9b42e08b3c9548818488268768a118b1",
-              "IPY_MODEL_12b56912736849fea2ad8124456fdc5c",
-              "IPY_MODEL_879c8ab5873847a8833bd74123be90a4"
-            ],
-            "layout": "IPY_MODEL_20352e5f58d24bb8b1f3940efd14fe4a"
-          }
-        },
-        "a1959759c5424da9961fb2a308d4dee4": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_3aaecbf540f54a2db9ab0931e3b1fe57",
-            "placeholder": "​",
-            "style": "IPY_MODEL_9e333ed3b5014069ac1dd969255dd591",
-            "value": " 239/239 [00:00&lt;00:00, 30.9kB/s]"
-          }
-        },
-        "a20927bf5f2c41f58c1e31ac858ab36c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_1811cda0644e4190a9469d1774435d82",
-            "placeholder": "​",
-            "style": "IPY_MODEL_35c811d2ae8e43f3b5cecbdd3cfa857f",
-            "value": "tokenizer.json: 100%"
-          }
-        },
-        "a3a945817f684328b34651fe052393ec": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "a44f630e099e43899f20a77084ae60cd": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_ed5ca967ad5342929e578ac6aa4dc4c0",
-            "placeholder": "​",
-            "style": "IPY_MODEL_af401d117d5047629d3a6e2361757b62",
-            "value": "model-00001-of-00008.safetensors: 100%"
-          }
-        },
-        "a4e5789584564049b83df7c6c54a3e08": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "a5434ee714f9498d83870544b67c0cb7": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "a55060adc3564407ac81ad7297d34aaa": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "a6f48410b9964fefba0c3009a77dc838": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "a7cf477e80fc43e0ad82c7997b076dce": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "a80410b919e442c49aea15acc1ce1a72": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_fa1282ccc7544e4f818e2f03ccffe4a5",
-            "placeholder": "​",
-            "style": "IPY_MODEL_bbbf575d2a4b4c6ea8389be79b2a6039",
-            "value": "model.safetensors.index.json: 100%"
-          }
-        },
-        "ab93eabd7cea4b94b4b7a387f101e8a1": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "ac764024cf1c4e08ba7749afd2cd20ac": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "ad1236893754446881e153adc9d5c962": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_62e302ebdad64aada0ffe64ae1c873f3",
-            "placeholder": "​",
-            "style": "IPY_MODEL_bd1b0dfed6d34d16af33a4a58330f5ec",
-            "value": "Saving the dataset (1/1 shards): 100%"
-          }
-        },
-        "ad7599de524549c48bf2d3124ad4b299": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "adacfdcc1b0140efac56918e9ccf064e": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "af401d117d5047629d3a6e2361757b62": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "b191ac001a2e4962bc9a245fcdf26e6b": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "b195f160ca20442fadd8b5aed0ee41af": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "b1bea589efa14258a9982071b87938bf": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "b5b65414154544aa8a71b1a39164aad7": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "b634bb73cfa743d09a5999101b840976": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "b82aa8c57f7c422a9a9c90f333ed2a99": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_c0991cf63ee6458b96e9a75e7a88b61a",
-              "IPY_MODEL_71c8af139cd248b1b51101fd46a93f35",
-              "IPY_MODEL_1d5117195d4b49eb8f1a73b18419f7ce"
-            ],
-            "layout": "IPY_MODEL_3c21e4a511b4441192c03b7f1d0976e9"
-          }
-        },
-        "b8766a88716948cf968f4563531a76d9": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_2b3a2659b12244bd8548320320016dbf",
-            "placeholder": "​",
-            "style": "IPY_MODEL_0cd7efffbb3c4c4b972e63749f61ab97",
-            "value": "Generating train split: "
-          }
-        },
-        "b87c84de30e84b3abf4871461fb9cbd3": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "b8e39e4dddc3497fbc29ae45c66da759": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "bb33aec33a6447078c31bfd728942994": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "bbbf575d2a4b4c6ea8389be79b2a6039": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "bca2c7185b6749fd899c06a2ba4c5e46": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_0f480e3a0b0a45d2a2d2dec3cad923f3",
-            "placeholder": "​",
-            "style": "IPY_MODEL_fcb30372e7404c5d8a1ad4df91e6c7b2",
-            "value": " 1.91G/1.91G [00:05&lt;00:00, 444MB/s]"
-          }
-        },
-        "bd1b0dfed6d34d16af33a4a58330f5ec": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "be724f04b03942b2a033a7e8898bb4fd": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "bed8726b8069434687c75452e21f19e5": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_fa864b41586f4a7aa56aeafd1d84eb75",
-            "max": 9985,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_3225603166b54e7aab766b9964a2f660",
-            "value": 9985
-          }
-        },
-        "bee3501b2a17427784a717e50a85e7fa": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "bfcdbba993b74972a9e3e575f86908ff": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "bff139df987d4a62abec6456cb27f3d4": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "danger",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_c1f9c267ba3f40039cdb5eb3267e8043",
-            "max": 3963750880,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_33b3b1d0295646edaac7b4822761aeb0",
-            "value": 3963750502
-          }
-        },
-        "c0892a1881de4eb4bfabc6a68f87ae99": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_158c8b85dbf34de6a94b4e35e2fc7d5a",
-            "placeholder": "​",
-            "style": "IPY_MODEL_0b4c9753a7cb4354b8e5f187e6e1ad7c",
-            "value": " 3.96G/3.96G [00:15&lt;00:00, 564MB/s]"
-          }
-        },
-        "c0991cf63ee6458b96e9a75e7a88b61a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_ed28e2e0410d4e0b855467e798e53d66",
-            "placeholder": "​",
-            "style": "IPY_MODEL_d93f134f802b4b69b575bdaf07dbd27c",
-            "value": "tokenizer_config.json: 100%"
-          }
-        },
-        "c12ea43372ac4d57bb9605f1a429b397": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "VBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "VBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "VBoxView",
-            "box_style": "",
-            "children": [],
-            "layout": "IPY_MODEL_131065f118274a1586ac38e39ed84ef0"
-          }
-        },
-        "c1314f241a434c41b45d84dc4d3b30f8": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "c1f9c267ba3f40039cdb5eb3267e8043": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "c33ced495f70464aa4a3a91922090853": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "c3725c7f79fe415fbd1ea336f0cc9cf1": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "danger",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_b191ac001a2e4962bc9a245fcdf26e6b",
-            "max": 3841788544,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_054c8dffadba48c6b895a6cc62448ecc",
-            "value": 3841788178
-          }
-        },
-        "c3be9109d63c485d9c0ef4f9bc0f9218": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "c42acf646f344a88b8c11f81e67f7206": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_8bc9d8ba866c442b9118d9630009939c",
-            "placeholder": "​",
-            "style": "IPY_MODEL_9f56a2d9979c4bd8928c644c22c3ecdf",
-            "value": "model-00003-of-00008.safetensors: 100%"
-          }
-        },
-        "c6164e05a1914ae48083db9ad7f4ef7c": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "c65dc74c7d6f4bab8f7dd28455161dd8": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "c6e00f5224364822bc4239b176686919": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_2a51b36be41745468e4c2d7a21b1c0d2",
-            "max": 36514,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_4fd114abe9f5494ab59858949f5055f1",
-            "value": 36514
-          }
-        },
-        "c73055099c084dca996159e23e162d0b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_e40d1c1ac9494b3bade9858324e7ffdf",
-            "placeholder": "​",
-            "style": "IPY_MODEL_d65b6b060d9845779299491ac5599c31",
-            "value": " 9985/9985 [01:04&lt;00:00, 189.08 examples/s]"
-          }
-        },
-        "c7433acd3c4841e6958ae8f7e87b1808": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "CheckboxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "CheckboxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "CheckboxView",
-            "description": "Add token as git credential?",
-            "description_tooltip": null,
-            "disabled": false,
-            "indent": true,
-            "layout": "IPY_MODEL_62c028fdef904dedb9cdeca2b3bda725",
-            "style": "IPY_MODEL_a7cf477e80fc43e0ad82c7997b076dce",
-            "value": false
-          }
-        },
-        "c84cc07789be48aebb322c23d355289e": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_0077aedc3d174560bce924ee89e9c006",
-            "placeholder": "​",
-            "style": "IPY_MODEL_00321cce58884f6f9b3855a21fcd9187",
-            "value": "Add position_id column (Sample Packing) (num_proc=2): 100%"
-          }
-        },
-        "ca65e32eb52f48c09a84b33cb18f22cd": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "cc587493c33c4f118d1b1170f85be24c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "cc94432d08464affa3e58b560bdad194": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "danger",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_b5b65414154544aa8a71b1a39164aad7",
-            "max": 3963750816,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_f0a58fbd0fca4340890041f99fa2f8c8",
-            "value": 3963750438
-          }
-        },
-        "ccfcdc95baf646f8aeb3d516742383f2": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "cdebbc55a1164c018546c2ac6f8c620c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_a44f630e099e43899f20a77084ae60cd",
-              "IPY_MODEL_c3725c7f79fe415fbd1ea336f0cc9cf1",
-              "IPY_MODEL_0e50870ed0c643e0b6c18cc5d7ddae7f"
-            ],
-            "layout": "IPY_MODEL_c33ced495f70464aa4a3a91922090853"
-          }
-        },
-        "d02274afd47b462291c745f261209d42": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "d07c8b97d3314f1c852e44bdd40f61ed": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "d0e9dce55cec4c1ca619a0ccf209d924": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "d1f9b10c130542f094c8fd3d1e23b5e9": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "d262c82138024169b9f3aa034ca756fa": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "d3de2662c7964f1ba96e58da382af720": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "d4183e9715f34d249942b8271cca3bdf": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_63580b6fb30642479fe3000915bf551a",
-            "placeholder": "​",
-            "style": "IPY_MODEL_8f726dbfb45d4528afa33e36a6313267",
-            "value": " 27.3M/27.3M [00:00&lt;00:00, 31.0MB/s]"
-          }
-        },
-        "d43c6df07ddb466587807d6dbe1ff614": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_8c4d4fc5a30f4e7cb3be53fe2adda33d",
-            "placeholder": "​",
-            "style": "IPY_MODEL_e90658f4bcb642baa78426012f863152",
-            "value": "model-00004-of-00008.safetensors: 100%"
-          }
-        },
-        "d65b6b060d9845779299491ac5599c31": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "d6fe74e4255444368f8f90a62157d869": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "d93f134f802b4b69b575bdaf07dbd27c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "d955dcaa0e944e719f3a06139dd54a03": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "da2347ac94764a3fa2743343cf0d3cd2": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "da6e93f3e4984780b930fe7a706983ea": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "daee63fd167e4441a32324b51b00ad2b": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_d07c8b97d3314f1c852e44bdd40f61ed",
-            "max": 9985,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_ebb69a2c3d0a4299a484698287b3087c",
-            "value": 9985
-          }
-        },
-        "dc892a596f6942d7973c616c38f0eebb": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_c84cc07789be48aebb322c23d355289e",
-              "IPY_MODEL_bed8726b8069434687c75452e21f19e5",
-              "IPY_MODEL_16a188a0b06d45f980dcf3933509fe0a"
-            ],
-            "layout": "IPY_MODEL_60c1a0d765c14a1d888317e6a507e4ea"
-          }
-        },
-        "dd0e646fad3f4a89ba23b39d162bd8d9": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_d43c6df07ddb466587807d6dbe1ff614",
-              "IPY_MODEL_e0e8b840b8ea4d0d9db09afe99fa287d",
-              "IPY_MODEL_9327977822be4b1294f80e876552e305"
-            ],
-            "layout": "IPY_MODEL_77304d1a46b3468a98483e02ec0ac4a4"
-          }
-        },
-        "de7c37ee83e24f0c889e84d07279c2ec": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_34cf3df51fbc41cabfdbba153c007f0e",
-            "placeholder": "​",
-            "style": "IPY_MODEL_ac764024cf1c4e08ba7749afd2cd20ac",
-            "value": "vocab.json: 100%"
-          }
-        },
-        "dfd2a2649b8341ef913207526708aff1": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "e09f1bcbb9d94c09be53e5e1303642c2": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_e7d8e4fe58384e93a106de546068c65e",
-            "max": 8,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_0aa8ab56b85f4171a79c3bc210594025",
-            "value": 8
-          }
-        },
-        "e0e8b840b8ea4d0d9db09afe99fa287d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "danger",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_f7434f3e03124a1c938a39af79d7fa59",
-            "max": 3963750880,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_c1314f241a434c41b45d84dc4d3b30f8",
-            "value": 3963750502
-          }
-        },
-        "e21e180307e5485cbbe908672fd6639a": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_2e2b0c1599c341a198f632f46a40c90e",
-              "IPY_MODEL_bff139df987d4a62abec6456cb27f3d4",
-              "IPY_MODEL_ebe1cc366d324ad59b264c8b3c431441"
-            ],
-            "layout": "IPY_MODEL_114dece49dba437c8572ef94b23c3b1e"
-          }
-        },
-        "e366ae3fceec4566b9ed303d6c5f90af": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "e3fb3fc6afe04b3c9b7ac61809ce78fa": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_c6164e05a1914ae48083db9ad7f4ef7c",
-            "placeholder": "​",
-            "style": "IPY_MODEL_813621384dc748b0ad06775e22761c0b",
-            "value": " 9985/9985 [00:03&lt;00:00, 3622.89 examples/s]"
-          }
-        },
-        "e400cbf14bcc446a9d33b210cd93550b": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "e40d1c1ac9494b3bade9858324e7ffdf": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "e575d87a7efe4ec7b1efde489839d4a6": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "e5a82df528bb4e408797a3b6c2758f4a": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "e6e969610738449887259063967f82b0": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "e7d8e4fe58384e93a106de546068c65e": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "e87ea87fcff247b5bbcc331ba79a8dc2": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "e8b7a81040904c1e89e58978223b1737": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "e90658f4bcb642baa78426012f863152": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "eb1c9535e6a546098b760528b2ea387c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_18357b321ce44d7b8bd9d1c886f69275",
-              "IPY_MODEL_279937fe03bc4e4eb25b472d7e9df163",
-              "IPY_MODEL_bca2c7185b6749fd899c06a2ba4c5e46"
-            ],
-            "layout": "IPY_MODEL_1f7d30f71bbd4547a9150d21da071055"
-          }
-        },
-        "ebb69a2c3d0a4299a484698287b3087c": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "ebc80d1a55fa47f4a5ea2756588569ec": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "ebe1cc366d324ad59b264c8b3c431441": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_fba7aa824b38467ab3061b226114cdec",
-            "placeholder": "​",
-            "style": "IPY_MODEL_f3075dccbd2747b4a7913b66f44f2596",
-            "value": " 3.96G/3.96G [00:13&lt;00:00, 398MB/s]"
-          }
-        },
-        "ec030fc3c346426f9abc3a89892258d3": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "success",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_dfd2a2649b8341ef913207526708aff1",
-            "max": 9985,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_4f1977d7e4824ef1a14b65f0f42bba10",
-            "value": 9985
-          }
-        },
-        "ec11d1e5ae7b42c883d9b1f38a65356e": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_936d04b5fe1b4c63bf0b080e423d051b",
-            "placeholder": "​",
-            "style": "IPY_MODEL_f1cef8e8dc2646fb9fd09f3b09081074",
-            "value": " 36.5k/36.5k [00:00&lt;00:00, 4.32MB/s]"
-          }
-        },
-        "ed28e2e0410d4e0b855467e798e53d66": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "ed5ca967ad5342929e578ac6aa4dc4c0": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "edc99591b9c747b689b94d0052fec14c": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "ef0a3c7a6f14460fb4da096928ae249e": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_07fb3a2c8315494e97b447e672dfae06",
-              "IPY_MODEL_ec030fc3c346426f9abc3a89892258d3",
-              "IPY_MODEL_e3fb3fc6afe04b3c9b7ac61809ce78fa"
-            ],
-            "layout": "IPY_MODEL_c3be9109d63c485d9c0ef4f9bc0f9218"
-          }
-        },
-        "ef223e8504b64e3592589880326aaf41": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "f0a58fbd0fca4340890041f99fa2f8c8": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "ProgressStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "ProgressStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "bar_color": null,
-            "description_width": ""
-          }
-        },
-        "f113ebd8c1c34806bea4dd7ed3035173": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "f1cef8e8dc2646fb9fd09f3b09081074": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "f3075dccbd2747b4a7913b66f44f2596": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "f365820a3d3c42b2948abfe32065de14": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_735d4f225b24414294fc1b213c61223c",
-            "placeholder": "​",
-            "style": "IPY_MODEL_5e5e15b0569b474c9620083b3ec6af55",
-            "value": "generation_config.json: 100%"
-          }
-        },
-        "f4667818b9d34a09891cd727a429a610": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_4b27c267393640f28f6eae0875bd2ed9",
-            "placeholder": "​",
-            "style": "IPY_MODEL_9858cb74a09748a39e8149baac96702c",
-            "value": " 3.96G/3.96G [00:11&lt;00:00, 457MB/s]"
-          }
-        },
-        "f4a1795dc7514a718f478245f521f0ba": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "f60a2bdb6b6b4e0e8c3508580e247132": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "FloatProgressModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "FloatProgressModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "ProgressView",
-            "bar_style": "danger",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_edc99591b9c747b689b94d0052fec14c",
-            "max": 3963750880,
-            "min": 0,
-            "orientation": "horizontal",
-            "style": "IPY_MODEL_35cc989ca3374e7dba0cb166febc4bde",
-            "value": 3963750502
-          }
-        },
-        "f7434f3e03124a1c938a39af79d7fa59": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "f8ef805b776145c3bfa9ba8d90972058": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "fa1282ccc7544e4f818e2f03ccffe4a5": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "fa864b41586f4a7aa56aeafd1d84eb75": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "fba7aa824b38467ab3061b226114cdec": {
-          "model_module": "@jupyter-widgets/base",
-          "model_module_version": "1.2.0",
-          "model_name": "LayoutModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/base",
-            "_model_module_version": "1.2.0",
-            "_model_name": "LayoutModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "LayoutView",
-            "align_content": null,
-            "align_items": null,
-            "align_self": null,
-            "border": null,
-            "bottom": null,
-            "display": null,
-            "flex": null,
-            "flex_flow": null,
-            "grid_area": null,
-            "grid_auto_columns": null,
-            "grid_auto_flow": null,
-            "grid_auto_rows": null,
-            "grid_column": null,
-            "grid_gap": null,
-            "grid_row": null,
-            "grid_template_areas": null,
-            "grid_template_columns": null,
-            "grid_template_rows": null,
-            "height": null,
-            "justify_content": null,
-            "justify_items": null,
-            "left": null,
-            "margin": null,
-            "max_height": null,
-            "max_width": null,
-            "min_height": null,
-            "min_width": null,
-            "object_fit": null,
-            "object_position": null,
-            "order": null,
-            "overflow": null,
-            "overflow_x": null,
-            "overflow_y": null,
-            "padding": null,
-            "right": null,
-            "top": null,
-            "visibility": null,
-            "width": null
-          }
-        },
-        "fcb30372e7404c5d8a1ad4df91e6c7b2": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "fcbab4d8dced41a18dfccce81e3a45a0": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "fd4f333f7ece4450b04e1a9af1f9d2f6": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_d1f9b10c130542f094c8fd3d1e23b5e9",
-            "placeholder": "​",
-            "style": "IPY_MODEL_e575d87a7efe4ec7b1efde489839d4a6",
-            "value": "model-00006-of-00008.safetensors: 100%"
-          }
-        },
-        "fe18bba7f3fb4c31bf840541f36b3425": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_fd4f333f7ece4450b04e1a9af1f9d2f6",
-              "IPY_MODEL_f60a2bdb6b6b4e0e8c3508580e247132",
-              "IPY_MODEL_c0892a1881de4eb4bfabc6a68f87ae99"
-            ],
-            "layout": "IPY_MODEL_1bec6297c90242a88672d195bc09d429"
-          }
-        },
-        "fe41858c6bd04c58840112b67c19a336": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_e5a82df528bb4e408797a3b6c2758f4a",
-            "placeholder": "​",
-            "style": "IPY_MODEL_f113ebd8c1c34806bea4dd7ed3035173",
-            "value": " 9985/9985 [00:00&lt;00:00, 44264.88 examples/s]"
-          }
-        },
-        "fea1b70fb46745feb5111b3929175b5d": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HBoxModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HBoxModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HBoxView",
-            "box_style": "",
-            "children": [
-              "IPY_MODEL_f365820a3d3c42b2948abfe32065de14",
-              "IPY_MODEL_823f1c78f15043e38bbd4dca3932a86a",
-              "IPY_MODEL_a1959759c5424da9961fb2a308d4dee4"
-            ],
-            "layout": "IPY_MODEL_34c9c0137b504cd799c6bd6de69507c2"
-          }
-        },
-        "ff3a94b146a948b6907f5d80c7157f99": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "DescriptionStyleModel",
-          "state": {
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "DescriptionStyleModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/base",
-            "_view_module_version": "1.2.0",
-            "_view_name": "StyleView",
-            "description_width": ""
-          }
-        },
-        "ffdbb12a2f2c4d14911685e7683e0ef0": {
-          "model_module": "@jupyter-widgets/controls",
-          "model_module_version": "1.5.0",
-          "model_name": "HTMLModel",
-          "state": {
-            "_dom_classes": [],
-            "_model_module": "@jupyter-widgets/controls",
-            "_model_module_version": "1.5.0",
-            "_model_name": "HTMLModel",
-            "_view_count": null,
-            "_view_module": "@jupyter-widgets/controls",
-            "_view_module_version": "1.5.0",
-            "_view_name": "HTMLView",
-            "description": "",
-            "description_tooltip": null,
-            "layout": "IPY_MODEL_ab93eabd7cea4b94b4b7a387f101e8a1",
-            "placeholder": "​",
-            "style": "IPY_MODEL_704f2f5a9b1c49d5a75a0025a5dda11b",
-            "value": " 3.96G/3.96G [00:12&lt;00:00, 656MB/s]"
-          }
-        }
-      }
-    }
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "OPLSwmgdrB7g"
+   },
+   "source": [
+    "# Fine-Tune Qwen3 14B with Axolotl\n",
+    "\n",
+    "[<img src=\"https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png\" alt=\"Built with Axolotl\" width=\"200\" height=\"32\"/>](https://github.com/axolotl-ai-cloud/axolotl)\n",
+    "\n",
+    "Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.\n",
+    "\n",
+    "- ⭐ us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n",
+    "- 📜 Read the [Docs](http://docs.axolotl.ai/)\n",
+    "- 💬 Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n",
+    "- 📰 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n"
+   ]
   },
-  "nbformat": 4,
-  "nbformat_minor": 0
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "rVjKD7CbxIP3"
+   },
+   "source": [
+    "# Installation\n",
+    "\n",
+    "Axolotl is easy to install from [pip](https://pypi.org/project/axolotl/), or use our [pre-built Docker images](http://docs.axolotl.ai/docs/docker.html) for a hassle free dependency experience. See our [docs](http://docs.axolotl.ai/docs/installation.html) for more information."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "msOCO4NRmRLa"
+   },
+   "outputs": [],
+   "source": [
+    "%%capture\n",
+    "# This step can take ~5-10 minutes to install dependencies\n",
+    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "N0OW0YeksDLr"
+   },
+   "source": [
+    "## Demo: Talk Like a Pirate\n",
+    "\n",
+    "In this demo, we are training the model ***to respond like a pirate***. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "8Du2fANTsNCK"
+   },
+   "source": [
+    "### Upload your own dataset or use a Huggingface dataset\n",
+    "\n",
+    "You can choose to use your own JSONL file from your own [Google Drive](https://drive.google.com/drive/home); for example downloading the [Pirate-Ultrachat JSONL](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k/blob/main/train.jsonl) to your Google Drive. JSONL datasets should be formatted similar to the [OpenAI dataset format](https://cookbook.openai.com/examples/chat_finetuning_data_prep).\n",
+    "\n",
+    "You can also simply use the [`winglian/pirate-ultrachat-10k`](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k) dataset directly.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "fGEEjyQ-r_IV"
+   },
+   "outputs": [],
+   "source": [
+    "# Default to HF dataset location\n",
+    "dataset_id = \"winglian/pirate-ultrachat-10k\"\n",
+    "uploaded = {}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "c5MyYqk7vIsG"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "# Optionally, upload your own JSONL to your Google Drive\n",
+    "GOOGLE_DRIVE_PATH = \"\"  # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n",
+    "\n",
+    "# \"Select All\" permissions, or you may get the error:\n",
+    "# \"MessageError: Error: credential propagation was unsuccessful\"\n",
+    "if GOOGLE_DRIVE_PATH:\n",
+    "    from google.colab import drive\n",
+    "\n",
+    "    # Mount your Google Drive\n",
+    "    GOOGLE_DRIVE_MNT = \"/content/drive/\"\n",
+    "    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n",
+    "    tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n",
+    "    # make sure file exists\n",
+    "    if not os.path.isfile(tmp_path):\n",
+    "        raise ValueError(f\"File {tmp_path} does not exist\")\n",
+    "    dataset_id = tmp_path"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "U6pTk3A9xj1W"
+   },
+   "source": [
+    "# Configure for Supervised Fine-Tuning (SFT)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 151,
+     "referenced_widgets": [
+      "388f618924274d21a066f098f4f1e744",
+      "7c95f85a2b1f47a1bd846d110c47bb3c",
+      "083f9cda8d754c168beee10d2f8955a2",
+      "62e1a65582f446a78612eaa804e08a7d",
+      "487a177d020f4605834878b2fdc7afa3",
+      "7fd44cf9ca6e4726bfd7ac21846d6a14",
+      "366a343b62fa47d8985a3bd464d99f9e",
+      "a0a11e929edd4189b79723d618522c33",
+      "e87ea87fcff247b5bbcc331ba79a8dc2",
+      "5e18768f7ad6434ba8b8b8a2e853e204",
+      "bb33aec33a6447078c31bfd728942994"
+     ]
+    },
+    "id": "fdRioqytmTtX",
+    "outputId": "f0acdcec-4b41-4a3f-ffed-c2d2d929158e"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-08 13:40:27,488] [INFO] [root.register:348] [PID:174] Attempting to load plugin: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
+      "[2025-05-08 13:40:27,493] [INFO] [root.register:351] [PID:174] Plugin loaded successfully: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
+      "[2025-05-08 13:40:27,959] [INFO] [axolotl.utils.schemas.config.check_eval_packing:721] [PID:174] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`\u001b[39m\n",
+      "[2025-05-08 13:40:27,960] [INFO] [axolotl.utils.schemas.config.hint_sample_packing_padding:514] [PID:174] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing\u001b[39m\n",
+      "[2025-05-08 13:40:27,961] [INFO] [axolotl.utils.schemas.config.check_bf16:1251] [PID:174] [RANK:0] bf16 support detected, but not enabled for this configuration.\u001b[39m\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "388f618924274d21a066f098f4f1e744",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-08 13:40:28,590] [INFO] [axolotl.normalize_config:237] [PID:174] [RANK:0] cuda memory usage baseline: 0.000GB (+0.002GB cache, +0.359GB misc)\u001b[39m\n"
+     ]
+    }
+   ],
+   "source": [
+    "from axolotl.utils.dict import DictDefault\n",
+    "from axolotl.cli.config import load_cfg\n",
+    "\n",
+    "# Axolotl provides full control and transparency over model and training configuration\n",
+    "config = DictDefault(\n",
+    "    base_model=\"Qwen/Qwen3-14B\",  # Use the instruct tuned model, but we're aligning it to be a pirate\n",
+    "    load_in_4bit=True,  # set to True for qLoRA\n",
+    "    adapter=\"qlora\",\n",
+    "    lora_r=32,\n",
+    "    lora_alpha=64,\n",
+    "    lora_target_modules=[\n",
+    "        \"q_proj\",\n",
+    "        \"k_proj\",\n",
+    "        \"v_proj\",\n",
+    "        \"o_proj\",  # train self_attn linear modules\n",
+    "        \"gate_proj\",\n",
+    "        \"down_proj\",\n",
+    "        \"up_proj\",  # train MLP linear modules\n",
+    "    ],\n",
+    "    lora_qkv_kernel=True,  # optimized triton kernels for LoRA\n",
+    "    lora_o_kernel=True,\n",
+    "    lora_mlp_kernel=True,\n",
+    "    embeddings_skip_upcast=True,  # keep embeddings in fp16 so the model fits in 15GB VRAM\n",
+    "    xformers_attention=True,  # use xformers on Colab w/ T4 for memory efficient attention, flash_attention only on Ampere or above\n",
+    "    plugins=[\n",
+    "        # more efficient training using Apple's Cut Cross Entropy; https://github.com/apple/ml-cross-entropy\n",
+    "        \"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\",\n",
+    "    ],\n",
+    "    sample_packing=True,  # 2-6x increase in tokens per micro-batch\n",
+    "    # when using packing, use a slightly higher learning rate to account for fewer steps\n",
+    "    # alternatively, reduce the micro_batch_size + gradient_accumulation_steps to achieve closer to the same number of steps/epoch\n",
+    "    learning_rate=0.00019,\n",
+    "    sequence_len=4096,  # larger sequence length improves packing efficiency for more tokens/sec\n",
+    "    micro_batch_size=1,\n",
+    "    gradient_accumulation_steps=1,\n",
+    "    gradient_checkpointing=True,  # tradeoff reduced VRAM for increased time\n",
+    "    gradient_checkpointing_kwargs={\n",
+    "        \"use_reentrant\": False,\n",
+    "    },\n",
+    "    optimizer=\"paged_adamw_8bit\",\n",
+    "    lr_scheduler=\"cosine\",\n",
+    "    warmup_steps=5,\n",
+    "    fp16=True,  # use float16 + automatic mixed precision, bfloat16 not supported on Colab w/ T4\n",
+    "    bf16=False,\n",
+    "    max_grad_norm=0.1,  # gradient clipping\n",
+    "    num_epochs=1,\n",
+    "    saves_per_epoch=2,  # how many checkpoints to save over one epoch\n",
+    "    logging_steps=1,\n",
+    "    output_dir=\"./outputs/qwen-sft-pirate-rrr\",\n",
+    "    chat_template=\"qwen3\",\n",
+    "    datasets=[\n",
+    "        {\n",
+    "            \"path\": dataset_id,  # Huggingface Dataset id or path to train.jsonl\n",
+    "            \"type\": \"chat_template\",\n",
+    "            \"split\": \"train\",\n",
+    "            \"eot_tokens\": [\"<|im_end|>\"],\n",
+    "        }\n",
+    "    ],\n",
+    "    dataloader_prefetch_factor=8,  # dataloader optimizations\n",
+    "    dataloader_num_workers=2,\n",
+    "    dataloader_pin_memory=True,\n",
+    ")\n",
+    "\n",
+    "# validates the configuration\n",
+    "cfg = load_cfg(config)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "715UpvnSoBIS"
+   },
+   "outputs": [],
+   "source": [
+    "from axolotl.utils import patch_optimized_env\n",
+    "\n",
+    "# speedup downloads from HF 🤗 and set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
+    "patch_optimized_env()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Vc6MC-hwyH-n"
+   },
+   "source": [
+    "# Datasets\n",
+    "\n",
+    "Axolotl has a robust suite of loaders and transforms to parse most open datasets of any format into the appropriate chat template for your model. Axolotl will mask input tokens from the user's prompt so that the train loss is only calculated against the model's response. For more information, [see our documentation](http://docs.axolotl.ai/docs/dataset-formats/conversation.html) on dataset preparation.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000,
+     "referenced_widgets": [
+      "b82aa8c57f7c422a9a9c90f333ed2a99",
+      "c0991cf63ee6458b96e9a75e7a88b61a",
+      "71c8af139cd248b1b51101fd46a93f35",
+      "1d5117195d4b49eb8f1a73b18419f7ce",
+      "3c21e4a511b4441192c03b7f1d0976e9",
+      "ed28e2e0410d4e0b855467e798e53d66",
+      "d93f134f802b4b69b575bdaf07dbd27c",
+      "d0e9dce55cec4c1ca619a0ccf209d924",
+      "4c727d40ef0443449afc31724ee79f0c",
+      "0dea5caa27384f5689e3cab51f558727",
+      "a6f48410b9964fefba0c3009a77dc838",
+      "95caff42f08a4c2aa14c867b8f37f231",
+      "de7c37ee83e24f0c889e84d07279c2ec",
+      "9d4897eefb5f48259ffb2d23e332f752",
+      "253017b0d0534e54ab44e181f6d7c82d",
+      "27beaf06e41b472abdb544a43c720c5a",
+      "34cf3df51fbc41cabfdbba153c007f0e",
+      "ac764024cf1c4e08ba7749afd2cd20ac",
+      "30a81da86f8043eca301e86a8651201a",
+      "e8b7a81040904c1e89e58978223b1737",
+      "1c6f1f10667545aaab958016ba7e2c94",
+      "e6e969610738449887259063967f82b0",
+      "a138859f19b74fc0928dc236ab5359db",
+      "9b42e08b3c9548818488268768a118b1",
+      "12b56912736849fea2ad8124456fdc5c",
+      "879c8ab5873847a8833bd74123be90a4",
+      "20352e5f58d24bb8b1f3940efd14fe4a",
+      "d955dcaa0e944e719f3a06139dd54a03",
+      "d3de2662c7964f1ba96e58da382af720",
+      "97e36007e1304e1583fd81bfb13f0edd",
+      "c65dc74c7d6f4bab8f7dd28455161dd8",
+      "ef223e8504b64e3592589880326aaf41",
+      "598da69727bd4fb8b1caf465ac736d7a",
+      "5f86cd894de94c3280fadc1e2fd0ee13",
+      "a20927bf5f2c41f58c1e31ac858ab36c",
+      "0a46ad75c198463d843fb35e813642cb",
+      "09007681cf8d42aeb8c1d2f6a74e470a",
+      "ebc80d1a55fa47f4a5ea2756588569ec",
+      "1811cda0644e4190a9469d1774435d82",
+      "35c811d2ae8e43f3b5cecbdd3cfa857f",
+      "b8e39e4dddc3497fbc29ae45c66da759",
+      "63b4e563e85c4f03b1b72beda9577bcc",
+      "b195f160ca20442fadd8b5aed0ee41af",
+      "ca65e32eb52f48c09a84b33cb18f22cd",
+      "7cd0b85ebd204b7aba908417811ce4e0",
+      "7baeab52d6694c32b1efd1ea1a0a7782",
+      "519a7b154022443db6703f04a9142bae",
+      "d4183e9715f34d249942b8271cca3bdf",
+      "da2347ac94764a3fa2743343cf0d3cd2",
+      "93a44a11aa4846fa8efc6c1413ef1627",
+      "a55060adc3564407ac81ad7297d34aaa",
+      "d02274afd47b462291c745f261209d42",
+      "0f417447a7bd4a33acca96fa37aec877",
+      "63580b6fb30642479fe3000915bf551a",
+      "8f726dbfb45d4528afa33e36a6313267",
+      "03b093d592ba4386aa61f7b8483da660",
+      "b8766a88716948cf968f4563531a76d9",
+      "6f3a28b912714c6e931003549664bfa3",
+      "16d1283741404b7bb319094c992fce01",
+      "2a5bb0e818ab47be8cf6465988328503",
+      "2b3a2659b12244bd8548320320016dbf",
+      "0cd7efffbb3c4c4b972e63749f61ab97",
+      "5ca240f31e6b44e3882c5eb37cd5a309",
+      "5eb06edeb58e4930b1affef2a59eae81",
+      "a4e5789584564049b83df7c6c54a3e08",
+      "ff3a94b146a948b6907f5d80c7157f99",
+      "258b7c635c1045329d4669e48c46ccd5",
+      "6f68ed9889f54ad2ae8a3b95ac263a83",
+      "80366349d81e4dcc892db6cd56e384f3",
+      "c73055099c084dca996159e23e162d0b",
+      "977f799afaac4a55b2dc1cffa7d5b63b",
+      "41f3b32c2f6b4034ae7a3b9124e28bc7",
+      "a10d0a76010f4e508c65a9b69ebc5156",
+      "f8ef805b776145c3bfa9ba8d90972058",
+      "cc587493c33c4f118d1b1170f85be24c",
+      "e40d1c1ac9494b3bade9858324e7ffdf",
+      "d65b6b060d9845779299491ac5599c31",
+      "0f6907ebbc6242c8bde059cef1e1bd29",
+      "5bdfd87fc6cd4f9dabef7cfee29c8060",
+      "64f54d4a744a4627a07c3c0120276f3b",
+      "65b75b9b8bc143cf997796af68ff6668",
+      "d6fe74e4255444368f8f90a62157d869",
+      "4d468f96ec924681ad65eb671674b93e",
+      "ad7599de524549c48bf2d3124ad4b299",
+      "0546d04aae644dde846c58a4afb598a6",
+      "897b77a56c09479bb11d7f2a30997e55",
+      "81c3db71ac704280ad030072655f1537",
+      "042e091f75694c47aee761e760e76773",
+      "ef0a3c7a6f14460fb4da096928ae249e",
+      "07fb3a2c8315494e97b447e672dfae06",
+      "ec030fc3c346426f9abc3a89892258d3",
+      "e3fb3fc6afe04b3c9b7ac61809ce78fa",
+      "c3be9109d63c485d9c0ef4f9bc0f9218",
+      "12815f401eba44658caa7b2e490137a8",
+      "30e02aa2d0d241979369e598287f2639",
+      "dfd2a2649b8341ef913207526708aff1",
+      "4f1977d7e4824ef1a14b65f0f42bba10",
+      "c6164e05a1914ae48083db9ad7f4ef7c",
+      "813621384dc748b0ad06775e22761c0b",
+      "dc892a596f6942d7973c616c38f0eebb",
+      "c84cc07789be48aebb322c23d355289e",
+      "bed8726b8069434687c75452e21f19e5",
+      "16a188a0b06d45f980dcf3933509fe0a",
+      "60c1a0d765c14a1d888317e6a507e4ea",
+      "0077aedc3d174560bce924ee89e9c006",
+      "00321cce58884f6f9b3855a21fcd9187",
+      "fa864b41586f4a7aa56aeafd1d84eb75",
+      "3225603166b54e7aab766b9964a2f660",
+      "349eee9f56d64f0cba6fc24ff2c50c9b",
+      "7e5d3774060e4589aa65982da5ea4ef4",
+      "7c2485c6cdfe463da6fdb35982a1070d",
+      "ad1236893754446881e153adc9d5c962",
+      "daee63fd167e4441a32324b51b00ad2b",
+      "fe41858c6bd04c58840112b67c19a336",
+      "d262c82138024169b9f3aa034ca756fa",
+      "62e302ebdad64aada0ffe64ae1c873f3",
+      "bd1b0dfed6d34d16af33a4a58330f5ec",
+      "d07c8b97d3314f1c852e44bdd40f61ed",
+      "ebb69a2c3d0a4299a484698287b3087c",
+      "e5a82df528bb4e408797a3b6c2758f4a",
+      "f113ebd8c1c34806bea4dd7ed3035173"
+     ]
+    },
+    "id": "KQQhgK8FoDfF",
+    "outputId": "f69441d8-95f9-4885-c306-6c8709090ff6"
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "b82aa8c57f7c422a9a9c90f333ed2a99",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/9.68k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "95caff42f08a4c2aa14c867b8f37f231",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a138859f19b74fc0928dc236ab5359db",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5f86cd894de94c3280fadc1e2fd0ee13",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-08 13:41:00,844] [DEBUG] [axolotl.utils.models.load_tokenizer:441] [PID:174] [RANK:0] EOS: 151645 / <|im_end|>\u001b[39m\n",
+      "[2025-05-08 13:41:00,845] [DEBUG] [axolotl.utils.models.load_tokenizer:442] [PID:174] [RANK:0] BOS: None / None\u001b[39m\n",
+      "[2025-05-08 13:41:00,846] [DEBUG] [axolotl.utils.models.load_tokenizer:443] [PID:174] [RANK:0] PAD: 151643 / <|endoftext|>\u001b[39m\n",
+      "[2025-05-08 13:41:00,847] [DEBUG] [axolotl.utils.models.load_tokenizer:444] [PID:174] [RANK:0] UNK: None / None\u001b[39m\n",
+      "[2025-05-08 13:41:00,869] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:271] [PID:174] [RANK:0] Unable to find prepared dataset in last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n",
+      "[2025-05-08 13:41:00,870] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:272] [PID:174] [RANK:0] Loading raw datasets...\u001b[39m\n",
+      "\u001b[33m[2025-05-08 13:41:00,870] [WARNING] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:274] [PID:174] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset.\u001b[39m\n",
+      "[2025-05-08 13:41:00,871] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:281] [PID:174] [RANK:0] No seed provided, using default seed of 42\u001b[39m\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7cd0b85ebd204b7aba908417811ce4e0",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "train.jsonl:   0%|          | 0.00/27.3M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "03b093d592ba4386aa61f7b8483da660",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Generating train split: 0 examples [00:00, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-08 13:41:04,196] [INFO] [axolotl.utils.data.sft.get_dataset_wrapper:484] [PID:174] [RANK:0] Loading dataset with base_type: chat_template and prompt_style: None\u001b[39m\n",
+      "[2025-05-08 13:41:04,233] [INFO] [axolotl.__call__:761] [PID:174] [RANK:0] Using chat template:\n",
+      "---\n",
+      "{%- if tools %}\n",
+      "    {{- '<|im_start|>system\\n' }}\n",
+      "    {%- if messages[0].role == 'system' %}\n",
+      "        {{- messages[0].content + '\\n\\n' }}\n",
+      "    {%- endif %}\n",
+      "    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n",
+      "    {%- for tool in tools %}\n",
+      "        {{- \"\\n\" }}\n",
+      "        {{- tool | tojson }}\n",
+      "    {%- endfor %}\n",
+      "    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n",
+      "{%- else %}\n",
+      "    {%- if messages[0].role == 'system' %}\n",
+      "        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n",
+      "    {%- endif %}\n",
+      "{%- endif %}\n",
+      "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n",
+      "{%- for message in messages[::-1] %}\n",
+      "    {%- set index = (messages|length - 1) - loop.index0 %}\n",
+      "    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n",
+      "        {%- set ns.multi_step_tool = false %}\n",
+      "        {%- set ns.last_query_index = index %}\n",
+      "    {%- endif %}\n",
+      "{%- endfor %}\n",
+      "{%- for message in messages %}\n",
+      "    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n",
+      "        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n",
+      "    {%- elif message.role == \"assistant\" %}\n",
+      "        {%- set content = message.content %}\n",
+      "        {%- set reasoning_content = '' %}\n",
+      "        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n",
+      "            {%- set reasoning_content = message.reasoning_content %}\n",
+      "        {%- else %}\n",
+      "            {%- if '</think>' in message.content %}\n",
+      "                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n",
+      "                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n",
+      "            {%- endif %}\n",
+      "        {%- endif %}\n",
+      "        {%- if loop.index0 > ns.last_query_index %}\n",
+      "            {%- if loop.last or (not loop.last and reasoning_content) %}\n",
+      "                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n",
+      "            {%- else %}\n",
+      "                {{- '<|im_start|>' + message.role + '\\n' + content }}\n",
+      "            {%- endif %}\n",
+      "        {%- else %}\n",
+      "            {{- '<|im_start|>' + message.role + '\\n' + content }}\n",
+      "        {%- endif %}\n",
+      "        {%- if message.tool_calls %}\n",
+      "            {%- for tool_call in message.tool_calls %}\n",
+      "                {%- if (loop.first and content) or (not loop.first) %}\n",
+      "                    {{- '\\n' }}\n",
+      "                {%- endif %}\n",
+      "                {%- if tool_call.function %}\n",
+      "                    {%- set tool_call = tool_call.function %}\n",
+      "                {%- endif %}\n",
+      "                {{- '<tool_call>\\n{\"name\": \"' }}\n",
+      "                {{- tool_call.name }}\n",
+      "                {{- '\", \"arguments\": ' }}\n",
+      "                {%- if tool_call.arguments is string %}\n",
+      "                    {{- tool_call.arguments }}\n",
+      "                {%- else %}\n",
+      "                    {{- tool_call.arguments | tojson }}\n",
+      "                {%- endif %}\n",
+      "                {{- '}\\n</tool_call>' }}\n",
+      "            {%- endfor %}\n",
+      "        {%- endif %}\n",
+      "        {{- '<|im_end|>\\n' }}\n",
+      "    {%- elif message.role == \"tool\" %}\n",
+      "        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n",
+      "            {{- '<|im_start|>user' }}\n",
+      "        {%- endif %}\n",
+      "        {{- '\\n<tool_response>\\n' }}\n",
+      "        {{- message.content }}\n",
+      "        {{- '\\n</tool_response>' }}\n",
+      "        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n",
+      "            {{- '<|im_end|>\\n' }}\n",
+      "        {%- endif %}\n",
+      "    {%- endif %}\n",
+      "{%- endfor %}\n",
+      "{%- if add_generation_prompt %}\n",
+      "    {{- '<|im_start|>assistant\\n' }}\n",
+      "    {%- if enable_thinking is defined and enable_thinking is false %}\n",
+      "        {{- '<think>\\n\\n</think>\\n\\n' }}\n",
+      "    {%- endif %}\n",
+      "{%- endif %}\n",
+      "---\u001b[39m\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "258b7c635c1045329d4669e48c46ccd5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Tokenizing Prompts (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-08 13:42:09,195] [INFO] [axolotl.utils.data.utils.drop_long_seq_in_dataset:177] [PID:174] [RANK:0] min_input_len: 23\u001b[39m\n",
+      "[2025-05-08 13:42:09,196] [INFO] [axolotl.utils.data.utils.drop_long_seq_in_dataset:179] [PID:174] [RANK:0] max_input_len: 3380\u001b[39m\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "0f6907ebbc6242c8bde059cef1e1bd29",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Dropping Long Sequences (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ef0a3c7a6f14460fb4da096928ae249e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Drop Samples with Zero Trainable Tokens (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dc892a596f6942d7973c616c38f0eebb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Add position_id column (Sample Packing) (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-08 13:42:21,651] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:351] [PID:174] [RANK:0] Saving merged prepared dataset to disk... last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7c2485c6cdfe463da6fdb35982a1070d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Saving the dataset (0/1 shards):   0%|          | 0/9985 [00:00<?, ? examples/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-08 13:42:25,711] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:411] [PID:174] [RANK:0] gather_len_batches: [1540]\u001b[39m\n",
+      "[2025-05-08 13:42:25,714] [INFO] [axolotl.calc_sample_packing_eff_est:491] [PID:174] [RANK:0] sample_packing_eff_est across ranks: [0.9987832601968344]\u001b[39m\n"
+     ]
+    }
+   ],
+   "source": [
+    "from axolotl.common.datasets import load_datasets\n",
+    "\n",
+    "# Load, parse and tokenize the datasets to be formatted with qwen3 chat template\n",
+    "# Drop long samples from the dataset that overflow the max sequence length\n",
+    "dataset_meta = load_datasets(cfg=cfg)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mrSNfHpk0EAe"
+   },
+   "source": [
+    "# Training\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 1000,
+     "referenced_widgets": [
+      "004d9177a6a14118a5930dc3cc13147b",
+      "a80410b919e442c49aea15acc1ce1a72",
+      "c6e00f5224364822bc4239b176686919",
+      "ec11d1e5ae7b42c883d9b1f38a65356e",
+      "734185351eb543fa9a00a881dcbb9fe7",
+      "fa1282ccc7544e4f818e2f03ccffe4a5",
+      "bbbf575d2a4b4c6ea8389be79b2a6039",
+      "2a51b36be41745468e4c2d7a21b1c0d2",
+      "4fd114abe9f5494ab59858949f5055f1",
+      "936d04b5fe1b4c63bf0b080e423d051b",
+      "f1cef8e8dc2646fb9fd09f3b09081074",
+      "cdebbc55a1164c018546c2ac6f8c620c",
+      "a44f630e099e43899f20a77084ae60cd",
+      "c3725c7f79fe415fbd1ea336f0cc9cf1",
+      "0e50870ed0c643e0b6c18cc5d7ddae7f",
+      "c33ced495f70464aa4a3a91922090853",
+      "ed5ca967ad5342929e578ac6aa4dc4c0",
+      "af401d117d5047629d3a6e2361757b62",
+      "b191ac001a2e4962bc9a245fcdf26e6b",
+      "054c8dffadba48c6b895a6cc62448ecc",
+      "bfcdbba993b74972a9e3e575f86908ff",
+      "6ebb2ec171414e47a14765505f64bb3c",
+      "500e272208a246089613bf788a165271",
+      "200df5e79b9244849e589ecb0250a520",
+      "cc94432d08464affa3e58b560bdad194",
+      "3036608c71904ce9ae4bb2a9fa8802d9",
+      "adacfdcc1b0140efac56918e9ccf064e",
+      "f4a1795dc7514a718f478245f521f0ba",
+      "5e746eb25bbe416fb585fa24e79f5177",
+      "b5b65414154544aa8a71b1a39164aad7",
+      "f0a58fbd0fca4340890041f99fa2f8c8",
+      "5ca6be24acb548cea130bd58e9954c7c",
+      "5cfb02ee044b4011a378efa8b54a370f",
+      "4d05314858354e729d76094b3b0ce761",
+      "c42acf646f344a88b8c11f81e67f7206",
+      "7be6f04c284e4326bb4ff3d301e7b3c6",
+      "ffdbb12a2f2c4d14911685e7683e0ef0",
+      "bee3501b2a17427784a717e50a85e7fa",
+      "8bc9d8ba866c442b9118d9630009939c",
+      "9f56a2d9979c4bd8928c644c22c3ecdf",
+      "9503a45960984adc97b58e16c50662e0",
+      "da6e93f3e4984780b930fe7a706983ea",
+      "ab93eabd7cea4b94b4b7a387f101e8a1",
+      "704f2f5a9b1c49d5a75a0025a5dda11b",
+      "dd0e646fad3f4a89ba23b39d162bd8d9",
+      "d43c6df07ddb466587807d6dbe1ff614",
+      "e0e8b840b8ea4d0d9db09afe99fa287d",
+      "9327977822be4b1294f80e876552e305",
+      "77304d1a46b3468a98483e02ec0ac4a4",
+      "8c4d4fc5a30f4e7cb3be53fe2adda33d",
+      "e90658f4bcb642baa78426012f863152",
+      "f7434f3e03124a1c938a39af79d7fa59",
+      "c1314f241a434c41b45d84dc4d3b30f8",
+      "37de928300e34184881039378bd75e7f",
+      "0e936d9dbf9c4fdd86bbfe9730dedc47",
+      "e21e180307e5485cbbe908672fd6639a",
+      "2e2b0c1599c341a198f632f46a40c90e",
+      "bff139df987d4a62abec6456cb27f3d4",
+      "ebe1cc366d324ad59b264c8b3c431441",
+      "114dece49dba437c8572ef94b23c3b1e",
+      "be724f04b03942b2a033a7e8898bb4fd",
+      "fcbab4d8dced41a18dfccce81e3a45a0",
+      "c1f9c267ba3f40039cdb5eb3267e8043",
+      "33b3b1d0295646edaac7b4822761aeb0",
+      "fba7aa824b38467ab3061b226114cdec",
+      "f3075dccbd2747b4a7913b66f44f2596",
+      "fe18bba7f3fb4c31bf840541f36b3425",
+      "fd4f333f7ece4450b04e1a9af1f9d2f6",
+      "f60a2bdb6b6b4e0e8c3508580e247132",
+      "c0892a1881de4eb4bfabc6a68f87ae99",
+      "1bec6297c90242a88672d195bc09d429",
+      "d1f9b10c130542f094c8fd3d1e23b5e9",
+      "e575d87a7efe4ec7b1efde489839d4a6",
+      "edc99591b9c747b689b94d0052fec14c",
+      "35cc989ca3374e7dba0cb166febc4bde",
+      "158c8b85dbf34de6a94b4e35e2fc7d5a",
+      "0b4c9753a7cb4354b8e5f187e6e1ad7c",
+      "4471ff62258549fba9514bb67050f965",
+      "9cd5211b5d8b457aa0002f1d17b80028",
+      "19127c7bb1554ccbac877059f9a82db0",
+      "f4667818b9d34a09891cd727a429a610",
+      "9ed02dc43412471a9ab47f3620ccf3a5",
+      "6932489232ec4ab18a160b1e7fbcdfe1",
+      "4540927d98f54466b434ba4c0edf045d",
+      "e400cbf14bcc446a9d33b210cd93550b",
+      "71002199df6b40c9a1ac40df5fb27a1b",
+      "4b27c267393640f28f6eae0875bd2ed9",
+      "9858cb74a09748a39e8149baac96702c",
+      "eb1c9535e6a546098b760528b2ea387c",
+      "18357b321ce44d7b8bd9d1c886f69275",
+      "279937fe03bc4e4eb25b472d7e9df163",
+      "bca2c7185b6749fd899c06a2ba4c5e46",
+      "1f7d30f71bbd4547a9150d21da071055",
+      "e366ae3fceec4566b9ed303d6c5f90af",
+      "5dd7d150dbe04f08b165ce7f2c27cd11",
+      "b634bb73cfa743d09a5999101b840976",
+      "742b1030acfd414bbd9d5327b7e3826d",
+      "0f480e3a0b0a45d2a2d2dec3cad923f3",
+      "fcb30372e7404c5d8a1ad4df91e6c7b2",
+      "2860e3bb3baf4f7da058465850e800c5",
+      "3efd18ea8eaa41918894883da9541bfa",
+      "e09f1bcbb9d94c09be53e5e1303642c2",
+      "82177df57a494de8900c14c2f5185175",
+      "ccfcdc95baf646f8aeb3d516742383f2",
+      "8f5bd719974e41c3a8dd9a5b0d3d71e6",
+      "b87c84de30e84b3abf4871461fb9cbd3",
+      "e7d8e4fe58384e93a106de546068c65e",
+      "0aa8ab56b85f4171a79c3bc210594025",
+      "67da6c4260574869aa24c3cbc1bc1654",
+      "94b9088614464f60a203de39dbcae853",
+      "fea1b70fb46745feb5111b3929175b5d",
+      "f365820a3d3c42b2948abfe32065de14",
+      "823f1c78f15043e38bbd4dca3932a86a",
+      "a1959759c5424da9961fb2a308d4dee4",
+      "34c9c0137b504cd799c6bd6de69507c2",
+      "735d4f225b24414294fc1b213c61223c",
+      "5e5e15b0569b474c9620083b3ec6af55",
+      "03a3c744d716431488163b4358b80f92",
+      "a5434ee714f9498d83870544b67c0cb7",
+      "3aaecbf540f54a2db9ab0931e3b1fe57",
+      "9e333ed3b5014069ac1dd969255dd591"
+     ]
+    },
+    "id": "IwrpurmloGOy",
+    "outputId": "84fa167f-ba27-4255-d508-dc9df56ad39b"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "     #@@ #@@      @@# @@#\n",
+      "    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.\n",
+      "    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@\n",
+      "      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@\n",
+      "    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@\n",
+      "    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@\n",
+      "     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@\n",
+      "                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@\n",
+      "    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@\n",
+      "                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@\n",
+      "    @@@@  @@@@@@@@@@@@@@@@\n",
+      "\n",
+      "[2025-05-07 22:08:14,344] [INFO] [axolotl.monkeypatch.peft.utils.patch_peft_prep_code:76] [PID:1336] [RANK:0] patching prepare_model_for_kbit_training to allow for overrides\u001b[39m\n",
+      "[2025-05-07 22:08:14,549] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:80] [PID:1336] [RANK:0] Applying Cut Cross Entropy to model type: qwen3\u001b[39m\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "004d9177a6a14118a5930dc3cc13147b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model.safetensors.index.json:   0%|          | 0.00/36.5k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "cdebbc55a1164c018546c2ac6f8c620c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00001-of-00008.safetensors:   0%|          | 0.00/3.84G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "500e272208a246089613bf788a165271",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00002-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4d05314858354e729d76094b3b0ce761",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00003-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "dd0e646fad3f4a89ba23b39d162bd8d9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00004-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "e21e180307e5485cbbe908672fd6639a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00005-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fe18bba7f3fb4c31bf840541f36b3425",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00006-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "4471ff62258549fba9514bb67050f965",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00007-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "eb1c9535e6a546098b760528b2ea387c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "model-00008-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-07 22:09:49,798] [INFO] [accelerate.utils.modeling.get_balanced_memory:990] [PID:1336] We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2860e3bb3baf4f7da058465850e800c5",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "fea1b70fb46745feb5111b3929175b5d",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-07 22:11:37,521] [INFO] [axolotl.utils.models.load_model:1302] [PID:1336] [RANK:0] cuda memory usage after model load: 9.264GB (+1.721GB cache, +0.375GB misc)\u001b[39m\n",
+      "[2025-05-07 22:11:37,532] [INFO] [axolotl.utils.models.prepare_model:1205] [PID:1336] [RANK:0] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
+      "[2025-05-07 22:11:37,537] [INFO] [axolotl.utils.models.load_model:1341] [PID:1336] [RANK:0] Converting modules to torch.float16\u001b[39m\n",
+      "trainable params: 128,450,560 || all params: 14,896,757,760 || trainable%: 0.8623\n",
+      "[2025-05-07 22:11:40,170] [INFO] [axolotl.utils.models.load_model:1402] [PID:1336] [RANK:0] cuda memory usage after adapters: 9.743GB (+1.476GB cache, +0.375GB misc)\u001b[39m\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.11/dist-packages/axolotl/core/trainers/base.py:64: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `AxolotlTrainer.__init__`. Use `processing_class` instead.\n",
+      "  super().__init__(*_args, **kwargs)\n",
+      "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-07 22:11:41,755] [INFO] [axolotl.train.save_initial_configs:359] [PID:1336] [RANK:0] Pre-saving adapter config to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
+      "[2025-05-07 22:11:41,756] [INFO] [axolotl.train.save_initial_configs:363] [PID:1336] [RANK:0] Pre-saving tokenizer to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
+      "[2025-05-07 22:11:41,974] [INFO] [axolotl.train.save_initial_configs:366] [PID:1336] [RANK:0] Pre-saving model config to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
+      "[2025-05-07 22:11:41,982] [INFO] [axolotl.train.execute_training:211] [PID:1336] [RANK:0] Starting trainer...\u001b[39m\n",
+      "[2025-05-07 22:11:45,047] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:411] [PID:1336] [RANK:0] gather_len_batches: [1540]\u001b[39m\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
+      "You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='25' max='25' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [25/25 09:25, Epoch 0/1]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>1.092300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>1.554200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>1.041400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>1.733800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>1.430000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>1.258500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>1.343600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8</td>\n",
+       "      <td>1.101700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>9</td>\n",
+       "      <td>1.086500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>0.813200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>11</td>\n",
+       "      <td>0.689600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>12</td>\n",
+       "      <td>0.826700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>13</td>\n",
+       "      <td>1.541800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>14</td>\n",
+       "      <td>0.948000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>15</td>\n",
+       "      <td>1.357000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>16</td>\n",
+       "      <td>1.085800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>17</td>\n",
+       "      <td>1.516800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>18</td>\n",
+       "      <td>1.146800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>19</td>\n",
+       "      <td>0.834800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>20</td>\n",
+       "      <td>0.968000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>21</td>\n",
+       "      <td>1.388800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>22</td>\n",
+       "      <td>1.511500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>23</td>\n",
+       "      <td>1.338500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>24</td>\n",
+       "      <td>1.206600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>25</td>\n",
+       "      <td>1.504600</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2025-05-07 22:12:42,746] [INFO] [axolotl.callbacks.on_step_end:128] [PID:1336] [RANK:0] cuda memory usage while training: 9.768GB (+3.287GB cache, +0.646GB misc)\u001b[39m\n",
+      "[2025-05-07 22:21:46,859] [INFO] [axolotl.train.save_trained_model:231] [PID:1336] [RANK:0] Training completed! Saving pre-trained model to ./outputs/qwen-sft-pirate-rrr.\u001b[39m\n"
+     ]
+    }
+   ],
+   "source": [
+    "from axolotl.train import train\n",
+    "\n",
+    "# just train the first 25 steps for demo.\n",
+    "# This is sufficient to align the model as we've used packing to maximize the trainable samples per step.\n",
+    "cfg.max_steps = 25\n",
+    "model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "j1b9ypF78eCb"
+   },
+   "source": [
+    "# Inferencing the trained model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "r3_vHhif8YEs",
+    "outputId": "e5050605-f6c9-421c-98f9-bde56a281eae"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Ahoy there, matey! Shiver me timbers, ye be lookin' for the Pythagorean theorem, eh? Well, hold yer horses and listen up, for I'll be tellin' ye all about it in me own special way.\n",
+      "\n",
+      "The Pythagorean theorem be a real gem of a mathematical trick that helps ye find the length of a side of a right triangle. Now, a right triangle be a triangle with a right angle, which be that little corner that looks like a square. \n",
+      "\n",
+      "The theorem be named after a clever fellow named Pythagoras, who be a mathematician from ancient Greece. He discovered that if ye have a right triangle, the square of the length of the hypotenuse (that be the side opposite the right angle) be equal to the sum of the squares of the other two sides. \n",
+      "\n",
+      "In other words, if ye have a triangle with sides of length a, b, and c (\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import TextStreamer\n",
+    "\n",
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": \"Explain the Pythagorean theorem to me.\",\n",
+    "    },\n",
+    "]\n",
+    "\n",
+    "prompt = tokenizer.apply_chat_template(\n",
+    "    messages,\n",
+    "    add_generation_prompt=True,\n",
+    "    tokenize=False,\n",
+    "    enable_thinking=False,\n",
+    ")\n",
+    "\n",
+    "outputs = model.generate(\n",
+    "    **tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\"),\n",
+    "    max_new_tokens=192,\n",
+    "    temperature=1.0,\n",
+    "    top_p=0.8,\n",
+    "    top_k=32,\n",
+    "    streamer=TextStreamer(tokenizer, skip_prompt=True),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "HoGwT2JRSIjA"
+   },
+   "source": [
+    "# Saving your trained model\n",
+    "\n",
+    "Axolotl automatically saves checkpoints to the `output_dir` path.\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "5BmSbiy6NaaS",
+    "outputId": "f5e1d913-7d55-42d2-8340-f9f1b0bc2b38"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total 506M\n",
+      "-rw-r--r-- 1 root root  845 May  7 22:21 adapter_config.json\n",
+      "-rw-r--r-- 1 root root 491M May  7 22:21 adapter_model.safetensors\n",
+      "-rw-r--r-- 1 root root  707 May  7 22:11 added_tokens.json\n",
+      "drwxr-xr-x 2 root root 4.0K May  7 22:17 checkpoint-13\n",
+      "drwxr-xr-x 2 root root 4.0K May  7 22:21 checkpoint-25\n",
+      "-rw-r--r-- 1 root root 1.2K May  7 22:11 config.json\n",
+      "-rw-r--r-- 1 root root 1.6M May  7 22:11 merges.txt\n",
+      "-rw-r--r-- 1 root root 2.6K May  7 22:21 README.md\n",
+      "-rw-r--r-- 1 root root  613 May  7 22:11 special_tokens_map.json\n",
+      "-rw-r--r-- 1 root root 9.5K May  7 22:11 tokenizer_config.json\n",
+      "-rw-r--r-- 1 root root  11M May  7 22:11 tokenizer.json\n",
+      "-rw-r--r-- 1 root root 2.7M May  7 22:11 vocab.json\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Show the saved checkpoints in the output_dir\n",
+    "!ls -lh \"./outputs/qwen-sft-pirate-rrr\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "_PCIFWxuOZd6"
+   },
+   "source": [
+    "Setting `hub_model_id: ` in the original config would have automatically uploaded the model to HuggingFace Hub (e.g. `hub_model_id: username/model_id`)\n",
+    "\n",
+    "If you prefer to manually upload the training artifacts, we can still upload the entire final checkpoint to HuggingFace from the CLI."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 955,
+     "referenced_widgets": [
+      "c12ea43372ac4d57bb9605f1a429b397",
+      "86816687746246b4a6105e8010384e25",
+      "6f05e9bebf7b40c9835808e77de6c236",
+      "c7433acd3c4841e6958ae8f7e87b1808",
+      "19c1e38389fa46c7b7e2152a56e1df34",
+      "0e067d8db8ed48308a718d5f57683fd1",
+      "131065f118274a1586ac38e39ed84ef0",
+      "8640ac440fbc4644b9a3af7ba3ae7183",
+      "5cea7996f02040b187ece0bb2d6a8d1f",
+      "2e257c8be2da40b4bb67a9e4ab6811f3",
+      "56e3768bef5a4b9db4168c5c17f509c2",
+      "62c028fdef904dedb9cdeca2b3bda725",
+      "a7cf477e80fc43e0ad82c7997b076dce",
+      "835bcc28a5564fb9b3d651bc8e32dc46",
+      "9f1c9a0695384bdaa6f8b847ef89bee8",
+      "b1bea589efa14258a9982071b87938bf",
+      "590eef89881545aa8bbef9a8bbe7fb00",
+      "4b1f04ff63d14a118fdd15814dff50e4",
+      "39789237703c4a418134243055c9cbf5",
+      "a3a945817f684328b34651fe052393ec"
+     ]
+    },
+    "id": "2yw8pLvlSMl8",
+    "outputId": "6e489ab2-4abe-4e28-84ca-959f912433a4"
+   },
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c12ea43372ac4d57bb9605f1a429b397",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "It seems you are trying to upload a large folder at once. This might take some time and then fail if the folder is too large. For such cases, it is recommended to upload in smaller batches or to use `HfApi().upload_large_folder(...)`/`huggingface-cli upload-large-folder` instead. For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder.\n",
+      "Start hashing 40 files.\n",
+      "Finished hashing 40 files.\n",
+      "Uploading files using Xet Storage..\n",
+      "Uploading...:  87% 1.82G/2.10G [00:23<00:04, 67.3MB/s]Cancellation requested; stopping current tasks.\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 598, in _upload_xet_files\n",
+      "    upload_files(\n",
+      "RuntimeError: Xet Runtime Error: Task cancelled; possible runtime shutdown in progress (task 9 was cancelled).\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/usr/local/bin/huggingface-cli\", line 8, in <module>\n",
+      "    sys.exit(main())\n",
+      "             ^^^^^^\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/huggingface_cli.py\", line 57, in main\n",
+      "    service.run()\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 207, in run\n",
+      "    print(self._upload())\n",
+      "          ^^^^^^^^^^^^^^\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 302, in _upload\n",
+      "    return self.api.upload_folder(\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
+      "    return fn(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
+      "    return fn(self, *args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4942, in upload_folder\n",
+      "    commit_info = self.create_commit(\n",
+      "                  ^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
+      "    return fn(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
+      "    return fn(self, *args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4202, in create_commit\n",
+      "    self.preupload_lfs_files(\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4483, in preupload_lfs_files\n",
+      "    _upload_xet_files(**upload_kwargs, create_pr=create_pr)  # type: ignore [arg-type]\n",
+      "    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
+      "    return fn(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 592, in _upload_xet_files\n",
+      "    with progress_cm as progress:\n",
+      "  File \"/usr/local/lib/python3.11/dist-packages/tqdm/std.py\", line 1138, in __exit__\n",
+      "    def __exit__(self, exc_type, exc_value, traceback):\n",
+      "\n",
+      "KeyboardInterrupt\n",
+      "^C\n"
+     ]
+    }
+   ],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "# remove the partial epoch checkpoints\n",
+    "!rm -rf \"./outputs/qwen-sft-pirate-rrr/checkpoint-*\"\n",
+    "\n",
+    "# HF Notebook login widget\n",
+    "notebook_login()\n",
+    "\n",
+    "# upload the LoRA adapter for your model to HF, remember to update the username/model-name below\n",
+    "!huggingface-cli upload --repo-type=model winglian/pirate-qwen-14B \"./outputs/qwen-sft-pirate-rrr\""
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "T4",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "00321cce58884f6f9b3855a21fcd9187": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "004d9177a6a14118a5930dc3cc13147b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_a80410b919e442c49aea15acc1ce1a72",
+       "IPY_MODEL_c6e00f5224364822bc4239b176686919",
+       "IPY_MODEL_ec11d1e5ae7b42c883d9b1f38a65356e"
+      ],
+      "layout": "IPY_MODEL_734185351eb543fa9a00a881dcbb9fe7"
+     }
+    },
+    "0077aedc3d174560bce924ee89e9c006": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "03a3c744d716431488163b4358b80f92": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "03b093d592ba4386aa61f7b8483da660": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_b8766a88716948cf968f4563531a76d9",
+       "IPY_MODEL_6f3a28b912714c6e931003549664bfa3",
+       "IPY_MODEL_16d1283741404b7bb319094c992fce01"
+      ],
+      "layout": "IPY_MODEL_2a5bb0e818ab47be8cf6465988328503"
+     }
+    },
+    "042e091f75694c47aee761e760e76773": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "0546d04aae644dde846c58a4afb598a6": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "054c8dffadba48c6b895a6cc62448ecc": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "07fb3a2c8315494e97b447e672dfae06": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_12815f401eba44658caa7b2e490137a8",
+      "placeholder": "​",
+      "style": "IPY_MODEL_30e02aa2d0d241979369e598287f2639",
+      "value": "Drop Samples with Zero Trainable Tokens (num_proc=2): 100%"
+     }
+    },
+    "083f9cda8d754c168beee10d2f8955a2": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a0a11e929edd4189b79723d618522c33",
+      "max": 728,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_e87ea87fcff247b5bbcc331ba79a8dc2",
+      "value": 728
+     }
+    },
+    "09007681cf8d42aeb8c1d2f6a74e470a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b195f160ca20442fadd8b5aed0ee41af",
+      "placeholder": "​",
+      "style": "IPY_MODEL_ca65e32eb52f48c09a84b33cb18f22cd",
+      "value": " 11.4M/11.4M [00:00&lt;00:00, 21.8MB/s]"
+     }
+    },
+    "0a46ad75c198463d843fb35e813642cb": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b8e39e4dddc3497fbc29ae45c66da759",
+      "max": 11422654,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_63b4e563e85c4f03b1b72beda9577bcc",
+      "value": 11422654
+     }
+    },
+    "0aa8ab56b85f4171a79c3bc210594025": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "0b4c9753a7cb4354b8e5f187e6e1ad7c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "0cd7efffbb3c4c4b972e63749f61ab97": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "0dea5caa27384f5689e3cab51f558727": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "0e067d8db8ed48308a718d5f57683fd1": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b1bea589efa14258a9982071b87938bf",
+      "placeholder": "​",
+      "style": "IPY_MODEL_590eef89881545aa8bbef9a8bbe7fb00",
+      "value": "\n<b>Pro Tip:</b> If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. </center>"
+     }
+    },
+    "0e50870ed0c643e0b6c18cc5d7ddae7f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_bfcdbba993b74972a9e3e575f86908ff",
+      "placeholder": "​",
+      "style": "IPY_MODEL_6ebb2ec171414e47a14765505f64bb3c",
+      "value": " 3.84G/3.84G [00:09&lt;00:00, 664MB/s]"
+     }
+    },
+    "0e936d9dbf9c4fdd86bbfe9730dedc47": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "0f417447a7bd4a33acca96fa37aec877": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "0f480e3a0b0a45d2a2d2dec3cad923f3": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "0f6907ebbc6242c8bde059cef1e1bd29": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_5bdfd87fc6cd4f9dabef7cfee29c8060",
+       "IPY_MODEL_64f54d4a744a4627a07c3c0120276f3b",
+       "IPY_MODEL_65b75b9b8bc143cf997796af68ff6668"
+      ],
+      "layout": "IPY_MODEL_d6fe74e4255444368f8f90a62157d869"
+     }
+    },
+    "114dece49dba437c8572ef94b23c3b1e": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "12815f401eba44658caa7b2e490137a8": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "12b56912736849fea2ad8124456fdc5c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_97e36007e1304e1583fd81bfb13f0edd",
+      "max": 1671853,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_c65dc74c7d6f4bab8f7dd28455161dd8",
+      "value": 1671853
+     }
+    },
+    "131065f118274a1586ac38e39ed84ef0": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": "center",
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": "flex",
+      "flex": null,
+      "flex_flow": "column",
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": "50%"
+     }
+    },
+    "158c8b85dbf34de6a94b4e35e2fc7d5a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "16a188a0b06d45f980dcf3933509fe0a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_349eee9f56d64f0cba6fc24ff2c50c9b",
+      "placeholder": "​",
+      "style": "IPY_MODEL_7e5d3774060e4589aa65982da5ea4ef4",
+      "value": " 9985/9985 [00:04&lt;00:00, 2604.11 examples/s]"
+     }
+    },
+    "16d1283741404b7bb319094c992fce01": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a4e5789584564049b83df7c6c54a3e08",
+      "placeholder": "​",
+      "style": "IPY_MODEL_ff3a94b146a948b6907f5d80c7157f99",
+      "value": " 9985/0 [00:00&lt;00:00, 50763.46 examples/s]"
+     }
+    },
+    "1811cda0644e4190a9469d1774435d82": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "18357b321ce44d7b8bd9d1c886f69275": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_e366ae3fceec4566b9ed303d6c5f90af",
+      "placeholder": "​",
+      "style": "IPY_MODEL_5dd7d150dbe04f08b165ce7f2c27cd11",
+      "value": "model-00008-of-00008.safetensors: 100%"
+     }
+    },
+    "19127c7bb1554ccbac877059f9a82db0": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "danger",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_e400cbf14bcc446a9d33b210cd93550b",
+      "max": 3963750880,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_71002199df6b40c9a1ac40df5fb27a1b",
+      "value": 3963750502
+     }
+    },
+    "19c1e38389fa46c7b7e2152a56e1df34": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ButtonModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ButtonModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ButtonView",
+      "button_style": "",
+      "description": "Login",
+      "disabled": false,
+      "icon": "",
+      "layout": "IPY_MODEL_835bcc28a5564fb9b3d651bc8e32dc46",
+      "style": "IPY_MODEL_9f1c9a0695384bdaa6f8b847ef89bee8",
+      "tooltip": ""
+     }
+    },
+    "1bec6297c90242a88672d195bc09d429": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "1c6f1f10667545aaab958016ba7e2c94": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "1d5117195d4b49eb8f1a73b18419f7ce": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_0dea5caa27384f5689e3cab51f558727",
+      "placeholder": "​",
+      "style": "IPY_MODEL_a6f48410b9964fefba0c3009a77dc838",
+      "value": " 9.68k/9.68k [00:00&lt;00:00, 812kB/s]"
+     }
+    },
+    "1f7d30f71bbd4547a9150d21da071055": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "200df5e79b9244849e589ecb0250a520": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_f4a1795dc7514a718f478245f521f0ba",
+      "placeholder": "​",
+      "style": "IPY_MODEL_5e746eb25bbe416fb585fa24e79f5177",
+      "value": "model-00002-of-00008.safetensors: 100%"
+     }
+    },
+    "20352e5f58d24bb8b1f3940efd14fe4a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "253017b0d0534e54ab44e181f6d7c82d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_1c6f1f10667545aaab958016ba7e2c94",
+      "placeholder": "​",
+      "style": "IPY_MODEL_e6e969610738449887259063967f82b0",
+      "value": " 2.78M/2.78M [00:00&lt;00:00, 17.8MB/s]"
+     }
+    },
+    "258b7c635c1045329d4669e48c46ccd5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_6f68ed9889f54ad2ae8a3b95ac263a83",
+       "IPY_MODEL_80366349d81e4dcc892db6cd56e384f3",
+       "IPY_MODEL_c73055099c084dca996159e23e162d0b"
+      ],
+      "layout": "IPY_MODEL_977f799afaac4a55b2dc1cffa7d5b63b"
+     }
+    },
+    "279937fe03bc4e4eb25b472d7e9df163": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "danger",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b634bb73cfa743d09a5999101b840976",
+      "max": 1912371880,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_742b1030acfd414bbd9d5327b7e3826d",
+      "value": 1912371698
+     }
+    },
+    "27beaf06e41b472abdb544a43c720c5a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2860e3bb3baf4f7da058465850e800c5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_3efd18ea8eaa41918894883da9541bfa",
+       "IPY_MODEL_e09f1bcbb9d94c09be53e5e1303642c2",
+       "IPY_MODEL_82177df57a494de8900c14c2f5185175"
+      ],
+      "layout": "IPY_MODEL_ccfcdc95baf646f8aeb3d516742383f2"
+     }
+    },
+    "2a51b36be41745468e4c2d7a21b1c0d2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2a5bb0e818ab47be8cf6465988328503": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2b3a2659b12244bd8548320320016dbf": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2e257c8be2da40b4bb67a9e4ab6811f3": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2e2b0c1599c341a198f632f46a40c90e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_be724f04b03942b2a033a7e8898bb4fd",
+      "placeholder": "​",
+      "style": "IPY_MODEL_fcbab4d8dced41a18dfccce81e3a45a0",
+      "value": "model-00005-of-00008.safetensors: 100%"
+     }
+    },
+    "3036608c71904ce9ae4bb2a9fa8802d9": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_5ca6be24acb548cea130bd58e9954c7c",
+      "placeholder": "​",
+      "style": "IPY_MODEL_5cfb02ee044b4011a378efa8b54a370f",
+      "value": " 3.96G/3.96G [00:10&lt;00:00, 531MB/s]"
+     }
+    },
+    "30a81da86f8043eca301e86a8651201a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "30e02aa2d0d241979369e598287f2639": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "3225603166b54e7aab766b9964a2f660": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "33b3b1d0295646edaac7b4822761aeb0": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "349eee9f56d64f0cba6fc24ff2c50c9b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "34c9c0137b504cd799c6bd6de69507c2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "34cf3df51fbc41cabfdbba153c007f0e": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "35c811d2ae8e43f3b5cecbdd3cfa857f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "35cc989ca3374e7dba0cb166febc4bde": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "366a343b62fa47d8985a3bd464d99f9e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "37de928300e34184881039378bd75e7f": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "388f618924274d21a066f098f4f1e744": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_7c95f85a2b1f47a1bd846d110c47bb3c",
+       "IPY_MODEL_083f9cda8d754c168beee10d2f8955a2",
+       "IPY_MODEL_62e1a65582f446a78612eaa804e08a7d"
+      ],
+      "layout": "IPY_MODEL_487a177d020f4605834878b2fdc7afa3"
+     }
+    },
+    "39789237703c4a418134243055c9cbf5": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "3aaecbf540f54a2db9ab0931e3b1fe57": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "3c21e4a511b4441192c03b7f1d0976e9": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "3efd18ea8eaa41918894883da9541bfa": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_8f5bd719974e41c3a8dd9a5b0d3d71e6",
+      "placeholder": "​",
+      "style": "IPY_MODEL_b87c84de30e84b3abf4871461fb9cbd3",
+      "value": "Loading checkpoint shards: 100%"
+     }
+    },
+    "41f3b32c2f6b4034ae7a3b9124e28bc7": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "4471ff62258549fba9514bb67050f965": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_9cd5211b5d8b457aa0002f1d17b80028",
+       "IPY_MODEL_19127c7bb1554ccbac877059f9a82db0",
+       "IPY_MODEL_f4667818b9d34a09891cd727a429a610"
+      ],
+      "layout": "IPY_MODEL_9ed02dc43412471a9ab47f3620ccf3a5"
+     }
+    },
+    "4540927d98f54466b434ba4c0edf045d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "487a177d020f4605834878b2fdc7afa3": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "4b1f04ff63d14a118fdd15814dff50e4": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "LabelModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "LabelModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "LabelView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_39789237703c4a418134243055c9cbf5",
+      "placeholder": "​",
+      "style": "IPY_MODEL_a3a945817f684328b34651fe052393ec",
+      "value": "Connecting..."
+     }
+    },
+    "4b27c267393640f28f6eae0875bd2ed9": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "4c727d40ef0443449afc31724ee79f0c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "4d05314858354e729d76094b3b0ce761": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_c42acf646f344a88b8c11f81e67f7206",
+       "IPY_MODEL_7be6f04c284e4326bb4ff3d301e7b3c6",
+       "IPY_MODEL_ffdbb12a2f2c4d14911685e7683e0ef0"
+      ],
+      "layout": "IPY_MODEL_bee3501b2a17427784a717e50a85e7fa"
+     }
+    },
+    "4d468f96ec924681ad65eb671674b93e": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "4f1977d7e4824ef1a14b65f0f42bba10": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "4fd114abe9f5494ab59858949f5055f1": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "500e272208a246089613bf788a165271": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_200df5e79b9244849e589ecb0250a520",
+       "IPY_MODEL_cc94432d08464affa3e58b560bdad194",
+       "IPY_MODEL_3036608c71904ce9ae4bb2a9fa8802d9"
+      ],
+      "layout": "IPY_MODEL_adacfdcc1b0140efac56918e9ccf064e"
+     }
+    },
+    "519a7b154022443db6703f04a9142bae": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_d02274afd47b462291c745f261209d42",
+      "max": 27341251,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_0f417447a7bd4a33acca96fa37aec877",
+      "value": 27341251
+     }
+    },
+    "56e3768bef5a4b9db4168c5c17f509c2": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "590eef89881545aa8bbef9a8bbe7fb00": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "598da69727bd4fb8b1caf465ac736d7a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "5bdfd87fc6cd4f9dabef7cfee29c8060": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_4d468f96ec924681ad65eb671674b93e",
+      "placeholder": "​",
+      "style": "IPY_MODEL_ad7599de524549c48bf2d3124ad4b299",
+      "value": "Dropping Long Sequences (num_proc=2): 100%"
+     }
+    },
+    "5ca240f31e6b44e3882c5eb37cd5a309": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": "20px"
+     }
+    },
+    "5ca6be24acb548cea130bd58e9954c7c": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "5cea7996f02040b187ece0bb2d6a8d1f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "5cfb02ee044b4011a378efa8b54a370f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "5dd7d150dbe04f08b165ce7f2c27cd11": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "5e18768f7ad6434ba8b8b8a2e853e204": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "5e5e15b0569b474c9620083b3ec6af55": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "5e746eb25bbe416fb585fa24e79f5177": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "5eb06edeb58e4930b1affef2a59eae81": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "5f86cd894de94c3280fadc1e2fd0ee13": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_a20927bf5f2c41f58c1e31ac858ab36c",
+       "IPY_MODEL_0a46ad75c198463d843fb35e813642cb",
+       "IPY_MODEL_09007681cf8d42aeb8c1d2f6a74e470a"
+      ],
+      "layout": "IPY_MODEL_ebc80d1a55fa47f4a5ea2756588569ec"
+     }
+    },
+    "60c1a0d765c14a1d888317e6a507e4ea": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "62c028fdef904dedb9cdeca2b3bda725": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "62e1a65582f446a78612eaa804e08a7d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_5e18768f7ad6434ba8b8b8a2e853e204",
+      "placeholder": "​",
+      "style": "IPY_MODEL_bb33aec33a6447078c31bfd728942994",
+      "value": " 728/728 [00:00&lt;00:00, 20.3kB/s]"
+     }
+    },
+    "62e302ebdad64aada0ffe64ae1c873f3": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "63580b6fb30642479fe3000915bf551a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "63b4e563e85c4f03b1b72beda9577bcc": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "64f54d4a744a4627a07c3c0120276f3b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_0546d04aae644dde846c58a4afb598a6",
+      "max": 9985,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_897b77a56c09479bb11d7f2a30997e55",
+      "value": 9985
+     }
+    },
+    "65b75b9b8bc143cf997796af68ff6668": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_81c3db71ac704280ad030072655f1537",
+      "placeholder": "​",
+      "style": "IPY_MODEL_042e091f75694c47aee761e760e76773",
+      "value": " 9985/9985 [00:02&lt;00:00, 3977.47 examples/s]"
+     }
+    },
+    "67da6c4260574869aa24c3cbc1bc1654": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "6932489232ec4ab18a160b1e7fbcdfe1": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "6ebb2ec171414e47a14765505f64bb3c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "6f05e9bebf7b40c9835808e77de6c236": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "PasswordModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "PasswordModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "PasswordView",
+      "continuous_update": true,
+      "description": "Token:",
+      "description_tooltip": null,
+      "disabled": false,
+      "layout": "IPY_MODEL_2e257c8be2da40b4bb67a9e4ab6811f3",
+      "placeholder": "​",
+      "style": "IPY_MODEL_56e3768bef5a4b9db4168c5c17f509c2",
+      "value": ""
+     }
+    },
+    "6f3a28b912714c6e931003549664bfa3": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_5ca240f31e6b44e3882c5eb37cd5a309",
+      "max": 1,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_5eb06edeb58e4930b1affef2a59eae81",
+      "value": 1
+     }
+    },
+    "6f68ed9889f54ad2ae8a3b95ac263a83": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_41f3b32c2f6b4034ae7a3b9124e28bc7",
+      "placeholder": "​",
+      "style": "IPY_MODEL_a10d0a76010f4e508c65a9b69ebc5156",
+      "value": "Tokenizing Prompts (num_proc=2): 100%"
+     }
+    },
+    "704f2f5a9b1c49d5a75a0025a5dda11b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "71002199df6b40c9a1ac40df5fb27a1b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "71c8af139cd248b1b51101fd46a93f35": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_d0e9dce55cec4c1ca619a0ccf209d924",
+      "max": 9675,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_4c727d40ef0443449afc31724ee79f0c",
+      "value": 9675
+     }
+    },
+    "734185351eb543fa9a00a881dcbb9fe7": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "735d4f225b24414294fc1b213c61223c": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "742b1030acfd414bbd9d5327b7e3826d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "77304d1a46b3468a98483e02ec0ac4a4": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "7baeab52d6694c32b1efd1ea1a0a7782": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_93a44a11aa4846fa8efc6c1413ef1627",
+      "placeholder": "​",
+      "style": "IPY_MODEL_a55060adc3564407ac81ad7297d34aaa",
+      "value": "train.jsonl: 100%"
+     }
+    },
+    "7be6f04c284e4326bb4ff3d301e7b3c6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "danger",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_9503a45960984adc97b58e16c50662e0",
+      "max": 3963750880,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_da6e93f3e4984780b930fe7a706983ea",
+      "value": 3963750502
+     }
+    },
+    "7c2485c6cdfe463da6fdb35982a1070d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_ad1236893754446881e153adc9d5c962",
+       "IPY_MODEL_daee63fd167e4441a32324b51b00ad2b",
+       "IPY_MODEL_fe41858c6bd04c58840112b67c19a336"
+      ],
+      "layout": "IPY_MODEL_d262c82138024169b9f3aa034ca756fa"
+     }
+    },
+    "7c95f85a2b1f47a1bd846d110c47bb3c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_7fd44cf9ca6e4726bfd7ac21846d6a14",
+      "placeholder": "​",
+      "style": "IPY_MODEL_366a343b62fa47d8985a3bd464d99f9e",
+      "value": "config.json: 100%"
+     }
+    },
+    "7cd0b85ebd204b7aba908417811ce4e0": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_7baeab52d6694c32b1efd1ea1a0a7782",
+       "IPY_MODEL_519a7b154022443db6703f04a9142bae",
+       "IPY_MODEL_d4183e9715f34d249942b8271cca3bdf"
+      ],
+      "layout": "IPY_MODEL_da2347ac94764a3fa2743343cf0d3cd2"
+     }
+    },
+    "7e5d3774060e4589aa65982da5ea4ef4": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "7fd44cf9ca6e4726bfd7ac21846d6a14": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "80366349d81e4dcc892db6cd56e384f3": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_f8ef805b776145c3bfa9ba8d90972058",
+      "max": 9985,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_cc587493c33c4f118d1b1170f85be24c",
+      "value": 9985
+     }
+    },
+    "813621384dc748b0ad06775e22761c0b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "81c3db71ac704280ad030072655f1537": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "82177df57a494de8900c14c2f5185175": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_67da6c4260574869aa24c3cbc1bc1654",
+      "placeholder": "​",
+      "style": "IPY_MODEL_94b9088614464f60a203de39dbcae853",
+      "value": " 8/8 [01:47&lt;00:00, 11.64s/it]"
+     }
+    },
+    "823f1c78f15043e38bbd4dca3932a86a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_03a3c744d716431488163b4358b80f92",
+      "max": 239,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_a5434ee714f9498d83870544b67c0cb7",
+      "value": 239
+     }
+    },
+    "835bcc28a5564fb9b3d651bc8e32dc46": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "8640ac440fbc4644b9a3af7ba3ae7183": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "86816687746246b4a6105e8010384e25": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_8640ac440fbc4644b9a3af7ba3ae7183",
+      "placeholder": "​",
+      "style": "IPY_MODEL_5cea7996f02040b187ece0bb2d6a8d1f",
+      "value": "<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svg\nalt='Hugging Face'> <br> Copy a token from <a\nhref=\"https://huggingface.co/settings/tokens\" target=\"_blank\">your Hugging Face\ntokens page</a> and paste it below. <br> Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. </center>"
+     }
+    },
+    "879c8ab5873847a8833bd74123be90a4": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_ef223e8504b64e3592589880326aaf41",
+      "placeholder": "​",
+      "style": "IPY_MODEL_598da69727bd4fb8b1caf465ac736d7a",
+      "value": " 1.67M/1.67M [00:00&lt;00:00, 19.0MB/s]"
+     }
+    },
+    "897b77a56c09479bb11d7f2a30997e55": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "8bc9d8ba866c442b9118d9630009939c": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "8c4d4fc5a30f4e7cb3be53fe2adda33d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "8f5bd719974e41c3a8dd9a5b0d3d71e6": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "8f726dbfb45d4528afa33e36a6313267": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "9327977822be4b1294f80e876552e305": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_37de928300e34184881039378bd75e7f",
+      "placeholder": "​",
+      "style": "IPY_MODEL_0e936d9dbf9c4fdd86bbfe9730dedc47",
+      "value": " 3.96G/3.96G [00:13&lt;00:00, 273MB/s]"
+     }
+    },
+    "936d04b5fe1b4c63bf0b080e423d051b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "93a44a11aa4846fa8efc6c1413ef1627": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "94b9088614464f60a203de39dbcae853": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "9503a45960984adc97b58e16c50662e0": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "95caff42f08a4c2aa14c867b8f37f231": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_de7c37ee83e24f0c889e84d07279c2ec",
+       "IPY_MODEL_9d4897eefb5f48259ffb2d23e332f752",
+       "IPY_MODEL_253017b0d0534e54ab44e181f6d7c82d"
+      ],
+      "layout": "IPY_MODEL_27beaf06e41b472abdb544a43c720c5a"
+     }
+    },
+    "977f799afaac4a55b2dc1cffa7d5b63b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "97e36007e1304e1583fd81bfb13f0edd": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "9858cb74a09748a39e8149baac96702c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "9b42e08b3c9548818488268768a118b1": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_d955dcaa0e944e719f3a06139dd54a03",
+      "placeholder": "​",
+      "style": "IPY_MODEL_d3de2662c7964f1ba96e58da382af720",
+      "value": "merges.txt: 100%"
+     }
+    },
+    "9cd5211b5d8b457aa0002f1d17b80028": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_6932489232ec4ab18a160b1e7fbcdfe1",
+      "placeholder": "​",
+      "style": "IPY_MODEL_4540927d98f54466b434ba4c0edf045d",
+      "value": "model-00007-of-00008.safetensors: 100%"
+     }
+    },
+    "9d4897eefb5f48259ffb2d23e332f752": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_30a81da86f8043eca301e86a8651201a",
+      "max": 2776833,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_e8b7a81040904c1e89e58978223b1737",
+      "value": 2776833
+     }
+    },
+    "9e333ed3b5014069ac1dd969255dd591": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "9ed02dc43412471a9ab47f3620ccf3a5": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "9f1c9a0695384bdaa6f8b847ef89bee8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ButtonStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ButtonStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "button_color": null,
+      "font_weight": ""
+     }
+    },
+    "9f56a2d9979c4bd8928c644c22c3ecdf": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "a0a11e929edd4189b79723d618522c33": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a10d0a76010f4e508c65a9b69ebc5156": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "a138859f19b74fc0928dc236ab5359db": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_9b42e08b3c9548818488268768a118b1",
+       "IPY_MODEL_12b56912736849fea2ad8124456fdc5c",
+       "IPY_MODEL_879c8ab5873847a8833bd74123be90a4"
+      ],
+      "layout": "IPY_MODEL_20352e5f58d24bb8b1f3940efd14fe4a"
+     }
+    },
+    "a1959759c5424da9961fb2a308d4dee4": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_3aaecbf540f54a2db9ab0931e3b1fe57",
+      "placeholder": "​",
+      "style": "IPY_MODEL_9e333ed3b5014069ac1dd969255dd591",
+      "value": " 239/239 [00:00&lt;00:00, 30.9kB/s]"
+     }
+    },
+    "a20927bf5f2c41f58c1e31ac858ab36c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_1811cda0644e4190a9469d1774435d82",
+      "placeholder": "​",
+      "style": "IPY_MODEL_35c811d2ae8e43f3b5cecbdd3cfa857f",
+      "value": "tokenizer.json: 100%"
+     }
+    },
+    "a3a945817f684328b34651fe052393ec": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "a44f630e099e43899f20a77084ae60cd": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_ed5ca967ad5342929e578ac6aa4dc4c0",
+      "placeholder": "​",
+      "style": "IPY_MODEL_af401d117d5047629d3a6e2361757b62",
+      "value": "model-00001-of-00008.safetensors: 100%"
+     }
+    },
+    "a4e5789584564049b83df7c6c54a3e08": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a5434ee714f9498d83870544b67c0cb7": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "a55060adc3564407ac81ad7297d34aaa": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "a6f48410b9964fefba0c3009a77dc838": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "a7cf477e80fc43e0ad82c7997b076dce": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "a80410b919e442c49aea15acc1ce1a72": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_fa1282ccc7544e4f818e2f03ccffe4a5",
+      "placeholder": "​",
+      "style": "IPY_MODEL_bbbf575d2a4b4c6ea8389be79b2a6039",
+      "value": "model.safetensors.index.json: 100%"
+     }
+    },
+    "ab93eabd7cea4b94b4b7a387f101e8a1": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "ac764024cf1c4e08ba7749afd2cd20ac": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "ad1236893754446881e153adc9d5c962": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_62e302ebdad64aada0ffe64ae1c873f3",
+      "placeholder": "​",
+      "style": "IPY_MODEL_bd1b0dfed6d34d16af33a4a58330f5ec",
+      "value": "Saving the dataset (1/1 shards): 100%"
+     }
+    },
+    "ad7599de524549c48bf2d3124ad4b299": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "adacfdcc1b0140efac56918e9ccf064e": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "af401d117d5047629d3a6e2361757b62": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "b191ac001a2e4962bc9a245fcdf26e6b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b195f160ca20442fadd8b5aed0ee41af": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b1bea589efa14258a9982071b87938bf": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b5b65414154544aa8a71b1a39164aad7": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b634bb73cfa743d09a5999101b840976": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b82aa8c57f7c422a9a9c90f333ed2a99": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_c0991cf63ee6458b96e9a75e7a88b61a",
+       "IPY_MODEL_71c8af139cd248b1b51101fd46a93f35",
+       "IPY_MODEL_1d5117195d4b49eb8f1a73b18419f7ce"
+      ],
+      "layout": "IPY_MODEL_3c21e4a511b4441192c03b7f1d0976e9"
+     }
+    },
+    "b8766a88716948cf968f4563531a76d9": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_2b3a2659b12244bd8548320320016dbf",
+      "placeholder": "​",
+      "style": "IPY_MODEL_0cd7efffbb3c4c4b972e63749f61ab97",
+      "value": "Generating train split: "
+     }
+    },
+    "b87c84de30e84b3abf4871461fb9cbd3": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "b8e39e4dddc3497fbc29ae45c66da759": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "bb33aec33a6447078c31bfd728942994": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "bbbf575d2a4b4c6ea8389be79b2a6039": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "bca2c7185b6749fd899c06a2ba4c5e46": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_0f480e3a0b0a45d2a2d2dec3cad923f3",
+      "placeholder": "​",
+      "style": "IPY_MODEL_fcb30372e7404c5d8a1ad4df91e6c7b2",
+      "value": " 1.91G/1.91G [00:05&lt;00:00, 444MB/s]"
+     }
+    },
+    "bd1b0dfed6d34d16af33a4a58330f5ec": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "be724f04b03942b2a033a7e8898bb4fd": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "bed8726b8069434687c75452e21f19e5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_fa864b41586f4a7aa56aeafd1d84eb75",
+      "max": 9985,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_3225603166b54e7aab766b9964a2f660",
+      "value": 9985
+     }
+    },
+    "bee3501b2a17427784a717e50a85e7fa": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "bfcdbba993b74972a9e3e575f86908ff": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "bff139df987d4a62abec6456cb27f3d4": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "danger",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_c1f9c267ba3f40039cdb5eb3267e8043",
+      "max": 3963750880,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_33b3b1d0295646edaac7b4822761aeb0",
+      "value": 3963750502
+     }
+    },
+    "c0892a1881de4eb4bfabc6a68f87ae99": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_158c8b85dbf34de6a94b4e35e2fc7d5a",
+      "placeholder": "​",
+      "style": "IPY_MODEL_0b4c9753a7cb4354b8e5f187e6e1ad7c",
+      "value": " 3.96G/3.96G [00:15&lt;00:00, 564MB/s]"
+     }
+    },
+    "c0991cf63ee6458b96e9a75e7a88b61a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_ed28e2e0410d4e0b855467e798e53d66",
+      "placeholder": "​",
+      "style": "IPY_MODEL_d93f134f802b4b69b575bdaf07dbd27c",
+      "value": "tokenizer_config.json: 100%"
+     }
+    },
+    "c12ea43372ac4d57bb9605f1a429b397": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "VBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "VBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "VBoxView",
+      "box_style": "",
+      "children": [],
+      "layout": "IPY_MODEL_131065f118274a1586ac38e39ed84ef0"
+     }
+    },
+    "c1314f241a434c41b45d84dc4d3b30f8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "c1f9c267ba3f40039cdb5eb3267e8043": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "c33ced495f70464aa4a3a91922090853": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "c3725c7f79fe415fbd1ea336f0cc9cf1": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "danger",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b191ac001a2e4962bc9a245fcdf26e6b",
+      "max": 3841788544,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_054c8dffadba48c6b895a6cc62448ecc",
+      "value": 3841788178
+     }
+    },
+    "c3be9109d63c485d9c0ef4f9bc0f9218": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "c42acf646f344a88b8c11f81e67f7206": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_8bc9d8ba866c442b9118d9630009939c",
+      "placeholder": "​",
+      "style": "IPY_MODEL_9f56a2d9979c4bd8928c644c22c3ecdf",
+      "value": "model-00003-of-00008.safetensors: 100%"
+     }
+    },
+    "c6164e05a1914ae48083db9ad7f4ef7c": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "c65dc74c7d6f4bab8f7dd28455161dd8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "c6e00f5224364822bc4239b176686919": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_2a51b36be41745468e4c2d7a21b1c0d2",
+      "max": 36514,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_4fd114abe9f5494ab59858949f5055f1",
+      "value": 36514
+     }
+    },
+    "c73055099c084dca996159e23e162d0b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_e40d1c1ac9494b3bade9858324e7ffdf",
+      "placeholder": "​",
+      "style": "IPY_MODEL_d65b6b060d9845779299491ac5599c31",
+      "value": " 9985/9985 [01:04&lt;00:00, 189.08 examples/s]"
+     }
+    },
+    "c7433acd3c4841e6958ae8f7e87b1808": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "CheckboxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "CheckboxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "CheckboxView",
+      "description": "Add token as git credential?",
+      "description_tooltip": null,
+      "disabled": false,
+      "indent": true,
+      "layout": "IPY_MODEL_62c028fdef904dedb9cdeca2b3bda725",
+      "style": "IPY_MODEL_a7cf477e80fc43e0ad82c7997b076dce",
+      "value": false
+     }
+    },
+    "c84cc07789be48aebb322c23d355289e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_0077aedc3d174560bce924ee89e9c006",
+      "placeholder": "​",
+      "style": "IPY_MODEL_00321cce58884f6f9b3855a21fcd9187",
+      "value": "Add position_id column (Sample Packing) (num_proc=2): 100%"
+     }
+    },
+    "ca65e32eb52f48c09a84b33cb18f22cd": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "cc587493c33c4f118d1b1170f85be24c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "cc94432d08464affa3e58b560bdad194": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "danger",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b5b65414154544aa8a71b1a39164aad7",
+      "max": 3963750816,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_f0a58fbd0fca4340890041f99fa2f8c8",
+      "value": 3963750438
+     }
+    },
+    "ccfcdc95baf646f8aeb3d516742383f2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "cdebbc55a1164c018546c2ac6f8c620c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_a44f630e099e43899f20a77084ae60cd",
+       "IPY_MODEL_c3725c7f79fe415fbd1ea336f0cc9cf1",
+       "IPY_MODEL_0e50870ed0c643e0b6c18cc5d7ddae7f"
+      ],
+      "layout": "IPY_MODEL_c33ced495f70464aa4a3a91922090853"
+     }
+    },
+    "d02274afd47b462291c745f261209d42": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d07c8b97d3314f1c852e44bdd40f61ed": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d0e9dce55cec4c1ca619a0ccf209d924": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d1f9b10c130542f094c8fd3d1e23b5e9": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d262c82138024169b9f3aa034ca756fa": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d3de2662c7964f1ba96e58da382af720": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "d4183e9715f34d249942b8271cca3bdf": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_63580b6fb30642479fe3000915bf551a",
+      "placeholder": "​",
+      "style": "IPY_MODEL_8f726dbfb45d4528afa33e36a6313267",
+      "value": " 27.3M/27.3M [00:00&lt;00:00, 31.0MB/s]"
+     }
+    },
+    "d43c6df07ddb466587807d6dbe1ff614": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_8c4d4fc5a30f4e7cb3be53fe2adda33d",
+      "placeholder": "​",
+      "style": "IPY_MODEL_e90658f4bcb642baa78426012f863152",
+      "value": "model-00004-of-00008.safetensors: 100%"
+     }
+    },
+    "d65b6b060d9845779299491ac5599c31": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "d6fe74e4255444368f8f90a62157d869": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d93f134f802b4b69b575bdaf07dbd27c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "d955dcaa0e944e719f3a06139dd54a03": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "da2347ac94764a3fa2743343cf0d3cd2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "da6e93f3e4984780b930fe7a706983ea": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "daee63fd167e4441a32324b51b00ad2b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_d07c8b97d3314f1c852e44bdd40f61ed",
+      "max": 9985,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_ebb69a2c3d0a4299a484698287b3087c",
+      "value": 9985
+     }
+    },
+    "dc892a596f6942d7973c616c38f0eebb": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_c84cc07789be48aebb322c23d355289e",
+       "IPY_MODEL_bed8726b8069434687c75452e21f19e5",
+       "IPY_MODEL_16a188a0b06d45f980dcf3933509fe0a"
+      ],
+      "layout": "IPY_MODEL_60c1a0d765c14a1d888317e6a507e4ea"
+     }
+    },
+    "dd0e646fad3f4a89ba23b39d162bd8d9": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_d43c6df07ddb466587807d6dbe1ff614",
+       "IPY_MODEL_e0e8b840b8ea4d0d9db09afe99fa287d",
+       "IPY_MODEL_9327977822be4b1294f80e876552e305"
+      ],
+      "layout": "IPY_MODEL_77304d1a46b3468a98483e02ec0ac4a4"
+     }
+    },
+    "de7c37ee83e24f0c889e84d07279c2ec": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_34cf3df51fbc41cabfdbba153c007f0e",
+      "placeholder": "​",
+      "style": "IPY_MODEL_ac764024cf1c4e08ba7749afd2cd20ac",
+      "value": "vocab.json: 100%"
+     }
+    },
+    "dfd2a2649b8341ef913207526708aff1": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "e09f1bcbb9d94c09be53e5e1303642c2": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_e7d8e4fe58384e93a106de546068c65e",
+      "max": 8,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_0aa8ab56b85f4171a79c3bc210594025",
+      "value": 8
+     }
+    },
+    "e0e8b840b8ea4d0d9db09afe99fa287d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "danger",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_f7434f3e03124a1c938a39af79d7fa59",
+      "max": 3963750880,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_c1314f241a434c41b45d84dc4d3b30f8",
+      "value": 3963750502
+     }
+    },
+    "e21e180307e5485cbbe908672fd6639a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_2e2b0c1599c341a198f632f46a40c90e",
+       "IPY_MODEL_bff139df987d4a62abec6456cb27f3d4",
+       "IPY_MODEL_ebe1cc366d324ad59b264c8b3c431441"
+      ],
+      "layout": "IPY_MODEL_114dece49dba437c8572ef94b23c3b1e"
+     }
+    },
+    "e366ae3fceec4566b9ed303d6c5f90af": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "e3fb3fc6afe04b3c9b7ac61809ce78fa": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_c6164e05a1914ae48083db9ad7f4ef7c",
+      "placeholder": "​",
+      "style": "IPY_MODEL_813621384dc748b0ad06775e22761c0b",
+      "value": " 9985/9985 [00:03&lt;00:00, 3622.89 examples/s]"
+     }
+    },
+    "e400cbf14bcc446a9d33b210cd93550b": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "e40d1c1ac9494b3bade9858324e7ffdf": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "e575d87a7efe4ec7b1efde489839d4a6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "e5a82df528bb4e408797a3b6c2758f4a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "e6e969610738449887259063967f82b0": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "e7d8e4fe58384e93a106de546068c65e": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "e87ea87fcff247b5bbcc331ba79a8dc2": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "e8b7a81040904c1e89e58978223b1737": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "e90658f4bcb642baa78426012f863152": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "eb1c9535e6a546098b760528b2ea387c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_18357b321ce44d7b8bd9d1c886f69275",
+       "IPY_MODEL_279937fe03bc4e4eb25b472d7e9df163",
+       "IPY_MODEL_bca2c7185b6749fd899c06a2ba4c5e46"
+      ],
+      "layout": "IPY_MODEL_1f7d30f71bbd4547a9150d21da071055"
+     }
+    },
+    "ebb69a2c3d0a4299a484698287b3087c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "ebc80d1a55fa47f4a5ea2756588569ec": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "ebe1cc366d324ad59b264c8b3c431441": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_fba7aa824b38467ab3061b226114cdec",
+      "placeholder": "​",
+      "style": "IPY_MODEL_f3075dccbd2747b4a7913b66f44f2596",
+      "value": " 3.96G/3.96G [00:13&lt;00:00, 398MB/s]"
+     }
+    },
+    "ec030fc3c346426f9abc3a89892258d3": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_dfd2a2649b8341ef913207526708aff1",
+      "max": 9985,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_4f1977d7e4824ef1a14b65f0f42bba10",
+      "value": 9985
+     }
+    },
+    "ec11d1e5ae7b42c883d9b1f38a65356e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_936d04b5fe1b4c63bf0b080e423d051b",
+      "placeholder": "​",
+      "style": "IPY_MODEL_f1cef8e8dc2646fb9fd09f3b09081074",
+      "value": " 36.5k/36.5k [00:00&lt;00:00, 4.32MB/s]"
+     }
+    },
+    "ed28e2e0410d4e0b855467e798e53d66": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "ed5ca967ad5342929e578ac6aa4dc4c0": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "edc99591b9c747b689b94d0052fec14c": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "ef0a3c7a6f14460fb4da096928ae249e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_07fb3a2c8315494e97b447e672dfae06",
+       "IPY_MODEL_ec030fc3c346426f9abc3a89892258d3",
+       "IPY_MODEL_e3fb3fc6afe04b3c9b7ac61809ce78fa"
+      ],
+      "layout": "IPY_MODEL_c3be9109d63c485d9c0ef4f9bc0f9218"
+     }
+    },
+    "ef223e8504b64e3592589880326aaf41": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "f0a58fbd0fca4340890041f99fa2f8c8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "f113ebd8c1c34806bea4dd7ed3035173": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "f1cef8e8dc2646fb9fd09f3b09081074": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "f3075dccbd2747b4a7913b66f44f2596": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "f365820a3d3c42b2948abfe32065de14": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_735d4f225b24414294fc1b213c61223c",
+      "placeholder": "​",
+      "style": "IPY_MODEL_5e5e15b0569b474c9620083b3ec6af55",
+      "value": "generation_config.json: 100%"
+     }
+    },
+    "f4667818b9d34a09891cd727a429a610": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_4b27c267393640f28f6eae0875bd2ed9",
+      "placeholder": "​",
+      "style": "IPY_MODEL_9858cb74a09748a39e8149baac96702c",
+      "value": " 3.96G/3.96G [00:11&lt;00:00, 457MB/s]"
+     }
+    },
+    "f4a1795dc7514a718f478245f521f0ba": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "f60a2bdb6b6b4e0e8c3508580e247132": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "danger",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_edc99591b9c747b689b94d0052fec14c",
+      "max": 3963750880,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_35cc989ca3374e7dba0cb166febc4bde",
+      "value": 3963750502
+     }
+    },
+    "f7434f3e03124a1c938a39af79d7fa59": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "f8ef805b776145c3bfa9ba8d90972058": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "fa1282ccc7544e4f818e2f03ccffe4a5": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "fa864b41586f4a7aa56aeafd1d84eb75": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "fba7aa824b38467ab3061b226114cdec": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "fcb30372e7404c5d8a1ad4df91e6c7b2": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "fcbab4d8dced41a18dfccce81e3a45a0": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "fd4f333f7ece4450b04e1a9af1f9d2f6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_d1f9b10c130542f094c8fd3d1e23b5e9",
+      "placeholder": "​",
+      "style": "IPY_MODEL_e575d87a7efe4ec7b1efde489839d4a6",
+      "value": "model-00006-of-00008.safetensors: 100%"
+     }
+    },
+    "fe18bba7f3fb4c31bf840541f36b3425": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_fd4f333f7ece4450b04e1a9af1f9d2f6",
+       "IPY_MODEL_f60a2bdb6b6b4e0e8c3508580e247132",
+       "IPY_MODEL_c0892a1881de4eb4bfabc6a68f87ae99"
+      ],
+      "layout": "IPY_MODEL_1bec6297c90242a88672d195bc09d429"
+     }
+    },
+    "fe41858c6bd04c58840112b67c19a336": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_e5a82df528bb4e408797a3b6c2758f4a",
+      "placeholder": "​",
+      "style": "IPY_MODEL_f113ebd8c1c34806bea4dd7ed3035173",
+      "value": " 9985/9985 [00:00&lt;00:00, 44264.88 examples/s]"
+     }
+    },
+    "fea1b70fb46745feb5111b3929175b5d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_f365820a3d3c42b2948abfe32065de14",
+       "IPY_MODEL_823f1c78f15043e38bbd4dca3932a86a",
+       "IPY_MODEL_a1959759c5424da9961fb2a308d4dee4"
+      ],
+      "layout": "IPY_MODEL_34c9c0137b504cd799c6bd6de69507c2"
+     }
+    },
+    "ff3a94b146a948b6907f5d80c7157f99": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "ffdbb12a2f2c4d14911685e7683e0ef0": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_ab93eabd7cea4b94b4b7a387f101e8a1",
+      "placeholder": "​",
+      "style": "IPY_MODEL_704f2f5a9b1c49d5a75a0025a5dda11b",
+      "value": " 3.96G/3.96G [00:12&lt;00:00, 656MB/s]"
+     }
+    }
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
 }
diff --git a/pyproject.toml b/pyproject.toml
index 36138c65d..932219d9e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,3 +26,34 @@ include-package-data = true
 
 [tool.setuptools.cmdclass]
 build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"
+
+[tool.ruff]
+line-length = 88
+target-version = "py310"
+
+[tool.ruff.lint]
+select = ["E", "F", "W", "C90", "B"]
+ignore = [
+    "E203",  # Whitespace before ':'
+    "E501",  # Line too long
+    "C901",  # Too complex
+    "B019",  # Use of functools.cache on methods
+    "E722",  # Bare except
+    "F821",  # Undefined name (for dynamic exec)
+]
+
+[tool.ruff.lint.isort]
+known-third-party = ["wandb", "comet_ml"]
+known-local-folder = ["src", "tests"]
+# Black-compatible isort settings
+force-single-line = false
+combine-as-imports = true
+split-on-trailing-comma = true
+
+[tool.ruff.format]
+# Use black's formatting style exactly
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+docstring-code-format = false
diff --git a/scripts/chat_datasets.py b/scripts/chat_datasets.py
index 1a85fcef9..0c1e0bd03 100644
--- a/scripts/chat_datasets.py
+++ b/scripts/chat_datasets.py
@@ -27,7 +27,7 @@ def parse_dataset(dataset=None, split="train"):
             break
     if not field_messages:
         raise ValueError(
-            f'No conversation field found in dataset: {", ".join(feature_keys)}'
+            f"No conversation field found in dataset: {', '.join(feature_keys)}"
         )
     ds_cfg["field_messages"] = field_messages
 
@@ -40,7 +40,7 @@ def parse_dataset(dataset=None, split="train"):
             break
     if not message_property_mappings["role"]:
         raise ValueError(
-            f'No role field found in messages: {", ".join(message_fields)}'
+            f"No role field found in messages: {', '.join(message_fields)}"
         )
 
     for key in ["content", "text", "value"]:
@@ -49,7 +49,7 @@ def parse_dataset(dataset=None, split="train"):
             break
     if not message_property_mappings["content"]:
         raise ValueError(
-            f'No content field found in messages: {", ".join(message_fields)}'
+            f"No content field found in messages: {', '.join(message_fields)}"
         )
     ds_cfg["message_property_mappings"] = message_property_mappings
 
diff --git a/scripts/unsloth_install.py b/scripts/unsloth_install.py
index acbd05e90..c0e5bbe70 100644
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -1,11 +1,10 @@
 # noqa
-# pylint: skip-file
 import sys
 
 try:
     import torch
-except ImportError:
-    raise ImportError("Install torch via `pip install torch`")
+except ImportError as error:
+    raise ImportError("Install torch via `pip install torch`") from error
 from packaging.version import Version as V
 
 use_uv = "--uv" in sys.argv[1:]
diff --git a/src/axolotl/cli/art.py b/src/axolotl/cli/art.py
index 2051784e9..81dbb9831 100644
--- a/src/axolotl/cli/art.py
+++ b/src/axolotl/cli/art.py
@@ -22,7 +22,7 @@ HAS_PRINTED_LOGO = False
 def print_axolotl_text_art():
     """Prints axolotl ASCII art."""
 
-    global HAS_PRINTED_LOGO  # pylint: disable=global-statement
+    global HAS_PRINTED_LOGO
     if HAS_PRINTED_LOGO:
         return
     if is_main_process():
diff --git a/src/axolotl/cli/cloud/modal_.py b/src/axolotl/cli/cloud/modal_.py
index 6d4f999b4..7f953372d 100644
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -41,7 +41,7 @@ def run_cmd(cmd: str, run_folder: str, volumes=None):
     if exit_code := subprocess.call(  # nosec B603
         cmd.split(), cwd=run_folder, env=new_env
     ):
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
+        exit(exit_code)
 
     # Commit writes to volume.
     if volumes:
@@ -130,7 +130,6 @@ class ModalCloud(Cloud):
         res = []
         if self.config.secrets:
             for key in self.config.get("secrets", []):
-                # pylint: disable=duplicate-code
                 if isinstance(key, str):
                     if val := os.environ.get(key, ""):
                         res.append(modal.Secret.from_dict({key: val}))
@@ -177,8 +176,8 @@ class ModalCloud(Cloud):
             with self.app.run(detach=True):
                 modal_fn.remote(
                     config_yaml,
-                    volumes={k: v[0] for k, v in self.volumes.items()},
                     *args,
+                    volumes={k: v[0] for k, v in self.volumes.items()},
                     **kwargs,
                 )
 
@@ -187,7 +186,7 @@ class ModalCloud(Cloud):
             return int(self.config.timeout)
         return 60 * 60 * 24  # 24 hours
 
-    def get_train_gpu(self):  # pylint: disable=too-many-return-statements
+    def get_train_gpu(self):
         count = self.config.gpu_count or 1
         family = self.config.gpu.lower() or "l40s"
 
@@ -277,7 +276,7 @@ def _train(
     launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
     launcher_args: list[str] | None = None,
     volumes=None,
-    **kwargs,  # pylint: disable=unused-argument
+    **kwargs,
 ):
     Path("/workspace/mounts").mkdir(parents=True, exist_ok=True)
     with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out:
diff --git a/src/axolotl/cli/config.py b/src/axolotl/cli/config.py
index 0f1245aed..20e341a0b 100644
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -210,7 +210,7 @@ def load_cfg(
     try:
         device_props = torch.cuda.get_device_properties("cuda")
         gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
-    except:  # pylint: disable=bare-except # noqa: E722
+    except:
         gpu_version = None
 
     prepare_plugins(cfg)
diff --git a/src/axolotl/cli/evaluate.py b/src/axolotl/cli/evaluate.py
index 9dd3b0083..1a73937a2 100644
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -28,7 +28,7 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
         cfg: Dictionary mapping `axolotl` config keys to values.
         cli_args: CLI arguments.
     """
-    # pylint: disable=duplicate-code
+
     check_accelerate_default_config()
     if int(os.getenv("LOCAL_RANK", "0")) == 0:
         check_user_token()
@@ -49,7 +49,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
         config: Path to `axolotl` config YAML file.
         kwargs: Additional keyword arguments to override config file values.
     """
-    # pylint: disable=duplicate-code
+
     parsed_cfg = load_cfg(config, **kwargs)
     parser = HfArgumentParser(TrainerCliArgs)
     parsed_cli_args, _ = parser.parse_args_into_dataclasses(
diff --git a/src/axolotl/cli/inference.py b/src/axolotl/cli/inference.py
index d03a91bc7..06b64292f 100644
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -35,7 +35,7 @@ def get_multi_line_input() -> str:
 
     instruction = ""
     for line in sys.stdin:
-        instruction += line  # pylint: disable=consider-using-join
+        instruction += line
 
     return instruction
 
@@ -167,7 +167,6 @@ def do_inference_gradio(
         if not instruction:
             return
         if prompter_module:
-            # pylint: disable=stop-iteration-return
             prompt: str = next(
                 prompter_module().build_prompt(instruction=instruction.strip("\n"))
             )
@@ -252,7 +251,7 @@ def do_cli(
         config: Path to `axolotl` config YAML file.
         kwargs: Additional keyword arguments to override config file values.
     """
-    # pylint: disable=duplicate-code
+
     parsed_cfg = load_cfg(config, inference=True, rl=None, **kwargs)
     parsed_cfg.sample_packing = False
     parser = transformers.HfArgumentParser(InferenceCliArgs)
diff --git a/src/axolotl/cli/main.py b/src/axolotl/cli/main.py
index e63392802..acfa81389 100644
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -1,7 +1,5 @@
 """Click CLI definitions for various axolotl commands."""
 
-# pylint: disable=redefined-outer-name
-
 import os
 import subprocess  # nosec B404
 from typing import Literal, Optional
diff --git a/src/axolotl/cli/merge_sharded_fsdp_weights.py b/src/axolotl/cli/merge_sharded_fsdp_weights.py
index c99f37fb1..43142d79e 100644
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -32,7 +32,7 @@ LOG = get_logger(__name__)
 class BFloat16CastPlanner(_EmptyStateDictLoadPlanner):
     """A custom planner to cast tensors to bfloat16 on the fly during loading."""
 
-    def commit_tensor(self, read_item, tensor):  # pylint: disable=unused-argument
+    def commit_tensor(self, read_item, tensor):
         tensor.copy_(tensor.to(torch.bfloat16))
 
 
@@ -59,10 +59,10 @@ def _distributed_checkpoint_to_merged_weights(
     state_dict: Dict = {}
     save_path_ = Path(save_path)
     save_path_.mkdir(exist_ok=True)
-    dist_cp_format_utils._load_state_dict(  # pylint: disable=protected-access
+    dist_cp_format_utils._load_state_dict(
         state_dict,
         storage_reader=dist_cp.FileSystemReader(checkpoint_dir),
-        planner=BFloat16CastPlanner(),  # pylint: disable=protected-access
+        planner=BFloat16CastPlanner(),
         no_dist=True,
     )
 
@@ -191,7 +191,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
         config: Path to `axolotl` config YAML file.
         kwargs: Additional keyword arguments to override config file values.
     """
-    # pylint: disable=duplicate-code
+
     parsed_cfg = load_cfg(config, **kwargs)
 
     fsdp_dir = Path(parsed_cfg.output_dir) / "pytorch_model_fsdp_0"
diff --git a/src/axolotl/cli/preprocess.py b/src/axolotl/cli/preprocess.py
index 4120062d8..ff4551c64 100644
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -73,7 +73,7 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
                     AutoModelForCausalLM.from_pretrained(
                         model_name, trust_remote_code=True
                     )
-                except Exception as exc:  # pylint: disable=broad-exception-caught,unused-variable  # nosec B110  # noqa F841
+                except Exception:  # nosec B110
                     pass
                 # fmt: on
 
@@ -95,7 +95,7 @@ def do_cli(
         config: Path to `axolotl` config YAML file.
         kwargs: Additional keyword arguments to override config file values.
     """
-    # pylint: disable=duplicate-code
+
     os.environ["AXOLOTL_IS_PREPROCESS"] = "1"
     is_preprocess = kwargs.pop("is_preprocess", True)
     parsed_cfg = load_cfg(config, is_preprocess=is_preprocess, **kwargs)
diff --git a/src/axolotl/cli/train.py b/src/axolotl/cli/train.py
index 7f0b0bdd2..5e766de37 100644
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -59,7 +59,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
         config: Path to `axolotl` config YAML file.
         kwargs: Additional keyword arguments to override config file values.
     """
-    # pylint: disable=duplicate-code
+
     parsed_cfg = load_cfg(config, **kwargs)
     parser = HfArgumentParser(TrainerCliArgs)
     parsed_cli_args, _ = parser.parse_args_into_dataclasses(
diff --git a/src/axolotl/cli/utils/args.py b/src/axolotl/cli/utils/args.py
index 3aea1a378..0aec737b8 100644
--- a/src/axolotl/cli/utils/args.py
+++ b/src/axolotl/cli/utils/args.py
@@ -65,7 +65,7 @@ def add_options_from_dataclass(config_class: Type[Any]) -> Callable:
         for field in reversed(dataclasses.fields(config_class)):
             field_type = _strip_optional_type(field.type)
 
-            if field_type == bool:
+            if field_type is bool:
                 field_name = field.name.replace("_", "-")
                 option_name = f"--{field_name}/--no-{field_name}"
                 function = click.option(
@@ -103,7 +103,7 @@ def add_options_from_config(config_class: Type[BaseModel]) -> Callable:
         for name, field in reversed(config_class.model_fields.items()):
             field_type = _strip_optional_type(field.annotation)
 
-            if field_type == bool:
+            if field_type is bool:
                 field_name = name.replace("_", "-")
                 option_name = f"--{field_name}/--no-{field_name}"
                 function = click.option(
diff --git a/src/axolotl/cli/utils/sweeps.py b/src/axolotl/cli/utils/sweeps.py
index bb1368cf6..2a0aa1367 100644
--- a/src/axolotl/cli/utils/sweeps.py
+++ b/src/axolotl/cli/utils/sweeps.py
@@ -49,7 +49,10 @@ def generate_sweep_configs(
                 new_config = {}
                 # new_config = deepcopy(base_config)
                 # Combine regular parameters with paired parameters
-                full_combo = {**dict(zip(param_names, reg_combo)), **paired_set}
+                full_combo = {
+                    **dict(zip(param_names, reg_combo, strict=False)),
+                    **paired_set,
+                }
                 for param_name, param_value in full_combo.items():
                     new_config[param_name] = param_value
                 print(new_config)
@@ -58,7 +61,7 @@ def generate_sweep_configs(
             # If no paired values, just use regular combinations
             # new_config = deepcopy(base_config)
             new_config = {}
-            for param_name, param_value in zip(param_names, reg_combo):
+            for param_name, param_value in zip(param_names, reg_combo, strict=False):
                 new_config[param_name] = param_value
             print(new_config)
             all_combinations.append(new_config)
diff --git a/src/axolotl/cli/utils/train.py b/src/axolotl/cli/utils/train.py
index b133d7271..6ce7d8df3 100644
--- a/src/axolotl/cli/utils/train.py
+++ b/src/axolotl/cli/utils/train.py
@@ -95,7 +95,6 @@ def generate_config_files(config: str, sweep: str | None) -> Iterator[tuple[str,
         permutation_id = f"sweep{idx:04d}"
         permutation["output_dir"] = str(permutation_dir / permutation_id)
 
-        # pylint: disable=consider-using-with
         temp_file = tempfile.NamedTemporaryFile(
             mode="w",
             suffix=".yaml",
diff --git a/src/axolotl/cli/vllm_serve.py b/src/axolotl/cli/vllm_serve.py
index cf687bea2..ea454fc96 100644
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -39,7 +39,7 @@ def do_vllm_serve(
     model = cfg.base_model
 
     serve_module = cli_args.get("serve_module", "trl.scripts.vllm_serve")
-    vllm_serve_main = getattr(__import__(serve_module, fromlist=["main"]), "main")
+    vllm_serve_main = __import__(serve_module, fromlist=["main"]).main
     tensor_parallel_size = 1
     data_parallel_size = 1
 
@@ -68,7 +68,6 @@ def do_vllm_serve(
         cli_args.get("enable_reasoning") or cfg.vllm.enable_reasoning or False
     )
 
-    # pylint: disable=unexpected-keyword-arg
     vllm_script_args = AxolotlScriptArguments(
         model=model,
         tensor_parallel_size=tensor_parallel_size,
diff --git a/src/axolotl/common/datasets.py b/src/axolotl/common/datasets.py
index 0ff52ebe1..e7433e3c2 100644
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -6,6 +6,7 @@ from dataclasses import dataclass
 
 from datasets import Dataset
 
+import axolotl.monkeypatch.data.batch_dataset_fetcher  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.loaders import load_processor, load_tokenizer
 from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
diff --git a/src/axolotl/convert.py b/src/axolotl/convert.py
index d1bdb34db..9e09b37dc 100644
--- a/src/axolotl/convert.py
+++ b/src/axolotl/convert.py
@@ -67,9 +67,7 @@ class JsonToJsonlConverter:
         self.json_parser = json_parser
         self.jsonl_serializer = jsonl_serializer
 
-    def convert(
-        self, input_file_path, output_file_path
-    ):  # pylint: disable=unused-argument
+    def convert(self, input_file_path, output_file_path):
         content = self.file_reader.read(input_file_path)
         data = self.json_parser.parse(content)
         # data = [r for r in data if r["conversations"]]  # vicuna cleaned has rows with empty conversations
diff --git a/src/axolotl/core/attention/flex_block_mask.py b/src/axolotl/core/attention/flex_block_mask.py
index fb9820f35..37149983c 100644
--- a/src/axolotl/core/attention/flex_block_mask.py
+++ b/src/axolotl/core/attention/flex_block_mask.py
@@ -84,9 +84,7 @@ def create_causal_mask(
     batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
     if attention_mask is not None:
 
-        def causal_doc_mask_mod(
-            batch_idx, head_idx, q_idx, kv_idx
-        ):  # pylint: disable=unused-argument
+        def causal_doc_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
             """
             Defines the logic of a block causal mask by combining both a standard causal mask
             and a block diagonal document mask.
@@ -103,9 +101,7 @@ def create_causal_mask(
         mask_factory_function = causal_doc_mask_mod
     else:
         mask_factory_function = causal_mask_function
-    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[
-        config._attn_implementation  # pylint: disable=protected-access
-    ]
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
 
     # Do not allow skip if we are compiling (this is to match BC)
     allow_is_causal_skip = (
diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
index e1f649715..44699e6ac 100644
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -44,7 +44,7 @@ from axolotl.utils.schemas.enums import CustomSupportedOptimizers
 LOG = logging.getLogger(__name__)
 
 with suppress(ImportError):
-    import torch._dynamo  # pylint: disable=ungrouped-imports
+    import torch._dynamo
 
 
 class TrainerBuilderBase(abc.ABC):
@@ -260,14 +260,14 @@ class TrainerBuilderBase(abc.ABC):
                 adam_kwargs["eps"] = training_args_kwargs.get("adam_epsilon")
 
             if self.cfg.optimizer == "muon":
-                from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
+                from axolotl.contribs.mit.muon import (
                     MuonOptimizerFactory,
                 )
 
                 optimizer_cls = MuonOptimizerFactory
                 optimizer_kwargs.update(adam_kwargs)
             elif self.cfg.optimizer == "dion":
-                from axolotl.contribs.mit.dion import (  # pylint: disable=no-name-in-module
+                from axolotl.contribs.mit.dion import (
                     DionOptimizerFactory,
                 )
 
@@ -414,12 +414,8 @@ class TrainerBuilderBase(abc.ABC):
 
     def _configure_torch_compile(self, training_args_kwargs: dict):
         if self.cfg.torch_compile and getattr(torch, "_dynamo", None):
-            torch._dynamo.config.suppress_errors = (  # pylint: disable=protected-access
-                True
-            )
-            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
-                256
-            )
+            torch._dynamo.config.suppress_errors = True
+            torch._dynamo.config.accumulated_cache_size_limit = 256
             training_args_kwargs["torch_compile"] = self.cfg.torch_compile
             if self.cfg.torch_compile_backend:
                 training_args_kwargs["torch_compile_backend"] = (
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
index e5bc21762..94b0db851 100644
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -344,16 +344,14 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
             training_args_cls = AxolotlPRMConfig
         else:
             training_args_cls = AxolotlTrainingArguments
-        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
+        training_args = training_args_cls(
             **training_arguments_kwargs,
         )
         training_args = self.hook_post_create_training_args(training_args)
 
         # unset run_name so wandb sets up experiment names
         if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
-            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
-                None
-            )
+            training_args.run_name = None
 
         data_collator_kwargs = {
             "padding": True,  # True/"longest" is the default
diff --git a/src/axolotl/core/builders/rl.py b/src/axolotl/core/builders/rl.py
index bc7816807..a6e8355f4 100644
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -168,16 +168,14 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
             if plugin_training_args:
                 training_args_kwargs.update(plugin_training_args)
 
-        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
+        training_args = training_args_cls(
             logging_first_step=True,
             **training_args_kwargs,
         )
 
         # unset run_name so wandb sets up experiment names
         if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
-            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
-                None
-            )
+            training_args.run_name = None
 
         return training_args, trainer_kwargs
 
diff --git a/src/axolotl/core/chat/format/chatml.py b/src/axolotl/core/chat/format/chatml.py
index 04c398fe8..deb8a9997 100644
--- a/src/axolotl/core/chat/format/chatml.py
+++ b/src/axolotl/core/chat/format/chatml.py
@@ -10,7 +10,7 @@ from .shared import wrap_tools
 
 def format_message(
     message: Messages,
-    message_index: Optional[int] = None,  # pylint: disable=unused-argument
+    message_index: Optional[int] = None,
 ) -> Messages:
     if message.is_chat_formatted:
         return message
diff --git a/src/axolotl/core/chat/messages.py b/src/axolotl/core/chat/messages.py
index 923b177c1..912a12ca1 100644
--- a/src/axolotl/core/chat/messages.py
+++ b/src/axolotl/core/chat/messages.py
@@ -15,11 +15,11 @@ class MessageRoles(str, Enum):
     Message roles for the system, user, assistant, and tools
     """
 
-    system = "system"  # pylint: disable=invalid-name
-    user = "user"  # pylint: disable=invalid-name
-    assistant = "assistant"  # pylint: disable=invalid-name
-    tool = "tool"  # pylint: disable=invalid-name
-    ipython = (  # pylint: disable=invalid-name
+    system = "system"
+    user = "user"
+    assistant = "assistant"
+    tool = "tool"
+    ipython = (
         # for responses from builtin tools
         "ipython"
     )
@@ -30,12 +30,12 @@ class MessageContentTypes(str, Enum):
     Message content types for text, image, audio, tool calls, and tool responses
     """
 
-    special_token = "special_token"  # pylint: disable=invalid-name  # nosec B105
-    text = "text"  # pylint: disable=invalid-name
-    image = "image"  # pylint: disable=invalid-name
-    audio = "audio"  # pylint: disable=invalid-name
-    tool_call = "tool_call"  # pylint: disable=invalid-name  # to differentiate regular responses from tool calls from the assistant
-    tool_response = "tool_response"  # pylint: disable=invalid-name
+    special_token = "special_token"  # nosec B105
+    text = "text"
+    image = "image"
+    audio = "audio"
+    tool_call = "tool_call"
+    tool_response = "tool_response"
 
 
 class SpecialToken(str, Enum):
@@ -43,8 +43,8 @@ class SpecialToken(str, Enum):
     Special tokens for beginning of string and end of string
     """
 
-    bos_token = "bos_token"  # pylint: disable=invalid-name  # nosec B105
-    eos_token = "eos_token"  # pylint: disable=invalid-name  # nosec B105
+    bos_token = "bos_token"  # nosec B105
+    eos_token = "eos_token"  # nosec B105
 
 
 class ToolCallFunction(BaseModel):
@@ -73,7 +73,7 @@ class ToolCallContents(BaseModel):
 
     name: str
     arguments: dict[str, Union[str, int]]
-    id: Optional[str] = None  # pylint: disable=invalid-name
+    id: Optional[str] = None
 
     def __str__(self) -> str:
         data = {"name": self.name, "arguments": self.arguments}
@@ -89,7 +89,7 @@ class ToolResponseContents(BaseModel):
 
     name: str
     content: Union[str, dict[str, Union[str, int, float]]]
-    id: Optional[str] = None  # pylint: disable=invalid-name
+    id: Optional[str] = None
 
     def __str__(self) -> str:
         data = {"name": self.name, "content": self.content}
diff --git a/src/axolotl/core/datasets/transforms/chat_builder.py b/src/axolotl/core/datasets/transforms/chat_builder.py
index 692fe3ebb..8f2013027 100644
--- a/src/axolotl/core/datasets/transforms/chat_builder.py
+++ b/src/axolotl/core/datasets/transforms/chat_builder.py
@@ -1,23 +1,17 @@
 """
-This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.
+This module contains a function that builds a transform that takes a row from the
+dataset and converts it to a Chat.
 """
 
-from typing import Any, Mapping, Union
+from typing import Any, Mapping
 
 
-def chat_message_transform_builder(  # pylint: disable=dangerous-default-value
+def chat_message_transform_builder(
     train_on_inputs=False,
     conversations_field: str = "conversations",
-    message_field_role: Union[str, list[str]] = ["role", "from"],  # commonly "role"
-    message_field_content: Union[str, list[str]] = [
-        "value",
-        "text",
-        "content",
-    ],  # commonly "content"
-    message_field_training: Union[str, list[str]] = [
-        "train",
-        "weight",
-    ],  # commonly "weight"
+    message_field_role: str | list[str] | None = None,  # commonly "role"
+    message_field_content: str | list[str] | None = None,  # commonly "content"
+    message_field_training: str | list[str] | None = None,  # commonly "weight"
 ):
     """Builds a transform that takes a row from the dataset and converts it to a Chat
 
@@ -39,6 +33,12 @@ def chat_message_transform_builder(  # pylint: disable=dangerous-default-value
             A function that takes a list of conversations and returns a list of messages.
     """
 
+    if message_field_training is None:
+        message_field_training = ["train", "weight"]
+    if message_field_content is None:
+        message_field_content = ["value", "text", "content"]
+    if message_field_role is None:
+        message_field_role = ["role", "from"]
     message_field_role = (
         [message_field_role]
         if isinstance(message_field_role, str)
diff --git a/src/axolotl/core/trainers/__init__.py b/src/axolotl/core/trainers/__init__.py
index a9cda4efc..22d8b64f6 100644
--- a/src/axolotl/core/trainers/__init__.py
+++ b/src/axolotl/core/trainers/__init__.py
@@ -1,6 +1,5 @@
 """Init for axolotl.core.trainers"""
 
-# pylint: disable=unused-import
 # flake8: noqa
 
 from .base import AxolotlTrainer
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 0f9f6e4c4..4b8861790 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -1,7 +1,5 @@
 """Module for customized trainers"""
 
-# pylint: disable=too-many-lines
-
 from __future__ import annotations
 
 import os
@@ -285,9 +283,9 @@ class AxolotlTrainer(
         # fmt: off
         if dataloader_key is not None and self.args.dataloader_persistent_workers:
             if hasattr(self, "_eval_dataloaders"):
-                self._eval_dataloaders[dataloader_key] = dataloader  # type: ignore  # pylint: disable=access-member-before-definition
+                self._eval_dataloaders[dataloader_key] = dataloader  # type: ignore
             else:
-                self._eval_dataloaders = {dataloader_key: dataloader}  # pylint: disable=attribute-defined-outside-init
+                self._eval_dataloaders = {dataloader_key: dataloader}
         # fmt: on
 
         return self.accelerator.prepare(dataloader)
@@ -443,7 +441,7 @@ class AxolotlTrainer(
         model,
         inputs,
         return_outputs=False,
-        num_items_in_batch=None,  # pylint: disable=unused-argument
+        num_items_in_batch=None,
     ):
         concat_inputs = AxolotlTrainer.orpo_concatenate_inputs(
             inputs,
@@ -524,9 +522,7 @@ class AxolotlTrainer(
         accelerator_config = self.args.accelerator_config.to_dict()
         use_configured_state = accelerator_config.get("use_configured_state", False)
         if not use_configured_state:
-            AcceleratorState._reset_state(  # pylint: disable=protected-access
-                reset_partial_state=True
-            )
+            AcceleratorState._reset_state(reset_partial_state=True)
 
         super().create_accelerator_and_postprocess()
 
@@ -540,7 +536,6 @@ class AxolotlTrainer(
             ):
                 self.accelerator.state.fsdp_plugin.limit_all_gathers = True
 
-    # pylint: disable=unused-argument
     def additional_accelerator_args(
         self, fp8: bool = False, enable_fsdp_float8_all_gather: bool = False, **kwargs
     ) -> dict[str, Any]:
diff --git a/src/axolotl/core/trainers/dpo/trainer.py b/src/axolotl/core/trainers/dpo/trainer.py
index b3067bb46..b04505d89 100644
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -101,11 +101,11 @@ class AxolotlDPOTrainer(
     ) -> dict[str, torch.Tensor]:
         if self.args.dpo_norm_loss:
             # fmt: off
-            loss_type: str = self.loss_type  # type: ignore[has-type]  # pylint: disable=access-member-before-definition
+            loss_type: str = self.loss_type  # type: ignore[has-type]
             # fmt: on
             # concatenated_forward handles avg token logprob for ipo case already
-            self.loss_type = "ipo"  # pylint: disable=attribute-defined-outside-init
+            self.loss_type = "ipo"
             res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
-            self.loss_type = loss_type  # pylint: disable=attribute-defined-outside-init
+            self.loss_type = loss_type
             return res
         return super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
diff --git a/src/axolotl/core/trainers/grpo/__init__.py b/src/axolotl/core/trainers/grpo/__init__.py
index 4106a2a7d..7eda7a0ba 100644
--- a/src/axolotl/core/trainers/grpo/__init__.py
+++ b/src/axolotl/core/trainers/grpo/__init__.py
@@ -128,9 +128,7 @@ class GRPOStrategy:
         return grpo_args_kwargs
 
     @classmethod
-    def set_trainer_args(
-        cls, cfg: DictDefault
-    ) -> list[Any]:  # pylint: disable=unused-argument
+    def set_trainer_args(cls, cfg: DictDefault) -> list[Any]:
         trainer_args = []
         if cfg.trl and cfg.trl.reward_funcs:
             reward_funcs = []
@@ -151,7 +149,7 @@ class GRPOStrategy:
         return trainer_kwargs
 
     @classmethod
-    def get_collator(cls, *args, **kwargs):  # pylint: disable=unused-argument
+    def get_collator(cls, *args, **kwargs):
         # No data collation is needed in GRPO, handled by trl's trainer __init__
         return None
 
diff --git a/src/axolotl/core/trainers/grpo/trainer.py b/src/axolotl/core/trainers/grpo/trainer.py
index 49caa6406..f9f5a695b 100644
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -1,7 +1,5 @@
 """Axolotl GRPO trainers (with and without sequence parallelism handling)"""
 
-# pylint: disable=too-many-lines,duplicate-code,protected-access,no-member
-
 import warnings
 from functools import partial
 from typing import Any
@@ -52,7 +50,6 @@ from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, Optimizer
 from axolotl.monkeypatch.ring_attn import get_ring_attn_group
 
 if is_peft_available():
-    # pylint: disable=unused-import
     from peft import PeftConfig
 
 
@@ -253,7 +250,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
     def get_train_dataloader(self) -> DataLoader:
         """Get dataloader for training"""
         train_dataset = self.train_dataset
-        # pylint: disable=access-member-before-definition
+
         data_collator = self.data_collator  # type: ignore
 
         # Handle dataset preprocessing
@@ -266,7 +263,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                     train_dataset, description="training"
                 )
         else:
-            self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
+            self.data_collator = self._get_collator_with_removed_columns(
                 data_collator,
                 description="training",
             )
@@ -308,10 +305,10 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
         # Generate completions using either vLLM or regular generation
         if self.args.use_vllm:
             # First, have main process load weights if needed
-            # pylint: disable=access-member-before-definition
+
             if self.state.global_step != self._last_loaded_step:  # type: ignore[has-type]
                 self._move_model_to_vllm()
-                # pylint: disable=attribute-defined-outside-init
+
                 self._last_loaded_step = self.state.global_step
 
             # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
@@ -333,8 +330,9 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                         # Extract prompts from this SP group, accounting for num_generations duplicates
                         # We only need prompts from one rank in each SP group
                         group_prompts = all_prompts_text[
-                            group_leader_rank
-                            * len(prompts_text) : (group_leader_rank + 1)
+                            group_leader_rank * len(prompts_text) : (
+                                group_leader_rank + 1
+                            )
                             * len(prompts_text) : self.num_generations
                         ]
 
@@ -485,7 +483,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
         )
         if is_conversational(inputs[0]):
             completions = []
-            for prompt, completion in zip(prompts, completions_text):
+            for prompt, completion in zip(prompts, completions_text, strict=False):
                 bootstrap = (
                     prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
                 )
@@ -503,6 +501,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                 self.reward_funcs,
                 self.reward_processing_classes,
                 self.reward_func_names,
+                strict=False,
             )
         ):
             with profiling_context(self, reward_func_name):
@@ -511,14 +510,17 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                 ):  # Module instead of PretrainedModel for compat with compiled models
                     if is_conversational(inputs[0]):
                         messages = [
-                            {"messages": p + c} for p, c in zip(prompts, completions)
+                            {"messages": p + c}
+                            for p, c in zip(prompts, completions, strict=False)
                         ]
                         texts = [
                             apply_chat_template(x, reward_processing_class)["text"]
                             for x in messages
                         ]
                     else:
-                        texts = [p + c for p, c in zip(prompts, completions)]
+                        texts = [
+                            p + c for p, c in zip(prompts, completions, strict=False)
+                        ]
                     reward_inputs = reward_processing_class(
                         text=texts,
                         return_tensors="pt",
@@ -564,7 +566,8 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
             row_reward_kwargs["completion"] = completions[nan_row_idx]
             warnings.warn(
                 f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. "
-                "Please ensure that at least one reward function returns a valid reward."
+                "Please ensure that at least one reward function returns a valid reward.",
+                stacklevel=2,
             )
 
         # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
diff --git a/src/axolotl/core/trainers/mamba.py b/src/axolotl/core/trainers/mamba.py
index b475b26d9..dedda1b29 100644
--- a/src/axolotl/core/trainers/mamba.py
+++ b/src/axolotl/core/trainers/mamba.py
@@ -5,7 +5,6 @@ import torch
 from axolotl.core.trainers.base import AxolotlTrainer
 
 
-# pylint: disable=too-many-ancestors
 class AxolotlMambaTrainer(AxolotlTrainer):
     """Mamba specific trainer to handle loss calculation"""
 
@@ -15,8 +14,8 @@ class AxolotlMambaTrainer(AxolotlTrainer):
         self,
         model,
         inputs,
-        return_outputs=False,  # pylint: disable=unused-argument
-        num_items_in_batch=None,  # pylint: disable=unused-argument
+        return_outputs=False,
+        num_items_in_batch=None,
     ):
         input_ids = inputs.pop("input_ids")
         lm_logits = model(input_ids).logits
diff --git a/src/axolotl/core/trainers/mixins/__init__.py b/src/axolotl/core/trainers/mixins/__init__.py
index b54577765..5fced1692 100644
--- a/src/axolotl/core/trainers/mixins/__init__.py
+++ b/src/axolotl/core/trainers/mixins/__init__.py
@@ -1,6 +1,5 @@
 """Init for axolotl.core.trainers.mixins"""
 
-# pylint: disable=unused-import
 # flake8: noqa
 
 from .activation_checkpointing import ActivationOffloadingMixin
diff --git a/src/axolotl/core/trainers/mixins/activation_checkpointing.py b/src/axolotl/core/trainers/mixins/activation_checkpointing.py
index 1bfdb49f7..b61c45fee 100644
--- a/src/axolotl/core/trainers/mixins/activation_checkpointing.py
+++ b/src/axolotl/core/trainers/mixins/activation_checkpointing.py
@@ -92,7 +92,7 @@ def get_lora_act_offloading_ctx_manager(
         `contextlib.ContextDecorator`:
             Activation offloading context manager for the model.
     """
-    # pylint: disable=unnecessary-dunder-call
+
     activations_handling_ctx = OffloadActivations(
         use_pin_memory=use_pin_memory,
         use_streams=use_streams,
diff --git a/src/axolotl/core/trainers/mixins/distributed_parallel.py b/src/axolotl/core/trainers/mixins/distributed_parallel.py
index d163e4eb5..77aee5236 100644
--- a/src/axolotl/core/trainers/mixins/distributed_parallel.py
+++ b/src/axolotl/core/trainers/mixins/distributed_parallel.py
@@ -26,7 +26,6 @@ class DistributedParallelMixin(Trainer):
             self.accelerator.distributed_type == "FSDP"
             and self.accelerator.state.fsdp_plugin is None
         ):
-            # pylint: disable=protected-access
             # handle Context Parallelism without FSDP
             self.accelerator.state.distributed_type = "MULTI_GPU"
             self.accelerator.state._shared_state["distributed_type"] = "MULTI_GPU"
diff --git a/src/axolotl/core/trainers/mixins/optimizer.py b/src/axolotl/core/trainers/mixins/optimizer.py
index a9a9a3992..850442c60 100644
--- a/src/axolotl/core/trainers/mixins/optimizer.py
+++ b/src/axolotl/core/trainers/mixins/optimizer.py
@@ -70,11 +70,11 @@ class OptimizerMixin(Trainer):
                 }
             )
         if params["embeddings"]:
-            lr = optimizer_kwargs["lr"]  # pylint: disable=invalid-name
+            lr = optimizer_kwargs["lr"]
             if self.args.embedding_lr_scale:
-                lr *= self.args.embedding_lr_scale  # pylint: disable=invalid-name
+                lr *= self.args.embedding_lr_scale
             elif self.args.embedding_lr:
-                lr = self.args.embedding_lr  # pylint: disable=invalid-name
+                lr = self.args.embedding_lr
             optimizer_grouped_parameters.append(
                 {
                     "params": list(params["embeddings"].values()),
@@ -143,7 +143,7 @@ class OptimizerMixin(Trainer):
                 loraplus_lr_embedding = getattr(
                     self.args, "loraplus_lr_embedding", 1e-6
                 )
-                self.optimizer = create_loraplus_optimizer(  # pylint: disable=attribute-defined-outside-init
+                self.optimizer = create_loraplus_optimizer(
                     opt_model,
                     optimizer_cls,
                     loraplus_lr_ratio=loraplus_lr_ratio,
@@ -185,17 +185,15 @@ class OptimizerMixin(Trainer):
                                 p.data_ptr(): p.numel() for p in module.parameters()
                             }.values()
                         )
-                        LOG.info(f"skipped {module}: {skipped/2**20}M params")
+                        LOG.info(f"skipped {module}: {skipped / 2**20}M params")
                         manager.register_module_override(
                             module, "weight", {"optim_bits": 32}
                         )
                         LOG.debug(f"bitsandbytes: will optimize {module} in fp32")
-                LOG.info(f"skipped: {skipped/2**20}M params")
+                LOG.info(f"skipped: {skipped / 2**20}M params")
 
         if is_sagemaker_mp_enabled():
-            self.optimizer = smp.DistributedOptimizer(  # pylint: disable=attribute-defined-outside-init
-                self.optimizer
-            )
+            self.optimizer = smp.DistributedOptimizer(self.optimizer)
 
         return self.optimizer
 
diff --git a/src/axolotl/core/trainers/mixins/scheduler.py b/src/axolotl/core/trainers/mixins/scheduler.py
index 399bf5947..fc2b0e59d 100644
--- a/src/axolotl/core/trainers/mixins/scheduler.py
+++ b/src/axolotl/core/trainers/mixins/scheduler.py
@@ -46,7 +46,7 @@ class SchedulerMixin(Trainer):
         )
 
         # fmt: off
-        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
+        if self.lr_scheduler is None:  # type: ignore
             # fmt: on
             plugin_manager = PluginManager.get_instance()
             lr_scheduler: LRScheduler | None = plugin_manager.create_lr_scheduler(
@@ -90,7 +90,7 @@ class SchedulerMixin(Trainer):
                     LOG.warning(
                         "Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
 
-                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
+                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(
                     optimizer,
                     num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                     num_training_steps=num_training_steps,
@@ -98,7 +98,7 @@ class SchedulerMixin(Trainer):
             elif self.args.cosine_min_lr_ratio and self.args.cosine_constant_lr_ratio and use_cosine_min_lr:
                 assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
                 assert 0 <= self.args.cosine_constant_lr_ratio <= 1.0, "cosine_constant_lr_ratio must be between 0.0 and 1.0"
-                self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(  # pylint: disable=attribute-defined-outside-init
+                self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(
                     optimizer,
                     num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                     num_training_steps=num_training_steps,
@@ -107,7 +107,7 @@ class SchedulerMixin(Trainer):
                 )
             elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
                 assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
-                self.lr_scheduler = get_cosine_schedule_with_min_lr(  # pylint: disable=attribute-defined-outside-init
+                self.lr_scheduler = get_cosine_schedule_with_min_lr(
                     optimizer,
                     num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                     num_training_steps=num_training_steps,
@@ -133,7 +133,7 @@ class SchedulerMixin(Trainer):
             )
             if not self.lr_scheduler:
                 super().create_scheduler(num_training_steps, optimizer)
-            self.lr_scheduler = JaggedLRRestartScheduler(  # pylint: disable=attribute-defined-outside-init
+            self.lr_scheduler = JaggedLRRestartScheduler(
                 optimizer,
                 self.lr_scheduler,
                 self.args.jagged_restart_steps,
diff --git a/src/axolotl/core/training_args_base.py b/src/axolotl/core/training_args_base.py
index fd0859ae9..a9cc7d224 100644
--- a/src/axolotl/core/training_args_base.py
+++ b/src/axolotl/core/training_args_base.py
@@ -14,7 +14,6 @@ class AxolotlTrainingMixins:
     Mixin class for the Axolotl training args.
     """
 
-    # pylint: disable=duplicate-code
     model_type: Optional[str] = field(
         default=None, metadata={"help": "HF model configuration model_type."}
     )
diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py
index c9d006ac8..b8f9484bc 100644
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -26,7 +26,7 @@ class TokenizedPromptDataset(Dataset):
         keep_in_memory: Whether to keep the tokenized dataset in memory.
     """
 
-    def __init__(  # pylint: disable=super-init-not-called
+    def __init__(
         self,
         prompt_tokenizer: PromptTokenizingStrategy,
         dataset: Dataset,
@@ -99,7 +99,7 @@ class ConstantLengthDataset(IterableDataset):
         seq_length: Length of token sequences to return.
     """
 
-    def __init__(  # pylint: disable=super-init-not-called
+    def __init__(
         self,
         tokenizer,
         datasets,
diff --git a/src/axolotl/evaluate.py b/src/axolotl/evaluate.py
index 2b5869939..e4496bee6 100644
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -79,7 +79,7 @@ def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, f
     model, tokenizer, _, processor = setup_model_and_tokenizer(cfg)
 
     # Get datasets
-    # pylint: disable=duplicate-code
+
     train_dataset = dataset_meta.train_dataset
     eval_dataset = dataset_meta.eval_dataset
     total_num_steps = dataset_meta.total_num_steps
diff --git a/src/axolotl/integrations/base.py b/src/axolotl/integrations/base.py
index 94ee8d4b1..8edee18a3 100644
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -76,7 +76,7 @@ class BasePlugin:
     def __init__(self):
         """Initializes the BasePlugin."""
 
-    def register(self, cfg: dict):  # pylint: disable=unused-argument
+    def register(self, cfg: dict):
         """Registers the plugin with the given configuration as an unparsed dict.
 
         Args:
@@ -104,14 +104,13 @@ class BasePlugin:
             dataset_meta: The metadata for the training dataset.
         """
 
-    def pre_model_load(self, cfg: DictDefault):  # pylint: disable=unused-argument
+    def pre_model_load(self, cfg: DictDefault):
         """Performs actions before the model is loaded.
 
         Args:
             cfg: The configuration for the plugin.
         """
 
-    # pylint: disable=unused-argument
     def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
         """Performs actions after the model is built/loaded, but before any adapters are applied.
 
@@ -119,7 +118,6 @@ class BasePlugin:
             cfg: The configuration for the plugin.
         """
 
-    # pylint: disable=unused-argument
     def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
         """Performs actions before LoRA weights are loaded.
 
@@ -128,7 +126,6 @@ class BasePlugin:
             model: The loaded model.
         """
 
-    # pylint: disable=unused-argument
     def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
         """Performs actions after LoRA weights are loaded.
 
@@ -137,7 +134,6 @@ class BasePlugin:
             model: The loaded model.
         """
 
-    # pylint: disable=unused-argument
     def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
         """Performs actions after the model is loaded.
 
@@ -146,7 +142,6 @@ class BasePlugin:
             model: The loaded model.
         """
 
-    # pylint: disable=unused-argument
     def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
         """Returns a custom class for the trainer.
 
@@ -157,7 +152,6 @@ class BasePlugin:
             The first non-`None` trainer class returned by a plugin.
         """
 
-    # pylint: disable=unused-argument
     def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
         """Performs actions after the trainer is created.
 
@@ -166,7 +160,7 @@ class BasePlugin:
             trainer: The trainer object for training.
         """
 
-    def get_training_args(self, cfg: DictDefault):  # pylint: disable=unused-argument):
+    def get_training_args(self, cfg: DictDefault):
         """
         Returns custom training arguments to set on TrainingArgs.
 
@@ -177,9 +171,7 @@ class BasePlugin:
             object: dict containing the training arguments.
         """
 
-    def get_collator_cls_and_kwargs(
-        self, cfg: DictDefault, is_eval: bool = False
-    ):  # pylint: disable=unused-argument):
+    def get_collator_cls_and_kwargs(self, cfg: DictDefault, is_eval: bool = False):
         """
         Returns a custom class for the collator.
 
@@ -191,7 +183,6 @@ class BasePlugin:
             class: The class for the collator.
         """
 
-    # pylint: disable=unused-argument
     def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
         """Creates and returns an optimizer for training.
 
@@ -203,7 +194,6 @@ class BasePlugin:
             The created optimizer.
         """
 
-    # pylint: disable=unused-argument
     def create_lr_scheduler(
         self,
         cfg: DictDefault,
@@ -223,7 +213,6 @@ class BasePlugin:
             The created learning rate scheduler.
         """
 
-    # pylint: disable=unused-argument
     def add_callbacks_pre_trainer(
         self, cfg: DictDefault, model: PreTrainedModel
     ) -> list[Callable]:
@@ -238,7 +227,6 @@ class BasePlugin:
         """
         return []
 
-    # pylint: disable=unused-argument
     def add_callbacks_post_trainer(
         self, cfg: DictDefault, trainer: Trainer
     ) -> list[Callable]:
@@ -254,7 +242,6 @@ class BasePlugin:
         """
         return []
 
-    # pylint: disable=unused-argument
     def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
         """Performs actions after training is complete.
 
@@ -263,7 +250,7 @@ class BasePlugin:
             model: The loaded model.
         """
 
-    def post_train_unload(self, cfg: DictDefault):  # pylint: disable=unused-argument
+    def post_train_unload(self, cfg: DictDefault):
         """Performs actions after training is complete and the model is unloaded.
 
         Args:
@@ -311,7 +298,7 @@ def load_plugin(plugin_name: str) -> BasePlugin:
     return plugin
 
 
-class PluginManager:  # pylint: disable=too-many-public-methods
+class PluginManager:
     """The `PluginManager` class is responsible for loading and managing plugins. It
     should be a singleton so it can be accessed from anywhere in the codebase.
 
diff --git a/src/axolotl/integrations/config.py b/src/axolotl/integrations/config.py
index f5fc07e9e..2217b2819 100644
--- a/src/axolotl/integrations/config.py
+++ b/src/axolotl/integrations/config.py
@@ -50,15 +50,9 @@ def merge_input_args():
         dynamic_input += f"class AxolotlInputConfig(AxolotlInputConfigBase, {', '.join(plugin_classes)}):\n    pass\n"
 
         namespace: Dict[Any, Any] = {}
-        exec(  # pylint: disable=exec-used  # nosec B102
-            dynamic_input, globals(), namespace
-        )
-        AxolotlInputConfig = namespace[  # pylint: disable=invalid-name
-            "AxolotlInputConfig"
-        ]
-        AxolotlConfigWCapabilities = namespace[  # pylint: disable=invalid-name
-            "AxolotlConfigWCapabilities"
-        ]
+        exec(dynamic_input, globals(), namespace)  # nosec B102
+        AxolotlInputConfig = namespace["AxolotlInputConfig"]
+        AxolotlConfigWCapabilities = namespace["AxolotlConfigWCapabilities"]
         return AxolotlConfigWCapabilities, AxolotlInputConfig
     return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase
 
@@ -74,7 +68,7 @@ def merge_training_args() -> Type:
     Returns:
     tuple: A tuple containing the newly created classes, AxolotlTrainingMixins.
     """
-    # pylint: disable=duplicate-code
+
     from axolotl.core.training_args_base import (
         AxolotlTrainingMixins as AxolotlTrainingMixinsBase,
     )
@@ -93,11 +87,7 @@ def merge_training_args() -> Type:
 
         namespace: Dict[Any, Any] = {}
         local_vars = {"AxolotlTrainingMixinsBase": AxolotlTrainingMixinsBase}
-        exec(  # pylint: disable=exec-used  # nosec B102
-            dynamic_input, {**globals(), **local_vars}, namespace
-        )
-        AxolotlTrainingMixins = namespace[  # pylint: disable=invalid-name
-            "AxolotlTrainingMixins"
-        ]
+        exec(dynamic_input, {**globals(), **local_vars}, namespace)  # nosec B102
+        AxolotlTrainingMixins = namespace["AxolotlTrainingMixins"]
         return AxolotlTrainingMixins
     return AxolotlTrainingMixinsBase
diff --git a/src/axolotl/integrations/cut_cross_entropy/__init__.py b/src/axolotl/integrations/cut_cross_entropy/__init__.py
index 4689cc9a8..6dd7c97e1 100644
--- a/src/axolotl/integrations/cut_cross_entropy/__init__.py
+++ b/src/axolotl/integrations/cut_cross_entropy/__init__.py
@@ -18,6 +18,7 @@ Module for the Plugin for Cut Cross Entropy integration with Axolotl.
 Cut Cross Entropy is an optimized implementation of cross entropy loss
 from Apple's ML team.
 """
+
 import importlib
 from functools import partial
 
@@ -28,7 +29,7 @@ from axolotl.utils import get_pytorch_version
 from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
 from axolotl.utils.logging import get_logger
 
-from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401
+from .args import CutCrossEntropyArgs as CutCrossEntropyArgs
 
 LOG = get_logger(__name__)
 
@@ -106,9 +107,7 @@ class CutCrossEntropyPlugin(BasePlugin):
         """
         from cut_cross_entropy.transformers.patch import PATCH_FNS
 
-        def patch_generic(
-            maybe_model, patch_options, model_type: str
-        ):  # pylint: disable=unused-argument
+        def patch_generic(maybe_model, patch_options, model_type: str):
             import cut_cross_entropy.transformers.llama
             from cut_cross_entropy.transformers.llama import cce_forward
 
@@ -121,12 +120,10 @@ class CutCrossEntropyPlugin(BasePlugin):
                 )
                 model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
 
-                cut_cross_entropy.transformers.llama._PATCH_OPTS = (  # pylint: disable=protected-access
-                    patch_options
-                )
+                cut_cross_entropy.transformers.llama._PATCH_OPTS = patch_options
 
                 model_cls.forward = cce_forward
-            # pylint: disable=duplicate-code
+
             except (ImportError, AttributeError) as e:
                 raise RuntimeError(
                     f"Could not import ForCausalLM class for model_type: {model_type}. "
diff --git a/src/axolotl/integrations/cut_cross_entropy/args.py b/src/axolotl/integrations/cut_cross_entropy/args.py
index 22852479a..3eeb9fac7 100644
--- a/src/axolotl/integrations/cut_cross_entropy/args.py
+++ b/src/axolotl/integrations/cut_cross_entropy/args.py
@@ -15,6 +15,7 @@
 """
 Module for handling Cut Cross Entropy input arguments.
 """
+
 from typing import Optional
 
 from pydantic import BaseModel, model_validator
diff --git a/src/axolotl/integrations/grokfast/__init__.py b/src/axolotl/integrations/grokfast/__init__.py
index 234d27226..df8cf2cf3 100644
--- a/src/axolotl/integrations/grokfast/__init__.py
+++ b/src/axolotl/integrations/grokfast/__init__.py
@@ -7,7 +7,7 @@ from transformers.trainer_callback import TrainerCallback
 from axolotl.utils.logging import get_logger
 
 from ..base import BasePlugin
-from .args import GrokfastArgs  # pylint: disable=unused-import. # noqa: F401
+from .args import GrokfastArgs as GrokfastArgs
 from .optimizer import gradfilter_ema
 
 LOG = get_logger(__name__)
@@ -24,12 +24,10 @@ class GrokfastCallbackHandler(TrainerCallback):
         self.alpha = alpha
         self.lamb = lamb
 
-    def on_train_begin(self, *args_, **kwargs):  # pylint: disable=unused-argument
+    def on_train_begin(self, *args_, **kwargs):
         self.grads = None
 
-    def on_pre_optimizer_step(
-        self, args_, state, control, **kwargs
-    ):  # pylint: disable=unused-argument
+    def on_pre_optimizer_step(self, args_, state, control, **kwargs):
         model = kwargs.pop("model")
         self.grads = gradfilter_ema(model, self.grads, alpha=self.alpha, lamb=self.lamb)
         return control
diff --git a/src/axolotl/integrations/grokfast/optimizer.py b/src/axolotl/integrations/grokfast/optimizer.py
index 38cda2c93..c83ef43bc 100644
--- a/src/axolotl/integrations/grokfast/optimizer.py
+++ b/src/axolotl/integrations/grokfast/optimizer.py
@@ -1,7 +1,6 @@
 # Copyright: MIT License (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
 # Reference: https://github.com/ironjr/grokfast
 
-# pylint: skip-file
 from collections import deque
 from typing import Dict, Literal, Optional
 
diff --git a/src/axolotl/integrations/kd/__init__.py b/src/axolotl/integrations/kd/__init__.py
index 4c8535a0a..b1a990553 100644
--- a/src/axolotl/integrations/kd/__init__.py
+++ b/src/axolotl/integrations/kd/__init__.py
@@ -15,6 +15,7 @@
 """
 Plugin init to add KD support to Axolotl.
 """
+
 from typing import Any
 
 from transformers import Trainer
@@ -22,7 +23,7 @@ from transformers import Trainer
 from axolotl.integrations.base import BasePlugin
 from axolotl.integrations.kd.callbacks import KDTemperatureSchedulerCallback
 
-from .args import KDArgs  # pylint: disable=unused-import. # noqa: F401
+from .args import KDArgs as KDArgs
 
 
 class KDPlugin(BasePlugin):
diff --git a/src/axolotl/integrations/kd/args.py b/src/axolotl/integrations/kd/args.py
index 758bc8917..425d8ddf6 100644
--- a/src/axolotl/integrations/kd/args.py
+++ b/src/axolotl/integrations/kd/args.py
@@ -15,6 +15,7 @@
 """
 Plugin args for KD support.
 """
+
 from dataclasses import dataclass
 from enum import Enum
 
@@ -26,8 +27,8 @@ class InferenceServerType(str, Enum):
     Online inferences server types to handle different request args
     """
 
-    vllm = "vllm"  # pylint: disable=invalid-name
-    sglang = "sglang"  # pylint: disable=invalid-name
+    vllm = "vllm"
+    sglang = "sglang"
 
 
 class KDArgs(BaseModel):
diff --git a/src/axolotl/integrations/kd/callbacks.py b/src/axolotl/integrations/kd/callbacks.py
index 911c3d517..c73d8a8bb 100644
--- a/src/axolotl/integrations/kd/callbacks.py
+++ b/src/axolotl/integrations/kd/callbacks.py
@@ -19,9 +19,7 @@ class KDTemperatureSchedulerCallback(TrainerCallback):
 
         self.trainer = trainer
 
-    def on_step_end(
-        self, args, state, control, **kwargs
-    ):  # pylint: disable=unused-argument
+    def on_step_end(self, args, state, control, **kwargs):
         # cosine decay temperature over the max steps
 
         progress = state.global_step / state.max_steps
diff --git a/src/axolotl/integrations/kd/chat_template.py b/src/axolotl/integrations/kd/chat_template.py
index 6376ecb09..04f0f24a4 100644
--- a/src/axolotl/integrations/kd/chat_template.py
+++ b/src/axolotl/integrations/kd/chat_template.py
@@ -15,6 +15,7 @@
 """
 Chat template prompt strategy loader with KD support
 """
+
 import logging
 from typing import Any, Dict
 
@@ -192,7 +193,6 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
         """
         Transform logprobs to target format for KD training
         """
-        # pylint: disable=duplicate-code
 
         logprobs = sample.pop(self.logprobs_field)
         target_seq_len = len(logprobs)
@@ -240,7 +240,7 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
                 target_mask.append([1] * top_k)
 
         for token_pos_logprobs, pos_target_token_ids in zip(
-            logprobs, sample["target_token_ids"]
+            logprobs, sample["target_token_ids"], strict=False
         ):
             # Convert to a tensor for easier manipulation
             position_logprobs_tensor = torch.tensor(
@@ -299,7 +299,7 @@ class KDStrategyLoader(StrategyLoader):
     Load ChatTemplateStrategy with KD support using StrategyLoader.
     """
 
-    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
+    def _get_strategy_cls(self, cfg):
         return ChatTemplateStrategyWithKD
 
     def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
@@ -319,7 +319,7 @@ class KDStrategyLoaderV2(KDStrategyLoader):
     Load KD chat template datasets with pre-tokenized logprob data
     """
 
-    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
+    def _get_strategy_cls(self, cfg):
         return ChatTemplateStrategyWithKDv2
 
 
diff --git a/src/axolotl/integrations/kd/collator.py b/src/axolotl/integrations/kd/collator.py
index 0cc745b78..675485d9d 100644
--- a/src/axolotl/integrations/kd/collator.py
+++ b/src/axolotl/integrations/kd/collator.py
@@ -37,7 +37,6 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
     target_logprobs. It also creates a teacher_mask to indicate which entries are valid.
     """
 
-    # pylint: disable=duplicate-code
     tokenizer: PreTrainedTokenizerBase
     model: Optional[Any] = None
     padding: Union[bool, str, PaddingStrategy] = True
@@ -72,7 +71,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
                         // self.pad_to_multiple_of
                     ) * self.pad_to_multiple_of
 
-                for f in features:  # pylint: disable=invalid-name
+                for f in features:
                     remainder = [pad_token_id] * (max_len - len(f[feature_name]))
                     if isinstance(f[feature_name], list):
                         f[feature_name] = (
@@ -101,7 +100,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
 
         if has_teacher_data:
             # Extract and remove from features
-            for f in features:  # pylint: disable=invalid-name
+            for f in features:
                 target_logprobs_list.append(f.pop("target_logprobs"))
                 target_token_ids_list.append(f.pop("target_token_ids"))
                 target_mask_list.append(f.pop("target_mask"))
@@ -117,24 +116,25 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
             padded_teacher_mask_list = []
 
             for t_logprobs, t_ids, t_mask in zip(
-                target_logprobs_list, target_token_ids_list, target_mask_list
+                target_logprobs_list,
+                target_token_ids_list,
+                target_mask_list,
+                strict=False,
             ):
                 t_logprobs_padded = []
                 t_ids_padded = []
                 t_mask_padded = []
 
-                for lp, ids, mask in zip(  # pylint: disable=invalid-name
-                    t_logprobs, t_ids, t_mask
-                ):
+                for lp, ids, mask in zip(t_logprobs, t_ids, t_mask, strict=False):
                     lp_len = len(lp)
                     if lp_len < max_k:
                         # Use -1e9 for padding logprobs and 0 for token_ids
                         pad_len = max_k - lp_len
-                        lp = lp + [-1e9] * pad_len  # pylint: disable=invalid-name
+                        lp = lp + [-1e9] * pad_len
                         ids = ids + [0] * pad_len
                         mask = mask + [0] * pad_len
                     else:
-                        lp = lp[:max_k]  # pylint: disable=invalid-name
+                        lp = lp[:max_k]
                         ids = ids[:max_k]
                         mask = mask[:max_k]
 
@@ -216,9 +216,7 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
         #    We want to produce a single "merged" feature dict for each sub-batch.
         out_features = [{} for _ in features]
 
-        for i, sub_features in enumerate(  # pylint: disable=too-many-nested-blocks
-            features
-        ):
+        for i, sub_features in enumerate(features):
             # sub_features is a list of dicts, each dict = one sequence’s features
             # We'll merge them into out_features[i].
             #
@@ -255,9 +253,7 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
                         if field_name in feat and isinstance(
                             feat[field_name], (list, torch.Tensor)
                         ):
-                            if isinstance(
-                                feat[field_name][0], (dict, str)
-                            ):  # pylint: disable=too-many-nested-blocks
+                            if isinstance(feat[field_name][0], (dict, str)):
                                 continue
                             arr = np.array(feat[field_name])
                             arrays.append(arr)
diff --git a/src/axolotl/integrations/kd/collator_online_teacher.py b/src/axolotl/integrations/kd/collator_online_teacher.py
index 584ace481..54e55a5e7 100644
--- a/src/axolotl/integrations/kd/collator_online_teacher.py
+++ b/src/axolotl/integrations/kd/collator_online_teacher.py
@@ -144,7 +144,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                 }
 
             for sequence_data, seq_input_ids, seq_labels in zip(
-                api_data, batch_input_ids, labels
+                api_data, batch_input_ids, labels, strict=False
             ):
                 current_target_logprobs = []
                 current_target_token_ids = []
@@ -165,7 +165,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                 assert len(seq_input_ids) == len(input_top_logprobs)
 
                 for i, _, label in zip(
-                    range(len(seq_input_ids)), seq_input_ids, seq_labels
+                    range(len(seq_input_ids)), seq_input_ids, seq_labels, strict=False
                 ):
                     if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
                         # this is always the case for the first token.
@@ -202,7 +202,8 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
 
                         # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
                         pos_logprobs_raw, pos_token_ids, _ = [
-                            list(row) for row in zip(*pos_top_logprobs_data)
+                            list(row)
+                            for row in zip(*pos_top_logprobs_data, strict=False)
                         ]
 
                         # Ensure correct length (top_k)
@@ -317,7 +318,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                 }
 
             for sequence_data, seq_input_ids, seq_labels in zip(
-                choices, batch_input_ids, labels
+                choices, batch_input_ids, labels, strict=False
             ):
                 # seq_input_ids: List[int]
                 # seq_labels: List[int]
@@ -342,7 +343,9 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
 
                 seq_len = len(seq_input_ids)
 
-                for i, _, label in zip(range(seq_len), seq_input_ids, seq_labels):
+                for i, _, label in zip(
+                    range(seq_len), seq_input_ids, seq_labels, strict=False
+                ):
                     if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
                         # this is always the case for the first token.
                         # there is never logprob data for the first token since that's a true input
@@ -424,7 +427,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                             list(range(self.kd_online_topk))
                         )
                         current_target_mask.append([0] * self.kd_online_topk)
-                for i in range(max(0, seq_len - len(current_target_logprobs))):
+                for _ in range(max(0, seq_len - len(current_target_logprobs))):
                     current_target_logprobs.append(
                         [-float("inf")] * self.kd_online_topk
                     )
diff --git a/src/axolotl/integrations/kd/kernels/liger.py b/src/axolotl/integrations/kd/kernels/liger.py
index 6356643c2..61ef3e10a 100644
--- a/src/axolotl/integrations/kd/kernels/liger.py
+++ b/src/axolotl/integrations/kd/kernels/liger.py
@@ -197,7 +197,7 @@ class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
         compute_ce_loss: bool = True,
         normalize_topk: bool = True,
     ):
-        CHUNK_SIZE = chunk_size  # pylint: disable=invalid-name
+        CHUNK_SIZE = chunk_size
         grad_weight_acc = torch.zeros_like(student_lm_head_weight)
         grad_inputs_list = []
         grad_bias_acc = (
@@ -298,8 +298,8 @@ class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
             accumulate_chunk_grads_compiled = accumulate_chunk_grads
 
         # Use the same chunking logic as LigerFusedLinearDistillationBase.forward
-        B, N, D = student_input.shape  # pylint: disable=invalid-name
-        K = target_token_ids.shape[-1]  # pylint: disable=invalid-name
+        B, N, D = student_input.shape
+        K = target_token_ids.shape[-1]
 
         student_input_flat = student_input.reshape(-1, student_input.shape[-1])
         target_token_ids_flat = target_token_ids.reshape(-1, target_token_ids.shape[-1])
diff --git a/src/axolotl/integrations/kd/kernels/models.py b/src/axolotl/integrations/kd/kernels/models.py
index 4319f5f7d..f7b468669 100644
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -40,10 +40,9 @@ def kldiv_forward_llama_like(
     output_attentions: Optional[bool] = None,
     output_hidden_states: Optional[bool] = None,
     cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,  # pylint: disable=unused-argument
+    logits_to_keep: Union[int, torch.Tensor] = 0,
     **kwargs: Unpack[TransformersKwargs],  # type: ignore[misc]
 ) -> CausalLMOutputWithPast:
-    # pylint: disable=duplicate-code
     output_attentions = (
         output_attentions
         if output_attentions is not None
diff --git a/src/axolotl/integrations/kd/topk_logprob/forward_kl.py b/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
index 74184455f..b79ba26f3 100644
--- a/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
+++ b/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
@@ -15,6 +15,7 @@
 """
 loss for top_k KL divergence
 """
+
 import torch
 from torch import nn
 
@@ -117,7 +118,6 @@ class ChunkedTopKKDLoss(nn.Module):
         target_mask: torch.Tensor,  # [B, seq_len, K]
         num_items_in_batch: int = -1,  # optional batch size for normalization
     ) -> torch.Tensor:
-
         # 1. Split along the "token" dimension (dim=1).
         student_logits_chunks = student_logits.chunk(self.num_output_chunks, dim=1)
         token_ids_chunks = target_token_ids.chunk(self.num_output_chunks, dim=1)
@@ -131,7 +131,11 @@ class ChunkedTopKKDLoss(nn.Module):
 
         # 2. Loop over each chunk and compute a chunk-specific loss.
         for st_chunk, tid_chunk, lp_chunk, msk_chunk in zip(
-            student_logits_chunks, token_ids_chunks, logprobs_chunks, mask_chunks
+            student_logits_chunks,
+            token_ids_chunks,
+            logprobs_chunks,
+            mask_chunks,
+            strict=False,
         ):
             # We pass num_items_in_batch=-1 so that the kd_loss
             # will average over *this chunk's* valid tokens only.
diff --git a/src/axolotl/integrations/kd/trainer.py b/src/axolotl/integrations/kd/trainer.py
index c454b2a2c..7ec43333a 100644
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -21,7 +21,6 @@ from axolotl.core.trainers.base import AxolotlTrainer
 from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss
 
 
-# pylint: disable=too-many-ancestors
 class AxolotlKDTrainer(AxolotlTrainer):
     """
     Custom trainer subclass for Knowledge Distillation (KD)
diff --git a/src/axolotl/integrations/liger/__init__.py b/src/axolotl/integrations/liger/__init__.py
index 86d56be80..c20f4545c 100644
--- a/src/axolotl/integrations/liger/__init__.py
+++ b/src/axolotl/integrations/liger/__init__.py
@@ -18,6 +18,7 @@ Module for the Plugin for LIGER integraton with Axolotl.
 Liger Kernel is the collection of Triton-native kernels for LLM Training.
 It is designed to be performant, correct, and light-weight.
 """
+
 from .args import LigerArgs
 from .plugin import LigerPlugin
 
diff --git a/src/axolotl/integrations/liger/models/base.py b/src/axolotl/integrations/liger/models/base.py
index f3cf4299a..a9dbe9412 100644
--- a/src/axolotl/integrations/liger/models/base.py
+++ b/src/axolotl/integrations/liger/models/base.py
@@ -41,7 +41,6 @@ def lce_forward(
             This is useful when using packed tensor format (single dimension for batch and sequence length).
     """
 
-    # pylint: disable=duplicate-code
     output_attentions = (
         output_attentions
         if output_attentions is not None
@@ -181,7 +180,7 @@ def patch_lce_forward(
         model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
 
         model_cls.forward = lce_forward
-    # pylint: disable=duplicate-code
+
     except (ImportError, AttributeError) as e:
         raise RuntimeError(
             f"Could not import ForCausalLM class for model_type: {model_type}. "
diff --git a/src/axolotl/integrations/liger/models/deepseekv2.py b/src/axolotl/integrations/liger/models/deepseekv2.py
index 2f0d2a704..99adce4a7 100644
--- a/src/axolotl/integrations/liger/models/deepseekv2.py
+++ b/src/axolotl/integrations/liger/models/deepseekv2.py
@@ -2,8 +2,6 @@
 DeepseekV2 model with LigerFusedLinearCrossEntropyLoss
 """
 
-# pylint: disable=duplicate-code
-
 from typing import List, Optional, Tuple, Union
 
 import torch
diff --git a/src/axolotl/integrations/liger/models/jamba.py b/src/axolotl/integrations/liger/models/jamba.py
index d25529970..78689e40c 100644
--- a/src/axolotl/integrations/liger/models/jamba.py
+++ b/src/axolotl/integrations/liger/models/jamba.py
@@ -2,8 +2,6 @@
 Jamba model with LigerFusedLinearCrossEntropyLoss
 """
 
-# pylint: disable=duplicate-code
-
 from typing import Optional, Tuple, Union
 
 import torch
diff --git a/src/axolotl/integrations/liger/models/llama4.py b/src/axolotl/integrations/liger/models/llama4.py
index 689823bb6..e51140265 100644
--- a/src/axolotl/integrations/liger/models/llama4.py
+++ b/src/axolotl/integrations/liger/models/llama4.py
@@ -46,7 +46,6 @@ def lce_forward(
     Returns:
     """
 
-    # pylint: disable=duplicate-code
     output_attentions = (
         output_attentions
         if output_attentions is not None
@@ -78,9 +77,7 @@ def lce_forward(
     hidden_states = outputs[0]
 
     if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1:
-        raise Exception(  # pylint: disable=broad-exception-raised
-            "Liger Kernel does not support pretraining_tp!!"
-        )
+        raise Exception("Liger Kernel does not support pretraining_tp!!")
 
     logits = None
     loss = None
@@ -128,7 +125,7 @@ def apply_liger_kernel_to_llama4(
     rms_norm: bool = False,
     glu_activation: bool = False,
     layer_norm: bool = False,
-    **kwargs,  # pylint: disable=unused-argument
+    **kwargs,
 ) -> None:
     """
     Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
@@ -144,15 +141,15 @@ def apply_liger_kernel_to_llama4(
         layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
     """
 
-    import transformers.models.llama4.modeling_llama4  # noqa: F401  # pylint: disable=unused-import
+    import transformers.models.llama4.modeling_llama4  # noqa: F401
     from liger_kernel.transformers.functional import liger_cross_entropy
     from liger_kernel.transformers.layer_norm import LigerLayerNorm
     from liger_kernel.transformers.rms_norm import LigerRMSNorm
     from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
 
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
 
     modeling_llama4 = sys.modules["transformers.models.llama4.modeling_llama4"]
 
@@ -165,7 +162,7 @@ def apply_liger_kernel_to_llama4(
             # clone config to avoid modifying the original
             config = deepcopy(config)
             if intermediate_size:
-                setattr(config, "intermediate_size", intermediate_size)
+                config.intermediate_size = intermediate_size
             return LigerSwiGLUMLP(config, **kwargs)
 
         modeling_llama4.Llama4TextMLP = _liger_swiglu_mlp_wrapper
diff --git a/src/axolotl/integrations/liger/models/qwen3.py b/src/axolotl/integrations/liger/models/qwen3.py
index 1dc19eaf9..b008755da 100644
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -43,7 +43,6 @@ def lce_forward(
     Returns:
     """
 
-    # pylint: disable=duplicate-code
     output_attentions = (
         output_attentions
         if output_attentions is not None
@@ -113,9 +112,8 @@ def apply_liger_kernel_to_qwen3(
     rms_norm: bool = False,
     glu_activation: bool = False,
     layer_norm: bool = False,
-    **kwargs,  # pylint: disable=unused-argument
+    **kwargs,
 ) -> None:
-    # pylint: disable=duplicate-code
     """
     Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
 
@@ -130,15 +128,15 @@ def apply_liger_kernel_to_qwen3(
         layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
     """
 
-    import transformers.models.qwen3.modeling_qwen3  # noqa: F401  # pylint: disable=unused-import
+    import transformers.models.qwen3.modeling_qwen3  # noqa: F401
     from liger_kernel.transformers.functional import liger_cross_entropy
     from liger_kernel.transformers.layer_norm import LigerLayerNorm
     from liger_kernel.transformers.rms_norm import LigerRMSNorm
     from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
 
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
 
     modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
 
diff --git a/src/axolotl/integrations/liger/models/qwen3_moe.py b/src/axolotl/integrations/liger/models/qwen3_moe.py
index 89bdc5bcc..40bee110c 100644
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -45,7 +45,6 @@ def lce_forward(
     Returns:
     """
 
-    # pylint: disable=duplicate-code
     output_attentions = (
         output_attentions
         if output_attentions is not None
@@ -135,9 +134,8 @@ def apply_liger_kernel_to_qwen3_moe(
     rms_norm: bool = False,
     glu_activation: bool = False,
     layer_norm: bool = False,
-    **kwargs,  # pylint: disable=unused-argument
+    **kwargs,
 ) -> None:
-    # pylint: disable=duplicate-code
     """
     Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
 
@@ -152,15 +150,15 @@ def apply_liger_kernel_to_qwen3_moe(
         layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
     """
 
-    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401  # pylint: disable=unused-import
+    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401
     from liger_kernel.transformers.functional import liger_cross_entropy
     from liger_kernel.transformers.layer_norm import LigerLayerNorm
     from liger_kernel.transformers.rms_norm import LigerRMSNorm
     from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
 
-    assert not (
-        cross_entropy and fused_linear_cross_entropy
-    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    assert not (cross_entropy and fused_linear_cross_entropy), (
+        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+    )
 
     modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
 
@@ -174,7 +172,7 @@ def apply_liger_kernel_to_qwen3_moe(
             # clone config to avoid modifying the original
             config = deepcopy(config)
             if intermediate_size:
-                setattr(config, "intermediate_size", intermediate_size)
+                config.intermediate_size = intermediate_size
             return LigerSwiGLUMLP(config, **kwargs)
 
         modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
diff --git a/src/axolotl/integrations/lm_eval/__init__.py b/src/axolotl/integrations/lm_eval/__init__.py
index 8db4dc634..0ab6b8697 100644
--- a/src/axolotl/integrations/lm_eval/__init__.py
+++ b/src/axolotl/integrations/lm_eval/__init__.py
@@ -7,7 +7,7 @@ import subprocess  # nosec
 from axolotl.integrations.base import BasePlugin
 from axolotl.integrations.lm_eval.cli import build_lm_eval_command
 
-from .args import LMEvalArgs  # pylint: disable=unused-import. # noqa: F401
+from .args import LMEvalArgs as LMEvalArgs
 
 
 class LMEvalPlugin(BasePlugin):
@@ -20,7 +20,6 @@ class LMEvalPlugin(BasePlugin):
 
     def post_train_unload(self, cfg):
         if cfg.lm_eval_post_train:
-            # pylint: disable=duplicate-code
             for lm_eval_args in build_lm_eval_command(
                 cfg.lm_eval_tasks,
                 bfloat16=cfg.bfloat16 or cfg.bf16,
diff --git a/src/axolotl/integrations/lm_eval/cli.py b/src/axolotl/integrations/lm_eval/cli.py
index 19608e1d9..ead82dcb7 100644
--- a/src/axolotl/integrations/lm_eval/cli.py
+++ b/src/axolotl/integrations/lm_eval/cli.py
@@ -99,7 +99,6 @@ def lm_eval(config: str, cloud: Optional[str] = None):
         with open(config, encoding="utf-8") as file:
             cfg: DictDefault = DictDefault(yaml.safe_load(file))
 
-        # pylint: disable=duplicate-code
         for lm_eval_args in build_lm_eval_command(
             cfg.lm_eval_tasks,
             bfloat16=cfg.bfloat16 or cfg.bf16,
diff --git a/src/axolotl/integrations/spectrum/__init__.py b/src/axolotl/integrations/spectrum/__init__.py
index 9f66aef97..5e8f9128d 100644
--- a/src/axolotl/integrations/spectrum/__init__.py
+++ b/src/axolotl/integrations/spectrum/__init__.py
@@ -23,7 +23,7 @@ import requests
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils.logging import get_logger
 
-from .args import SpectrumArgs  # pylint: disable=unused-import. # noqa: F401
+from .args import SpectrumArgs as SpectrumArgs
 
 LOG = get_logger(__name__)
 
@@ -46,7 +46,7 @@ def _generate_unfrozen_params_yaml(snr_data, top_fraction=0.5):
         "^lm_head.weight$",
         "^model.embed_tokens.weight$",
     ]
-    for layer_type, layer_names in top_layers_by_type.items():
+    for _, layer_names in top_layers_by_type.items():
         for layer_name in layer_names:
             unfrozen_parameters.append(layer_name)
     return unfrozen_parameters
@@ -84,7 +84,7 @@ class SpectrumPlugin(BasePlugin):
                 snr_data = json.load(fin)
         except FileNotFoundError:
             pass
-        except Exception as exc:  # pylint: disable=broad-exception-caught
+        except Exception as exc:
             LOG.warning(f"Failed to read SNR data from {snr_path}: {exc}")
 
         if not snr_data:
diff --git a/src/axolotl/integrations/spectrum/args.py b/src/axolotl/integrations/spectrum/args.py
index df5756038..be6ca4bfc 100644
--- a/src/axolotl/integrations/spectrum/args.py
+++ b/src/axolotl/integrations/spectrum/args.py
@@ -15,6 +15,7 @@
 """
 Module for handling Spectrum input arguments.
 """
+
 from typing import Optional
 
 from pydantic import BaseModel, model_validator
diff --git a/src/axolotl/kernels/geglu.py b/src/axolotl/kernels/geglu.py
index 6acbea0d4..ee3260ebd 100644
--- a/src/axolotl/kernels/geglu.py
+++ b/src/axolotl/kernels/geglu.py
@@ -5,8 +5,6 @@ See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202).
 Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
 """
 
-# pylint: disable=invalid-name,unnecessary-lambda-assignment,duplicate-code
-
 import torch
 import triton
 import triton.language as tl
diff --git a/src/axolotl/kernels/lora.py b/src/axolotl/kernels/lora.py
index fb45f2aa7..c3356fb90 100644
--- a/src/axolotl/kernels/lora.py
+++ b/src/axolotl/kernels/lora.py
@@ -7,8 +7,6 @@ See "LoRA: Low-Rank Adaptation of Large Language Models"
 Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
 """
 
-# pylint: disable=invalid-name
-
 from typing import Callable
 
 import torch
diff --git a/src/axolotl/kernels/quantize.py b/src/axolotl/kernels/quantize.py
index b61603fbc..d094f2381 100644
--- a/src/axolotl/kernels/quantize.py
+++ b/src/axolotl/kernels/quantize.py
@@ -1,7 +1,5 @@
 """Dequantization utilities for `bitsandbytes` integration."""
 
-# pylint: disable=invalid-name,global-statement
-
 import ctypes
 
 import bitsandbytes as bnb
diff --git a/src/axolotl/kernels/swiglu.py b/src/axolotl/kernels/swiglu.py
index 43a798edc..b13bcd350 100644
--- a/src/axolotl/kernels/swiglu.py
+++ b/src/axolotl/kernels/swiglu.py
@@ -99,7 +99,6 @@ def _swiglu_bwd_kernel(
     tl.store(up_ptr + offsets, grad_up, mask=mask)  # grad wrt up
 
 
-# pylint: disable=unnecessary-lambda-assignment
 def swiglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
     """
     SwiGLU forward pass. Computes SwiGLU activation: `x * sigmoid(x) * up`, where
@@ -128,7 +127,6 @@ def swiglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
     return out
 
 
-# pylint: disable=unnecessary-lambda-assignment
 def swiglu_backward(
     grad_output: torch.Tensor, gate: torch.Tensor, up: torch.Tensor
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
diff --git a/src/axolotl/loaders/__init__.py b/src/axolotl/loaders/__init__.py
index 3eef75e58..ae99bf16d 100644
--- a/src/axolotl/loaders/__init__.py
+++ b/src/axolotl/loaders/__init__.py
@@ -1,6 +1,5 @@
 """Init for axolotl.loaders module"""
 
-# pylint: disable=unused-import
 # flake8: noqa
 
 from .adapter import load_adapter, load_lora
diff --git a/src/axolotl/loaders/adapter.py b/src/axolotl/loaders/adapter.py
index db28206b6..867e6901c 100644
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -28,14 +28,12 @@ LOG = get_logger(__name__)
 def setup_quantized_meta_for_peft(model: torch.nn.Module):
     """Replaces `quant_state.to` with a dummy function to prevent PEFT from moving `quant_state` to meta device"""
 
-    def temp_to_method(self, *args, **kwargs):  # pylint: disable=unused-argument
+    def temp_to_method(self, *args, **kwargs):
         return self
 
     for param in model.parameters():
         if isinstance(param, Params4bit):
-            param.quant_state._orig_to = (  # pylint: disable=protected-access
-                param.quant_state.to
-            )
+            param.quant_state._orig_to = param.quant_state.to
             param.quant_state.to = types.MethodType(temp_to_method, param.quant_state)
 
 
@@ -43,10 +41,8 @@ def setup_quantized_peft_meta_for_training(model: torch.nn.Module):
     """Replaces dummy `quant_state.to` method with the original function to allow training to continue"""
     for param in model.parameters():
         if isinstance(param, Params4bit) and hasattr(param.quant_state, "_orig_to"):
-            param.quant_state.to = (
-                param.quant_state._orig_to  # pylint: disable=protected-access
-            )
-            param.quant_state._orig_to = None  # pylint: disable=protected-access
+            param.quant_state.to = param.quant_state._orig_to
+            param.quant_state._orig_to = None
 
 
 def find_all_linear_names(model):
diff --git a/src/axolotl/loaders/model.py b/src/axolotl/loaders/model.py
index 53ae428a2..a9507d685 100644
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -102,7 +102,7 @@ class ModelLoader:
         *,
         inference: bool = False,
         reference_model: bool = False,
-        **kwargs,  # pylint: disable=unused-argument
+        **kwargs,
     ):
         """Initializes the ModelLoader.
 
@@ -134,7 +134,7 @@ class ModelLoader:
 
         # Init model config
         self.model_config = load_model_config(cfg)
-        self.auto_model_loader = AutoModelForCausalLM  # pylint: disable=invalid-name
+        self.auto_model_loader = AutoModelForCausalLM
 
         # Initialize the patch manager
         self.patch_manager = PatchManager(
@@ -607,27 +607,19 @@ class ModelLoader:
             self.model_kwargs["attn_implementation"] = self.cfg.attn_implementation
         elif self.cfg.flex_attention:
             self.model_kwargs["attn_implementation"] = "flex_attention"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "flex_attention"
-            )
+            self.model_config._attn_implementation = "flex_attention"
 
         elif self.cfg.flash_attention:
             if not self.cfg.sample_packing and self.cfg.s2_attention:
                 pass
             self.model_kwargs["attn_implementation"] = "flash_attention_2"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "flash_attention_2"
-            )
+            self.model_config._attn_implementation = "flash_attention_2"
         elif self.cfg.sdp_attention:
             self.model_kwargs["attn_implementation"] = "sdpa"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "sdpa"
-            )
+            self.model_config._attn_implementation = "sdpa"
         elif self.cfg.eager_attention:
             self.model_kwargs["attn_implementation"] = "eager"
-            self.model_config._attn_implementation = (  # pylint: disable=protected-access
-                "eager"
-            )
+            self.model_config._attn_implementation = "eager"
 
         if self.cfg.low_cpu_mem_usage:
             self.model_kwargs["low_cpu_mem_usage"] = True
@@ -767,7 +759,7 @@ class ModelLoader:
                 )
         elif self.model_type == "MambaLMHeadModel":
             # FIXME this is janky at best and hacked together to make it work
-            MambaLMHeadModel = fix_mamba_attn_for_loss()  # pylint: disable=invalid-name
+            MambaLMHeadModel = fix_mamba_attn_for_loss()
 
             self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
             self.model_kwargs["device"] = torch.cuda.current_device()
@@ -816,7 +808,6 @@ class ModelLoader:
         if is_deepspeed_zero3_enabled():
             skip_move_to_device = True
 
-        # pylint: disable=protected-access
         if self.cfg.tensor_parallel_size > 1:
             # workaround for upstream 4.54.0 not setting _tp_size or _device_mesh
             # TODO(wing): remove once 4.54.1 is released
diff --git a/src/axolotl/loaders/tokenizer.py b/src/axolotl/loaders/tokenizer.py
index 0a486d023..dcc255938 100644
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -50,7 +50,7 @@ def modify_tokenizer_files(
     tokenizer_dir = os.path.join(output_dir, "tokenizer")
     os.makedirs(tokenizer_dir, exist_ok=True)
 
-    if is_local_main_process():  # pylint: disable=too-many-nested-blocks
+    if is_local_main_process():
         # Load the tokenizer
         temp_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
 
@@ -73,9 +73,9 @@ def modify_tokenizer_files(
                 for token_id, new_value in token_id_mappings.items():
                     token_id_str = str(token_id)
                     if token_id_str in config_data["added_tokens_decoder"]:
-                        config_data["added_tokens_decoder"][token_id_str][
-                            "content"
-                        ] = new_value
+                        config_data["added_tokens_decoder"][token_id_str]["content"] = (
+                            new_value
+                        )
                     else:
                         raise ValueError(
                             f"Token ID {token_id_str} not found in added_tokens_decoder"
@@ -215,7 +215,7 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
         for k, val in special_tokens.items():
             # check if new special token is not already in tokenizer and
             # is adapter training to make sure lora_modules_to_save is set
-            # pylint: disable=too-many-boolean-expressions
+
             if (
                 (getattr(tokenizer, k) is None or getattr(tokenizer, k) != val)
                 and (len(tokenizer.encode(val, add_special_tokens=False)) > 2)
diff --git a/src/axolotl/models/mamba/__init__.py b/src/axolotl/models/mamba/__init__.py
index fee88e3a4..d6bb40d99 100644
--- a/src/axolotl/models/mamba/__init__.py
+++ b/src/axolotl/models/mamba/__init__.py
@@ -21,4 +21,4 @@ def fix_mamba_attn_for_loss():
     from .modeling_mamba import MambaLMHeadModel as MambaLMHeadModelFixed
 
     mixer_seq_simple.MambaLMHeadModel = MambaLMHeadModelFixed
-    return mixer_seq_simple.MambaLMHeadModel  # pylint: disable=invalid-name
+    return mixer_seq_simple.MambaLMHeadModel
diff --git a/src/axolotl/models/mamba/modeling_mamba.py b/src/axolotl/models/mamba/modeling_mamba.py
index 70e9c88c8..2cfe11544 100644
--- a/src/axolotl/models/mamba/modeling_mamba.py
+++ b/src/axolotl/models/mamba/modeling_mamba.py
@@ -1,4 +1,3 @@
-# pylint: skip-file
 import os
 from collections import namedtuple
 from functools import partial
@@ -112,7 +111,7 @@ class MambaLMHeadModel(nn.Module, GenerationMixin):
         self,
         save_directory: Union[str, os.PathLike],
         state_dict: Optional[dict] = None,
-        safe_serialization: Optional[bool] = None,  # pylint: disable=unused-argument
+        safe_serialization: Optional[bool] = None,
     ):
         if state_dict is None:
             state_dict = self.state_dict()
diff --git a/src/axolotl/monkeypatch/accelerate/fsdp2.py b/src/axolotl/monkeypatch/accelerate/fsdp2.py
index 66d3d0d2d..3b38a33b7 100644
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -130,9 +130,9 @@ def get_state_dict(self, model, unwrap=True):
                         "Deepspeed TP requires deepspeed >= 0.16.4, Please update DeepSpeed via `pip install deepspeed -U`."
                     )
                 state_dict = (
-                    model._consolidated_16bit_state_dict()  # pylint: disable=protected-access
+                    model._consolidated_16bit_state_dict()
                     if tp_sharding
-                    else model._zero3_consolidated_16bit_state_dict()  # pylint: disable=protected-access
+                    else model._zero3_consolidated_16bit_state_dict()
                 )
             else:
                 raise ValueError(
@@ -231,8 +231,7 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
     )
 
     is_type_fsdp = isinstance(model, FSDPModule) or (
-        is_compiled_module(model)
-        and isinstance(model._orig_mod, FSDPModule)  # pylint: disable=protected-access
+        is_compiled_module(model) and isinstance(model._orig_mod, FSDPModule)
     )
     if is_type_fsdp:
         return model
diff --git a/src/axolotl/monkeypatch/accelerate/parallelism_config.py b/src/axolotl/monkeypatch/accelerate/parallelism_config.py
index e3cafc87d..b2157fb6b 100644
--- a/src/axolotl/monkeypatch/accelerate/parallelism_config.py
+++ b/src/axolotl/monkeypatch/accelerate/parallelism_config.py
@@ -2,7 +2,6 @@
 workaround to allow parallelism config for pure CP
 """
 
-# pylint: disable=protected-access
 import os
 import warnings
 
@@ -30,7 +29,7 @@ def _validate_accelerator(self, accelerator):
     allow_parallelism_config = False
 
     if (
-        self.cp_size > 1  # pylint: disable=chained-comparison
+        self.cp_size > 1
         and self.dp_shard_size <= 1
         and os.environ.get("ACCELERATE_ALLOW_CP_STANDALONE", "false").lower() == "true"
     ):
@@ -55,6 +54,7 @@ def _validate_accelerator(self, accelerator):
         warnings.warn(
             "ParallelismConfig has the following warnings:\n" + "\n".join(_warnings),
             UserWarning,
+            stacklevel=2,
         )
 
 
diff --git a/src/axolotl/monkeypatch/attention/flex_attn.py b/src/axolotl/monkeypatch/attention/flex_attn.py
index 98aead832..f59b8abe2 100644
--- a/src/axolotl/monkeypatch/attention/flex_attn.py
+++ b/src/axolotl/monkeypatch/attention/flex_attn.py
@@ -65,11 +65,9 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
             return self._compiled_flex_attention
 
     transformers.integrations.flex_attention.WrappedFlexAttention = WrappedFlexAttention
-    setattr(
-        sys.modules["transformers.integrations.flex_attention"],
-        "WrappedFlexAttention",
-        WrappedFlexAttention,
-    )
+    sys.modules[
+        "transformers.integrations.flex_attention"
+    ].WrappedFlexAttention = WrappedFlexAttention
 
 
 def patch_flex_make_mask():
@@ -144,9 +142,7 @@ def patch_flex_make_mask():
         # computation prior to the softmax. For sample packing, we need both the
         # logic for both causal mask and document mask. See PyTorch's official
         # blog post for more details: https://pytorch.org/blog/flexattention/#mask-mods
-        def causal_mask_mod(
-            batch_idx, head_idx, q_idx, kv_idx
-        ):  # pylint: disable=unused-argument
+        def causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
             """
             Defines the logic of a block causal mask by combining both a standard causal mask
             and a block diagonal document mask.
@@ -198,14 +194,12 @@ def patch_flex_make_mask():
     for n in tuple(sys.modules):
         if ".modeling_" in n:
             if hasattr(sys.modules[n], "make_flex_block_causal_mask"):
-                sys.modules[n].make_flex_block_causal_mask = (
-                    patched_make_flex_block_causal_mask
-                )
-                setattr(
-                    sys.modules[n],
-                    "make_flex_block_causal_mask",
-                    patched_make_flex_block_causal_mask,
-                )
+                sys.modules[
+                    n
+                ].make_flex_block_causal_mask = patched_make_flex_block_causal_mask
+                sys.modules[
+                    n
+                ].make_flex_block_causal_mask = patched_make_flex_block_causal_mask
 
     transformers.integrations.flex_attention.make_flex_block_causal_mask = (
         patched_make_flex_block_causal_mask
diff --git a/src/axolotl/monkeypatch/attention/xformers.py b/src/axolotl/monkeypatch/attention/xformers.py
index 5901963f0..eca95797a 100644
--- a/src/axolotl/monkeypatch/attention/xformers.py
+++ b/src/axolotl/monkeypatch/attention/xformers.py
@@ -23,15 +23,15 @@ def xformers_attention_forward(
     value: torch.Tensor,
     attention_mask: Optional[torch.Tensor] = None,
     position_ids: Optional[torch.LongTensor] = None,
-    dropout: float = 0.0,  # pylint: disable=unused-argument
-    scaling: Optional[float] = None,  # pylint: disable=unused-argument
-    sliding_window: Optional[int] = None,  # pylint: disable=unused-argument
-    softcap: Optional[float] = None,  # pylint: disable=unused-argument
+    dropout: float = 0.0,
+    scaling: Optional[float] = None,
+    sliding_window: Optional[int] = None,
+    softcap: Optional[float] = None,
     cu_seq_lens_q: Optional[torch.LongTensor] = None,
     cu_seq_lens_k: Optional[torch.LongTensor] = None,
     max_length_q: Optional[int] = None,
-    max_length_k: Optional[int] = None,  # pylint: disable=unused-argument
-    **kwargs,  # pylint: disable=unused-argument
+    max_length_k: Optional[int] = None,
+    **kwargs,
 ):
     # Get dimensions
     # query: [batch, heads, seq_len, hidden_dim]
diff --git a/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py b/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py
index 589980c8b..2c5077392 100644
--- a/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/btlm_attn_hijack_flash.py
@@ -25,9 +25,7 @@ def replace_btlm_attn_with_flash_attn(model_name="cerebras/btlm-3b-8k-base"):
         ".configuration_btlm", ".modeling_btlm"
     )
     modeling_btlm = importlib.import_module(module_name)
-    modeling_btlm.BTLMAttention._attn = (  # pylint: disable=protected-access
-        flashattn_attn
-    )
+    modeling_btlm.BTLMAttention._attn = flashattn_attn
 
 
 def flashattn_attn(
@@ -35,9 +33,9 @@ def flashattn_attn(
     query: torch.Tensor,
     key: Optional[torch.Tensor] = None,
     value: Optional[torch.Tensor] = None,
-    attention_mask: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+    attention_mask: Optional[torch.Tensor] = None,
     head_mask: Optional[torch.Tensor] = None,
-    position_bias: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+    position_bias: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
     softmax_scale = (
         1 / (key.size(-1) ** self.attn_scale_power) if self.scale_attn_weights else None
diff --git a/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py b/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
index 73bf37b61..c426344a6 100644
--- a/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
+++ b/src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
@@ -1,7 +1,5 @@
 """Monkey patches for the dataset fetcher to handle batches of packed indexes."""
 
-# pylint: disable=protected-access
-
 import torch
 from torch.utils.data._utils.fetch import _BaseDatasetFetcher
 from torch.utils.data._utils.worker import _worker_loop
diff --git a/src/axolotl/monkeypatch/fsdp2_qlora.py b/src/axolotl/monkeypatch/fsdp2_qlora.py
index 5a4332fff..04d0d1971 100644
--- a/src/axolotl/monkeypatch/fsdp2_qlora.py
+++ b/src/axolotl/monkeypatch/fsdp2_qlora.py
@@ -15,7 +15,6 @@ from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
 
 
-# pylint: disable=protected-access
 def apply_init_sharded_param_patch():
     """Apply patch to FSDPParam._init_sharded_param to support Params4bit."""
     from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam
@@ -66,14 +65,14 @@ def apply_init_sharded_param_patch():
             if item in patched_source:
                 items_to_import.append(item)
 
-        exec(  # pylint: disable=exec-used  # nosec B102
+        exec(  # nosec B102
             f"from {module_name} import ({', '.join(items_to_import)})",
             globals(),
         )
-        exec(patched_source, globals())  # pylint: disable=exec-used  # nosec B102
+        exec(patched_source, globals())  # nosec B102
 
         # Replace the method
-        FSDPParam._init_sharded_param = patched_init_sharded_param  # pylint: disable=undefined-variable  # noqa: F821
+        FSDPParam._init_sharded_param = patched_init_sharded_param
         LOG.info("Successfully applied FSDP _init_sharded_param patch")
     else:
         LOG.warning("Could not find target code for _init_sharded_param patching")
@@ -131,14 +130,14 @@ def apply_init_unsharded_param_patch():
             if item in patched_source:
                 items_to_import.append(item)
 
-        exec(  # pylint: disable=exec-used  # nosec B102
+        exec(  # nosec B102
             f"from {module_name} import ({', '.join(items_to_import)})",
             globals(),
         )
-        exec(patched_source, globals())  # pylint: disable=exec-used  # nosec B102
+        exec(patched_source, globals())  # nosec B102
 
         # Replace the method
-        FSDPParam.init_unsharded_param = patched_init_unsharded_param  # pylint: disable=undefined-variable  # noqa: F821
+        FSDPParam.init_unsharded_param = patched_init_unsharded_param
         LOG.info("Successfully applied FSDP init_unsharded_param patch")
     else:
         LOG.warning("Could not find target code for patching")
diff --git a/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py b/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py
index 3b090d5e5..b58bbb67c 100644
--- a/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/__init__.py
@@ -25,9 +25,7 @@ else:
         return False
 
 
-def hf_grad_checkpoint_offload_wrapper(
-    decoder_layer, *args, use_reentrant=None
-):  # pylint: disable=unused-argument
+def hf_grad_checkpoint_offload_wrapper(decoder_layer, *args, use_reentrant=None):
     if uses_gc_layers(decoder_layer):
         return CPU_Offloaded_Gradient_Checkpointer.apply(
             decoder_layer,
@@ -44,9 +42,7 @@ def hf_grad_checkpoint_offload_wrapper(
     )
 
 
-def hf_grad_checkpoint_disk_offload_wrapper(
-    decoder_layer, *args, use_reentrant=None
-):  # pylint: disable=unused-argument
+def hf_grad_checkpoint_disk_offload_wrapper(decoder_layer, *args, use_reentrant=None):
     if uses_gc_layers(decoder_layer):
         return Disco.apply(
             decoder_layer,
diff --git a/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py b/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
index bbcfb91e6..8d06f172d 100644
--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
@@ -35,9 +35,7 @@ else:
     torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")
 
 
-class CPU_Offloaded_Gradient_Checkpointer(  # pylint: disable=invalid-name
-    torch.autograd.Function
-):
+class CPU_Offloaded_Gradient_Checkpointer(torch.autograd.Function):
     """
     Saves VRAM by smartly offloading to RAM.
     Tiny hit to performance, since we mask the movement via non blocking calls.
@@ -66,6 +64,4 @@ class CPU_Offloaded_Gradient_Checkpointer(  # pylint: disable=invalid-name
         return (
             None,
             hidden_states.grad,
-        ) + (
-            None,
-        ) * len(ctx.args)
+        ) + (None,) * len(ctx.args)
diff --git a/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py b/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
index 792d3c6ef..220799fbf 100644
--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
@@ -62,9 +62,9 @@ class DiskOffloadManager:
 
         # Track tensor paths and their status
         self.tensor_paths: deque = deque()  # Ordered history of tensor paths (LIFO)
-        self.file_locks: Dict[str, threading.Lock] = (
-            {}
-        )  # Maps file_path -> threading.Lock()
+        self.file_locks: Dict[
+            str, threading.Lock
+        ] = {}  # Maps file_path -> threading.Lock()
         # Maps file_path -> status ("saving", "ready", "prefetching", "loaded", "deleted")
         self.file_status: Dict[str, str] = {}
 
@@ -236,7 +236,7 @@ class DiskOffloadManager:
             self.tensor_paths.append(file_path)
 
         # Acquire semaphore to limit concurrent save operations
-        self.save_semaphore.acquire()  # pylint: disable=consider-using-with
+        self.save_semaphore.acquire()
         # Queue tensor for saving in background
         self.save_queue.put((tensor.detach(), file_path))
 
diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
index 1316b5374..3953cb138 100644
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -2,6 +2,7 @@
 
 # copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py
 
+import importlib.util
 import warnings
 from typing import Optional, Tuple
 
@@ -19,7 +20,7 @@ from axolotl.monkeypatch.utils import set_module_name
 from axolotl.utils.logging import get_logger
 
 try:
-    from flash_attn.flash_attn_interface import (  # pylint: disable=ungrouped-imports
+    from flash_attn.flash_attn_interface import (
         flash_attn_varlen_qkvpacked_func,
     )
 except ImportError:
@@ -32,12 +33,7 @@ LOG = get_logger(__name__)
 
 
 def is_xformers_available() -> bool:
-    try:
-        import xformers  # pylint: disable=unused-import  # noqa: F401
-
-        return True
-    except ImportError:
-        return False
+    return importlib.util.find_spec("xformers") is not None
 
 
 def is_xformers_swiglu_available() -> bool:
@@ -83,7 +79,7 @@ def patch_fa_llama_cross_entropy():
         num_items_in_batch: int = None,
         ignore_index: int = -100,
         **kwargs,
-    ):  # pylint: disable=unused-argument
+    ):
         reduction = "sum" if num_items_in_batch is not None else "mean"
         loss, _ = flash_attn_cross_entropy_loss(
             source, target, ignore_index=ignore_index
@@ -120,9 +116,7 @@ def replace_llama_attn_with_flash_attn(
     rms_norm: Optional[bool] = False,
     use_shifted_sparse_attn: Optional[bool] = False,
 ):
-    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
-        _prepare_decoder_attention_mask
-    )
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
     if use_shifted_sparse_attn:
         transformers.models.llama.modeling_llama.LlamaAttention.forward = (
             flashattn_forward_with_s2attn
@@ -145,7 +139,7 @@ def _prepare_decoder_attention_mask(
     input_shape,
     inputs_embeds,
     past_key_values_length,
-):  # pylint: disable=unused-argument
+):
     # [bsz, seq_len]
     return attention_mask
 
@@ -161,9 +155,9 @@ def flashattn_forward_with_s2attn(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
-    padding_mask: Optional[torch.LongTensor] = None,  # pylint: disable=unused-argument
-    cu_seqlens: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
-    max_seqlen: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+    padding_mask: Optional[torch.LongTensor] = None,
+    cu_seqlens: Optional[torch.Tensor] = None,
+    max_seqlen: Optional[torch.Tensor] = None,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
     """Input shape: Batch x Time x Channel
 
@@ -176,7 +170,8 @@ def flashattn_forward_with_s2attn(
     """
     if output_attentions:
         warnings.warn(
-            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.",
+            stacklevel=2,
         )
 
     bsz, q_len, _ = hidden_states.size()
@@ -198,7 +193,6 @@ def flashattn_forward_with_s2attn(
     )
     # [bsz, q_len, nh, hd]
     # [bsz, nh, q_len, hd]
-    # pylint: disable=duplicate-code
 
     cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
     query_states, key_states = apply_rotary_pos_emb(
@@ -244,9 +238,7 @@ def flashattn_forward_with_s2attn(
         .permute(0, 3, 1, 2, 4, 5)
         .reshape(bsz * 2, q_len, 3, self.num_heads // 2, self.head_dim)
     )
-    x = rearrange(  # pylint: disable=invalid-name
-        qkv, "b s three h d -> b s (three h d)"
-    )
+    x = rearrange(qkv, "b s three h d -> b s (three h d)")
     x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
     cu_q_len_tmp = torch.arange(
         0, max_s, group_size, device=key_padding_mask.device, dtype=cu_q_lens.dtype
diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
index 28223eee3..332242e2c 100644
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -32,10 +32,9 @@ def xformers_forward(
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
     output_attentions: bool = False,
     use_cache: bool = False,
-    padding_mask: Optional[torch.LongTensor] = None,  # pylint: disable=unused-argument
-    **kwargs,  # pylint: disable=unused-argument
+    padding_mask: Optional[torch.LongTensor] = None,
+    **kwargs,
 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    # pylint: disable=duplicate-code
     bsz, q_len, _ = hidden_states.size()
 
     if not hasattr(self, "pretraining_tp"):
@@ -102,7 +101,8 @@ def xformers_forward(
 
     if output_attentions:
         warnings.warn(
-            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.",
+            stacklevel=2,
         )
 
     #
diff --git a/src/axolotl/monkeypatch/llama_expand_mask.py b/src/axolotl/monkeypatch/llama_expand_mask.py
index 0277c212a..5cfb7818e 100644
--- a/src/axolotl/monkeypatch/llama_expand_mask.py
+++ b/src/axolotl/monkeypatch/llama_expand_mask.py
@@ -21,6 +21,4 @@ def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int]
 def hijack_expand_mask():
     import transformers
 
-    transformers.models.llama.modeling_llama._expand_mask = (  # pylint: disable=protected-access
-        _expand_mask
-    )
+    transformers.models.llama.modeling_llama._expand_mask = _expand_mask
diff --git a/src/axolotl/monkeypatch/llama_patch_multipack.py b/src/axolotl/monkeypatch/llama_patch_multipack.py
index cfd525367..8d234881f 100644
--- a/src/axolotl/monkeypatch/llama_patch_multipack.py
+++ b/src/axolotl/monkeypatch/llama_patch_multipack.py
@@ -12,15 +12,15 @@ def hijack_llama_prepare_4d_mask():
     from transformers import modeling_attn_mask_utils
     from transformers.models.llama import modeling_llama
 
-    modeling_llama._prepare_4d_causal_attention_mask_for_sdpa = (  # pylint: disable=protected-access
+    modeling_llama._prepare_4d_causal_attention_mask_for_sdpa = (
         patched_prepare_4d_causal_attention_mask_for_sdpa
     )
-    modeling_attn_mask_utils._prepare_4d_causal_attention_mask_for_sdpa = (  # pylint: disable=protected-access
+    modeling_attn_mask_utils._prepare_4d_causal_attention_mask_for_sdpa = (
         patched_prepare_4d_causal_attention_mask_for_sdpa
     )
-    modeling_llama._prepare_4d_causal_attention_mask = (  # pylint: disable=protected-access
+    modeling_llama._prepare_4d_causal_attention_mask = (
         patched_prepare_4d_causal_attention_mask
     )
-    modeling_attn_mask_utils._prepare_4d_causal_attention_mask = (  # pylint: disable=protected-access
+    modeling_attn_mask_utils._prepare_4d_causal_attention_mask = (
         patched_prepare_4d_causal_attention_mask
     )
diff --git a/src/axolotl/monkeypatch/lora_kernels.py b/src/axolotl/monkeypatch/lora_kernels.py
index be1e1f2ff..ef5174ba2 100644
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -30,48 +30,36 @@ QKV_PATCHES = [
     query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
     key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
     value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-""".lstrip(
-            "\n"
-        ),
+""".lstrip("\n"),
         """
     query_states, key_states, value_states = self.apply_qkv(hidden_states)
     query_states = query_states.view(hidden_shape).transpose(1, 2)
     key_states = key_states.view(hidden_shape).transpose(1, 2)
     value_states = value_states.view(hidden_shape).transpose(1, 2)
-""".lstrip(
-            "\n"
-        ),
+""".lstrip("\n"),
     ),
     (
         """
     query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
     key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
     value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-""".lstrip(
-            "\n"
-        ),
+""".lstrip("\n"),
         """
     query_states, key_states, value_states = self.apply_qkv(hidden_states)
     query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
     key_states = self.k_norm(key_states.view(hidden_shape)).transpose(1, 2)
     value_states = value_states.view(hidden_shape).transpose(1, 2)
-""".lstrip(
-            "\n"
-        ),
+""".lstrip("\n"),
     ),
 ]
 
 ORIGINAL_O_CODE = """
     attn_output = self.o_proj(attn_output)
-""".lstrip(
-    "\n"
-)
+""".lstrip("\n")
 
 PATCHED_O_CODE = """
     attn_output = self.apply_o(attn_output)
-""".lstrip(
-    "\n"
-)
+""".lstrip("\n")
 
 SUPPORTED_ACTIVATIONS = ["silu", "gelu"]
 APPLY_FN_MAPPING = {
@@ -176,7 +164,6 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
         ) from e
 
 
-# pylint: disable=protected-access
 def patch_self_attn_lora(cfg: DictDefault):
     """
     Given an `axolotl` config, this method patches the inferred attention class forward
@@ -203,9 +190,9 @@ def patch_self_attn_lora(cfg: DictDefault):
     attention_cls._original_forward = self_attn_forward
     self_attn_forward, _ = detab_code(self_attn_forward)
 
-    assert any(
-        qkv_options[0] in self_attn_forward for qkv_options in QKV_PATCHES
-    ), "Original QKV code not found"
+    assert any(qkv_options[0] in self_attn_forward for qkv_options in QKV_PATCHES), (
+        "Original QKV code not found"
+    )
     assert ORIGINAL_O_CODE in self_attn_forward, "Original O code not found"
 
     for qkv_orig, qkv_patched in QKV_PATCHES:
@@ -231,16 +218,14 @@ def patch_self_attn_lora(cfg: DictDefault):
         if item in self_attn_forward:
             items_to_import.append(item)
 
-    exec(  # pylint: disable=exec-used  # nosec B102
+    exec(
         f"from {module_name} import ({', '.join(items_to_import)})",
         globals(),
     )
-    exec(self_attn_forward, globals())  # pylint: disable=exec-used  # nosec B102
+    exec(self_attn_forward, globals())
 
     LOG.info(f"Patched attention class with LoRA optims: {attention_cls.__name__}")
-    attention_cls.forward = (
-        axolotl_attn_forward  # pylint: disable=undefined-variable  # noqa: F821
-    )
+    attention_cls.forward = axolotl_attn_forward
 
 
 def find_self_attn_in_layer(
@@ -277,9 +262,13 @@ def find_mlp_in_layer(
                 layer.feedforward.experts.gate_projs,
                 layer.feedforward.experts.up_projs,
                 layer.feedforward.experts.down_projs,
+                strict=False,
             ):
-                yield gate_proj, up_proj, down_proj, FakeMLP(
-                    gate_proj, up_proj, down_proj
+                yield (
+                    gate_proj,
+                    up_proj,
+                    down_proj,
+                    FakeMLP(gate_proj, up_proj, down_proj),
                 )
 
 
@@ -337,9 +326,9 @@ def apply_lora_kernel_patches(
 
     # Get active LoRA adapter config
     if hasattr(model, "active_adapters"):
-        assert (
-            len(model.active_adapters) == 1
-        ), "Axolotl currently does not support LoRA Triton kernels for multiple adapters"
+        assert len(model.active_adapters) == 1, (
+            "Axolotl currently does not support LoRA Triton kernels for multiple adapters"
+        )
         active_adapter = model.active_adapters[0]
     else:
         active_adapter = model.active_adapter
diff --git a/src/axolotl/monkeypatch/loss/chunked.py b/src/axolotl/monkeypatch/loss/chunked.py
index 0a9d0de82..26a52f898 100644
--- a/src/axolotl/monkeypatch/loss/chunked.py
+++ b/src/axolotl/monkeypatch/loss/chunked.py
@@ -25,7 +25,7 @@ class CEWithChunkedOutputLoss(torch.nn.Module):
         self,
         logits: torch.Tensor,
         labels: torch.Tensor,
-        normalize: bool = True,  # pylint: disable=unused-argument
+        normalize: bool = True,
     ) -> torch.Tensor:
         """
         Upcast logits to fp32 and compute cross entropy loss.
@@ -63,7 +63,7 @@ class CEWithChunkedOutputLoss(torch.nn.Module):
 
         # compute one chunk at a time
         total_loss = 0.0
-        for logits_chunk, labels_chunk in zip(logits, labels):
+        for logits_chunk, labels_chunk in zip(logits, labels, strict=False):
             total_loss += self.compute_cross_entropy(logits_chunk, labels_chunk)
 
         if reduction == "sum":
@@ -88,9 +88,9 @@ def get_causal_lm_loss(num_output_chunks: int = 8, ignore_index: int = -100):
         num_items_in_batch: int = None,
         ignore_index: int = -100,
         **kwargs,
-    ):  # pylint: disable=unused-argument
+    ):
         reduction = "sum" if num_items_in_batch is not None else "mean"
-        logit_chunks = [  # pylint: disable=unnecessary-comprehension
+        logit_chunks = [
             chunk for chunk in source.chunk(loss_fn_ce.num_output_chunks, dim=1)
         ]
         loss = loss_fn_ce(logit_chunks, target, reduction=reduction)
@@ -101,7 +101,7 @@ def get_causal_lm_loss(num_output_chunks: int = 8, ignore_index: int = -100):
     def for_causal_lm_chunked_loss(
         logits,
         labels,
-        vocab_size: int = None,  # pylint: disable=unused-argument
+        vocab_size: int = None,
         num_items_in_batch: Optional[int] = None,
         ignore_index: int = -100,
         shift_labels: Optional[torch.Tensor] = None,
diff --git a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
index e1be424a3..0994da91c 100644
--- a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
@@ -1,7 +1,5 @@
 """Flash attention monkey patch for mistral model"""
 
-# pylint: disable=duplicate-code
-
 from functools import partial
 
 import transformers
diff --git a/src/axolotl/monkeypatch/mixtral/__init__.py b/src/axolotl/monkeypatch/mixtral/__init__.py
index 5b8054000..b353b12cf 100644
--- a/src/axolotl/monkeypatch/mixtral/__init__.py
+++ b/src/axolotl/monkeypatch/mixtral/__init__.py
@@ -31,14 +31,12 @@ def patch_mixtral_moe_forward_zero3() -> None:
         topk_weight = topk_weight.to(hidden_states.dtype)
 
         hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
-        y = torch.empty_like(hidden_states)  # pylint: disable=invalid-name
+        y = torch.empty_like(hidden_states)
         flat_topk_idx = topk_idx.view(-1)
         for i in range(self.num_experts):
             expert = self.experts[i]
             y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
-        y = (  # pylint: disable=invalid-name
-            y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)
-        ).sum(dim=1)
+        y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
         final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
         return final_hidden_states, router_logits
 
diff --git a/src/axolotl/monkeypatch/models/llama4/modeling.py b/src/axolotl/monkeypatch/models/llama4/modeling.py
index 4127793e7..0fc8f5699 100644
--- a/src/axolotl/monkeypatch/models/llama4/modeling.py
+++ b/src/axolotl/monkeypatch/models/llama4/modeling.py
@@ -95,18 +95,12 @@ def patch_llama4_linearized_modeling():
 
     old_lamma_4_text_experts = modeling_llama4.Llama4TextExperts
     modeling_llama4.Llama4TextExperts = Llama4TextExperts
-    setattr(
-        sys.modules["transformers.models.llama4"],
-        "Llama4TextExperts",
-        Llama4TextExperts,
-    )
+    sys.modules["transformers.models.llama4"].Llama4TextExperts = Llama4TextExperts
 
     def unpatch():
         modeling_llama4.Llama4TextExperts = old_lamma_4_text_experts
-        setattr(
-            sys.modules["transformers.models.llama4"],
-            "Llama4TextExperts",
-            old_lamma_4_text_experts,
-        )
+        sys.modules[
+            "transformers.models.llama4"
+        ].Llama4TextExperts = old_lamma_4_text_experts
 
     return unpatch
diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
index 7df9877d7..e4f9ca2be 100644
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -49,9 +49,7 @@ def patch_for_multipack(model_type, model_name=None, has_remote_code=False):
         assert hasattr(
             transformers.modeling_flash_attention_utils, "_get_unpad_data"
         ), "transformers api changed for _get_unpad_data for flash attention"
-        transformers.modeling_flash_attention_utils._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
+        transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data
 
     if model_type == "mixtral" and is_deepspeed_zero3_enabled():
         patch_mixtral_moe_forward_zero3()
@@ -67,6 +65,4 @@ def patch_remote(model_name):
     module_name = ".".join(parts)
     modeling_arch = importlib.import_module(module_name)
     if hasattr(modeling_arch, "_get_unpad_data"):
-        modeling_arch._get_unpad_data = (  # pylint: disable=protected-access
-            get_unpad_data
-        )
+        modeling_arch._get_unpad_data = get_unpad_data
diff --git a/src/axolotl/monkeypatch/peft/utils.py b/src/axolotl/monkeypatch/peft/utils.py
index 0c571fbd2..d1011f5eb 100644
--- a/src/axolotl/monkeypatch/peft/utils.py
+++ b/src/axolotl/monkeypatch/peft/utils.py
@@ -49,9 +49,7 @@ def patch_peft_prep_code():
         prep_code = get_peft_prep_code()
     except OSError:
         return
-    peft.utils.other._original_create_accelerator_and_postprocess = (  # pylint: disable=protected-access
-        prep_code
-    )
+    peft.utils.other._original_create_accelerator_and_postprocess = prep_code
     prep_code, _ = detab_code(prep_code)
     if ORIGINAL_PREPARE_CODE not in prep_code:
         return
@@ -68,11 +66,15 @@ def patch_peft_prep_code():
         if item in prep_code:
             items_to_import.append(item)
 
-    exec(  # pylint: disable=exec-used  # nosec B102
+    exec(
         "from peft.utils.other import (" + ", ".join(x for x in items_to_import) + ")",
         globals(),
     )
-    exec(prep_code, globals())  # pylint: disable=exec-used  # nosec B102
+    exec(prep_code, globals())
     LOG.info("patching prepare_model_for_kbit_training to allow for overrides")
-    peft.utils.other.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
-    axolotl.loaders.model.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
+    peft.utils.other.prepare_model_for_kbit_training = (
+        fixed_prepare_model_for_kbit_training
+    )
+    axolotl.loaders.model.prepare_model_for_kbit_training = (
+        fixed_prepare_model_for_kbit_training
+    )
diff --git a/src/axolotl/monkeypatch/relora.py b/src/axolotl/monkeypatch/relora.py
index 0028a0cf6..a01d850b3 100644
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -91,9 +91,9 @@ class ReLoRACallback(TrainerCallback):
         if not os.path.exists(self.last_full_model):
             self.last_full_model = str(Path(snapshot_download(cfg.base_model)))
 
-        assert os.path.exists(
-            self.last_full_model
-        ), "for ReLORA base_model must be a local path"
+        assert os.path.exists(self.last_full_model), (
+            "for ReLORA base_model must be a local path"
+        )
 
         self.num_lora_restarts = 0
         self.need_full_save = False
@@ -293,7 +293,6 @@ def find_lora_modules(model: peft.LoraModel) -> Dict[str, peft.tuners.lora.LoraL
     key_list = [key for key, _ in model.model.named_modules() if "lora" not in key]
     for key in key_list:
         try:
-            # pylint: disable=protected-access
             _parent, target, _target_name = peft.utils._get_submodules(model.model, key)
         except AttributeError:
             continue
@@ -341,7 +340,7 @@ def merge_and_save(
     modules = find_lora_modules(model)
 
     if not quantized:
-        for module_name, target in modules.items():
+        for _, target in modules.items():
             active_adapter = target.active_adapter
             if isinstance(active_adapter, list):
                 active_adapter = active_adapter[0]
diff --git a/src/axolotl/monkeypatch/ring_attn/__init__.py b/src/axolotl/monkeypatch/ring_attn/__init__.py
index 736378b16..1c14776c9 100644
--- a/src/axolotl/monkeypatch/ring_attn/__init__.py
+++ b/src/axolotl/monkeypatch/ring_attn/__init__.py
@@ -1,6 +1,5 @@
 """Init for ring attention monkeypatch module"""
 
-# pylint: disable=unused-import
 # flake8: noqa
 
 from .patch import (
diff --git a/src/axolotl/monkeypatch/ring_attn/adapters/batch.py b/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
index 607b4dd71..74d33ed4a 100644
--- a/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
+++ b/src/axolotl/monkeypatch/ring_attn/adapters/batch.py
@@ -7,8 +7,6 @@ Our implementation closely follows the structure of that module, but we've minif
 somewhat to support only the latest versions of transformers.
 """
 
-# pylint: disable=protected-access,cyclic-import
-
 import os
 from typing import Callable
 
@@ -20,7 +18,7 @@ from ring_flash_attn import ring_flash_attn_func
 from ring_flash_attn.adapters.hf_adapter import check_params
 from transformers.modeling_flash_attention_utils import is_flash_attn_greater_or_equal
 
-try:  # pylint: disable=duplicate-code
+try:
     from transformers.modeling_flash_attention_utils import _flash_supports_window
 except ImportError:
     try:
@@ -59,7 +57,7 @@ def create_flash_attn_forward_varlen_llama3(
     """
 
     # transformers 4.48+
-    # pylint: disable=unused-argument
+
     def _flash_attention_forward(
         query_states: torch.Tensor,
         key_states: torch.Tensor,
diff --git a/src/axolotl/monkeypatch/ring_attn/patch.py b/src/axolotl/monkeypatch/ring_attn/patch.py
index ea0f9dd02..e1fd10b3a 100644
--- a/src/axolotl/monkeypatch/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/ring_attn/patch.py
@@ -15,7 +15,7 @@ import torch
 import torch.distributed as dist
 from torch.distributed import DeviceMesh
 
-try:  # pylint: disable=duplicate-code
+try:
     from transformers.modeling_flash_attention_utils import _flash_supports_window
 except ImportError:
     try:
@@ -43,7 +43,7 @@ def get_ring_attn_group() -> dist.ProcessGroup:
 
 def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
     """Setter for ring attention group on this rank."""
-    global RING_ATTN_GROUP  # pylint: disable=global-statement
+    global RING_ATTN_GROUP
     RING_ATTN_GROUP = ring_attn_group
 
 
@@ -57,29 +57,24 @@ def create_ring_flash_attention_forward(
         query_states: torch.Tensor,
         key_states: torch.Tensor,
         value_states: torch.Tensor,
-        attention_mask: torch.Tensor,  # pylint: disable=unused-argument
+        attention_mask: torch.Tensor,
         query_length: int,
         is_causal: bool,
         dropout: float = 0.0,
-        position_ids: Optional[torch.Tensor] = None,  # pylint: disable=unused-argument
+        position_ids: Optional[torch.Tensor] = None,
         softmax_scale: Optional[float] = None,
         sliding_window: Optional[int] = None,
         use_top_left_mask: bool = False,
         softcap: Optional[float] = None,
         deterministic: bool = None,
-        cu_seq_lens_q: Optional[
-            torch.LongTensor
-        ] = None,  # pylint: disable=unused-argument
-        cu_seq_lens_k: Optional[
-            torch.LongTensor
-        ] = None,  # pylint: disable=unused-argument
-        max_length_q: Optional[int] = None,  # pylint: disable=unused-argument
-        max_length_k: Optional[int] = None,  # pylint: disable=unused-argument
-        target_dtype: Optional[torch.dtype] = None,  # pylint: disable=unused-argument
-        attn_implementation: Optional[str] = None,  # pylint: disable=unused-argument
-        **kwargs,  # pylint: disable=unused-argument
+        cu_seq_lens_q: Optional[torch.LongTensor] = None,
+        cu_seq_lens_k: Optional[torch.LongTensor] = None,
+        max_length_q: Optional[int] = None,
+        max_length_k: Optional[int] = None,
+        target_dtype: Optional[torch.dtype] = None,
+        attn_implementation: Optional[str] = None,
+        **kwargs,
     ):
-        # pylint: disable=duplicate-code
         if not use_top_left_mask:
             causal = is_causal
         else:
@@ -101,9 +96,9 @@ def create_ring_flash_attention_forward(
         if deterministic is None:
             deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
         flash_kwargs["deterministic"] = deterministic
-        assert (
-            softcap is None
-        ), "llama3_flash_attn_varlen_func does not support softcap yet."
+        assert softcap is None, (
+            "llama3_flash_attn_varlen_func does not support softcap yet."
+        )
         # flash_kwargs["softcap"] = softcap
         flash_kwargs["group"] = process_group
 
@@ -193,7 +188,7 @@ def register_ring_attn_from_device_mesh(
         # fmt: off
         import ring_flash_attn.adapters.hf_adapter
 
-        from ring_flash_attn.adapters.hf_adapter import (  # isort: skip  # pylint: disable=unused-import
+        from ring_flash_attn.adapters.hf_adapter import (  # isort: skip
             create_ring_flash_attention_forward as create_ring_flash_attention_forward_orig,
         )
 
diff --git a/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py b/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py
index 85454fe2e..0fa6d6424 100644
--- a/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py
@@ -16,8 +16,8 @@
 # This code is based off the following work:
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
 # https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
-# pylint: disable=duplicate-code
 """PyTorch StableLM Epoch model."""
+
 import importlib
 import math
 from typing import Optional, Tuple, Union
@@ -26,7 +26,7 @@ import torch
 import torch.utils.checkpoint
 from accelerate import init_empty_weights
 from einops import rearrange
-from flash_attn.flash_attn_interface import (  # pylint: disable=ungrouped-imports
+from flash_attn.flash_attn_interface import (
     flash_attn_varlen_qkvpacked_func,
 )
 from torch import nn
@@ -49,27 +49,21 @@ def replace_stablelm_attn_with_flash_attn(model_name="stabilityai/stablelm-3b-4e
         ".configuration_stablelm_epoch", ".modeling_stablelm_epoch"
     )
     modeling_stablelm = importlib.import_module(module_name)
-    modeling_stablelm.Attention.forward = (  # pylint: disable=protected-access
-        flashattn_attn
-    )
-    modeling_stablelm.StableLMEpochModel.forward = (  # pylint: disable=protected-access
-        stablelm_model_forward
-    )
-    modeling_stablelm.DecoderLayer.forward = (  # pylint: disable=protected-access
-        decoder_layer_forward
-    )
+    modeling_stablelm.Attention.forward = flashattn_attn
+    modeling_stablelm.StableLMEpochModel.forward = stablelm_model_forward
+    modeling_stablelm.DecoderLayer.forward = decoder_layer_forward
 
 
 def rotate_half(x: torch.Tensor):
     """Rotates half the hidden dims of the input."""
-    # pylint: disable=invalid-name
+
     x1, x2 = torch.chunk(x, 2, dim=-1)
     return torch.cat((-x2, x1), dim=-1)
 
 
 def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
     # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.
-    # pylint: disable=invalid-name
+
     cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
     sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
     cos = cos[position_ids].unsqueeze(1)  # [batch_size, 1, seq_len, dim]
@@ -99,7 +93,7 @@ def flashattn_attn(
     attention_mask: torch.FloatTensor,
     position_ids: torch.LongTensor,
     past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: Optional[bool] = False,  # pylint: disable=unused-argument
+    output_attentions: Optional[bool] = False,
     use_cache: Optional[bool] = False,
     cu_seqlens: Optional[torch.Tensor] = None,
     max_seqlen: Optional[torch.Tensor] = None,
@@ -216,7 +210,6 @@ def decoder_layer_forward(
 ) -> Union[
     Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]
 ]:
-    # pylint: disable=duplicate-code
     residual = hidden_states
 
     hidden_states = self.input_layernorm(hidden_states)
@@ -263,7 +256,6 @@ def stablelm_model_forward(
     output_hidden_states: Optional[bool] = None,
     return_dict: Optional[bool] = None,
 ) -> Union[Tuple, BaseModelOutputWithPast]:
-    # pylint: disable=duplicate-code
     output_attentions = (
         output_attentions
         if output_attentions is not None
@@ -326,13 +318,11 @@ def stablelm_model_forward(
             dtype=torch.bool,
             device=inputs_embeds.device,
         )
-    attention_mask = (
-        self._prepare_decoder_attention_mask(  # pylint: disable=protected-access
-            attention_mask,
-            (batch_size, seq_length),
-            inputs_embeds,
-            past_key_values_length,
-        )
+    attention_mask = self._prepare_decoder_attention_mask(
+        attention_mask,
+        (batch_size, seq_length),
+        inputs_embeds,
+        past_key_values_length,
     )
 
     hidden_states = inputs_embeds
diff --git a/src/axolotl/monkeypatch/tiled_mlp/patch.py b/src/axolotl/monkeypatch/tiled_mlp/patch.py
index 419c73104..7cdc6d3a3 100644
--- a/src/axolotl/monkeypatch/tiled_mlp/patch.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/patch.py
@@ -40,7 +40,6 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
         is_distributed = int(os.environ.get("WORLD_SIZE", 1)) > 1
 
         def tiled_mlp_forward(self, x):
-            # pylint: disable=protected-access
             input_shape = x.shape
             seqlen = input_shape[-2]
             hidden = input_shape[-1]
@@ -79,14 +78,13 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
             return down_res
 
         mlp_cls.forward = tiled_mlp_forward
-        mlp_cls._compute_params = []  # pylint: disable=protected-access
-        mlp_cls._tiled_mlp_dist_impl = None  # pylint: disable=protected-access
+        mlp_cls._compute_params = []
+        mlp_cls._tiled_mlp_dist_impl = None
         LOG.info(
             f"Successfully monkey-patched TiledMLP for model_type: {model_type}",
             main_process_only=True,
         )
     except (ImportError, AttributeError) as e:
         raise RuntimeError(
-            f"Could not import MLP class for model_type: {model_type}. "
-            f"Error: {str(e)}"
+            f"Could not import MLP class for model_type: {model_type}. Error: {str(e)}"
         ) from e
diff --git a/src/axolotl/monkeypatch/trainer/lr.py b/src/axolotl/monkeypatch/trainer/lr.py
index 9afc23c46..c33674cee 100644
--- a/src/axolotl/monkeypatch/trainer/lr.py
+++ b/src/axolotl/monkeypatch/trainer/lr.py
@@ -39,4 +39,4 @@ def _get_learning_rate(self):
 def patch_trainer_get_lr():
     from transformers.trainer import Trainer
 
-    Trainer._get_learning_rate = _get_learning_rate  # pylint: disable=protected-access
+    Trainer._get_learning_rate = _get_learning_rate
diff --git a/src/axolotl/monkeypatch/trainer_accelerator_args.py b/src/axolotl/monkeypatch/trainer_accelerator_args.py
index 819a66255..9fc6e38c6 100644
--- a/src/axolotl/monkeypatch/trainer_accelerator_args.py
+++ b/src/axolotl/monkeypatch/trainer_accelerator_args.py
@@ -47,9 +47,7 @@ def patch_create_accelerate_code_for_fp8(enable_fsdp_float8_all_gather: bool):
         create_code = get_create_accelerate_code()
     except OSError:
         return
-    Trainer._original_create_accelerator_and_postprocess = (  # pylint: disable=protected-access
-        create_code
-    )
+    Trainer._original_create_accelerator_and_postprocess = create_code
     create_code, _ = detab_code(create_code)
     if ORIGINAL_TRAINER_CODE not in create_code:
         return
@@ -72,12 +70,14 @@ def patch_create_accelerate_code_for_fp8(enable_fsdp_float8_all_gather: bool):
         if item in create_code:
             items_to_import.append(item)
 
-    exec(  # pylint: disable=exec-used  # nosec B102
+    exec(
         "from transformers.trainer import ("
         + ", ".join(x for x in items_to_import)
         + ")",
         globals(),
     )
-    exec(create_code, globals())  # pylint: disable=exec-used  # nosec B102
+    exec(create_code, globals())
     LOG.info("patching create_accelerator_and_postprocess to allow for overrides")
-    Trainer.create_accelerator_and_postprocess = fixed_create_accelerator_and_postprocess  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
+    Trainer.create_accelerator_and_postprocess = (
+        fixed_create_accelerator_and_postprocess
+    )
diff --git a/src/axolotl/monkeypatch/trainer_fsdp_optim.py b/src/axolotl/monkeypatch/trainer_fsdp_optim.py
index 1c2511524..692f754d7 100644
--- a/src/axolotl/monkeypatch/trainer_fsdp_optim.py
+++ b/src/axolotl/monkeypatch/trainer_fsdp_optim.py
@@ -23,9 +23,7 @@ PATCHED_TRAINER_CODE = """
 
 
 def get_training_loop_code() -> str:
-    training_loop = inspect.getsource(
-        Trainer._inner_training_loop  # pylint: disable=protected-access
-    )
+    training_loop = inspect.getsource(Trainer._inner_training_loop)
     return training_loop
 
 
@@ -44,9 +42,7 @@ def patch_training_loop_for_fsdp():
         training_loop = get_training_loop_code()
     except OSError:
         return
-    Trainer._original_inner_training_loop = (  # pylint: disable=protected-access
-        training_loop
-    )
+    Trainer._original_inner_training_loop = training_loop
     training_loop, _ = detab_code(training_loop)
     if ORIGINAL_TRAINER_CODE not in training_loop:
         return
@@ -66,14 +62,12 @@ def patch_training_loop_for_fsdp():
         if item in training_loop:
             items_to_import.append(item)
 
-    exec(  # pylint: disable=exec-used  # nosec B102
+    exec(
         "from transformers.trainer import ("
         + ", ".join(x for x in items_to_import)
         + ")",
         globals(),
     )
-    exec(training_loop, globals())  # pylint: disable=exec-used  # nosec B102
+    exec(training_loop, globals())
     LOG.info("patching _inner_training_loop for fsdp optimizer save")
-    Trainer._inner_training_loop = (  # pylint: disable=protected-access
-        _fixed_inner_training_loop  # pylint: disable=undefined-variable  # noqa: F821
-    )
+    Trainer._inner_training_loop = _fixed_inner_training_loop
diff --git a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
index 75f4158b3..012c699fa 100644
--- a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
@@ -52,7 +52,6 @@ def check_evaluation_loop_is_fsdp2_patchable() -> bool:
     return ORIGINAL_FSDP2_CODE in evaluation_loop_source
 
 
-# pylint: disable=protected-access
 def patch_evaluation_loop(patch_fsdp2: bool):
     """Patch the evaluation_loop method."""
     # Check if already patched
@@ -101,16 +100,14 @@ def patch_evaluation_loop(patch_fsdp2: bool):
             items_to_import.append(item)
 
     # Execute the imports and patched method
-    exec(  # pylint: disable=exec-used  # nosec B102
+    exec(
         f"from {module_name} import ({', '.join(items_to_import)})",
         globals(),
     )
-    exec(evaluation_loop_source, globals())  # pylint: disable=exec-used  # nosec B102
+    exec(evaluation_loop_source, globals())
 
     LOG.info("Patched Trainer.evaluation_loop with nanmean loss calculation")
-    Trainer.evaluation_loop = (
-        axolotl_evaluation_loop  # pylint: disable=undefined-variable  # noqa: F821
-    )
+    Trainer.evaluation_loop = axolotl_evaluation_loop
 
 
 def check_maybe_log_save_evaluate_is_patchable() -> bool:
@@ -118,7 +115,6 @@ def check_maybe_log_save_evaluate_is_patchable() -> bool:
     return ORIGINAL_MAYBE_CODE in maybe_log_source
 
 
-# pylint: disable=protected-access
 def patch_maybe_log_save_evaluate():
     """Patch the _maybe_log_save_evaluate method."""
     # Check if already patched
@@ -155,11 +151,11 @@ def patch_maybe_log_save_evaluate():
             items_to_import.append(item)
 
     # Execute the imports and patched method
-    exec(  # pylint: disable=exec-used  # nosec B102
+    exec(
         f"from {module_name} import ({', '.join(items_to_import)})",
         globals(),
     )
-    exec(maybe_log_source, globals())  # pylint: disable=exec-used  # nosec B102
+    exec(maybe_log_source, globals())
 
     LOG.info("Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation")
-    Trainer._maybe_log_save_evaluate = axolotl_maybe_log_save_evaluate  # pylint: disable=undefined-variable  # noqa: F821
+    Trainer._maybe_log_save_evaluate = axolotl_maybe_log_save_evaluate
diff --git a/src/axolotl/monkeypatch/unsloth_.py b/src/axolotl/monkeypatch/unsloth_.py
index 146047e95..59f32c6f5 100644
--- a/src/axolotl/monkeypatch/unsloth_.py
+++ b/src/axolotl/monkeypatch/unsloth_.py
@@ -17,27 +17,19 @@ ORIGINAL_QKV_CODE = """
     query_states = self.q_proj(hidden_states)
     key_states = self.k_proj(hidden_states)
     value_states = self.v_proj(hidden_states)
-""".lstrip(
-    "\n"
-)
+""".lstrip("\n")
 
 PATCHED_QKV_CODE = """
     query_states, key_states, value_states = self.apply_qkv(self, hidden_states)
-""".lstrip(
-    "\n"
-)
+""".lstrip("\n")
 
 ORIGINAL_O_CODE = """
     attn_output = self.o_proj(attn_output)
-""".lstrip(
-    "\n"
-)
+""".lstrip("\n")
 
 PATCHED_O_CODE = """
     attn_output = self.apply_o(self, attn_output)
-""".lstrip(
-    "\n"
-)
+""".lstrip("\n")
 
 
 def original_apply_qkv(self, hidden_states):
@@ -66,13 +58,13 @@ def check_self_attn_is_patchable() -> bool:
 def integrate_cross_entropy_loss_patch(model_type: str = "llama") -> None:
     from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss
 
-    def UnslothForCausalLMLoss(  # pylint: disable=invalid-name
+    def UnslothForCausalLMLoss(
         logits,
         labels,
-        vocab_size: int,  # pylint: disable=unused-argument
+        vocab_size: int,
         num_items_in_batch: int = None,
-        ignore_index: int = -100,  # pylint: disable=unused-argument
-        **kwargs,  # pylint: disable=unused-argument
+        ignore_index: int = -100,
+        **kwargs,
     ):
         # Upcast to float if we need to compute the loss to avoid potential precision issues
         logits = logits.float()
@@ -93,18 +85,16 @@ def integrate_cross_entropy_loss_patch(model_type: str = "llama") -> None:
         raise ValueError("Unsupported model type")
 
 
-self_attn_lora_patched = False  # pylint: disable=invalid-name
+self_attn_lora_patched = False
 
 
 def patch_self_attn_lora():
-    global self_attn_lora_patched  # pylint: disable=global-statement
+    global self_attn_lora_patched
     if self_attn_lora_patched:
         # prevent patching multiple times
         return
     self_attn_forward = get_self_attn_code()
-    LlamaFlashAttention2._original_forward = (  # pylint: disable=protected-access
-        self_attn_forward
-    )
+    LlamaFlashAttention2._original_forward = self_attn_forward
     self_attn_forward, _ = detab_code(self_attn_forward)
     assert ORIGINAL_QKV_CODE in self_attn_forward, "Original qkv code not found"
     assert ORIGINAL_O_CODE in self_attn_forward, "Original o code not found"
@@ -125,27 +115,25 @@ def patch_self_attn_lora():
         if item in self_attn_forward:
             items_to_import.append(item)
 
-    exec(  # pylint: disable=exec-used  # nosec B102
+    exec(
         "from transformers.models.llama.modeling_llama import ("
         + ", ".join(x for x in items_to_import)
         + ")",
         globals(),
     )
-    exec(self_attn_forward, globals())  # pylint: disable=exec-used  # nosec B102
+    exec(self_attn_forward, globals())
     self_attn_lora_patched = True
     LOG.info("patching unsloth attn lora")
-    LlamaFlashAttention2.forward = (
-        unsloth_attn_forward  # pylint: disable=undefined-variable  # noqa: F821
-    )
+    LlamaFlashAttention2.forward = unsloth_attn_forward
 
 
 def integrate_rope_embeddings():
     import transformers.models.llama.modeling_llama
     from unsloth.kernels.rope_embedding import fast_rope_embedding
 
-    def apply_rotary_pos_emb(  # pylint: disable=unused-argument
-        q,  # pylint: disable=invalid-name
-        k,  # pylint: disable=invalid-name
+    def apply_rotary_pos_emb(
+        q,
+        k,
         cos,
         sin,
         position_ids=None,
diff --git a/src/axolotl/monkeypatch/xformers_/__init__.py b/src/axolotl/monkeypatch/xformers_/__init__.py
index a052ea49e..6f5b43f77 100644
--- a/src/axolotl/monkeypatch/xformers_/__init__.py
+++ b/src/axolotl/monkeypatch/xformers_/__init__.py
@@ -36,7 +36,7 @@ class FusedMLP(torch.nn.Module):
         self.swiglu.w3.weight.data = down_proj.weight.data
 
     def _post_training(self, model, name):
-        w1, w2 = torch.split(  # pylint: disable=invalid-name
+        w1, w2 = torch.split(
             self.swiglu.w12.weight.data, self.config.intermediate_size, dim=0
         )
 
@@ -48,5 +48,5 @@ class FusedMLP(torch.nn.Module):
 
         set_module_name(model, name, new_mlp)
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:  # pylint: disable=invalid-name
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
         return self.swiglu(x)
diff --git a/src/axolotl/processing_strategies.py b/src/axolotl/processing_strategies.py
index 31597d5a6..4b06eb4c8 100644
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -156,9 +156,9 @@ class ProcessingStrategy:
                 image_value = load_image(image_value)
 
                 if self.image_size is not None:
-                    assert hasattr(
-                        image_value, "resize"
-                    ), "Image does not have a resize method"
+                    assert hasattr(image_value, "resize"), (
+                        "Image does not have a resize method"
+                    )
 
                     if isinstance(self.image_size, tuple):
                         image_value = image_value.resize(
diff --git a/src/axolotl/prompt_strategies/__init__.py b/src/axolotl/prompt_strategies/__init__.py
index cf936481e..d9936b9ae 100644
--- a/src/axolotl/prompt_strategies/__init__.py
+++ b/src/axolotl/prompt_strategies/__init__.py
@@ -48,6 +48,6 @@ def load(strategy, tokenizer, cfg, ds_cfg, processor=None):
         return func(tokenizer, cfg, **load_kwargs)
     except ModuleNotFoundError:
         return None
-    except Exception as exc:  # pylint: disable=broad-exception-caught
+    except Exception as exc:
         LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
         raise exc
diff --git a/src/axolotl/prompt_strategies/alpaca_chat.py b/src/axolotl/prompt_strategies/alpaca_chat.py
index 975fee889..391ba6072 100644
--- a/src/axolotl/prompt_strategies/alpaca_chat.py
+++ b/src/axolotl/prompt_strategies/alpaca_chat.py
@@ -39,7 +39,7 @@ class AlpacaChatPrompter(AlpacaPrompter):
     system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n"
     system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n"
 
-    def __init__(self):  # pylint: disable=super-init-not-called
+    def __init__(self):
         self.prompt_style = PromptStyle.CHAT.value
         self.match_prompt_style()
 
@@ -54,7 +54,7 @@ class NoSystemPrompter(AlpacaPrompter):
     turn_format = "{instruction} {input} "
     turn_no_input_format = "{instruction} "
 
-    def __init__(self):  # pylint: disable=super-init-not-called
+    def __init__(self):
         pass
 
 
diff --git a/src/axolotl/prompt_strategies/alpaca_w_system.py b/src/axolotl/prompt_strategies/alpaca_w_system.py
index 6873c8e08..808ba517e 100644
--- a/src/axolotl/prompt_strategies/alpaca_w_system.py
+++ b/src/axolotl/prompt_strategies/alpaca_w_system.py
@@ -22,10 +22,9 @@ class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy):
         )
 
     def tokenize_prompt(self, prompt):
-        # pylint: disable=duplicate-code
         (
             instruction,
-            input,  # pylint: disable=redefined-builtin
+            input,
             response,
             system,
         ) = self.parse_instruction_fields(prompt)
@@ -64,7 +63,7 @@ class SystemDataPrompter(AlpacaPrompter):
         self,
         system: str,
         instruction: str,
-        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
+        input: Union[None, str] = None,
         output: Union[None, str] = None,
     ) -> Generator[str, None, None]:
         # returns the full prompt from instruction and optional input
@@ -93,7 +92,6 @@ class OpenOrcaSystemDataPrompter(SystemDataPrompter):
     """
 
     def match_prompt_style(self):
-        # pylint: disable=duplicate-code
         if self.prompt_style == PromptStyle.INSTRUCT.value:
             self.turn_format = "### Human:\n{instruction}\n### Additional Context:\n{input}\n### Assistant:\n"
             self.turn_no_input_format = "### Human:\n{instruction}\n### Assistant:\n"
diff --git a/src/axolotl/prompt_strategies/base.py b/src/axolotl/prompt_strategies/base.py
index 370a51a95..45a3ffda9 100644
--- a/src/axolotl/prompt_strategies/base.py
+++ b/src/axolotl/prompt_strategies/base.py
@@ -29,6 +29,6 @@ def load(strategy, cfg, module_base=None, **kwargs):
         mod = importlib.import_module(strategy, module_base)
         func = getattr(mod, load_fn)
         return func(cfg, **kwargs)
-    except Exception:  # pylint: disable=broad-exception-caught
+    except Exception:
         LOG.warning(f"unable to load strategy {strategy}")
         return None
diff --git a/src/axolotl/prompt_strategies/bradley_terry/__init__.py b/src/axolotl/prompt_strategies/bradley_terry/__init__.py
index 7530aee19..7336edc71 100644
--- a/src/axolotl/prompt_strategies/bradley_terry/__init__.py
+++ b/src/axolotl/prompt_strategies/bradley_terry/__init__.py
@@ -10,7 +10,6 @@ LOG = get_logger(__name__)
 
 
 def load(strategy, tokenizer, cfg, ds_cfg):
-    # pylint: disable=duplicate-code
     try:
         load_fn = "load"
         if strategy.split(".")[-1].startswith("load_"):
@@ -30,6 +29,6 @@ def load(strategy, tokenizer, cfg, ds_cfg):
         return func(tokenizer, cfg, **load_kwargs)
     except ModuleNotFoundError:
         return None
-    except Exception as exc:  # pylint: disable=broad-exception-caught
+    except Exception as exc:
         LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
         return None
diff --git a/src/axolotl/prompt_strategies/bradley_terry/chat_template.py b/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
index e655f85a1..fd0d76f51 100644
--- a/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
+++ b/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
@@ -34,7 +34,6 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
 
         max_length = self.prompter.max_length
 
-        # pylint: disable=duplicate-code
         prompt["messages"] = []
         if prompt["system"]:
             prompt["messages"].append({"role": "system", "content": prompt["system"]})
@@ -52,7 +51,6 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
                 :max_length
             ]
 
-        # pylint: disable=duplicate-code
         prompt["messages"] = []
         if prompt["system"]:
             prompt["messages"].append({"role": "system", "content": prompt["system"]})
diff --git a/src/axolotl/prompt_strategies/bradley_terry/llama3.py b/src/axolotl/prompt_strategies/bradley_terry/llama3.py
index 1d586fd5f..5548d882e 100644
--- a/src/axolotl/prompt_strategies/bradley_terry/llama3.py
+++ b/src/axolotl/prompt_strategies/bradley_terry/llama3.py
@@ -6,7 +6,7 @@ chatml transforms for datasets with system, input, chosen, rejected to match lla
 def icr(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     """
     chatml transforms for datasets with system, input, chosen, rejected
     ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
diff --git a/src/axolotl/prompt_strategies/chat_template.py b/src/axolotl/prompt_strategies/chat_template.py
index f927b7fcb..cb3e3dfb1 100644
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -2,8 +2,6 @@
 HF Chat Templates prompt strategy
 """
 
-# pylint: disable=too-many-lines
-
 from collections import defaultdict
 from typing import TYPE_CHECKING, Any, Dict, List, Set, Union
 
@@ -402,9 +400,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
         feature_names = list(prompt.keys())
 
         # Process each prompt individually
-        for row in zip(*prompt.values()):
+        for row in zip(*prompt.values(), strict=False):
             tokenized_prompt = self._tokenize_single_prompt(
-                dict(zip(feature_names, row))
+                dict(zip(feature_names, row, strict=False))
             )
             for key, val in tokenized_prompt.items():
                 res[key].append(val)
@@ -431,9 +429,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                 add_generation_prompt=True,
                 images=images,
             )
-            tokenized_res = self.prompter.build_prompt(
-                turns, images=images
-            )  # type: ignore
+            tokenized_res = self.prompter.build_prompt(turns, images=images)  # type: ignore
             tokenized_prompt = {}
             if isinstance(tokenized_res, list):
                 input_ids = prompt_ids + tokenized_res[len(prompt_ids) :]
@@ -613,7 +609,6 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
         """
         Locate the starting and ending indices of the specified turn in a conversation.
         """
-        # pylint: disable=too-many-return-statements
 
         if turn_idx >= len(turns):
             raise ValueError(f"Turn index {turn_idx} out of range")
@@ -850,7 +845,7 @@ class MistralStrategy(ChatTemplateStrategy):
         split_thinking: bool | None = False,
     ):
         # Call the parent's parent __init__ (PromptTokenizingStrategy) to skip ChatTemplateStrategy's validation
-        # pylint: disable=non-parent-init-called,super-init-not-called
+
         PromptTokenizingStrategy.__init__(
             self, prompter, tokenizer, train_on_inputs, sequence_len
         )
diff --git a/src/axolotl/prompt_strategies/completion.py b/src/axolotl/prompt_strategies/completion.py
index 62a4b90b2..f43f25793 100644
--- a/src/axolotl/prompt_strategies/completion.py
+++ b/src/axolotl/prompt_strategies/completion.py
@@ -42,8 +42,8 @@ class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
     def tokenize_prompt(self, prompt):
         res = defaultdict(lambda: [])
         feature_names = list(prompt.keys())
-        for row in zip(*prompt.values()):
-            prompt_row = dict(zip(feature_names, row))
+        for row in zip(*prompt.values(), strict=False):
+            prompt_row = dict(zip(feature_names, row, strict=False))
             (
                 instruction,
                 _,
@@ -59,9 +59,7 @@ class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
 
         return dict(res)
 
-    def _build_full_prompt(
-        self, instruction, input, response
-    ):  # pylint: disable=redefined-builtin
+    def _build_full_prompt(self, instruction, input, response):
         return next(iter(self.prompter.build_prompt(instruction, input, response)))
 
 
@@ -73,8 +71,8 @@ class CompletionPrompter:
     def build_prompt(
         self,
         instruction: str,
-        input=None,  # pylint: disable=redefined-builtin, unused-argument
-        output=None,  # pylint: disable=unused-argument
+        input=None,
+        output=None,
     ) -> Generator[str, None, None]:
         yield instruction
 
diff --git a/src/axolotl/prompt_strategies/context_qa.py b/src/axolotl/prompt_strategies/context_qa.py
index aac44e0b2..09e96d26e 100644
--- a/src/axolotl/prompt_strategies/context_qa.py
+++ b/src/axolotl/prompt_strategies/context_qa.py
@@ -86,7 +86,6 @@ class ContextV2Prompter(AlpacaPrompter):
     system_no_input_prompt = ""
 
     def match_prompt_style(self):
-        # pylint: disable=duplicate-code
         self.turn_format = "{instruction}\n{input}"
         self.turn_no_input_format = "{instruction}"
         self.system_format = "{system}"
diff --git a/src/axolotl/prompt_strategies/creative_acr.py b/src/axolotl/prompt_strategies/creative_acr.py
index ea67034b3..3e016e30e 100644
--- a/src/axolotl/prompt_strategies/creative_acr.py
+++ b/src/axolotl/prompt_strategies/creative_acr.py
@@ -134,9 +134,7 @@ class CreativePrompterBase:
     def build_prompt(
         self,
         instruction: str,
-        input: Union[  # pylint: disable=redefined-builtin, unused-argument
-            None, str
-        ] = None,
+        input: Union[None, str] = None,
         output: Union[None, str] = None,
     ) -> Generator[str, None, None]:
         if self.system_prompt:
diff --git a/src/axolotl/prompt_strategies/dpo/chat_template.py b/src/axolotl/prompt_strategies/dpo/chat_template.py
index 786770885..85c4d2182 100644
--- a/src/axolotl/prompt_strategies/dpo/chat_template.py
+++ b/src/axolotl/prompt_strategies/dpo/chat_template.py
@@ -6,9 +6,7 @@ from axolotl.utils.chat_templates import extract_chat_template_args, get_chat_te
 from axolotl.utils.schemas.utils import handle_legacy_message_fields_logic
 
 
-def default(
-    cfg, dataset_idx=0, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
+def default(cfg, dataset_idx=0, **kwargs):
     ds_cfg = cfg["datasets"][dataset_idx]
     ds_cfg = handle_legacy_message_fields_logic(ds_cfg)
 
diff --git a/src/axolotl/prompt_strategies/dpo/chatml.py b/src/axolotl/prompt_strategies/dpo/chatml.py
index 34a54aaa0..8614708eb 100644
--- a/src/axolotl/prompt_strategies/dpo/chatml.py
+++ b/src/axolotl/prompt_strategies/dpo/chatml.py
@@ -6,7 +6,7 @@ DPO strategies for chatml
 def default(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     def transform_fn(sample):
         if "prompt" in sample.keys():
             prompt_key = "prompt"
@@ -46,7 +46,7 @@ def default(
 def argilla_chat(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     """
     for argilla/dpo-mix-7k conversations
     """
@@ -65,7 +65,7 @@ def argilla_chat(
 def icr(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     """
     chatml transforms for datasets with system, input, chosen, rejected
     ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
@@ -88,7 +88,7 @@ def icr(
     return transform_fn
 
 
-def intel(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def intel(cfg, **kwargs):
     """
     For Intel Orca DPO Pairs
     """
@@ -110,9 +110,7 @@ def intel(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-arg
     return transform_fn
 
 
-def prompt_pairs(
-    cfg, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
+def prompt_pairs(cfg, **kwargs):
     def transform_fn(sample):
         if "system" in sample and sample["system"]:
             sample["prompt"] = (
@@ -130,7 +128,7 @@ def prompt_pairs(
     return transform_fn
 
 
-def ultra(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def ultra(cfg, **kwargs):
     """
     for ultrafeedback binarized conversations
     """
diff --git a/src/axolotl/prompt_strategies/dpo/llama3.py b/src/axolotl/prompt_strategies/dpo/llama3.py
index eed420017..c13ff55e4 100644
--- a/src/axolotl/prompt_strategies/dpo/llama3.py
+++ b/src/axolotl/prompt_strategies/dpo/llama3.py
@@ -6,9 +6,8 @@ DPO strategies for llama-3 chat template
 def default(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     def transform_fn(sample):
-        # pylint: disable=duplicate-code
         if "prompt" in sample.keys():
             prompt_key = "prompt"
         elif "input" in sample.keys():
@@ -47,7 +46,7 @@ def default(
 def argilla_chat(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     """
     for argilla/dpo-mix-7k conversations
     """
@@ -66,7 +65,7 @@ def argilla_chat(
 def icr(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     """
     chatml transforms for datasets with system, input, chosen, rejected
     ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
@@ -89,7 +88,7 @@ def icr(
     return transform_fn
 
 
-def intel(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def intel(cfg, **kwargs):
     """
     For Intel Orca DPO Pairs
     """
@@ -111,9 +110,7 @@ def intel(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-arg
     return transform_fn
 
 
-def prompt_pairs(
-    cfg, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
+def prompt_pairs(cfg, **kwargs):
     def transform_fn(sample):
         if "system" in sample and sample["system"]:
             sample["prompt"] = (
@@ -131,7 +128,7 @@ def prompt_pairs(
     return transform_fn
 
 
-def ultra(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def ultra(cfg, **kwargs):
     """
     for ultrafeedback binarized conversations
     """
diff --git a/src/axolotl/prompt_strategies/dpo/passthrough.py b/src/axolotl/prompt_strategies/dpo/passthrough.py
index 1fcb838db..52b5ceac1 100644
--- a/src/axolotl/prompt_strategies/dpo/passthrough.py
+++ b/src/axolotl/prompt_strategies/dpo/passthrough.py
@@ -3,12 +3,8 @@ DPO prompt strategies passthrough/zero-processing strategy
 """
 
 
-def default(
-    cfg, dataset_idx=0, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
-    def transform_fn(
-        sample, tokenizer=None
-    ):  # pylint: disable=possibly-unused-variable,unused-argument
+def default(cfg, dataset_idx=0, **kwargs):
+    def transform_fn(sample, tokenizer=None):
         return sample
 
     return transform_fn
diff --git a/src/axolotl/prompt_strategies/dpo/user_defined.py b/src/axolotl/prompt_strategies/dpo/user_defined.py
index cdd9b8c9c..0bcb1d94c 100644
--- a/src/axolotl/prompt_strategies/dpo/user_defined.py
+++ b/src/axolotl/prompt_strategies/dpo/user_defined.py
@@ -3,7 +3,7 @@ User-defined DPO strategies
 """
 
 
-def default(cfg, dataset_idx=0, **kwargs):  # pylint: disable=unused-argument
+def default(cfg, dataset_idx=0, **kwargs):
     ds_cfg = cfg["datasets"][dataset_idx]["type"]
     if not isinstance(ds_cfg, dict):
         raise ValueError(
diff --git a/src/axolotl/prompt_strategies/dpo/zephyr.py b/src/axolotl/prompt_strategies/dpo/zephyr.py
index 9eb895009..781227181 100644
--- a/src/axolotl/prompt_strategies/dpo/zephyr.py
+++ b/src/axolotl/prompt_strategies/dpo/zephyr.py
@@ -3,14 +3,11 @@ DPO strategies for zephyr
 """
 
 
-def nectar(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def nectar(cfg, **kwargs):
     def transform_fn(sample):
         data = {}
         data["prompt"] = (
-            "<|system|>\n</s>\n"
-            "<|user|>\n"
-            f"{sample['prompt']}</s>\n"
-            "<|assistant|>\n"
+            f"<|system|>\n</s>\n<|user|>\n{sample['prompt']}</s>\n<|assistant|>\n"
         )
         answers = sorted(sample["answers"], key=lambda x: x["rank"])
         data["chosen"] = answers[-1]["answer"]
diff --git a/src/axolotl/prompt_strategies/input_output.py b/src/axolotl/prompt_strategies/input_output.py
index 8be745b20..c84eecffc 100644
--- a/src/axolotl/prompt_strategies/input_output.py
+++ b/src/axolotl/prompt_strategies/input_output.py
@@ -16,7 +16,6 @@ class RawInputOutputStrategy(PromptTokenizingStrategy):
             self.eos_token = self.tokenizer.eos_token
 
     def tokenize_prompt(self, prompt):
-        # pylint: disable=duplicate-code
         input_ids = []
         labels = []
         for label, text in self.prompter.build_prompt(prompt["segments"]):
diff --git a/src/axolotl/prompt_strategies/kto/chatml.py b/src/axolotl/prompt_strategies/kto/chatml.py
index 97ae59ed5..945940f3f 100644
--- a/src/axolotl/prompt_strategies/kto/chatml.py
+++ b/src/axolotl/prompt_strategies/kto/chatml.py
@@ -2,13 +2,11 @@
 KTO strategies for chatml
 """
 
-# pylint: disable=duplicate-code
-
 
 def argilla(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     def transform_fn(sample):
         if "system" in sample and sample["system"]:
             sample["prompt"] = (
@@ -28,7 +26,7 @@ def argilla(
 def argilla_chat(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     """
     for argilla/kto-mix-15k conversations
     """
@@ -43,7 +41,7 @@ def argilla_chat(
     return transform_fn
 
 
-def intel(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def intel(cfg, **kwargs):
     """
     For Intel Orca KTO
     ex: argilla/distilabel-intel-orca-kto
@@ -65,9 +63,7 @@ def intel(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-arg
     return transform_fn
 
 
-def prompt_pairs(
-    cfg, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
+def prompt_pairs(cfg, **kwargs):
     def transform_fn(sample):
         if "system" in sample and sample["system"]:
             sample["prompt"] = (
@@ -84,7 +80,7 @@ def prompt_pairs(
     return transform_fn
 
 
-def ultra(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def ultra(cfg, **kwargs):
     """
     for ultrafeedback binarized conversations
     ex: argilla/ultrafeedback-binarized-preferences-cleaned-kto
diff --git a/src/axolotl/prompt_strategies/kto/llama3.py b/src/axolotl/prompt_strategies/kto/llama3.py
index fde3c2ed4..9061f6f5e 100644
--- a/src/axolotl/prompt_strategies/kto/llama3.py
+++ b/src/axolotl/prompt_strategies/kto/llama3.py
@@ -2,13 +2,11 @@
 KTO strategies for llama-3 chat template
 """
 
-# pylint: disable=duplicate-code
-
 
 def argilla(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     def transform_fn(sample):
         if "system" in sample and sample["system"]:
             sample["prompt"] = (
@@ -28,7 +26,7 @@ def argilla(
 def argilla_chat(
     cfg,
     **kwargs,
-):  # pylint: disable=possibly-unused-variable,unused-argument
+):
     """
     for argilla/kto-mix-15k conversations
     """
@@ -43,7 +41,7 @@ def argilla_chat(
     return transform_fn
 
 
-def intel(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def intel(cfg, **kwargs):
     """
     For Intel Orca KTO
     ex: argilla/distilabel-intel-orca-kto
@@ -65,9 +63,7 @@ def intel(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-arg
     return transform_fn
 
 
-def prompt_pairs(
-    cfg, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
+def prompt_pairs(cfg, **kwargs):
     def transform_fn(sample):
         if "system" in sample and sample["system"]:
             sample["prompt"] = (
@@ -84,7 +80,7 @@ def prompt_pairs(
     return transform_fn
 
 
-def ultra(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def ultra(cfg, **kwargs):
     """
     for ultrafeedback binarized conversations
     ex: argilla/ultrafeedback-binarized-preferences-cleaned-kto
diff --git a/src/axolotl/prompt_strategies/kto/user_defined.py b/src/axolotl/prompt_strategies/kto/user_defined.py
index 7c68a3000..e26683cde 100644
--- a/src/axolotl/prompt_strategies/kto/user_defined.py
+++ b/src/axolotl/prompt_strategies/kto/user_defined.py
@@ -2,10 +2,8 @@
 User-defined KTO strategies
 """
 
-# pylint: disable=duplicate-code
 
-
-def default(cfg, dataset_idx=0, **kwargs):  # pylint: disable=unused-argument
+def default(cfg, dataset_idx=0, **kwargs):
     ds_cfg = cfg["datasets"][dataset_idx]["type"]
     if not isinstance(ds_cfg, dict):
         raise ValueError(
diff --git a/src/axolotl/prompt_strategies/llama2_chat.py b/src/axolotl/prompt_strategies/llama2_chat.py
index eef2e1d4d..9eff062ec 100644
--- a/src/axolotl/prompt_strategies/llama2_chat.py
+++ b/src/axolotl/prompt_strategies/llama2_chat.py
@@ -153,7 +153,7 @@ class LLama2ChatTokenizingStrategy(PromptTokenizingStrategy):
         }
 
 
-class Llama2ChatPrompter:  # pylint: disable=too-few-public-methods
+class Llama2ChatPrompter:
     """
     A prompter that generates prompts for Llama2 models.
     """
@@ -190,7 +190,7 @@ class Llama2ChatPrompter:  # pylint: disable=too-few-public-methods
             # Skip the first one if it is not from human
             source = source[1:]
 
-        conv.messages = []  # pylint: disable=R0801
+        conv.messages = []
         for j, sentence in enumerate(source):
             role = roles[sentence["from"]]
             assert role == conv.roles[j % 2], ALTERNATING_ASSERTION_FAILED_ROLE
diff --git a/src/axolotl/prompt_strategies/messages/__init__.py b/src/axolotl/prompt_strategies/messages/__init__.py
index 6eae9dfd8..2c920a568 100644
--- a/src/axolotl/prompt_strategies/messages/__init__.py
+++ b/src/axolotl/prompt_strategies/messages/__init__.py
@@ -11,7 +11,7 @@ LOG = get_logger(__name__)
 def load(tokenizer, cfg, ds_cfg, processor=None):
     try:
         strategy = ds_cfg.get("input_transform", "chat")
-        # pylint: disable=duplicate-code
+
         load_fn = "load"
         if strategy.split(".")[-1].startswith("load_"):
             load_fn = strategy.split(".")[-1]
@@ -29,6 +29,6 @@ def load(tokenizer, cfg, ds_cfg, processor=None):
         return func(tokenizer, cfg, **load_kwargs)
     except ModuleNotFoundError:
         return None
-    except Exception as exc:  # pylint: disable=broad-exception-caught
+    except Exception as exc:
         LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
         raise exc
diff --git a/src/axolotl/prompt_strategies/messages/chat.py b/src/axolotl/prompt_strategies/messages/chat.py
index eaed2396a..854d25e42 100644
--- a/src/axolotl/prompt_strategies/messages/chat.py
+++ b/src/axolotl/prompt_strategies/messages/chat.py
@@ -19,7 +19,7 @@ class ChatMessageDatasetWrappingStrategy(DatasetWrappingStrategy):
         processor,
         message_transform=None,
         formatter=None,
-        **kwargs,  # pylint: disable=unused-argument
+        **kwargs,
     ):
         """
         :param processor: tokenizer or image processor
@@ -35,7 +35,7 @@ class ChatMessageDatasetWrappingStrategy(DatasetWrappingStrategy):
         dataset,
         process_count: Optional[int] = None,
         keep_in_memory: Optional[bool] = False,
-        **kwargs,  # pylint: disable=unused-argument
+        **kwargs,
     ):
         self.dataset = TokenizedChatDataset(
             dataset,
@@ -72,9 +72,10 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
         builder_kwargs["message_field_training"] = message_field_training
 
     chat_template = ds_cfg.get("chat_template", cfg.get("chat_template", "chatml"))
-    format_message = (
-        lambda x: x  # noqa E731  # pylint: disable=unnecessary-lambda-assignment
-    )
+
+    def format_message(x):
+        return x
+
     if chat_template == "chatml":
         from axolotl.core.chat.format.chatml import format_message  # noqa F811
     if chat_template.startswith("llama3"):
diff --git a/src/axolotl/prompt_strategies/metharme.py b/src/axolotl/prompt_strategies/metharme.py
index 66da72389..35f1ef3b3 100644
--- a/src/axolotl/prompt_strategies/metharme.py
+++ b/src/axolotl/prompt_strategies/metharme.py
@@ -10,8 +10,6 @@ LOG = get_logger(__name__)
 
 IGNORE_TOKEN_ID = -100
 
-# pylint: disable=duplicate-code
-
 
 class MetharmePromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
     """
@@ -66,7 +64,7 @@ class MetharmePrompter(AlpacaPrompter):
     turn_format = "{instruction}"
     turn_no_input_format = "{instruction}"
 
-    def __init__(self, *args, **kwargs):  # pylint: disable=super-init-not-called
+    def __init__(self, *args, **kwargs):
         pass
 
 
diff --git a/src/axolotl/prompt_strategies/orpo/chat_template.py b/src/axolotl/prompt_strategies/orpo/chat_template.py
index fdee28ea1..b655bc970 100644
--- a/src/axolotl/prompt_strategies/orpo/chat_template.py
+++ b/src/axolotl/prompt_strategies/orpo/chat_template.py
@@ -23,9 +23,7 @@ class MessageList(BaseModel):
     messages: List[Message]
 
 
-def load(
-    tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None, **kwargs
-):  # pylint: disable=possibly-unused-variable,unused-argument
+def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None, **kwargs):
     """
     chatml transforms for datasets with system, input, chosen, rejected
     """
@@ -219,29 +217,38 @@ class ORPOPrompter(Prompter):
         for message in message_list.messages:
             conversation.append(message.model_dump())
             if message.role == "system":
-                yield self.tokenizer.apply_chat_template(
-                    conversation,
-                    add_generation_prompt=False,
-                    chat_template=self.chat_template,
-                    tokenize=False,
-                ), False
+                yield (
+                    self.tokenizer.apply_chat_template(
+                        conversation,
+                        add_generation_prompt=False,
+                        chat_template=self.chat_template,
+                        tokenize=False,
+                    ),
+                    False,
+                )
             if message.role == "user":
-                yield self.tokenizer.apply_chat_template(
-                    conversation,
-                    add_generation_prompt=True,
-                    chat_template=self.chat_template,
-                    tokenize=False,
-                ), False
+                yield (
+                    self.tokenizer.apply_chat_template(
+                        conversation,
+                        add_generation_prompt=True,
+                        chat_template=self.chat_template,
+                        tokenize=False,
+                    ),
+                    False,
+                )
             if message.role == "assistant":
-                yield self.tokenizer.apply_chat_template(
-                    conversation,
-                    add_generation_prompt=False,
-                    chat_template=self.chat_template,
-                    tokenize=False,
-                ), True
+                yield (
+                    self.tokenizer.apply_chat_template(
+                        conversation,
+                        add_generation_prompt=False,
+                        chat_template=self.chat_template,
+                        tokenize=False,
+                    ),
+                    True,
+                )
 
 
-def argilla(cfg, **kwargs):  # pylint: disable=possibly-unused-variable,unused-argument
+def argilla(cfg, **kwargs):
     dataset_parser = ORPODatasetParsingStrategy()
 
     def transform_fn(sample, tokenizer=None):
diff --git a/src/axolotl/prompt_strategies/pygmalion.py b/src/axolotl/prompt_strategies/pygmalion.py
index 51f92f397..8c53a5f27 100644
--- a/src/axolotl/prompt_strategies/pygmalion.py
+++ b/src/axolotl/prompt_strategies/pygmalion.py
@@ -69,7 +69,6 @@ class PygmalionPromptTokenizingStrategy(PromptTokenizingStrategy):
                 LOG.warning(f"unknown role in conversation: {role}")
                 res = defaultdict(lambda: [])
 
-            # pylint: disable=duplicate-code
             result, current_len = parse_tokenized_to_result(
                 result,
                 current_len,
@@ -89,7 +88,10 @@ class PygmalionPrompter:
         pass
 
     def build_prompt(
-        self, source, *args, **kwargs  # pylint: disable=unused-argument
+        self,
+        source,
+        *args,
+        **kwargs,
     ) -> Generator[Tuple[str, str], None, None]:
         for msg in source:
             yield msg["role"], msg["value"]
diff --git a/src/axolotl/prompt_strategies/stepwise_supervised.py b/src/axolotl/prompt_strategies/stepwise_supervised.py
index 8be7c35e3..9175126e7 100644
--- a/src/axolotl/prompt_strategies/stepwise_supervised.py
+++ b/src/axolotl/prompt_strategies/stepwise_supervised.py
@@ -66,7 +66,7 @@ class StepwiseSupervisedPromptTokenizingStrategy:
         # Create step-wise labels
         labels = [
             [IGNORE_INDEX] * (len(completion) - 1) + [label]  # type: ignore
-            for completion, label in zip(completions_ids, labels)
+            for completion, label in zip(completions_ids, labels, strict=False)
         ]
 
         # Join all steps
diff --git a/src/axolotl/prompt_strategies/user_defined.py b/src/axolotl/prompt_strategies/user_defined.py
index e20e80c3a..0bff514e7 100644
--- a/src/axolotl/prompt_strategies/user_defined.py
+++ b/src/axolotl/prompt_strategies/user_defined.py
@@ -83,16 +83,12 @@ def load(tokenizer, cfg, ds_cfg: Optional[UserDefinedDatasetConfig] = None):
         cfg.sequence_len,
     )
 
-    setattr(
-        strat,
-        "parse_instruction_fields",
-        partial(
-            parse_instruction_fields,
-            ds_cfg.field_instruction,
-            ds_cfg.field_input,
-            ds_cfg.field_output,
-            ds_cfg.field_system,
-            system_prompt,
-        ),
+    strat.parse_instruction_fields = partial(  # type: ignore[method-assign]
+        parse_instruction_fields,
+        ds_cfg.field_instruction,
+        ds_cfg.field_input,
+        ds_cfg.field_output,
+        ds_cfg.field_system,
+        system_prompt,
     )
     return strat
diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
index 9ca645de3..2bf9ec763 100644
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -118,7 +118,7 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
     def tokenize_prompt(self, prompt):
         (
             instruction,
-            input,  # pylint: disable=redefined-builtin
+            input,
             response,
         ) = self.parse_instruction_fields(prompt)
         user_prompt = next(
@@ -144,7 +144,10 @@ class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
         return tokenized_prompt
 
     def _build_full_prompt(
-        self, instruction, input, response  # pylint: disable=redefined-builtin
+        self,
+        instruction,
+        input,
+        response,
     ):
         return next(
             iter(
@@ -257,10 +260,9 @@ class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
         raise NotImplementedError
 
     def tokenize_prompt(self, prompt):
-        # pylint: disable=duplicate-code
         (
             instruction,
-            input,  # pylint: disable=redefined-builtin
+            input,
             output,
             reflection,
             corrected,
@@ -287,9 +289,7 @@ class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
 
         return tokenized_full_prompt
 
-    def _build_full_prompt(
-        self, instruction, input, output, reflection, corrected
-    ):  # pylint: disable=redefined-builtin
+    def _build_full_prompt(self, instruction, input, output, reflection, corrected):
         return next(
             iter(
                 self.prompter.build_prompt(
diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py
index d29da075e..9543996f7 100644
--- a/src/axolotl/prompters.py
+++ b/src/axolotl/prompters.py
@@ -46,7 +46,6 @@ class AlpacaPrompter(Prompter):
         self.match_prompt_style()
 
     def match_prompt_style(self):
-        # pylint: disable=duplicate-code
         if self.prompt_style == PromptStyle.INSTRUCT.value:
             self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
             self.turn_no_input_format = (
@@ -93,7 +92,7 @@ class AlpacaPrompter(Prompter):
     def build_prompt(
         self,
         instruction: str,
-        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
+        input: Union[None, str] = None,
         output: Union[None, str] = None,
     ) -> Generator[str, None, None]:
         yield self._build_result(instruction, input, output)
@@ -218,7 +217,7 @@ class ReflectAlpacaPrompter(Prompter):
     def _build_result(
         self,
         instruction: str,
-        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
+        input: Union[None, str] = None,
         output: Union[None, str] = None,
         reflection: Union[None, str] = None,
         corrected: Union[None, str] = None,
@@ -242,12 +241,11 @@ class ReflectAlpacaPrompter(Prompter):
     def build_prompt(
         self,
         instruction: str,
-        input: Union[None, str] = None,  # pylint: disable=redefined-builtin
+        input: Union[None, str] = None,
         output: Union[None, str] = None,
         reflection: Union[None, str] = None,
         corrected: Union[None, str] = None,
     ) -> Generator[str, None, None]:
-        # pylint: disable=duplicate-code
         yield self._build_result(
             instruction,
             input,
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index dd39cc228..e409d4a11 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -230,7 +230,7 @@ def save_trained_model(
     # Post training module hooks
     for name, module in model.named_modules():
         if hasattr(module, "_post_training"):
-            module._post_training(model, name)  # pylint: disable=protected-access
+            module._post_training(model, name)
 
     # handle QAT
     if cfg.qat:
@@ -253,9 +253,7 @@ def save_trained_model(
             # final model weights have already been saved by `ReLoRACallback.on_train_end`
             return
 
-    if (  # pylint: disable=too-many-nested-blocks
-        trainer.is_fsdp_enabled or cfg.fsdp_config
-    ):
+    if trainer.is_fsdp_enabled or cfg.fsdp_config:
         if cfg.fsdp_config or cfg.fsdp:
             if cfg.fsdp_config.final_state_dict_type:
                 state_dict_type = cfg.fsdp_config.final_state_dict_type
@@ -438,7 +436,7 @@ def setup_model_card(cfg: DictDefault):
     badge_markdown = """[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)"""
     transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}"
 
-    if getattr(cfg, "axolotl_config_path"):
+    if cfg.axolotl_config_path:
         raw_axolotl_cfg = Path(cfg.axolotl_config_path)
         version = importlib.metadata.version("axolotl")
         if raw_axolotl_cfg.is_file():
@@ -489,7 +487,9 @@ def handle_untrained_tokens_fix(
         )
 
 
-def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> tuple[
+def setup_model_and_trainer(
+    cfg: DictDefault, dataset_meta: TrainDatasetMeta
+) -> tuple[
     "HFRLTrainerBuilder" | "HFCausalTrainerBuilder",
     PeftModel | PreTrainedModel,
     PreTrainedTokenizer,
diff --git a/src/axolotl/utils/__init__.py b/src/axolotl/utils/__init__.py
index e669413f8..e5050116a 100644
--- a/src/axolotl/utils/__init__.py
+++ b/src/axolotl/utils/__init__.py
@@ -17,7 +17,6 @@ def is_comet_available():
     return importlib.util.find_spec("comet_ml") is not None
 
 
-# pylint: disable=duplicate-code
 def get_pytorch_version() -> tuple[int, int, int]:
     """
     Get Pytorch version as a tuple of (major, minor, patch).
diff --git a/src/axolotl/utils/callbacks/__init__.py b/src/axolotl/utils/callbacks/__init__.py
index d3f3126b5..6c5512223 100644
--- a/src/axolotl/utils/callbacks/__init__.py
+++ b/src/axolotl/utils/callbacks/__init__.py
@@ -56,9 +56,7 @@ IGNORE_INDEX = -100
 LOG = get_logger(__name__)
 
 
-class SaveBetterTransformerModelCallback(
-    TrainerCallback
-):  # pylint: disable=too-few-public-methods
+class SaveBetterTransformerModelCallback(TrainerCallback):
     """Callback to save the BetterTransformer wrapped model"""
 
     def on_step_end(
@@ -103,7 +101,7 @@ class LossWatchDogCallback(TrainerCallback):
 
     def on_step_end(
         self,
-        args: TrainingArguments,  # pylint: disable=unused-argument
+        args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
         **_kwargs,
@@ -126,7 +124,7 @@ class SaveModelOnFirstStepCallback(TrainerCallback):
 
     def on_step_end(
         self,
-        args: TrainingArguments,  # pylint: disable=unused-argument
+        args: TrainingArguments,
         state: TrainerState,
         control: TrainerControl,
         **_kwargs,
@@ -239,10 +237,10 @@ def bench_eval_callback_factory(trainer, tokenizer):
         def on_evaluate(
             self,
             args: AxolotlTrainingArguments,
-            state: TrainerState,  # pylint: disable=unused-argument
-            control: TrainerControl,  # pylint: disable=unused-argument
-            metrics: Dict[str, float],  # pylint: disable=unused-argument
-            **kwargs,  # pylint: disable=unused-argument
+            state: TrainerState,
+            control: TrainerControl,
+            metrics: Dict[str, float],
+            **kwargs,
         ):
             data_loader = trainer.get_bench_dataloader(
                 bench_dataset.remove_columns(["input", "subject", "output", "name"])
@@ -272,7 +270,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
             # Extract results by subject.
             bench_name = bench_dataset["name"]
             bench_names: dict = {s: {"refs": [], "preds": []} for s in set(bench_name)}
-            for s, p, r in zip(bench_name, preds, refs):  # pylint: disable=invalid-name
+            for s, p, r in zip(bench_name, preds, refs, strict=False):
                 bench_names[s]["preds"].append(p)
                 bench_names[s]["refs"].append(r)
             barrier()
@@ -310,9 +308,7 @@ def bench_eval_callback_factory(trainer, tokenizer):
                 bench_scores = []
                 bench_refs = []
                 bench_preds = []
-                for (
-                    bench_name
-                ) in combined_bench_names:  # pylint: disable=consider-using-dict-items
+                for bench_name in combined_bench_names:
                     bench_score = accuracy.compute(
                         references=combined_bench_names[bench_name]["refs"],
                         predictions=combined_bench_names[bench_name]["preds"],
@@ -361,18 +357,18 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
                 else:
                     try:
                         metrics[metric] = evaluate.load(metric)
-                    except Exception as exc:  # pylint: disable=broad-exception-caught
+                    except Exception as exc:
                         LOG.warning(f"{metric}: {exc.args}")
             return metrics
 
         def on_evaluate(
             self,
-            args: AxolotlTrainingArguments,  # pylint: disable=unused-argument
+            args: AxolotlTrainingArguments,
             state: TrainerState,
             control: TrainerControl,
-            train_dataloader,  # pylint: disable=unused-argument
+            train_dataloader,
             eval_dataloader,
-            **kwargs,  # pylint: disable=unused-argument
+            **kwargs,
         ):
             trainer.model_wrapped.eval()
 
@@ -380,7 +376,6 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
                 self.cfg.device
             )  # Use this instead of trainer.model_wrapped.device as it may return cpu if fsdp offloaded
 
-            # pylint: disable=duplicate-code
             generation_config = GenerationConfig(
                 max_new_tokens=self.cfg.eval_max_new_tokens,
                 bos_token_id=tokenizer.bos_token_id,
@@ -411,9 +406,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
                 try:
                     # Only pass the kwargs that are in the metric's feature list
                     metric_kwargs = {
-                        k: kwargs[k]
-                        for k in metric._feature_names()  # pylint: disable=protected-access
-                        if k in kwargs
+                        k: kwargs[k] for k in metric._feature_names() if k in kwargs
                     }
 
                     if isinstance(metric, Perplexity):
@@ -425,7 +418,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
                         if "score" in metric_score
                         else metric_score["mean_score"]
                     )
-                except Exception:  # pylint: disable=broad-exception-caught
+                except Exception:
                     traceback.print_exc()
                     LOG.debug(
                         f"Failed to compute metric {metric.name} with kwargs {kwargs.keys()}"
@@ -473,6 +466,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
                             batch_input_ids,
                             batch_labels,
                             batch_pos_ids,
+                            strict=False,
                         ):
                             if pos_ids is None:
                                 pos_ranges = [(0, len(input_ids_all) - 1)]
@@ -523,7 +517,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
                         prediction_all_tokens = predictions["sequences"].cpu().tolist()
                         prediction_without_prompt_tokens_list = []
                         for prompt_token_ids, prediction_tokens in zip(
-                            prompt_token_ids_list, prediction_all_tokens
+                            prompt_token_ids_list, prediction_all_tokens, strict=False
                         ):
                             prediction_without_prompt_tokens = prediction_tokens[
                                 len(prompt_token_ids) :
@@ -561,12 +555,12 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
 
         def on_evaluate(
             self,
-            args: AxolotlTrainingArguments,  # pylint: disable=unused-argument
+            args: AxolotlTrainingArguments,
             state: TrainerState,
             control: TrainerControl,
-            train_dataloader,  # pylint: disable=unused-argument
+            train_dataloader,
             eval_dataloader,
-            **kwargs,  # pylint: disable=unused-argument
+            **kwargs,
         ):
             eval_table_size = self.cfg.eval_table_size
 
@@ -576,7 +570,6 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
             trainer.model.eval()
             device = torch.device(self.cfg.device)
 
-            # pylint: disable=duplicate-code
             generation_config = GenerationConfig(
                 max_new_tokens=self.cfg.eval_max_new_tokens,
                 bos_token_id=tokenizer.bos_token_id,
@@ -644,6 +637,7 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
                         batch_labels,
                         batch_pos_ids,
                         batch_logits,
+                        strict=False,
                     ):
                         if pos_ids is None:
                             pos_ranges = [(0, len(input_ids_all) - 1)]
@@ -697,7 +691,7 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
                     prediction_all_tokens = predictions["sequences"].cpu().tolist()
                     prediction_without_prompt_tokens_list = []
                     for prompt_token_ids, prediction_tokens in zip(
-                        prompt_token_ids_list, prediction_all_tokens
+                        prompt_token_ids_list, prediction_all_tokens, strict=False
                     ):
                         prediction_without_prompt_tokens = prediction_tokens[
                             len(prompt_token_ids) :
@@ -716,7 +710,11 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
                         prediction_text,
                         pred_step_text,
                     ) in zip(
-                        prompt_texts, completion_texts, predicted_texts, pred_step_texts
+                        prompt_texts,
+                        completion_texts,
+                        predicted_texts,
+                        pred_step_texts,
+                        strict=False,
                     ):
                         table_data["id"].append(row_index)
                         table_data["Prompt"].append(prompt_text)
@@ -774,10 +772,10 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
 
     def on_train_begin(
         self,
-        args: AxolotlTrainingArguments,  # pylint: disable=unused-argument
-        state: TrainerState,  # pylint: disable=unused-argument
+        args: AxolotlTrainingArguments,
+        state: TrainerState,
         control: TrainerControl,
-        **kwargs,  # pylint: disable=unused-argument
+        **kwargs,
     ):
         if state.is_world_process_zero:
             try:
@@ -845,19 +843,30 @@ class GCCallback(TrainerCallback):
         gc.collect()
 
     def on_train_begin(
-        self, args, state, control, **kwargs  # pylint: disable=unused-argument
+        self,
+        args,
+        state,
+        control,
+        **kwargs,
     ):
         self._gc()
 
     def on_step_begin(
-        self, args, state, control, **kwargs  # pylint: disable=unused-argument
+        self,
+        args,
+        state,
+        control,
+        **kwargs,
     ):
-        # pylint: disable=consider-using-in
         if self.next_gc_on_begin_step == state.global_step or state.global_step == 0:
             self._gc()
 
     def on_step_end(
-        self, args, state, control, **kwargs  # pylint: disable=unused-argument
+        self,
+        args,
+        state,
+        control,
+        **kwargs,
     ):
         if control.should_evaluate:
             # automatically GC before evals so the eval memory spike from the CEL doesn't OOM the trainer
@@ -879,7 +888,11 @@ class GCCallback(TrainerCallback):
                 self._gc()
 
     def on_epoch_end(
-        self, args, state, control, **kwargs  # pylint: disable=unused-argument
+        self,
+        args,
+        state,
+        control,
+        **kwargs,
     ):
         self._gc()
 
@@ -892,16 +905,12 @@ def colab_inference_post_train_callback(trainer: Trainer):
             self.gpu_name = torch.cuda.get_device_name(0)
             self.cfg = cfg
 
-        def on_train_end(
-            self, args, state, control, **kwargs
-        ):  # pylint: disable=unused-argument
+        def on_train_end(self, args, state, control, **kwargs):
             """
             handle T4 gpu, we need to convert attention to eager for inference
             """
             if "Tesla T4" in self.gpu_name and self.cfg.xformers_attention:
-                trainer.model.config._attn_implementation = (  # pylint: disable=protected-access
-                    "eager"
-                )
+                trainer.model.config._attn_implementation = "eager"
             trainer.model.gradient_checkpointing_disable()
             trainer.model.config.use_cache = True
             trainer.model.eval()
diff --git a/src/axolotl/utils/callbacks/comet_.py b/src/axolotl/utils/callbacks/comet_.py
index 7dce95145..cd3bcf70e 100644
--- a/src/axolotl/utils/callbacks/comet_.py
+++ b/src/axolotl/utils/callbacks/comet_.py
@@ -22,10 +22,10 @@ class SaveAxolotlConfigtoCometCallback(TrainerCallback):
 
     def on_train_begin(
         self,
-        args: "AxolotlTrainingArguments",  # pylint: disable=unused-argument
-        state: TrainerState,  # pylint: disable=unused-argument
+        args: "AxolotlTrainingArguments",
+        state: TrainerState,
         control: TrainerControl,
-        **kwargs,  # pylint: disable=unused-argument
+        **kwargs,
     ):
         if is_main_process():
             try:
diff --git a/src/axolotl/utils/callbacks/lisa.py b/src/axolotl/utils/callbacks/lisa.py
index 348cdf2da..03f189d80 100644
--- a/src/axolotl/utils/callbacks/lisa.py
+++ b/src/axolotl/utils/callbacks/lisa.py
@@ -55,9 +55,7 @@ def lisa_callback_factory(trainer: "AxolotlTrainer"):
                 for param in layer.parameters():
                     param.requires_grad = False
 
-        def on_step_begin(
-            self, args, state, control, **kwargs
-        ):  # pylint: disable=unused-argument
+        def on_step_begin(self, args, state, control, **kwargs):
             # Check if it's time to switch active layers, including at step 0
             if state.global_step % self.step_interval == 0 or state.global_step == 1:
                 self.switch_active_layers()
diff --git a/src/axolotl/utils/callbacks/mlflow_.py b/src/axolotl/utils/callbacks/mlflow_.py
index ac72f5e6d..30120a87d 100644
--- a/src/axolotl/utils/callbacks/mlflow_.py
+++ b/src/axolotl/utils/callbacks/mlflow_.py
@@ -23,7 +23,6 @@ def should_log_artifacts() -> bool:
 
 
 class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
-    # pylint: disable=duplicate-code
     """Callback to save axolotl config to mlflow"""
 
     def __init__(self, axolotl_config_path):
@@ -31,10 +30,10 @@ class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
 
     def on_train_begin(
         self,
-        args: "AxolotlTrainingArguments",  # pylint: disable=unused-argument
-        state: TrainerState,  # pylint: disable=unused-argument
+        args: "AxolotlTrainingArguments",
+        state: TrainerState,
         control: TrainerControl,
-        **kwargs,  # pylint: disable=unused-argument
+        **kwargs,
     ):
         if is_main_process():
             try:
diff --git a/src/axolotl/utils/callbacks/profiler.py b/src/axolotl/utils/callbacks/profiler.py
index d26b7f9dd..2cf5e0f4f 100644
--- a/src/axolotl/utils/callbacks/profiler.py
+++ b/src/axolotl/utils/callbacks/profiler.py
@@ -26,58 +26,50 @@ class PytorchProfilerCallback(TrainerCallback):
         if profiler_steps_start == 0:
             # start recording memory allocations before everything is allocated, because if we start
             # at the beginning of step 0, we won't have any memory allocations in the traces
-            torch.cuda.memory._record_memory_history(  # pylint: disable=protected-access
-                enabled="all"
-            )
+            torch.cuda.memory._record_memory_history(enabled="all")
             profiler_steps_start = -1
         self.profiler_steps_start = profiler_steps_start
 
-    def on_step_begin(  # pylint: disable=unused-argument
+    def on_step_begin(
         self,
-        args: TrainingArguments,  # pylint: disable=unused-argument
+        args: TrainingArguments,
         state: TrainerState,
-        control: TrainerControl,  # pylint: disable=unused-argument
-        **kwargs,  # pylint: disable=unused-argument
+        control: TrainerControl,
+        **kwargs,
     ):
         if state.global_step == self.profiler_steps_start:
-            torch.cuda.memory._record_memory_history(  # pylint: disable=protected-access
-                enabled="all"
-            )
+            torch.cuda.memory._record_memory_history(enabled="all")
 
-    def on_step_end(  # pylint: disable=unused-argument
+    def on_step_end(
         self,
-        args: TrainingArguments,  # pylint: disable=unused-argument
+        args: TrainingArguments,
         state: TrainerState,
-        control: TrainerControl,  # pylint: disable=unused-argument
-        **kwargs,  # pylint: disable=unused-argument
+        control: TrainerControl,
+        **kwargs,
     ):
         if state.global_step == self.profiler_steps_end:
-            snapshot = torch.cuda.memory._snapshot()  # pylint: disable=protected-access
+            snapshot = torch.cuda.memory._snapshot()
             with open(Path(args.output_dir) / "snapshot.pickle", "wb") as fout:
                 dump(snapshot, fout)
 
             # tell CUDA to stop recording memory allocations now
-            torch.cuda.memory._record_memory_history(  # pylint: disable=protected-access
-                enabled=None
-            )
+            torch.cuda.memory._record_memory_history(enabled=None)
 
-    def on_train_end(  # pylint: disable=unused-argument
+    def on_train_end(
         self,
-        args: TrainingArguments,  # pylint: disable=unused-argument
+        args: TrainingArguments,
         state: TrainerState,
-        control: TrainerControl,  # pylint: disable=unused-argument
-        **kwargs,  # pylint: disable=unused-argument
+        control: TrainerControl,
+        **kwargs,
     ):
         # make sure to record if we happen to have more steps than steps to profile
         if (
             state.global_step >= self.profiler_steps_start
             and state.global_step < self.profiler_steps_end
         ):
-            snapshot = torch.cuda.memory._snapshot()  # pylint: disable=protected-access
+            snapshot = torch.cuda.memory._snapshot()
             with open(Path(args.output_dir) / "snapshot.pickle", "wb") as fout:
                 dump(snapshot, fout)
 
             # tell CUDA to stop recording memory allocations now
-            torch.cuda.memory._record_memory_history(  # pylint: disable=protected-access
-                enabled=None
-            )
+            torch.cuda.memory._record_memory_history(enabled=None)
diff --git a/src/axolotl/utils/callbacks/qat.py b/src/axolotl/utils/callbacks/qat.py
index cf4d9a937..70746d6be 100644
--- a/src/axolotl/utils/callbacks/qat.py
+++ b/src/axolotl/utils/callbacks/qat.py
@@ -38,9 +38,7 @@ class QATCallback(TrainerCallback):
     def __init__(self, cfg: QATConfig):
         self.cfg = cfg
 
-    def on_step_begin(
-        self, args, state, control, model, **kwargs
-    ):  # pylint: disable=unused-argument
+    def on_step_begin(self, args, state, control, model, **kwargs):
         if self.cfg.fake_quant_after_n_steps is not None:
             if state.global_step == 0:
                 LOG.info(f"Disabling fake quantization at step {state.global_step}")
diff --git a/src/axolotl/utils/config/__init__.py b/src/axolotl/utils/config/__init__.py
index c9613c39b..534d7c4a4 100644
--- a/src/axolotl/utils/config/__init__.py
+++ b/src/axolotl/utils/config/__init__.py
@@ -37,7 +37,7 @@ def choose_device(cfg):
                 return f"npu:{cfg.local_rank}"
 
             raise SystemError("No CUDA/mps/npu device found")
-        except Exception:  # pylint: disable=broad-exception-caught
+        except Exception:
             return "cpu"
 
     cfg.device = get_device()
@@ -266,8 +266,8 @@ def validate_config(
 
     if cfg.plugins:
         (
-            AxolotlConfigWCapabilities,  # pylint: disable=invalid-name
-            AxolotlInputConfig,  # pylint: disable=invalid-name
+            AxolotlConfigWCapabilities,
+            AxolotlInputConfig,
         ) = merge_input_args()
 
     # Convert datasets to proper format if needed
diff --git a/src/axolotl/utils/ctx_managers/__init__.py b/src/axolotl/utils/ctx_managers/__init__.py
index e544621b5..6ffda9e55 100644
--- a/src/axolotl/utils/ctx_managers/__init__.py
+++ b/src/axolotl/utils/ctx_managers/__init__.py
@@ -1,6 +1,5 @@
 """Init for context manager submodule"""
 
-# pylint: disable=unused-import
 # flake8: noqa
 
 from .sequence_parallel import SequenceParallelContextManager
diff --git a/src/axolotl/utils/ctx_managers/sequence_parallel.py b/src/axolotl/utils/ctx_managers/sequence_parallel.py
index 029d991dd..1ec91ae2a 100644
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -26,7 +26,7 @@ def apply_sequence_parallelism(
     local_rank: int,
     local_world_size: int,
     gradient_accumulation_steps: int,
-    ring_attn_func: RingAttnFunc,  # pylint: disable=unused-argument
+    ring_attn_func: RingAttnFunc,
 ) -> tuple[dict[str, torch.Tensor], int, int]:
     """
     Apply sequence parallelism slicing to a batch.
diff --git a/src/axolotl/utils/data/pretraining.py b/src/axolotl/utils/data/pretraining.py
index f3422f990..72c5536e9 100644
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/pretraining.py
@@ -67,7 +67,7 @@ def encode_pretraining(
     buffer_labels = torch.tensor([], dtype=torch.long)
     buffer_attention_mask = torch.tensor([], dtype=torch.long)
 
-    for ids, labels, mask in zip(input_ids, targets, attention_mask):
+    for ids, labels, mask in zip(input_ids, targets, attention_mask, strict=False):
         if buffer_input_ids.numel() == max_tokens:
             new_input_ids.append(buffer_input_ids)
             new_labels.append(buffer_labels)
@@ -247,7 +247,6 @@ def encode_packed_pretraining(
     batch_size: int = 4,
     multipack_attn: Optional[bool] = True,
 ) -> Dict[str, List]:
-    # pylint: disable=duplicate-code
     # tokenize all the examples
     # rows get split with stride (overlap)
     train_dataset = ds_wrapper(dataset=Dataset.from_dict(examples))[0]
diff --git a/src/axolotl/utils/data/rl.py b/src/axolotl/utils/data/rl.py
index 6fd539758..d371c9acb 100644
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -255,7 +255,6 @@ def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset:
     return dataset
 
 
-# pylint: disable=duplicate-code
 def _load_or_create_dataset_split(
     cfg: DictDefault, tokenizer: PreTrainedTokenizer, split: Literal["train", "test"]
 ) -> Dataset:
diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py
index 21c8e472b..1d7d37f15 100644
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -337,7 +337,7 @@ def generate_split_fingerprints(
     dataset: Dataset, val_set_size: int | float, seed: int
 ) -> tuple[str, str]:
     """Generate consistent fingerprints for train/test splits."""
-    fingerprint = dataset._fingerprint  # pylint: disable=protected-access
+    fingerprint = dataset._fingerprint
 
     train_hash_input = f"{fingerprint}|{val_set_size}|train|{seed}"
     test_hash_input = f"{fingerprint}|{val_set_size}|test|{seed}"
@@ -497,7 +497,7 @@ def try_load_from_hub(
             token=cfg.hf_use_auth_token,
         )
         return dataset[split]
-    except Exception:  # pylint: disable=broad-except # nosec
+    except Exception:
         LOG.info("Unable to find prepared dataset in HuggingFace Hub")
         return None
 
diff --git a/src/axolotl/utils/data/utils.py b/src/axolotl/utils/data/utils.py
index 856a609c7..4868576a0 100644
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -44,7 +44,7 @@ def retry_on_request_exceptions(
 
     def decorator(func):
         @functools.wraps(func)
-        def wrapper(*args, **kwargs):  # pylint: disable=inconsistent-return-statements
+        def wrapper(*args, **kwargs):
             for attempt in range(max_retries):
                 try:
                     return func(*args, **kwargs)
diff --git a/src/axolotl/utils/data/wrappers.py b/src/axolotl/utils/data/wrappers.py
index b6dc42c71..cb9e2c6b4 100644
--- a/src/axolotl/utils/data/wrappers.py
+++ b/src/axolotl/utils/data/wrappers.py
@@ -54,7 +54,6 @@ def handle_unknown_dataset_strategy(dataset_config: DictDefault) -> NoReturn:
     raise ValueError(error_message)
 
 
-# pylint: disable=too-many-return-statements
 def get_dataset_wrapper(
     dataset_config: DictDefault,
     tokenizer: PreTrainedTokenizer,
@@ -62,7 +61,7 @@ def get_dataset_wrapper(
     dataset_base_type: str | None,
     dataset: Dataset | IterableDataset,
     dataset_prompt_style: str | None = None,
-    processor: ProcessorMixin | None = None,  # pylint: disable=unused-argument
+    processor: ProcessorMixin | None = None,
 ) -> tuple[Dataset | IterableDataset, Prompter | None]:
     """Create an appropriate dataset wrapper and prompter based on dataset
     configuration.
diff --git a/src/axolotl/utils/dict.py b/src/axolotl/utils/dict.py
index c2670dfeb..7d146c7a9 100644
--- a/src/axolotl/utils/dict.py
+++ b/src/axolotl/utils/dict.py
@@ -17,15 +17,15 @@ class DictDefault(Dict):
     def __setitem__(self, name, value):
         # workaround for pickle/unpickle issues and __frozen not being available
         try:
-            isFrozen = hasattr(  # pylint: disable=invalid-name
+            isFrozen = hasattr(self, "__frozen") and object.__getattribute__(
                 self, "__frozen"
-            ) and object.__getattribute__(self, "__frozen")
+            )
         except AttributeError:
-            isFrozen = False  # pylint: disable=invalid-name
+            isFrozen = False
 
         if isFrozen and name not in super().keys():
             raise KeyError(name)
-        super(Dict, self).__setitem__(name, value)  # pylint: disable=bad-super-call
+        super(Dict, self).__setitem__(name, value)
         try:
             p = object.__getattribute__(self, "__parent")
             key = object.__getattribute__(self, "__key")
diff --git a/src/axolotl/utils/distributed.py b/src/axolotl/utils/distributed.py
index 48771fd97..840772d91 100644
--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -15,7 +15,7 @@ from transformers.utils.import_utils import (
     is_torch_npu_available,
 )
 
-distributed_state = None  # pylint: disable=invalid-name
+distributed_state = None
 
 
 def get_device_type() -> torch.device:
@@ -48,7 +48,7 @@ def get_current_device() -> int:
 
 
 def init_distributed_state():
-    global distributed_state  # pylint: disable=global-statement
+    global distributed_state
     if distributed_state is None:
         timeout = int(os.environ.get("AXOLOTL_NCCL_TIMEOUT", 1800))
         try:
@@ -137,7 +137,7 @@ def zero_first(is_main: bool):
         barrier()
 
 
-def gather_scalar_from_all_ranks(fn, world_size=1):  # pylint: disable=invalid-name
+def gather_scalar_from_all_ranks(fn, world_size=1):
     """
     Run a callable 'fn' on all ranks and gather the results on the specified rank.
 
@@ -201,7 +201,7 @@ def broadcast_dict(vals: dict):
     return vals
 
 
-def compute_and_broadcast(fn):  # pylint: disable=invalid-name
+def compute_and_broadcast(fn):
     """
     Compute a value using the function 'fn' only on the specified rank (default is 0).
     The value is then broadcasted to all other ranks.
@@ -234,7 +234,7 @@ def compute_and_broadcast(fn):  # pylint: disable=invalid-name
     return float(value_tensor.item())
 
 
-def gather_from_all_ranks(fn, world_size=1):  # pylint: disable=invalid-name
+def gather_from_all_ranks(fn, world_size=1):
     """
     Run a callable 'fn' on all ranks and gather the results on the specified rank.
 
diff --git a/src/axolotl/utils/environment.py b/src/axolotl/utils/environment.py
index 3c83c87cb..751f7e253 100644
--- a/src/axolotl/utils/environment.py
+++ b/src/axolotl/utils/environment.py
@@ -26,7 +26,7 @@ def check_cuda_p2p_ib_support():
                 for unsupported_device in unsupported_devices
             ):
                 return False
-    except Exception:  # pylint: disable=broad-except # nosec
+    except Exception:  # nosec B110
         pass
     return True
 
diff --git a/src/axolotl/utils/lora.py b/src/axolotl/utils/lora.py
index 759c17ac2..6ae481b6b 100644
--- a/src/axolotl/utils/lora.py
+++ b/src/axolotl/utils/lora.py
@@ -15,6 +15,7 @@
 """
 module to get the state dict of a merged lora model
 """
+
 import torch
 from peft.tuners.tuners_utils import onload_layer
 from peft.utils import ModulesToSaveWrapper, _get_submodules
diff --git a/src/axolotl/utils/mistral/mistral_tokenizer.py b/src/axolotl/utils/mistral/mistral_tokenizer.py
index 61cbdc5b0..0414ece78 100644
--- a/src/axolotl/utils/mistral/mistral_tokenizer.py
+++ b/src/axolotl/utils/mistral/mistral_tokenizer.py
@@ -53,7 +53,7 @@ class HFMistralTokenizer(MistralCommonTokenizer):
         """
         # Check if MistralRequestValidator has a _mode attribute.
         # This is a private API and may change in the future.
-        # pylint: disable=protected-access
+
         from mistral_common.protocol.instruct.validator import MistralRequestValidator
 
         if not (
@@ -74,7 +74,7 @@ class HFMistralTokenizer(MistralCommonTokenizer):
     def apply_chat_template(  # type: ignore
         self,
         conversation: list[dict] | list[list[dict]],
-        chat_template: str | None = None,  # pylint: disable=unused-argument
+        chat_template: str | None = None,
         add_generation_prompt: bool = False,
         **kwargs,
     ) -> str | list[int]:
diff --git a/src/axolotl/utils/model_shard_quant.py b/src/axolotl/utils/model_shard_quant.py
index 5c5006eda..f20a9625e 100644
--- a/src/axolotl/utils/model_shard_quant.py
+++ b/src/axolotl/utils/model_shard_quant.py
@@ -46,13 +46,11 @@ def _replace_linear(
 
         if isinstance(module, torch.nn.Linear) and name not in skip_modules:
             if issubclass(linear_replacement, Linear4bit):
-                model._modules[name] = (  # pylint: disable=protected-access
-                    linear_replacement(
-                        module.in_features,
-                        module.out_features,
-                        module.bias is not None,
-                        **kwargs,
-                    )
+                model._modules[name] = linear_replacement(
+                    module.in_features,
+                    module.out_features,
+                    module.bias is not None,
+                    **kwargs,
                 )
             else:
                 raise ValueError(
@@ -151,7 +149,7 @@ def load_sharded_model(
             model_name,
             use_cache=False,
             torch_dtype=torch.float32,
-            _attn_implementation=model_config._attn_implementation,  # pylint: disable=protected-access
+            _attn_implementation=model_config._attn_implementation,
             trust_remote_code=cfg.trust_remote_code,
         )
         dtype = torch_dtype if not cfg.float32 else None
diff --git a/src/axolotl/utils/optimizers/adopt.py b/src/axolotl/utils/optimizers/adopt.py
index 6f064abbf..20ddfa7ec 100644
--- a/src/axolotl/utils/optimizers/adopt.py
+++ b/src/axolotl/utils/optimizers/adopt.py
@@ -6,7 +6,6 @@ Taniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeo
 """
 
 # mypy: ignore-errors
-# pylint: skip-file
 # flake8: noqa
 # mypy: allow-untyped-decorators
 # mypy: allow-untyped-defs
@@ -288,7 +287,9 @@ def _single_tensor_adopt(
             assert (
                 param.device.type == step_t.device.type
                 and param.device.type in capturable_supported_devices
-            ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            ), (
+                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+            )
 
         step = step_t if capturable or differentiable else _get_value(step_t)
 
@@ -365,7 +366,9 @@ def _multi_tensor_adopt(
             p.device.type == step.device.type
             and p.device.type in capturable_supported_devices
             for p, step in zip(params, state_steps)
-        ), f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        ), (
+            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
+        )
 
     assert grad_scale is None and found_inf is None
 
diff --git a/src/axolotl/utils/samplers/multipack.py b/src/axolotl/utils/samplers/multipack.py
index af62c0a4f..d07988613 100644
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -268,7 +268,7 @@ class MultipackBatchSampler(BatchSampler):
         num_processes: int | None = None,  # Number of processes for parallel packing
         safe_mode: bool = True,  # Conservative packing to prevent training instability
         mp_start_method: str = "fork",
-        **kwargs,  # pylint: disable=unused-argument
+        **kwargs,
     ):
         super().__init__(sampler, batch_size, drop_last)
         self.batch_size = batch_size
@@ -317,9 +317,7 @@ class MultipackBatchSampler(BatchSampler):
             return self._batches
 
         # Get indices from the sampler
-        indices = [  # pylint: disable=unnecessary-comprehension
-            idx for idx in self.sampler
-        ]
+        indices = [idx for idx in self.sampler]
 
         # Get lengths of the selected sequences
         lengths = self.lengths[indices]
@@ -417,7 +415,7 @@ class MultipackBatchSampler(BatchSampler):
 
         # Gather efficiency from all ranks and apply the calculation function
         sample_packing_actual_eff_all = reduce_and_broadcast(
-            lambda: float(self.efficiency()),  # pylint: disable=unnecessary-lambda
+            lambda: float(self.efficiency()),
             calc_sample_packing_eff_est,
         )
 
diff --git a/src/axolotl/utils/schedulers.py b/src/axolotl/utils/schedulers.py
index cdaf92271..83a993089 100644
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -107,9 +107,7 @@ class InterpolatingLogScheduler(LRScheduler):
         self.num_steps = num_steps
         self.min_lr = min_lr
         self.max_lr = max_lr
-        self.q = (max_lr / min_lr) ** (  # pylint: disable=invalid-name
-            1 / (num_steps - 1)
-        )
+        self.q = (max_lr / min_lr) ** (1 / (num_steps - 1))
         super().__init__(optimizer, last_epoch)
 
     def get_lr(self):
@@ -310,7 +308,6 @@ class JaggedLRRestartScheduler(LRScheduler):
         jagged_restart_anneal_steps: int = 1,
         min_lr_scale: float = 0.001,
     ) -> None:
-        # pylint: disable=duplicate-code
         self.inner_schedule = inner_schedule
         self.restarts_steps = jagged_restart_steps
         self.warmup_steps = jagged_restart_warmup_steps
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index a607b3dca..4d660d4b7 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -1,7 +1,5 @@
 """Module with Pydantic models for configuration."""
 
-# pylint: disable=too-many-lines
-
 from typing import Annotated, Any, Literal
 
 from annotated_types import MinLen
@@ -51,7 +49,6 @@ from axolotl.utils.schemas.vllm import VllmConfig
 LOG = get_logger(__name__)
 
 
-# pylint: disable=too-many-ancestors
 class AxolotlInputConfig(
     ModelInputConfig,
     ModelOutputConfig,
@@ -124,10 +121,10 @@ class AxolotlInputConfig(
         },
     )
     trl: TRLConfig | None = Field(
-        default_factory=lambda: TRLConfig(),  # pylint: disable=unnecessary-lambda
+        default_factory=lambda: TRLConfig(),
     )
     vllm: VllmConfig | None = Field(
-        default_factory=lambda: VllmConfig(),  # pylint: disable=unnecessary-lambda
+        default_factory=lambda: VllmConfig(),
     )
     qat: QATConfig | None = None
     quantization: PTQConfig | None = None
@@ -1035,7 +1032,6 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
 
         return data
 
-    # pylint: disable=duplicate-code
     @model_validator(mode="before")
     @classmethod
     def check_multigpu_unsloth(cls, data):
@@ -1051,7 +1047,6 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
                 )
         return data
 
-    # pylint: disable=duplicate-code
     @model_validator(mode="before")
     @classmethod
     def check_multigpu_lora_kernels(cls, data):
diff --git a/src/axolotl/utils/schemas/datasets.py b/src/axolotl/utils/schemas/datasets.py
index d9c8042d4..e32468706 100644
--- a/src/axolotl/utils/schemas/datasets.py
+++ b/src/axolotl/utils/schemas/datasets.py
@@ -203,7 +203,6 @@ class SFTDataset(BaseModel):
 
     @model_validator(mode="before")
     @classmethod
-    # pylint: disable=duplicate-code
     def check_chat_template_config(cls, data):
         if isinstance(data, BaseModel):
             data = data.model_dump()
diff --git a/src/axolotl/utils/schemas/enums.py b/src/axolotl/utils/schemas/enums.py
index cf2a8b484..8f4718aa9 100644
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -1,7 +1,5 @@
 """Enums for Axolotl input config"""
 
-# pylint: disable=invalid-name
-
 from enum import Enum
 
 import torch
diff --git a/src/axolotl/utils/schemas/training.py b/src/axolotl/utils/schemas/training.py
index b1788dcaa..8e06e82cb 100644
--- a/src/axolotl/utils/schemas/training.py
+++ b/src/axolotl/utils/schemas/training.py
@@ -96,9 +96,9 @@ class HyperparametersConfig(BaseModel):
             "description": "Path to torch distx for optim 'adamw_anyprecision'"
         },
     )
-    lr_scheduler: (SchedulerType | Literal["one_cycle"] | Literal["rex"]) | None = (
-        SchedulerType.COSINE
-    )
+    lr_scheduler: (
+        SchedulerType | Literal["one_cycle"] | Literal["rex"]
+    ) | None = SchedulerType.COSINE
     lr_scheduler_kwargs: dict[str, Any] | None = Field(
         default=None,
         json_schema_extra={
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 217244b01..791894990 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -1,7 +1,5 @@
 """Module with validation methods for config pydantic model."""
 
-# pylint: disable=too-many-boolean-expressions
-
 import json
 import sys
 import tempfile
@@ -16,7 +14,6 @@ from transformers.utils.import_utils import is_torch_npu_available
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType
 
-# pylint: disable=too-many-lines
 
 LOG = get_logger(__name__)
 
@@ -346,7 +343,6 @@ class TrainingValidationMixin:
     @model_validator(mode="after")
     def check_fft_possible_bad_config(self):
         if (
-            # pylint: disable=too-many-boolean-expressions
             not (self.bf16 or self.bfloat16)
             and (self.fp16 or self.float16)
             and not self.adapter
@@ -460,12 +456,12 @@ class TrainingValidationMixin:
     @classmethod
     def check_mistral_common_import(cls, tokenizer_use_mistral_common):
         if tokenizer_use_mistral_common:
-            try:
-                import mistral_common  # noqa: F401 # pylint:disable=unused-import
-            except ImportError as exception:
+            import importlib.util
+
+            if importlib.util.find_spec("mistral_common") is None:
                 raise ImportError(
                     "mistral-common is required for mistral models. Please install it with `pip install axolotl` or `pip install -e .`."
-                ) from exception
+                )
 
         return tokenizer_use_mistral_common
 
@@ -685,7 +681,7 @@ class RLValidationMixin:
         # TODO: SalmanMohammadi
         # Distributed RL with QLoRA + gradient checkpointing
         # and use_reentrant = True is broken upstream in TRL
-        # pylint: disable=too-many-boolean-expressions
+
         if (
             data.get("rl")
             and data.get("gradient_checkpointing")
@@ -1252,26 +1248,19 @@ class ComplexValidationMixin:
                 import transformers.modeling_flash_attention_utils
                 from transformers.utils import is_flash_attn_greater_or_equal
 
-                # pylint: disable=protected-access
                 transformers.modeling_flash_attention_utils._flash_supports_window = (
                     True
                 )
-                setattr(
-                    sys.modules["transformers.modeling_flash_attention_utils"],
-                    "_flash_supports_window",
-                    True,
-                )
-                setattr(
-                    sys.modules["transformers.modeling_flash_attention_utils"],
-                    "_flash_supports_window_size",
-                    True,
-                )
-                setattr(
-                    sys.modules["transformers.modeling_flash_attention_utils"],
-                    "is_flash_attn_greater_or_equal",
-                    is_flash_attn_greater_or_equal,
-                )
-                import ring_flash_attn  # noqa: F401 # pylint:disable=unused-import
+                sys.modules[
+                    "transformers.modeling_flash_attention_utils"
+                ]._flash_supports_window = True
+                sys.modules[
+                    "transformers.modeling_flash_attention_utils"
+                ]._flash_supports_window_size = True
+                sys.modules[
+                    "transformers.modeling_flash_attention_utils"
+                ].is_flash_attn_greater_or_equal = is_flash_attn_greater_or_equal
+                import ring_flash_attn  # noqa: F401  # Required after monkey-patching
             except ImportError as exception:
                 raise ImportError(
                     "context_parallel_size > 1 but ring_flash_attn is not installed. "
@@ -1336,7 +1325,6 @@ class GRPOVllmValidationMixin:
         return self
 
 
-# pylint: disable=too-many-ancestors
 class ValidationMixin(
     DatasetValidationMixin,
     AttentionValidationMixin,
diff --git a/src/axolotl/utils/tokenization.py b/src/axolotl/utils/tokenization.py
index 3526bd5b5..3f44a3429 100644
--- a/src/axolotl/utils/tokenization.py
+++ b/src/axolotl/utils/tokenization.py
@@ -31,7 +31,7 @@ def check_example_labels(example, tokenizer, text_only=False):
     # You can compare the input_ids and labels element-wise
     # Remember to ignore positions with IGNORE_TOKEN_ID (if you use it) or attention_mask equal to 0
     colored_tokens = []
-    for _, (input_id, label_id) in enumerate(zip(input_ids, labels)):
+    for _, (input_id, label_id) in enumerate(zip(input_ids, labels, strict=False)):
         decoded_input_token = tokenizer.decode(input_id)
         # Choose the color based on whether the label has the ignore value or not
         color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index e424cb55a..08038cb18 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -496,7 +496,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                 return max(estimates)
 
             sample_packing_actual_eff_all = reduce_and_broadcast(
-                lambda: sampler.efficiency(),  # pylint: disable=unnecessary-lambda
+                lambda: sampler.efficiency(),
                 calc_sample_packing_eff_est,
             )
             sample_packing_eff_est = (
diff --git a/src/setuptools_axolotl_dynamic_dependencies.py b/src/setuptools_axolotl_dynamic_dependencies.py
index 02a5b8083..ccd7c72d7 100644
--- a/src/setuptools_axolotl_dynamic_dependencies.py
+++ b/src/setuptools_axolotl_dynamic_dependencies.py
@@ -9,7 +9,6 @@ from importlib.metadata import PackageNotFoundError, version
 from setuptools.command.build_py import build_py as _build_py
 
 
-# pylint: disable=duplicate-code
 def parse_requirements():
     _install_requires = []
     _dependency_links = []
diff --git a/tests/cli/test_cli_evaluate.py b/tests/cli/test_cli_evaluate.py
index a191bf957..e8b88625a 100644
--- a/tests/cli/test_cli_evaluate.py
+++ b/tests/cli/test_cli_evaluate.py
@@ -1,7 +1,5 @@
 """Tests for evaluate CLI command."""
 
-# pylint: disable=duplicate-code
-
 from unittest.mock import patch
 
 from axolotl.cli.main import cli
@@ -31,7 +29,6 @@ class TestEvaluateCommand(BaseCliTest):
         config_path = tmp_path / "config.yml"
         config_path.write_text(valid_test_config)
 
-        # pylint: disable=duplicate-code
         with patch("axolotl.cli.evaluate.do_evaluate") as mock_evaluate:
             result = cli_runner.invoke(
                 cli,
diff --git a/tests/cli/test_cli_inference.py b/tests/cli/test_cli_inference.py
index 3394c189d..807dc7fa3 100644
--- a/tests/cli/test_cli_inference.py
+++ b/tests/cli/test_cli_inference.py
@@ -1,7 +1,5 @@
 """pytest tests for axolotl CLI inference command."""
 
-# pylint: disable=duplicate-code
-
 from unittest.mock import patch
 
 from axolotl.cli.main import cli
diff --git a/tests/cli/test_cli_merge_sharded_fsdp_weights.py b/tests/cli/test_cli_merge_sharded_fsdp_weights.py
index 4f6a973ea..de13b28ed 100644
--- a/tests/cli/test_cli_merge_sharded_fsdp_weights.py
+++ b/tests/cli/test_cli_merge_sharded_fsdp_weights.py
@@ -1,7 +1,5 @@
 """pytest tests for axolotl CLI merge_sharded_fsdp_weights command."""
 
-# pylint: disable=duplicate-code
-
 from unittest.mock import patch
 
 from axolotl.cli.main import cli
diff --git a/tests/cli/test_cli_train.py b/tests/cli/test_cli_train.py
index d4d90f57f..1251ab3c0 100644
--- a/tests/cli/test_cli_train.py
+++ b/tests/cli/test_cli_train.py
@@ -1,7 +1,5 @@
 """Tests for train CLI command."""
 
-# pylint: disable=duplicate-code
-
 from unittest.mock import MagicMock, patch
 
 from axolotl.cli.main import cli
diff --git a/tests/cli/test_utils.py b/tests/cli/test_utils.py
index a3e4e9887..431c35c3c 100644
--- a/tests/cli/test_utils.py
+++ b/tests/cli/test_utils.py
@@ -1,7 +1,5 @@
 """pytest tests for axolotl CLI utils."""
 
-# pylint: disable=redefined-outer-name
-
 import json
 from unittest.mock import Mock, patch
 
@@ -25,7 +23,7 @@ MOCK_TREE_RESPONSE = {
 def mock_responses():
     """Mock responses for API and file downloads"""
 
-    def mock_get(url, timeout=None):  # pylint: disable=unused-argument
+    def mock_get(url, timeout=None):
         response = Mock()
         if "api.github.com" in url:
             response.text = json.dumps(MOCK_TREE_RESPONSE)
@@ -93,21 +91,21 @@ def assert_launcher_args_in_command(
     called_cmd = mock_subprocess_call.call_args.args[0]
 
     # Verify launcher
-    assert (
-        called_cmd[0] == launcher
-    ), f"Expected launcher {launcher}, got {called_cmd[0]}"
+    assert called_cmd[0] == launcher, (
+        f"Expected launcher {launcher}, got {called_cmd[0]}"
+    )
 
     # Verify launcher args are present
     for arg in expected_launcher_args:
-        assert (
-            arg in called_cmd
-        ), f"Expected launcher arg '{arg}' not found in command: {called_cmd}"
+        assert arg in called_cmd, (
+            f"Expected launcher arg '{arg}' not found in command: {called_cmd}"
+        )
 
     # Verify module is present
     assert "-m" in called_cmd, "Expected -m flag for module execution"
-    assert (
-        command_module in called_cmd
-    ), f"Expected module {command_module} not found in command: {called_cmd}"
+    assert command_module in called_cmd, (
+        f"Expected module {command_module} not found in command: {called_cmd}"
+    )
 
 
 def assert_no_launcher_args_contamination(mock_subprocess_call, launcher: str):
@@ -126,17 +124,17 @@ def assert_no_launcher_args_contamination(mock_subprocess_call, launcher: str):
         launch_idx = called_cmd.index("launch")
         m_idx = called_cmd.index("-m")
         launcher_section = called_cmd[launch_idx + 1 : m_idx]
-        assert (
-            len(launcher_section) == 0
-        ), f"Unexpected launcher args found: {launcher_section}"
+        assert len(launcher_section) == 0, (
+            f"Unexpected launcher args found: {launcher_section}"
+        )
     elif launcher == "torchrun":
         # For torchrun, launcher args should be between 'torchrun' and '-m'
         torchrun_idx = called_cmd.index("torchrun")
         m_idx = called_cmd.index("-m")
         launcher_section = called_cmd[torchrun_idx + 1 : m_idx]
-        assert (
-            len(launcher_section) == 0
-        ), f"Unexpected launcher args found: {launcher_section}"
+        assert len(launcher_section) == 0, (
+            f"Unexpected launcher args found: {launcher_section}"
+        )
 
 
 @pytest.fixture
diff --git a/tests/conftest.py b/tests/conftest.py
index 9e1af318d..98847ebad 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -33,10 +33,9 @@ logging.getLogger("filelock").setLevel(logging.CRITICAL)
 
 
 def retry_on_request_exceptions(max_retries=3, delay=1):
-    # pylint: disable=duplicate-code
     def decorator(func):
         @functools.wraps(func)
-        def wrapper(*args, **kwargs):  # pylint: disable=inconsistent-return-statements
+        def wrapper(*args, **kwargs):
             for attempt in range(max_retries):
                 try:
                     return func(*args, **kwargs)
@@ -171,7 +170,7 @@ def download_argilla_distilabel_intel_orca_dpo_dataset():
 # @disable_hf_offline
 # def dataset_fozzie_alpaca_dpo_dataset(
 #     download_fozzie_alpaca_dpo_dataset,
-# ):  # pylint: disable=unused-argument,redefined-outer-name
+# ):
 #     return load_dataset("fozziethebeat/alpaca_messages_2k_dpo_test", split="train")
 #
 #
@@ -179,7 +178,7 @@ def download_argilla_distilabel_intel_orca_dpo_dataset():
 # @disable_hf_offline
 # def dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff(
 #     download_fozzie_alpaca_dpo_dataset,
-# ):  # pylint: disable=unused-argument,redefined-outer-name
+# ):
 #     return load_dataset(
 #         "fozziethebeat/alpaca_messages_2k_dpo_test", split="train", revision="ea82cff"
 #     )
@@ -359,7 +358,7 @@ def download_llama32_1b_model_fixture():
 @enable_hf_offline
 def tokenizer_huggyllama(
     download_huggyllama_model_fixture,
-):  # pylint: disable=unused-argument,redefined-outer-name
+):
     tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
     tokenizer.pad_token = "</s>"
 
@@ -370,7 +369,7 @@ def tokenizer_huggyllama(
 @enable_hf_offline
 def tokenizer_huggyllama_w_special_tokens(
     tokenizer_huggyllama,
-):  # pylint: disable=redefined-outer-name
+):
     tokenizer_huggyllama.add_special_tokens(
         {
             "bos_token": "<s>",
@@ -386,7 +385,7 @@ def tokenizer_huggyllama_w_special_tokens(
 @enable_hf_offline
 def tokenizer_llama2_7b(
     download_llama2_model_fixture,
-):  # pylint: disable=unused-argument,redefined-outer-name
+):
     tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")
 
     return tokenizer
@@ -396,7 +395,7 @@ def tokenizer_llama2_7b(
 @enable_hf_offline
 def tokenizer_mistral_7b_instruct(
     download_mlx_mistral_7b_model_fixture,
-):  # pylint: disable=unused-argument,redefined-outer-name
+):
     return AutoTokenizer.from_pretrained("casperhansen/mistral-7b-instruct-v0.1-awq")
 
 
@@ -442,9 +441,7 @@ def cleanup_monkeypatches():
     # original_fa2_forward = LlamaFlashAttention2.forward
     original_llama_attn_forward = LlamaAttention.forward
     original_llama_forward = LlamaForCausalLM.forward
-    original_trainer_inner_training_loop = (
-        Trainer._inner_training_loop  # pylint: disable=protected-access
-    )
+    original_trainer_inner_training_loop = Trainer._inner_training_loop
     original_trainer_training_step = Trainer.training_step
     # monkey patches can happen inside the tests
     yield
@@ -452,9 +449,7 @@ def cleanup_monkeypatches():
     # LlamaFlashAttention2.forward = original_fa2_forward
     LlamaAttention.forward = original_llama_attn_forward
     LlamaForCausalLM.forward = original_llama_forward
-    Trainer._inner_training_loop = (  # pylint: disable=protected-access
-        original_trainer_inner_training_loop
-    )
+    Trainer._inner_training_loop = original_trainer_inner_training_loop
     Trainer.training_step = original_trainer_training_step
 
     # Reset other known monkeypatches
@@ -490,7 +485,7 @@ def cleanup_monkeypatches():
 @pytest.fixture
 def dataset_winglian_tiny_shakespeare(
     download_ds_fixture_bundle: Path,
-):  # pylint: disable=redefined-outer-name
+):
     ds_path = download_ds_fixture_bundle / "winglian__tiny-shakespeare"
     return datasets.load_from_disk(ds_path)
 
@@ -498,7 +493,7 @@ def dataset_winglian_tiny_shakespeare(
 @pytest.fixture
 def dataset_tatsu_lab_alpaca(
     download_ds_fixture_bundle: Path,
-):  # pylint: disable=redefined-outer-name
+):
     ds_path = download_ds_fixture_bundle / "tatsu-lab__alpaca"
     return datasets.load_from_disk(ds_path)["train"]
 
@@ -506,7 +501,7 @@ def dataset_tatsu_lab_alpaca(
 @pytest.fixture
 def dataset_mhenrichsen_alpaca_2k_test(
     download_ds_fixture_bundle: Path,
-):  # pylint: disable=redefined-outer-name
+):
     ds_path = download_ds_fixture_bundle / "mhenrichsen__alpaca_2k_test"
     return datasets.load_from_disk(ds_path)["train"]
 
@@ -514,7 +509,7 @@ def dataset_mhenrichsen_alpaca_2k_test(
 @pytest.fixture
 def dataset_argilla_ultrafeedback_binarized_preferences_cleaned(
     download_ds_fixture_bundle: Path,
-):  # pylint: disable=redefined-outer-name
+):
     ds_path = (
         download_ds_fixture_bundle
         / "argilla__ultrafeedback-binarized-preferences-cleaned"
@@ -525,7 +520,7 @@ def dataset_argilla_ultrafeedback_binarized_preferences_cleaned(
 @pytest.fixture
 def dataset_fozziethebeat_alpaca_messages_2k_dpo_test(
     download_ds_fixture_bundle: Path,
-):  # pylint: disable=redefined-outer-name
+):
     ds_path = download_ds_fixture_bundle / "fozziethebeat__alpaca_messages_2k_dpo_test"
     return datasets.load_from_disk(ds_path)["train"]
 
@@ -533,7 +528,7 @@ def dataset_fozziethebeat_alpaca_messages_2k_dpo_test(
 @pytest.fixture
 def dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff(
     download_ds_fixture_bundle: Path,
-):  # pylint: disable=redefined-outer-name
+):
     ds_path = (
         download_ds_fixture_bundle
         / "fozziethebeat__alpaca_messages_2k_dpo_test__rev_ea82cff"
@@ -557,7 +552,7 @@ def fixture_min_base_cfg():
     )
 
 
-# # pylint: disable=redefined-outer-name,unused-argument
+#
 @pytest.mark.skipif(
     os.environ.get("AXOLOTL_IS_CI_CACHE_PRELOAD", "-1") != "1",
     reason="Not running in CI cache preload",
diff --git a/tests/constants.py b/tests/constants.py
index e024e6920..cd75bd339 100644
--- a/tests/constants.py
+++ b/tests/constants.py
@@ -3,6 +3,7 @@
 This module contains constants and configuration dictionaries used for
 datasets and other utilities in the Axolotl project, specifically for testing.
 """
+
 # Configuration for Alpaca Messages Dataset
 ALPACA_MESSAGES_CONFIG_OG = {
     "path": "fozziethebeat/alpaca_messages_2k_dpo_test",
diff --git a/tests/core/test_builders.py b/tests/core/test_builders.py
index fab01a644..6428aa977 100644
--- a/tests/core/test_builders.py
+++ b/tests/core/test_builders.py
@@ -1,7 +1,5 @@
 """Unit tests for axolotl.core.builders"""
 
-# pylint: disable=protected-access
-
 import sys
 from pathlib import Path
 from unittest.mock import patch
@@ -330,7 +328,6 @@ def rand_reward_func(prompts, completions) -> list[float]:
         )
 
     def test_grpo_training_arguments(self, grpo_cfg, model, tokenizer, tmp_path):
-
         rewards_dir = tmp_path / "rewards_test"
         self._write_rewards_file(rewards_dir)
 
@@ -477,7 +474,7 @@ def rand_reward_func(prompts, completions) -> list[float]:
 
             assert trainer.optimizer_cls_and_kwargs is not None
 
-            from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
+            from axolotl.contribs.mit.muon import (
                 Muon,
                 MuonOptimizerFactory,
             )
@@ -559,7 +556,7 @@ class TestHFCausalTrainerBuilder:
 
         assert trainer.optimizer_cls_and_kwargs is not None
 
-        from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
+        from axolotl.contribs.mit.muon import (
             Muon,
             MuonOptimizerFactory,
         )
@@ -599,6 +596,6 @@ class TestTrainerClsPlugin:
         except TypeError as e:
             # Error raised if trainer_cls is None
             assert "'tuple' object has no attribute 'config'" not in str(e)
-        except Exception:  # pylint: disable=broad-exception-caught
+        except Exception:
             # Another error happens, so we passed trainer_cls to builder
             pass
diff --git a/tests/e2e/integrations/test_cut_cross_entropy.py b/tests/e2e/integrations/test_cut_cross_entropy.py
index 34e6c9644..1ba05077c 100644
--- a/tests/e2e/integrations/test_cut_cross_entropy.py
+++ b/tests/e2e/integrations/test_cut_cross_entropy.py
@@ -12,8 +12,6 @@ from axolotl.utils.dict import DictDefault
 
 from ..utils import check_model_output_exists
 
-# pylint: disable=duplicate-code
-
 
 @pytest.fixture()
 def min_cfg(temp_dir):
@@ -53,7 +51,6 @@ class TestCutCrossEntropyIntegration:
     e2e tests for cut_cross_entropy integration with Axolotl
     """
 
-    # pylint: disable=redefined-outer-name
     def test_llama_w_cce(self, min_cfg, temp_dir):
         cfg = DictDefault(min_cfg)
         cfg = validate_config(cfg)
@@ -69,7 +66,6 @@ class TestCutCrossEntropyIntegration:
             train(cfg=cfg, dataset_meta=dataset_meta)
             check_model_output_exists(temp_dir, cfg)
 
-    # pylint: disable=redefined-outer-name
     def test_qwen2_w_cce(self, temp_dir):
         cfg = DictDefault(
             {
diff --git a/tests/e2e/integrations/test_fp8.py b/tests/e2e/integrations/test_fp8.py
index 0302b7e35..7db63cc4d 100644
--- a/tests/e2e/integrations/test_fp8.py
+++ b/tests/e2e/integrations/test_fp8.py
@@ -18,7 +18,7 @@ class FP8IntegrationTestCase:
     @require_torch_2_7_0
     def test_fp8_single_gpu_smoke(self, temp_dir):
         """Smoke test for single GPU FP8 + torch.compile training"""
-        # pylint: disable=duplicate-code
+
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -53,7 +53,6 @@ class FP8IntegrationTestCase:
             }
         )
 
-        # pylint: disable=duplicate-code
         cfg = validate_config(cfg)
         normalize_config(cfg)
         dataset_meta = load_datasets(cfg=cfg)
diff --git a/tests/e2e/integrations/test_hooks.py b/tests/e2e/integrations/test_hooks.py
index 8743efb98..b85505caa 100644
--- a/tests/e2e/integrations/test_hooks.py
+++ b/tests/e2e/integrations/test_hooks.py
@@ -28,85 +28,81 @@ class LogHooksPlugin(BasePlugin):
         except FileNotFoundError:
             pass
 
-    def post_trainer_create(self, cfg, trainer):  # pylint: disable=unused-argument
+    def post_trainer_create(self, cfg, trainer):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("post_trainer_create\n")
 
-    def pre_model_load(self, cfg):  # pylint: disable=unused-argument
+    def pre_model_load(self, cfg):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("pre_model_load\n")
 
-    def post_model_build(self, cfg, model):  # pylint: disable=unused-argument
+    def post_model_build(self, cfg, model):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("post_model_build\n")
 
-    def pre_lora_load(self, cfg, model):  # pylint: disable=unused-argument
+    def pre_lora_load(self, cfg, model):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("pre_lora_load\n")
 
-    def post_lora_load(self, cfg, model):  # pylint: disable=unused-argument
+    def post_lora_load(self, cfg, model):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("post_lora_load\n")
 
-    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
+    def post_model_load(self, cfg, model):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("post_model_load\n")
 
-    def create_optimizer(self, cfg, trainer):  # pylint: disable=unused-argument
+    def create_optimizer(self, cfg, trainer):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("create_optimizer\n")
 
-    def get_trainer_cls(self, cfg):  # pylint: disable=unused-argument
+    def get_trainer_cls(self, cfg):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("get_trainer_cls\n")
 
-    def create_lr_scheduler(
-        self, cfg, trainer, optimizer, num_training_steps
-    ):  # pylint: disable=unused-argument
+    def create_lr_scheduler(self, cfg, trainer, optimizer, num_training_steps):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("create_lr_scheduler\n")
 
-    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
+    def add_callbacks_pre_trainer(self, cfg, model):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("add_callbacks_pre_trainer\n")
         return []
 
-    def add_callbacks_post_trainer(
-        self, cfg, trainer
-    ):  # pylint: disable=unused-argument
+    def add_callbacks_post_trainer(self, cfg, trainer):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("add_callbacks_post_trainer\n")
         return []
 
-    def post_train(self, cfg, model):  # pylint: disable=unused-argument
+    def post_train(self, cfg, model):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
             f.write("post_train\n")
 
-    def post_train_unload(self, cfg):  # pylint: disable=unused-argument
+    def post_train_unload(self, cfg):
         with open(
             self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
         ) as f:
@@ -119,7 +115,6 @@ class TestPluginHooks:
     """
 
     def test_plugin_hooks(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/integrations/test_kd.py b/tests/e2e/integrations/test_kd.py
index 1ac3b537e..98383614b 100644
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -81,7 +81,7 @@ class TestKnowledgeDistillation:
     @require_torch_2_5_1
     def test_llama_kd(self, temp_dir, kd_min_cfg):
         cfg = DictDefault(kd_min_cfg)
-        # pylint: disable=duplicate-code
+
         # write cfg to yaml file
         Path(temp_dir).mkdir(parents=True, exist_ok=True)
         with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
@@ -123,7 +123,7 @@ class TestKnowledgeDistillation:
             }
             | kd_min_cfg
         )
-        # pylint: disable=duplicate-code
+
         # write cfg to yaml file
         Path(temp_dir).mkdir(parents=True, exist_ok=True)
         with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
diff --git a/tests/e2e/integrations/test_liger.py b/tests/e2e/integrations/test_liger.py
index b1f5befdd..285969963 100644
--- a/tests/e2e/integrations/test_liger.py
+++ b/tests/e2e/integrations/test_liger.py
@@ -17,7 +17,6 @@ class LigerIntegrationTestCase:
 
     @require_torch_2_4_1
     def test_llama_wo_flce(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -53,7 +52,7 @@ class LigerIntegrationTestCase:
                 "save_first_step": False,
             }
         )
-        # pylint: disable=duplicate-code
+
         cfg = validate_config(cfg)
         prepare_plugins(cfg)
         normalize_config(cfg)
@@ -64,7 +63,6 @@ class LigerIntegrationTestCase:
 
     @require_torch_2_4_1
     def test_llama_w_flce(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -100,7 +98,7 @@ class LigerIntegrationTestCase:
                 "save_first_step": False,
             }
         )
-        # pylint: disable=duplicate-code
+
         cfg = validate_config(cfg)
         prepare_plugins(cfg)
         normalize_config(cfg)
diff --git a/tests/e2e/kernels/test_geglu.py b/tests/e2e/kernels/test_geglu.py
index 4094a8ce7..78ba74c0e 100644
--- a/tests/e2e/kernels/test_geglu.py
+++ b/tests/e2e/kernels/test_geglu.py
@@ -85,6 +85,6 @@ def test_geglu_inplace_preservation():
 
     assert not torch.equal(gate, gate_copy), "Gate should be modified in-place"
     assert not torch.equal(up, up_copy), "Up should be modified in-place"
-    assert not torch.equal(
-        grad_output, grad_copy
-    ), "Grad output should be modified in-place"
+    assert not torch.equal(grad_output, grad_copy), (
+        "Grad output should be modified in-place"
+    )
diff --git a/tests/e2e/kernels/test_lora.py b/tests/e2e/kernels/test_lora.py
index cd6131ff1..9baceb668 100644
--- a/tests/e2e/kernels/test_lora.py
+++ b/tests/e2e/kernels/test_lora.py
@@ -1,7 +1,5 @@
 """Tests for LoRA custom autograd."""
 
-# pylint: disable=invalid-name,redefined-outer-name
-
 import pytest
 import torch
 from bitsandbytes.functional import QuantState
@@ -333,7 +331,7 @@ def test_lora_qkv(sample_tensors):
     X.requires_grad = True
 
     # Test without LoRA adapters
-    # pylint: disable=duplicate-code
+
     Q1, K1, V1 = LoRA_QKV.apply(
         X,
         q_weight,
diff --git a/tests/e2e/kernels/test_quantize.py b/tests/e2e/kernels/test_quantize.py
index ea91407ef..60396584c 100644
--- a/tests/e2e/kernels/test_quantize.py
+++ b/tests/e2e/kernels/test_quantize.py
@@ -1,7 +1,5 @@
 """Tests for quantization utility functions."""
 
-# pylint: disable=invalid-name
-
 import torch
 from bitsandbytes.functional import QuantState
 
diff --git a/tests/e2e/kernels/test_swiglu.py b/tests/e2e/kernels/test_swiglu.py
index 60fdafb79..58d5e04a7 100644
--- a/tests/e2e/kernels/test_swiglu.py
+++ b/tests/e2e/kernels/test_swiglu.py
@@ -1,7 +1,5 @@
 """Tests for SwiGLU activation function Triton kernels."""
 
-# pylint: disable=duplicate-code
-
 import torch
 import torch.nn.functional as F
 
@@ -74,6 +72,6 @@ def test_swiglu_inplace_preservation():
 
     assert not torch.equal(gate, gate_copy), "Gate should be modified in-place"
     assert not torch.equal(up, up_copy), "Up should be modified in-place"
-    assert not torch.equal(
-        grad_output, grad_copy
-    ), "Grad output should be modified in-place"
+    assert not torch.equal(grad_output, grad_copy), (
+        "Grad output should be modified in-place"
+    )
diff --git a/tests/e2e/multigpu/solo/test_flex.py b/tests/e2e/multigpu/solo/test_flex.py
index cbdf8de96..881d75c25 100644
--- a/tests/e2e/multigpu/solo/test_flex.py
+++ b/tests/e2e/multigpu/solo/test_flex.py
@@ -31,7 +31,6 @@ class TestPackedFlex:
 
     @require_torch_2_6_0
     def test_loss_llama(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/multigpu/solo/test_grpo.py b/tests/e2e/multigpu/solo/test_grpo.py
index 92e0f7040..b48eb30e1 100644
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -80,7 +80,7 @@ def start_vllm(
     cmd_env = env.copy()
     cmd_env.update({"VLLM_LOGGING_CONFIG_PATH": vllm_logging_json})
     # start `trl vllm-serve` command in the background and capture the process id
-    process = subprocess.Popen(  # pylint: disable=consider-using-with
+    process = subprocess.Popen(
         cmd,
         env=cmd_env,
         stdout=subprocess.DEVNULL if quiet else subprocess.PIPE,
diff --git a/tests/e2e/multigpu/test_eval.py b/tests/e2e/multigpu/test_eval.py
index 4f86278ff..504659a3a 100644
--- a/tests/e2e/multigpu/test_eval.py
+++ b/tests/e2e/multigpu/test_eval.py
@@ -21,7 +21,6 @@ class TestMultiGPUEval:
     """
 
     def test_eval_sample_packing(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -93,7 +92,6 @@ class TestMultiGPUEval:
         check_tensorboard(temp_dir + "/runs", "eval/loss", 2.5, "Eval Loss is too high")
 
     def test_eval(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/multigpu/test_fp8_fsdp2.py b/tests/e2e/multigpu/test_fp8_fsdp2.py
index f7fa29a31..dc369f3de 100644
--- a/tests/e2e/multigpu/test_fp8_fsdp2.py
+++ b/tests/e2e/multigpu/test_fp8_fsdp2.py
@@ -1,7 +1,5 @@
 """Test module for FP8 mixed precision with FSDP2 multi-GPU functionality."""
 
-# pylint: disable=duplicate-code
-
 import os
 from pathlib import Path
 
@@ -28,9 +26,9 @@ def verify_fp8_training_success(temp_dir):
     assert len(model_files) > 0, "No model files found - training may have failed"
 
     checkpoint_files = list(output_path.glob("checkpoint-*"))
-    assert (
-        len(checkpoint_files) > 0
-    ), "No checkpoint files found - training may have failed"
+    assert len(checkpoint_files) > 0, (
+        "No checkpoint files found - training may have failed"
+    )
 
     tb_log_path = most_recent_subdir(temp_dir + "/runs")
     if tb_log_path:
@@ -42,9 +40,9 @@ def verify_fp8_training_success(temp_dir):
             train_loss_df = df[df.tag == "train/train_loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
-                assert not torch.isnan(
-                    torch.tensor(final_loss)
-                ), f"Training loss is NaN: {final_loss}"
+                assert not torch.isnan(torch.tensor(final_loss)), (
+                    f"Training loss is NaN: {final_loss}"
+                )
 
 
 class TestFP8FSDP2:
diff --git a/tests/e2e/multigpu/test_fsdp1.py b/tests/e2e/multigpu/test_fsdp1.py
index fe0badbe2..cb92c80b5 100644
--- a/tests/e2e/multigpu/test_fsdp1.py
+++ b/tests/e2e/multigpu/test_fsdp1.py
@@ -1,7 +1,5 @@
 """Test module for FSDP1 multi-GPU functionality."""
 
-# pylint: disable=duplicate-code
-
 import os
 from pathlib import Path
 
@@ -29,9 +27,9 @@ def verify_training_success(temp_dir):
     assert len(model_files) > 0, "No model files found - training may have failed"
 
     checkpoint_files = list(output_path.glob("checkpoint-*"))
-    assert (
-        len(checkpoint_files) > 0
-    ), "No checkpoint files found - training may have failed"
+    assert len(checkpoint_files) > 0, (
+        "No checkpoint files found - training may have failed"
+    )
 
     tb_log_path = most_recent_subdir(temp_dir + "/runs")
     if tb_log_path:
@@ -43,9 +41,9 @@ def verify_training_success(temp_dir):
             train_loss_df = df[df.tag == "train/train_loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
-                assert not torch.isnan(
-                    torch.tensor(final_loss)
-                ), f"Training loss is NaN: {final_loss}"
+                assert not torch.isnan(torch.tensor(final_loss)), (
+                    f"Training loss is NaN: {final_loss}"
+                )
 
 
 class TestFSDP1:
diff --git a/tests/e2e/multigpu/test_fsdp2.py b/tests/e2e/multigpu/test_fsdp2.py
index 0bb255266..8b7ee710e 100644
--- a/tests/e2e/multigpu/test_fsdp2.py
+++ b/tests/e2e/multigpu/test_fsdp2.py
@@ -1,7 +1,5 @@
 """Test module for FSDP2 multi-GPU functionality."""
 
-# pylint: disable=duplicate-code
-
 import os
 from pathlib import Path
 
@@ -29,9 +27,9 @@ def verify_training_success(temp_dir):
     assert len(model_files) > 0, "No model files found - training may have failed"
 
     checkpoint_files = list(output_path.glob("checkpoint-*"))
-    assert (
-        len(checkpoint_files) > 0
-    ), "No checkpoint files found - training may have failed"
+    assert len(checkpoint_files) > 0, (
+        "No checkpoint files found - training may have failed"
+    )
 
     tb_log_path = most_recent_subdir(temp_dir + "/runs")
     if tb_log_path:
@@ -43,9 +41,9 @@ def verify_training_success(temp_dir):
             train_loss_df = df[df.tag == "train/train_loss"]
             if len(train_loss_df) > 0:
                 final_loss = train_loss_df.value.values[-1]
-                assert not torch.isnan(
-                    torch.tensor(final_loss)
-                ), f"Training loss is NaN: {final_loss}"
+                assert not torch.isnan(torch.tensor(final_loss)), (
+                    f"Training loss is NaN: {final_loss}"
+                )
 
 
 class TestFSDP2:
diff --git a/tests/e2e/multigpu/test_gemma3.py b/tests/e2e/multigpu/test_gemma3.py
index 4a7b101a8..51ec68b11 100644
--- a/tests/e2e/multigpu/test_gemma3.py
+++ b/tests/e2e/multigpu/test_gemma3.py
@@ -29,7 +29,6 @@ class TestMultiGPUGemma3:
     """
 
     def test_lora_ddp_packed(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "axolotl-mirrors/gemma-3-4b-pt",
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index aab14dcc4..ad15d628b 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -35,7 +35,6 @@ class TestMultiGPULlama:
     """
 
     def test_lora_ddp(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -99,7 +98,6 @@ class TestMultiGPULlama:
         [1, 2],
     )
     def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -162,7 +160,6 @@ class TestMultiGPULlama:
         )
 
     def test_dpo_lora_ddp(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -242,7 +239,6 @@ class TestMultiGPULlama:
         )
 
     def test_dpo_qlora_ddp(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -326,7 +322,6 @@ class TestMultiGPULlama:
         [1, 2],
     )
     def test_fsdp(self, temp_dir, gradient_accumulation_steps):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -402,7 +397,6 @@ class TestMultiGPULlama:
         ],
     )
     def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -484,7 +478,6 @@ class TestMultiGPULlama:
     def test_fsdp2_packed(
         self, temp_dir, attention_backend, fsdp_reshard_after_forward
     ):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -556,7 +549,6 @@ class TestMultiGPULlama:
         )
 
     def test_fsdp_qlora_prequant_packed(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
@@ -656,7 +648,6 @@ class TestMultiGPULlama:
     def test_ds_zero3_packed(
         self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
     ):
-        # pylint: disable=duplicate-code
         if qlora:
             adapter = {
                 "adapter": "qlora",
@@ -732,7 +723,6 @@ class TestMultiGPULlama:
         [True, False],
     )
     def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
-        # pylint: disable=duplicate-code
         if qlora:
             adapter = {
                 "adapter": "qlora",
@@ -809,7 +799,6 @@ class TestMultiGPULlama:
         [True, False],
     )
     def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
-        # pylint: disable=duplicate-code
         if qlora:
             adapter = {
                 "adapter": "qlora",
@@ -880,7 +869,6 @@ class TestMultiGPULlama:
         reason="fix untrained tokens brittle with lots of edge cases in latest transformers"
     )
     def test_fix_untrained_tokens(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py
index 7f1278abf..7c6ea8a1f 100644
--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -26,7 +26,6 @@ class TestMultiGPURay:
 
     @require_torch_lt_2_6_0
     def test_lora_ddp(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -90,7 +89,6 @@ class TestMultiGPURay:
         [1, 2],
     )
     def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -150,7 +148,6 @@ class TestMultiGPURay:
         [1, 2],
     )
     def test_sft_fsdp2_packed(self, temp_dir, gradient_accumulation_steps):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/multigpu/test_tp.py b/tests/e2e/multigpu/test_tp.py
index 87a1c6339..9891a0906 100644
--- a/tests/e2e/multigpu/test_tp.py
+++ b/tests/e2e/multigpu/test_tp.py
@@ -19,7 +19,6 @@ class TestTensorParallel:
     )
     @require_torch_2_7_0
     def test_fft_sft(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "Qwen/Qwen2.5-0.5B",
diff --git a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
index b4dc5de54..2180eb99d 100644
--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -1,7 +1,5 @@
 """Integration tests for LoRA activation and attention kernels."""
 
-# pylint: disable=redefined-outer-name
-
 from pathlib import Path
 
 import pytest
@@ -88,7 +86,7 @@ def test_attention_patching_integration(model_name, attention_cls):
     cfg = DictDefault({"base_model": model_name})
 
     # Store the original implementation
-    original_forward = getattr(attention_cls, "forward")
+    original_forward = attention_cls.forward
 
     # Apply patch
     patch_self_attn_lora(cfg)
@@ -104,7 +102,7 @@ def test_attention_patching_integration(model_name, attention_cls):
     assert hasattr(attention_cls, "_original_forward")
 
     # Clean up
-    setattr(attention_cls, "forward", original_forward)
+    attention_cls.forward = original_forward
     delattr(attention_cls, "_original_forward")
 
 
@@ -379,9 +377,9 @@ def test_model_architecture(model_config):
 
     # Verify correct activation function
     layer = patched_model.model.model.layers[0]
-    assert (
-        layer.mlp.forward.__func__ is model_config["expected_activation"]
-    ), f"Wrong activation for {model_config['name']}"
+    assert layer.mlp.forward.__func__ is model_config["expected_activation"], (
+        f"Wrong activation for {model_config['name']}"
+    )
 
     # Test forward pass
     inputs = get_test_inputs(model)
@@ -390,12 +388,11 @@ def test_model_architecture(model_config):
         patched_output = patched_model(inputs).logits
 
     # Check outputs match
-    assert torch.allclose(
-        original_output, patched_output, rtol=1e-4
-    ), f"Outputs don't match for {model_config['name']}"
+    assert torch.allclose(original_output, patched_output, rtol=1e-4), (
+        f"Outputs don't match for {model_config['name']}"
+    )
 
 
-# pylint: disable=duplicate-code
 def test_kernel_training_integration(temp_dir):
     """Test model loading with kernel patches enabled."""
     from axolotl.cli.utils import load_model_and_tokenizer
@@ -563,15 +560,13 @@ def test_kernel_training_integration_dropout_non_zero(temp_dir):
     model_loader = ModelLoader(cfg, tokenizer)
 
     # Apply patch
-    model_loader.patch_manager._apply_self_attention_lora_patch()  # pylint: disable=protected-access
+    model_loader.patch_manager._apply_self_attention_lora_patch()
 
     # Verify patch was not applied
     assert attention_cls.forward == original_forward_method
 
     # Apply apply_lora_kernel_patches
-    model_loader.patch_manager._apply_lora_kernel_patch(  # pylint: disable=protected-access
-        model
-    )
+    model_loader.patch_manager._apply_lora_kernel_patch(model)
 
     # Verify patch was not applied
     layers = get_layers(model)
diff --git a/tests/e2e/patched/test_4d_multipack_llama.py b/tests/e2e/patched/test_4d_multipack_llama.py
index 1824443e7..ef28cc406 100644
--- a/tests/e2e/patched/test_4d_multipack_llama.py
+++ b/tests/e2e/patched/test_4d_multipack_llama.py
@@ -19,7 +19,6 @@ class Test4dMultipackLlama(unittest.TestCase):
 
     @with_temp_dir
     def test_sdp_lora_packing(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -67,7 +66,6 @@ class Test4dMultipackLlama(unittest.TestCase):
 
     @with_temp_dir
     def test_torch_lora_packing(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_activation_checkpointing.py b/tests/e2e/patched/test_activation_checkpointing.py
index 06e3de274..ddace8ef1 100644
--- a/tests/e2e/patched/test_activation_checkpointing.py
+++ b/tests/e2e/patched/test_activation_checkpointing.py
@@ -32,10 +32,9 @@ class TestActivationCheckpointing:
     def test_activation_checkpointing_offload(
         self,
         temp_dir,
-        fix_checkpoint_after_test,  # pylint: disable=unused-argument,redefined-outer-name
+        fix_checkpoint_after_test,
         gradient_checkpointing,
     ):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_cli_integrations.py b/tests/e2e/patched/test_cli_integrations.py
index 6c908faf1..6eba92689 100644
--- a/tests/e2e/patched/test_cli_integrations.py
+++ b/tests/e2e/patched/test_cli_integrations.py
@@ -10,7 +10,6 @@ from axolotl.cli.config import load_cfg
 from axolotl.utils.dict import DictDefault
 
 
-# pylint: disable=duplicate-code
 class TestPluginArgs:
     """
     test class for plugin args loaded from the config file
diff --git a/tests/e2e/patched/test_fa_xentropy.py b/tests/e2e/patched/test_fa_xentropy.py
index 38099b220..9f4699854 100644
--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -23,7 +23,6 @@ class TestFAXentropyLlama:
         [1, 4],
     )
     def test_lora_packing_fa_cross_entropy(self, temp_dir, gradient_accumulation_steps):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_falcon_samplepack.py b/tests/e2e/patched/test_falcon_samplepack.py
index ef31b11c7..cc5091403 100644
--- a/tests/e2e/patched/test_falcon_samplepack.py
+++ b/tests/e2e/patched/test_falcon_samplepack.py
@@ -22,7 +22,6 @@ class TestFalconPatched(unittest.TestCase):
     @pytest.mark.skip(reason="no tiny models for testing with safetensors")
     @with_temp_dir
     def test_qlora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "illuin/tiny-random-FalconForCausalLM",
@@ -71,7 +70,6 @@ class TestFalconPatched(unittest.TestCase):
     @pytest.mark.skip(reason="no tiny models for testing with safetensors")
     @with_temp_dir
     def test_ft(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "illuin/tiny-random-FalconForCausalLM",
diff --git a/tests/e2e/patched/test_flattening.py b/tests/e2e/patched/test_flattening.py
index fdaab558d..2c247d406 100644
--- a/tests/e2e/patched/test_flattening.py
+++ b/tests/e2e/patched/test_flattening.py
@@ -23,7 +23,6 @@ class TestFAFlattening:
         [1, 4],
     )
     def test_lora_packing_flattening(self, temp_dir, gradient_accumulation_steps):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_fsdp2_qlora.py b/tests/e2e/patched/test_fsdp2_qlora.py
index ca17b81d1..de9c929e1 100644
--- a/tests/e2e/patched/test_fsdp2_qlora.py
+++ b/tests/e2e/patched/test_fsdp2_qlora.py
@@ -15,7 +15,6 @@ class TestFSDPPatchIntegration:
             apply_init_unsharded_param_patch,
         )
 
-        # pylint: disable=protected-access
         original_init_sharded = FSDPParam._init_sharded_param
         original_init_unsharded = FSDPParam.init_unsharded_param
 
@@ -23,11 +22,9 @@ class TestFSDPPatchIntegration:
         apply_init_sharded_param_patch()
         apply_init_unsharded_param_patch()
 
-        assert (
-            # pylint: disable=protected-access
-            FSDPParam._init_sharded_param
-            != original_init_sharded
-        ), "_init_sharded_param was not patched"
-        assert (
-            FSDPParam.init_unsharded_param != original_init_unsharded
-        ), "init_unsharded_param was not patched"
+        assert FSDPParam._init_sharded_param != original_init_sharded, (
+            "_init_sharded_param was not patched"
+        )
+        assert FSDPParam.init_unsharded_param != original_init_unsharded, (
+            "init_unsharded_param was not patched"
+        )
diff --git a/tests/e2e/patched/test_fused_llama.py b/tests/e2e/patched/test_fused_llama.py
index f0c4f155f..f0c5df18a 100644
--- a/tests/e2e/patched/test_fused_llama.py
+++ b/tests/e2e/patched/test_fused_llama.py
@@ -23,7 +23,6 @@ class TestFusedLlama(unittest.TestCase):
 
     @with_temp_dir
     def test_fft_packing(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_llama_s2_attention.py b/tests/e2e/patched/test_llama_s2_attention.py
index ba5556a59..0dd748945 100644
--- a/tests/e2e/patched/test_llama_s2_attention.py
+++ b/tests/e2e/patched/test_llama_s2_attention.py
@@ -22,7 +22,6 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
 
     @with_temp_dir
     def test_lora_s2_attn(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -71,7 +70,6 @@ class TestLlamaShiftedSparseAttention(unittest.TestCase):
 
     @with_temp_dir
     def test_fft_s2_attn(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_lora_llama_multipack.py b/tests/e2e/patched/test_lora_llama_multipack.py
index fdf6adbc6..1833c750b 100644
--- a/tests/e2e/patched/test_lora_llama_multipack.py
+++ b/tests/e2e/patched/test_lora_llama_multipack.py
@@ -22,7 +22,6 @@ class TestLoraLlama(unittest.TestCase):
 
     @with_temp_dir
     def test_lora_packing(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -73,7 +72,6 @@ class TestLoraLlama(unittest.TestCase):
     @pytest.mark.skipif(not is_auto_gptq_available(), reason="auto-gptq not available")
     @with_temp_dir
     def test_lora_gptq_packed(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "lilmeaty/SmolLM2-135M-Instruct-GPTQ",
diff --git a/tests/e2e/patched/test_mistral_samplepack.py b/tests/e2e/patched/test_mistral_samplepack.py
index bea0f9c68..e03941b07 100644
--- a/tests/e2e/patched/test_mistral_samplepack.py
+++ b/tests/e2e/patched/test_mistral_samplepack.py
@@ -20,7 +20,6 @@ class TestMistral(unittest.TestCase):
     @require_torch_2_6_0
     @with_temp_dir
     def test_lora_packing(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
@@ -68,7 +67,6 @@ class TestMistral(unittest.TestCase):
 
     @with_temp_dir
     def test_ft_packing(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
diff --git a/tests/e2e/patched/test_mixtral_samplepack.py b/tests/e2e/patched/test_mixtral_samplepack.py
index 09e427abd..3517ff3db 100644
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -19,7 +19,6 @@ class TestMixtral(unittest.TestCase):
 
     @with_temp_dir
     def test_qlora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "hf-internal-testing/Mixtral-tiny",
@@ -64,7 +63,6 @@ class TestMixtral(unittest.TestCase):
 
     @with_temp_dir
     def test_ft(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "hf-internal-testing/Mixtral-tiny",
diff --git a/tests/e2e/patched/test_model_patches.py b/tests/e2e/patched/test_model_patches.py
index b90be23e4..aaaaf5fe2 100644
--- a/tests/e2e/patched/test_model_patches.py
+++ b/tests/e2e/patched/test_model_patches.py
@@ -89,5 +89,5 @@ class TestModelPatches(unittest.TestCase):
 
         assert (
             "torch.jit"
-            in transformers.modeling_flash_attention_utils._get_unpad_data.__module__  # pylint: disable=protected-access
+            in transformers.modeling_flash_attention_utils._get_unpad_data.__module__
         )
diff --git a/tests/e2e/patched/test_peft_embeddings.py b/tests/e2e/patched/test_peft_embeddings.py
index 4769319ae..374ef97d8 100644
--- a/tests/e2e/patched/test_peft_embeddings.py
+++ b/tests/e2e/patched/test_peft_embeddings.py
@@ -15,7 +15,6 @@ class TestLlamaPeftEmbeddings:
     """
 
     def test_peft_embeddings_upcast(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_phi_multipack.py b/tests/e2e/patched/test_phi_multipack.py
index 1f0ddd630..77b2d99e5 100644
--- a/tests/e2e/patched/test_phi_multipack.py
+++ b/tests/e2e/patched/test_phi_multipack.py
@@ -19,7 +19,6 @@ class TestPhiMultipack(unittest.TestCase):
 
     @with_temp_dir
     def test_ft_packed(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "microsoft/phi-1_5",
@@ -67,7 +66,6 @@ class TestPhiMultipack(unittest.TestCase):
 
     @with_temp_dir
     def test_qlora_packed(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "microsoft/phi-1_5",
diff --git a/tests/e2e/patched/test_resume.py b/tests/e2e/patched/test_resume.py
index 54b8245ee..747b79dc7 100644
--- a/tests/e2e/patched/test_resume.py
+++ b/tests/e2e/patched/test_resume.py
@@ -22,7 +22,6 @@ class TestResumeLlama:
 
     @require_torch_2_6_0
     def test_resume_lora_packed(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/patched/test_unsloth_qlora.py b/tests/e2e/patched/test_unsloth_qlora.py
index 2c8ee4eb0..bf00e8a5f 100644
--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -12,7 +12,6 @@ from axolotl.utils.dict import DictDefault
 from ..utils import check_model_output_exists, check_tensorboard
 
 
-# pylint: disable=duplicate-code
 @pytest.mark.skip(
     reason="Unsloth integration will be broken going into latest transformers"
 )
diff --git a/tests/e2e/solo/test_flex.py b/tests/e2e/solo/test_flex.py
index 76364fc0e..abe8fb69a 100644
--- a/tests/e2e/solo/test_flex.py
+++ b/tests/e2e/solo/test_flex.py
@@ -22,7 +22,6 @@ class TestPackedFlex(unittest.TestCase):
     @require_torch_2_6_0
     @with_temp_dir
     def test_loss_llama(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/solo/test_relora_llama.py b/tests/e2e/solo/test_relora_llama.py
index b399b4680..be77684ba 100644
--- a/tests/e2e/solo/test_relora_llama.py
+++ b/tests/e2e/solo/test_relora_llama.py
@@ -20,7 +20,6 @@ class TestReLoraLlama(unittest.TestCase):
 
     @with_temp_dir
     def test_relora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -76,9 +75,9 @@ class TestReLoraLlama(unittest.TestCase):
 
         train(cfg=cfg, dataset_meta=dataset_meta)
         check_model_output_exists(Path(temp_dir) / "checkpoint-100/adapter", cfg)
-        assert (
-            Path(temp_dir) / "checkpoint-100/relora/model.safetensors"
-        ).exists(), "Relora model checkpoint not found"
+        assert (Path(temp_dir) / "checkpoint-100/relora/model.safetensors").exists(), (
+            "Relora model checkpoint not found"
+        )
 
         check_tensorboard(
             temp_dir + "/runs", "train/grad_norm", 0.2, "grad_norm is too high"
diff --git a/tests/e2e/test_activation_offloading.py b/tests/e2e/test_activation_offloading.py
index 06c5c0656..9df85ab31 100644
--- a/tests/e2e/test_activation_offloading.py
+++ b/tests/e2e/test_activation_offloading.py
@@ -11,8 +11,6 @@ from axolotl.utils.dict import DictDefault
 
 from .utils import check_model_output_exists
 
-# pylint: disable=duplicate-code
-
 
 class TestActivationOffloading:
     """
@@ -28,7 +26,6 @@ class TestActivationOffloading:
         temp_dir,
         adapter,
     ):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_deepseekv3.py b/tests/e2e/test_deepseekv3.py
index e4a47fb0a..e11be8265 100644
--- a/tests/e2e/test_deepseekv3.py
+++ b/tests/e2e/test_deepseekv3.py
@@ -25,7 +25,6 @@ class TestDeepseekV3:
         [True, False],
     )
     def test_lora_deepseekv3(self, temp_dir, sample_packing):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "axolotl-ai-co/DeepSeek-V3-11M",
@@ -83,7 +82,6 @@ class TestDeepseekV3:
         [True, False],
     )
     def test_fft_deepseekv3(self, temp_dir, sample_packing):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "axolotl-ai-co/DeepSeek-V3-11M",
diff --git a/tests/e2e/test_dpo.py b/tests/e2e/test_dpo.py
index a1df69535..8f577ef47 100644
--- a/tests/e2e/test_dpo.py
+++ b/tests/e2e/test_dpo.py
@@ -21,7 +21,6 @@ class TestDPOLlamaLora(unittest.TestCase):
 
     @with_temp_dir
     def test_dpo_lora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -70,7 +69,6 @@ class TestDPOLlamaLora(unittest.TestCase):
 
     @with_temp_dir
     def test_dpo_nll_lora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -120,7 +118,6 @@ class TestDPOLlamaLora(unittest.TestCase):
 
     @with_temp_dir
     def test_dpo_use_weighting(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -171,7 +168,6 @@ class TestDPOLlamaLora(unittest.TestCase):
     @pytest.mark.skip("kto_pair no longer supported in trl")
     @with_temp_dir
     def test_kto_pair_lora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -220,7 +216,6 @@ class TestDPOLlamaLora(unittest.TestCase):
 
     @with_temp_dir
     def test_ipo_lora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -269,7 +264,6 @@ class TestDPOLlamaLora(unittest.TestCase):
 
     @with_temp_dir
     def test_orpo_lora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -322,7 +316,6 @@ class TestDPOLlamaLora(unittest.TestCase):
     @pytest.mark.skip(reason="Fix the implementation")
     @with_temp_dir
     def test_kto_lora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_embeddings_lr.py b/tests/e2e/test_embeddings_lr.py
index e4a06ad14..633e449ef 100644
--- a/tests/e2e/test_embeddings_lr.py
+++ b/tests/e2e/test_embeddings_lr.py
@@ -19,7 +19,6 @@ class TestEmbeddingsLrScale(unittest.TestCase):
 
     @with_temp_dir
     def test_train_w_embedding_lr_scale(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -65,7 +64,6 @@ class TestEmbeddingsLrScale(unittest.TestCase):
 
     @with_temp_dir
     def test_train_w_embedding_lr(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_evaluate.py b/tests/e2e/test_evaluate.py
index 977497e5e..3b0ab1450 100644
--- a/tests/e2e/test_evaluate.py
+++ b/tests/e2e/test_evaluate.py
@@ -13,7 +13,6 @@ class TestE2eEvaluate:
     """Test cases for evaluate CLI"""
 
     def test_evaluate(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_falcon.py b/tests/e2e/test_falcon.py
index 5be6efcf6..1a363fe6a 100644
--- a/tests/e2e/test_falcon.py
+++ b/tests/e2e/test_falcon.py
@@ -22,7 +22,6 @@ class TestFalcon(unittest.TestCase):
     @pytest.mark.skip(reason="no tiny models for testing with safetensors")
     @with_temp_dir
     def test_lora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "illuin/tiny-random-FalconForCausalLM",
@@ -74,7 +73,6 @@ class TestFalcon(unittest.TestCase):
     @pytest.mark.skip(reason="no tiny models for testing with safetensors")
     @with_temp_dir
     def test_lora_added_vocab(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "illuin/tiny-random-FalconForCausalLM",
@@ -130,7 +128,6 @@ class TestFalcon(unittest.TestCase):
     @pytest.mark.skip(reason="no tiny models for testing with safetensors")
     @with_temp_dir
     def test_ft(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "illuin/tiny-random-FalconForCausalLM",
diff --git a/tests/e2e/test_gemma2.py b/tests/e2e/test_gemma2.py
index c0eba72a7..9e9f1a9cc 100644
--- a/tests/e2e/test_gemma2.py
+++ b/tests/e2e/test_gemma2.py
@@ -22,7 +22,6 @@ class TestGemma2:
         [True, False],
     )
     def test_lora_gemma2(self, temp_dir, sample_packing):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "axolotl-ai-co/gemma-2-33M",
@@ -78,7 +77,6 @@ class TestGemma2:
         [True, False],
     )
     def test_fft_gemma2(self, temp_dir, sample_packing):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "axolotl-ai-co/gemma-2-33M",
diff --git a/tests/e2e/test_gemma3_text.py b/tests/e2e/test_gemma3_text.py
index ef38d028d..6cd999242 100644
--- a/tests/e2e/test_gemma3_text.py
+++ b/tests/e2e/test_gemma3_text.py
@@ -22,7 +22,6 @@ class TestGemma3Text:
         [True, False],
     )
     def test_lora_gemma3_text(self, temp_dir, sample_packing):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "axolotl-ai-co/gemma-3-34M",
@@ -78,7 +77,6 @@ class TestGemma3Text:
         [True, False],
     )
     def test_fft_gemma3_text(self, temp_dir, sample_packing):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "axolotl-ai-co/gemma-3-34M",
diff --git a/tests/e2e/test_imports.py b/tests/e2e/test_imports.py
index 050e4dfb3..4c01e50be 100644
--- a/tests/e2e/test_imports.py
+++ b/tests/e2e/test_imports.py
@@ -11,11 +11,7 @@ class TestImports(unittest.TestCase):
     """
 
     def test_import_causal_trainer(self):
-        from axolotl.core.builders import (  # pylint: disable=unused-import  # noqa: F401
-            HFCausalTrainerBuilder,
-        )
+        pass
 
     def test_import_rl_trainer(self):
-        from axolotl.core.builders import (  # pylint: disable=unused-import  # noqa: F401
-            HFRLTrainerBuilder,
-        )
+        pass
diff --git a/tests/e2e/test_llama.py b/tests/e2e/test_llama.py
index 1e6df0be9..de085cbe2 100644
--- a/tests/e2e/test_llama.py
+++ b/tests/e2e/test_llama.py
@@ -16,7 +16,6 @@ class TestLlama:
     """
 
     def test_fft_trust_remote_code(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -57,7 +56,6 @@ class TestLlama:
         check_model_output_exists(temp_dir, cfg)
 
     def test_fix_untrained_tokens(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -105,7 +103,6 @@ class TestLlama:
         check_model_output_exists(temp_dir, cfg)
 
     def test_fix_untrained_tokens_already_trained(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -150,7 +147,6 @@ class TestLlama:
         check_model_output_exists(temp_dir, cfg)
 
     def test_batch_flattening(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py
index bd5502300..a041244e7 100644
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -22,7 +22,6 @@ class TestPretrainLlama:
         ],
     )
     def test_pretrain(self, temp_dir, sample_packing, pretrain_multipack_attn):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_llama_vision.py b/tests/e2e/test_llama_vision.py
index 760759bca..0cc927f76 100644
--- a/tests/e2e/test_llama_vision.py
+++ b/tests/e2e/test_llama_vision.py
@@ -19,7 +19,6 @@ class TestLlamaVision(unittest.TestCase):
 
     @with_temp_dir
     def test_lora_llama_vision_text_only_dataset(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "axolotl-ai-co/Llama-3.2-39M-Vision",
@@ -67,7 +66,6 @@ class TestLlamaVision(unittest.TestCase):
 
     @with_temp_dir
     def test_lora_llama_vision_multimodal_dataset(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "axolotl-ai-co/Llama-3.2-39M-Vision",
diff --git a/tests/e2e/test_load_model.py b/tests/e2e/test_load_model.py
index 8fcffeb11..7c5389a58 100644
--- a/tests/e2e/test_load_model.py
+++ b/tests/e2e/test_load_model.py
@@ -56,13 +56,11 @@ class TestLoadModelUtils:
                 "context_parallel_size": 1,
             }
         )
-        self.model_loader = (  # pylint: disable=attribute-defined-outside-init
-            ModelLoader(
-                cfg=self.cfg,
-                tokenizer="",
-                inference=False,
-                reference_model=True,
-            )
+        self.model_loader = ModelLoader(
+            cfg=self.cfg,
+            tokenizer="",
+            inference=False,
+            reference_model=True,
         )
 
     @pytest.mark.parametrize("embedding_modules", ["embed_tokens", "lm_head"])
@@ -74,7 +72,7 @@ class TestLoadModelUtils:
         self, temp_dir, embedding_modules, dist_dtype, before_kbit_train_or_finetune
     ):
         self.cfg.output_dir = temp_dir
-        self.model_loader.tokenizer = load_tokenizer(self.cfg)  # pylint: disable=all
+        self.model_loader.tokenizer = load_tokenizer(self.cfg)
         self.model_loader.load()
         self.model_loader._convert_embedding_modules_dtype(
             embedding_modules, dist_dtype, before_kbit_train_or_finetune
diff --git a/tests/e2e/test_lora_llama.py b/tests/e2e/test_lora_llama.py
index 7e0ff46cf..b6ee393df 100644
--- a/tests/e2e/test_lora_llama.py
+++ b/tests/e2e/test_lora_llama.py
@@ -19,7 +19,6 @@ class TestLoraLlama(unittest.TestCase):
 
     @with_temp_dir
     def test_lora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_mamba.py b/tests/e2e/test_mamba.py
index 73d3bdc26..67935377d 100644
--- a/tests/e2e/test_mamba.py
+++ b/tests/e2e/test_mamba.py
@@ -22,7 +22,6 @@ class TestMamba(unittest.TestCase):
 
     @with_temp_dir
     def test_fft(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "state-spaces/mamba-130m",
diff --git a/tests/e2e/test_mistral.py b/tests/e2e/test_mistral.py
index f47f794e0..08b3b05af 100644
--- a/tests/e2e/test_mistral.py
+++ b/tests/e2e/test_mistral.py
@@ -21,7 +21,6 @@ class TestMistral(unittest.TestCase):
 
     @with_temp_dir
     def test_lora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
@@ -68,7 +67,6 @@ class TestMistral(unittest.TestCase):
 
     @with_temp_dir
     def test_ft(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
diff --git a/tests/e2e/test_mixtral.py b/tests/e2e/test_mixtral.py
index 3fe2bf70f..c46cf906d 100644
--- a/tests/e2e/test_mixtral.py
+++ b/tests/e2e/test_mixtral.py
@@ -22,7 +22,6 @@ class TestMixtral(unittest.TestCase):
 
     @with_temp_dir
     def test_qlora_w_fa2(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "hf-internal-testing/Mixtral-tiny",
@@ -78,7 +77,6 @@ class TestMixtral(unittest.TestCase):
 
     @with_temp_dir
     def test_qlora_wo_fa2(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "hf-internal-testing/Mixtral-tiny",
@@ -134,7 +132,6 @@ class TestMixtral(unittest.TestCase):
 
     @with_temp_dir
     def test_16bit_lora_w_fa2(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "hf-internal-testing/Mixtral-tiny",
@@ -193,7 +190,6 @@ class TestMixtral(unittest.TestCase):
 
     @with_temp_dir
     def test_16bit_lora_wo_fa2(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "hf-internal-testing/Mixtral-tiny",
@@ -252,7 +248,6 @@ class TestMixtral(unittest.TestCase):
 
     @with_temp_dir
     def test_ft(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "hf-internal-testing/Mixtral-tiny",
diff --git a/tests/e2e/test_optimizers.py b/tests/e2e/test_optimizers.py
index 987d86041..dbea92a5b 100644
--- a/tests/e2e/test_optimizers.py
+++ b/tests/e2e/test_optimizers.py
@@ -25,7 +25,6 @@ class TestCustomOptimizers(unittest.TestCase):
 
     @with_temp_dir
     def test_optimi_adamw(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -71,7 +70,6 @@ class TestCustomOptimizers(unittest.TestCase):
     @with_temp_dir
     @require_torch_2_5_1
     def test_adopt_adamw(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -117,7 +115,6 @@ class TestCustomOptimizers(unittest.TestCase):
     @with_temp_dir
     @require_torch_2_5_1
     def test_muon(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -164,7 +161,6 @@ class TestCustomOptimizers(unittest.TestCase):
     @with_temp_dir
     @require_torch_2_7_0
     def test_dion(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -206,7 +202,6 @@ class TestCustomOptimizers(unittest.TestCase):
 
     @with_temp_dir
     def test_fft_schedule_free_adamw(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -234,7 +229,6 @@ class TestCustomOptimizers(unittest.TestCase):
                 "save_first_step": False,
             }
         )
-        # pylint: disable=duplicate-code
 
         cfg = validate_config(cfg)
         normalize_config(cfg)
@@ -246,7 +240,6 @@ class TestCustomOptimizers(unittest.TestCase):
     @with_temp_dir
     @require_torch_2_6_0
     def test_came_pytorch(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "JackFram/llama-68m",
diff --git a/tests/e2e/test_packing_loss.py b/tests/e2e/test_packing_loss.py
index aec9d95f8..7cb979ce6 100644
--- a/tests/e2e/test_packing_loss.py
+++ b/tests/e2e/test_packing_loss.py
@@ -21,7 +21,6 @@ class TestPackedLlama(unittest.TestCase):
 
     @with_temp_dir
     def test_loss_packed(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_phi.py b/tests/e2e/test_phi.py
index ab3a63674..ae2210249 100644
--- a/tests/e2e/test_phi.py
+++ b/tests/e2e/test_phi.py
@@ -19,7 +19,6 @@ class TestPhi(unittest.TestCase):
 
     @with_temp_dir
     def test_phi_ft(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "microsoft/phi-1_5",
@@ -65,7 +64,6 @@ class TestPhi(unittest.TestCase):
 
     @with_temp_dir
     def test_phi_qlora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "microsoft/phi-1_5",
diff --git a/tests/e2e/test_preprocess.py b/tests/e2e/test_preprocess.py
index 25f42e832..4aa4cb6c2 100644
--- a/tests/e2e/test_preprocess.py
+++ b/tests/e2e/test_preprocess.py
@@ -15,7 +15,7 @@ class TestPreprocess:
 
     def test_w_deepspeed(self, temp_dir):
         """make sure preproces doesn't choke when using deepspeed in the config"""
-        # pylint: disable=duplicate-code
+
         cfg = DictDefault(
             {
                 "base_model": "Qwen/Qwen2.5-0.5B",
diff --git a/tests/e2e/test_process_reward_model_smollm2.py b/tests/e2e/test_process_reward_model_smollm2.py
index bd9eec48b..9d83aabbc 100644
--- a/tests/e2e/test_process_reward_model_smollm2.py
+++ b/tests/e2e/test_process_reward_model_smollm2.py
@@ -19,7 +19,6 @@ class TestProcessRewardSmolLM2(unittest.TestCase):
 
     @with_temp_dir
     def test_prm(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_qat.py b/tests/e2e/test_qat.py
index 139ae155a..7d41dfb50 100644
--- a/tests/e2e/test_qat.py
+++ b/tests/e2e/test_qat.py
@@ -18,7 +18,6 @@ class TestQATLlama:
     """
 
     def test_qat(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -68,7 +67,6 @@ class TestQATLlama:
         check_model_output_exists(Path(temp_dir) / "checkpoint-5", cfg)
 
     def test_qat_dpo(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_quantization.py b/tests/e2e/test_quantization.py
index 500b7e556..cfbdfec38 100644
--- a/tests/e2e/test_quantization.py
+++ b/tests/e2e/test_quantization.py
@@ -131,7 +131,7 @@ class TestQuantization:
     @require_torch_2_6_0
     def test_prepare_model_for_qat(
         self, model, weight_dtype, activation_dtype, group_size, quantize_embedding
-    ):  # pylint: disable=redefined-outer-name
+    ):
         prepare_model_for_qat(
             model, weight_dtype, group_size, activation_dtype, quantize_embedding
         )
@@ -175,7 +175,7 @@ class TestQuantization:
         group_size,
         quantize_embedding,
         expected_exception,
-    ):  # pylint: disable=redefined-outer-name
+    ):
         if expected_exception:
             with pytest.raises(expected_exception):
                 quantize_model_for_ptq(
@@ -198,11 +198,13 @@ class TestQuantization:
                     if activation_dtype:
                         assert isinstance(
                             child.weight, LinearActivationQuantizedTensor
-                        ), "Linear weight should be quantized with activation quantization"
+                        ), (
+                            "Linear weight should be quantized with activation quantization"
+                        )
                     else:
-                        assert isinstance(
-                            child.weight, AffineQuantizedTensor
-                        ), "Linear weight should be quantized without activation quantization"
+                        assert isinstance(child.weight, AffineQuantizedTensor), (
+                            "Linear weight should be quantized without activation quantization"
+                        )
 
 
 class TestQuantizationCallback:
@@ -217,9 +219,7 @@ class TestQuantizationCallback:
         )
 
     @require_torch_2_6_0
-    def test_qat_callback_fake_quant_after_n_steps(
-        self, model, trainer_state
-    ):  # pylint: disable=redefined-outer-name
+    def test_qat_callback_fake_quant_after_n_steps(self, model, trainer_state):
         cfg = QATConfig(
             weight_dtype="int8",
             activation_dtype="int8",
@@ -269,9 +269,7 @@ class TestQuantizationCallback:
         assert model.lm_head.weight_fake_quantizer.enabled
 
     @require_torch_2_6_0
-    def test_qat_callback_fake_quant_after_n_steps_is_none(
-        self, model, trainer_state
-    ):  # pylint: disable=redefined-outer-name
+    def test_qat_callback_fake_quant_after_n_steps_is_none(self, model, trainer_state):
         cfg = QATConfig(
             weight_dtype="int8",
             activation_dtype="int8",
@@ -314,9 +312,7 @@ class TestConvertQATModelForPTQ:
     """
 
     @require_torch_2_6_0
-    def test_convert_qat_model_for_ptq(
-        self, model
-    ):  # pylint: disable=redefined-outer-name
+    def test_convert_qat_model_for_ptq(self, model):
         config = QATConfig(
             weight_dtype="int8",
             activation_dtype="int8",
diff --git a/tests/e2e/test_qwen.py b/tests/e2e/test_qwen.py
index 59267d14d..1c75d817b 100644
--- a/tests/e2e/test_qwen.py
+++ b/tests/e2e/test_qwen.py
@@ -19,7 +19,6 @@ class TestE2eQwen:
 
     @pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"])
     def test_dpo(self, base_model, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": base_model,
diff --git a/tests/e2e/test_reward_model_smollm2.py b/tests/e2e/test_reward_model_smollm2.py
index 82513f99f..cc768b173 100644
--- a/tests/e2e/test_reward_model_smollm2.py
+++ b/tests/e2e/test_reward_model_smollm2.py
@@ -19,7 +19,6 @@ class TestRewardModelLoraSmolLM2(unittest.TestCase):
 
     @with_temp_dir
     def test_rm_lora(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_save_first_step.py b/tests/e2e/test_save_first_step.py
index 5bbd2302b..ce2d3f145 100644
--- a/tests/e2e/test_save_first_step.py
+++ b/tests/e2e/test_save_first_step.py
@@ -20,7 +20,6 @@ class TestSaveFirstStepCallback(unittest.TestCase):
 
     @with_temp_dir
     def test_save_first_step(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -61,7 +60,6 @@ class TestSaveFirstStepCallback(unittest.TestCase):
 
     @with_temp_dir
     def test_no_save_first_step(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/test_schedulers.py b/tests/e2e/test_schedulers.py
index 8f7a13aee..5b9c56288 100644
--- a/tests/e2e/test_schedulers.py
+++ b/tests/e2e/test_schedulers.py
@@ -19,7 +19,6 @@ class TestCustomSchedulers(unittest.TestCase):
 
     @with_temp_dir
     def test_rex_scheduler(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
index 939ed5c1c..7db6cf74e 100644
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -2,6 +2,7 @@
 helper utils for tests
 """
 
+import importlib.util
 import os
 import shutil
 import tempfile
@@ -107,12 +108,7 @@ def require_vllm(test_case):
     """
 
     def is_vllm_installed():
-        try:
-            import vllm  # pylint: disable=unused-import  # noqa: F401
-
-            return True
-        except ImportError:
-            return False
+        return importlib.util.find_spec("vllm") is not None
 
     return unittest.skipUnless(
         is_vllm_installed(), "test requires vllm to be installed"
@@ -125,12 +121,7 @@ def require_llmcompressor(test_case):
     """
 
     def is_llmcompressor_installed():
-        try:
-            import llmcompressor  # pylint: disable=unused-import  # noqa: F401
-
-            return True
-        except ImportError:
-            return False
+        return importlib.util.find_spec("llmcompressor") is not None
 
     return unittest.skipUnless(
         is_llmcompressor_installed(), "test requires llmcompressor to be installed"
@@ -159,8 +150,8 @@ def check_tensorboard(
     tb_log_path = most_recent_subdir(temp_run_dir)
     event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
     reader = SummaryReader(event_file)
-    df = reader.scalars  # pylint: disable=invalid-name
-    df = df[(df.tag == tag)]  # pylint: disable=invalid-name
+    df = reader.scalars
+    df = df[(df.tag == tag)]
     lt_val = (1 + rtol) * lt_val
     if "%s" in assertion_err:
         assert df.value.values[-1] < lt_val, assertion_err % df.value.values[-1]
diff --git a/tests/hf_offline_utils.py b/tests/hf_offline_utils.py
index 385e61f18..0e4a2f067 100644
--- a/tests/hf_offline_utils.py
+++ b/tests/hf_offline_utils.py
@@ -20,7 +20,7 @@ def reload_modules(hf_hub_offline):
     importlib.reload(huggingface_hub.constants)
     huggingface_hub.constants.HF_HUB_OFFLINE = hf_hub_offline
     importlib.reload(datasets.config)
-    setattr(datasets.config, "HF_HUB_OFFLINE", hf_hub_offline)
+    datasets.config.HF_HUB_OFFLINE = hf_hub_offline
     reset_sessions()
 
 
diff --git a/tests/integrations/test_liger.py b/tests/integrations/test_liger.py
index 5c4bd1028..d7b171ec2 100644
--- a/tests/integrations/test_liger.py
+++ b/tests/integrations/test_liger.py
@@ -10,7 +10,6 @@ from axolotl.utils.config import prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 
 
-# pylint: disable=duplicate-code
 @pytest.fixture(name="minimal_liger_cfg")
 def fixture_cfg():
     return DictDefault(
@@ -30,7 +29,6 @@ def fixture_cfg():
     )
 
 
-# pylint: disable=too-many-public-methods
 class TestValidation:
     """
     Test the validation module for liger
diff --git a/tests/patched/test_validation.py b/tests/patched/test_validation.py
index 677512d3d..21299ed98 100644
--- a/tests/patched/test_validation.py
+++ b/tests/patched/test_validation.py
@@ -1,4 +1,3 @@
-# pylint: disable=too-many-lines
 """Module for testing the validation module"""
 
 import os
@@ -49,7 +48,6 @@ class BaseValidation:
         self._caplog = caplog
 
 
-# pylint: disable=too-many-public-methods
 class TestValidation(BaseValidation):
     """
     Test the validation module
@@ -241,7 +239,7 @@ class TestValidation(BaseValidation):
 
     def test_lr_as_float(self, minimal_cfg):
         cfg = (
-            DictDefault(  # pylint: disable=unsupported-binary-operation
+            DictDefault(
                 {
                     "learning_rate": "5e-5",
                 }
@@ -303,7 +301,7 @@ class TestValidation(BaseValidation):
         )
 
         cfg = (
-            DictDefault(  # pylint: disable=unsupported-binary-operation
+            DictDefault(
                 {
                     "load_in_8bit": True,
                 }
@@ -315,7 +313,7 @@ class TestValidation(BaseValidation):
             validate_config(cfg)
 
         cfg = (
-            DictDefault(  # pylint: disable=unsupported-binary-operation
+            DictDefault(
                 {
                     "gptq": True,
                 }
@@ -327,7 +325,7 @@ class TestValidation(BaseValidation):
             validate_config(cfg)
 
         cfg = (
-            DictDefault(  # pylint: disable=unsupported-binary-operation
+            DictDefault(
                 {
                     "load_in_4bit": False,
                 }
@@ -339,7 +337,7 @@ class TestValidation(BaseValidation):
             validate_config(cfg)
 
         cfg = (
-            DictDefault(  # pylint: disable=unsupported-binary-operation
+            DictDefault(
                 {
                     "load_in_4bit": True,
                 }
@@ -361,7 +359,7 @@ class TestValidation(BaseValidation):
         )
 
         cfg = (
-            DictDefault(  # pylint: disable=unsupported-binary-operation
+            DictDefault(
                 {
                     "load_in_8bit": True,
                 }
@@ -373,7 +371,7 @@ class TestValidation(BaseValidation):
             validate_config(cfg)
 
         cfg = (
-            DictDefault(  # pylint: disable=unsupported-binary-operation
+            DictDefault(
                 {
                     "gptq": True,
                 }
@@ -385,7 +383,7 @@ class TestValidation(BaseValidation):
             validate_config(cfg)
 
         cfg = (
-            DictDefault(  # pylint: disable=unsupported-binary-operation
+            DictDefault(
                 {
                     "load_in_4bit": True,
                 }
diff --git a/tests/prompt_strategies/conftest.py b/tests/prompt_strategies/conftest.py
index 7f942e0ef..12c4bcd93 100644
--- a/tests/prompt_strategies/conftest.py
+++ b/tests/prompt_strategies/conftest.py
@@ -30,7 +30,6 @@ def fixture_assistant_dataset():
 
 @pytest.fixture(name="sharegpt_dataset")
 def fixture_sharegpt_dataset():
-    # pylint: disable=duplicate-code
     return Dataset.from_list(
         [
             {
@@ -47,7 +46,6 @@ def fixture_sharegpt_dataset():
 
 @pytest.fixture(name="basic_dataset")
 def fixture_basic_dataset():
-    # pylint: disable=duplicate-code
     return Dataset.from_list(
         [
             {
@@ -65,7 +63,6 @@ def fixture_basic_dataset():
 
 @pytest.fixture(name="toolcalling_dataset")
 def fixture_toolcalling_dataset():
-    # pylint: disable=duplicate-code
     return Dataset.from_list(
         [
             {
@@ -112,7 +109,7 @@ def fixture_toolcalling_dataset():
 @enable_hf_offline
 def fixture_llama3_tokenizer(
     download_llama3_8b_instruct_model_fixture,
-):  # pylint: disable=unused-argument,redefined-outer-name
+):
     tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")
 
     return tokenizer
@@ -129,7 +126,7 @@ def fixture_smollm2_tokenizer():
 @enable_hf_offline
 def fixture_mistralv03_tokenizer(
     download_mlx_mistral_7b_model_fixture,
-):  # pylint: disable=unused-argument,redefined-outer-name
+):
     tokenizer = AutoTokenizer.from_pretrained(
         "mlx-community/Mistral-7B-Instruct-v0.3-4bit"
     )
diff --git a/tests/prompt_strategies/messages/test_chat.py b/tests/prompt_strategies/messages/test_chat.py
index a4c2ae67f..f083232a8 100644
--- a/tests/prompt_strategies/messages/test_chat.py
+++ b/tests/prompt_strategies/messages/test_chat.py
@@ -2,7 +2,6 @@
 tests for chat_template prompt strategy
 """
 
-# pylint: disable=duplicate-code
 import unittest
 
 from axolotl.prompt_strategies.messages.chat import load
@@ -53,9 +52,9 @@ class TestMessagesChatLlama3:
         # fmt: on
         LOG.debug(f"Expected input_ids: {expected_input_ids}")
         LOG.debug(f"Actual input_ids: {input_ids}")
-        assert (
-            input_ids == expected_input_ids
-        ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        assert input_ids == expected_input_ids, (
+            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/prompt_strategies/test_alpaca.py b/tests/prompt_strategies/test_alpaca.py
index 78f783747..b96ebce19 100644
--- a/tests/prompt_strategies/test_alpaca.py
+++ b/tests/prompt_strategies/test_alpaca.py
@@ -30,7 +30,6 @@ def fixture_alpaca_dataset():
 @pytest.fixture(name="tokenizer")
 @enable_hf_offline
 def fixture_tokenizer():
-    # pylint: disable=all
     tokenizer = AutoTokenizer.from_pretrained(
         "casperhansen/mistral-7b-instruct-v0.1-awq"
     )
diff --git a/tests/prompt_strategies/test_chat_template_ds_schema_unification.py b/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
index 502efae4b..e8d35e974 100644
--- a/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
+++ b/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
@@ -18,9 +18,7 @@ def fixture_messages_w_tools():
 {"messages":[{"role":"user","content":"move to (0, 1)"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"move","arguments":{"x":0,"y":1}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
 {"messages":[{"role":"user","content":"turn 270 degree"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"turn","arguments":{"theta": 270}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
 {"messages":[{"role":"user","content":"jump high"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"invalid_prompt","arguments":{"message": "jump is not a valid action"}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
-    """.strip().split(
-        "\n"
-    )
+    """.strip().split("\n")
     rows = [json.loads(row) for row in jsons]
     return Dataset.from_list(rows)
 
@@ -28,7 +26,7 @@ def fixture_messages_w_tools():
 @pytest.fixture(name="qwen3_tokenizer")
 def qwen3_tokenizer_fixture(
     download_qwen3_half_billion_model,
-):  # pylint: disable=unused-argument
+):
     tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
 
     return tokenizer
diff --git a/tests/prompt_strategies/test_chat_templates.py b/tests/prompt_strategies/test_chat_templates.py
index 371ccf616..90e0e274b 100644
--- a/tests/prompt_strategies/test_chat_templates.py
+++ b/tests/prompt_strategies/test_chat_templates.py
@@ -67,9 +67,9 @@ class TestAssistantChatTemplateLlama3:
         # fmt: on
         LOG.debug(f"Expected input_ids: {expected_input_ids}")
         LOG.debug(f"Actual input_ids: {input_ids}")
-        assert (
-            input_ids == expected_input_ids
-        ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        assert input_ids == expected_input_ids, (
+            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        )
 
     def test_llama3(self, llama3_tokenizer, assistant_dataset):
         LOG.info("Testing llama-3 with assistant dataset")
@@ -109,9 +109,9 @@ class TestAssistantChatTemplateLlama3:
         # fmt: on
         LOG.debug(f"Expected input_ids: {expected_input_ids}")
         LOG.debug(f"Actual input_ids: {input_ids}")
-        assert (
-            input_ids == expected_input_ids
-        ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        assert input_ids == expected_input_ids, (
+            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        )
 
     def test_phi35(self, phi35_tokenizer, assistant_dataset):
         LOG.info("Testing phi-3.5 with assistant dataset")
@@ -161,15 +161,15 @@ class TestAssistantChatTemplateLlama3:
         # fmt: on
         LOG.debug(f"Expected input_ids: {expected_input_ids}")
         LOG.debug(f"Actual input_ids: {input_ids}")
-        assert (
-            input_ids == expected_input_ids
-        ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        assert input_ids == expected_input_ids, (
+            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        )
 
         LOG.debug(f"Expected labels : {expected_labels}")
         LOG.debug(f"Actual labels : {labels}")
-        assert (
-            labels == expected_labels
-        ), f"Input IDs mismatch: {labels} != {expected_labels}"
+        assert labels == expected_labels, (
+            f"Input IDs mismatch: {labels} != {expected_labels}"
+        )
 
     def test_llama3_with_training_data(self, llama3_tokenizer, assistant_dataset):
         LOG.info("Testing llama-3 with assistant dataset including training data")
@@ -234,7 +234,7 @@ class TestSharegptChatTemplateLlama3:
 
     def test_llama3_assistant(self, llama3_tokenizer, sharegpt_dataset):
         LOG.info("Testing ShareGPT style datasets with llama-3 assistant prompts")
-        # pylint: disable=duplicate-code
+
         strategy = ChatTemplateStrategy(
             ChatTemplatePrompter(
                 llama3_tokenizer,
@@ -285,16 +285,16 @@ class TestSharegptChatTemplateLlama3:
         LOG.debug(f"Expected labels: {expected_labels}")
         LOG.debug(f"Actual labels: {labels}")
 
-        assert (
-            input_ids == expected_input_ids
-        ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
-        assert (
-            labels == expected_labels
-        ), f"Labels mismatch: {labels} != {expected_labels}"
+        assert input_ids == expected_input_ids, (
+            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        )
+        assert labels == expected_labels, (
+            f"Labels mismatch: {labels} != {expected_labels}"
+        )
 
     def test_llama3_human(self, llama3_tokenizer, sharegpt_dataset):
         LOG.info("Testing ShareGPT style datasets with llama-3 human prompts")
-        # pylint: disable=duplicate-code
+
         strategy = ChatTemplateStrategy(
             ChatTemplatePrompter(
                 llama3_tokenizer,
@@ -345,16 +345,16 @@ class TestSharegptChatTemplateLlama3:
         LOG.debug(f"Expected labels: {expected_labels}")
         LOG.debug(f"Actual labels: {labels}")
 
-        assert (
-            input_ids == expected_input_ids
-        ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
-        assert (
-            labels == expected_labels
-        ), f"Labels mismatch: {labels} != {expected_labels}"
+        assert input_ids == expected_input_ids, (
+            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        )
+        assert labels == expected_labels, (
+            f"Labels mismatch: {labels} != {expected_labels}"
+        )
 
     def test_llama3_system_human(self, llama3_tokenizer, basic_dataset):
         LOG.info("Testing ShareGPT style datasets with llama-3 system/human prompts")
-        # pylint: disable=duplicate-code
+
         strategy = ChatTemplateStrategy(
             ChatTemplatePrompter(
                 llama3_tokenizer,
@@ -409,12 +409,12 @@ class TestSharegptChatTemplateLlama3:
         LOG.debug(f"Expected labels: {expected_labels}")
         LOG.debug(f"Actual labels: {labels}")
 
-        assert (
-            input_ids == expected_input_ids
-        ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
-        assert (
-            labels == expected_labels
-        ), f"Labels mismatch: {labels} != {expected_labels}"
+        assert input_ids == expected_input_ids, (
+            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        )
+        assert labels == expected_labels, (
+            f"Labels mismatch: {labels} != {expected_labels}"
+        )
 
 
 class TestAssistantToolCallingChatTemplateLlama32Vision:
@@ -481,13 +481,13 @@ class TestAssistantToolCallingChatTemplateLlama32Vision:
         ]
         # fmt: on
 
-        assert (
-            input_ids == expected_input_ids
-        ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        assert input_ids == expected_input_ids, (
+            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        )
 
-        assert (
-            labels == expected_labels
-        ), f"Labels mismatch: {labels} != {expected_labels}"
+        assert labels == expected_labels, (
+            f"Labels mismatch: {labels} != {expected_labels}"
+        )
 
     def test_llama32vision_train_on_tools(
         self, llama3_tokenizer, toolcalling_dataset, llama3_2_vision_chat_template_jinja
@@ -495,7 +495,6 @@ class TestAssistantToolCallingChatTemplateLlama32Vision:
         LOG.info(
             "Testing assistant style datasets with tool_calling with llama-32 chat template, training on tools"
         )
-        # pylint: disable=duplicate-code
 
         strategy = ChatTemplateStrategy(
             ChatTemplatePrompter(
@@ -549,13 +548,13 @@ class TestAssistantToolCallingChatTemplateLlama32Vision:
         ]
         # fmt: on
 
-        assert (
-            input_ids == expected_input_ids
-        ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        assert input_ids == expected_input_ids, (
+            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+        )
 
-        assert (
-            labels == expected_labels
-        ), f"Labels mismatch: {labels} != {expected_labels}"
+        assert labels == expected_labels, (
+            f"Labels mismatch: {labels} != {expected_labels}"
+        )
 
 
 if __name__ == "__main__":
diff --git a/tests/prompt_strategies/test_chat_templates_advanced.py b/tests/prompt_strategies/test_chat_templates_advanced.py
index f847cab4a..fd39a4305 100644
--- a/tests/prompt_strategies/test_chat_templates_advanced.py
+++ b/tests/prompt_strategies/test_chat_templates_advanced.py
@@ -2,8 +2,6 @@
 tests for chat_template prompt strategy
 """
 
-# pylint: disable=too-many-lines
-
 from copy import deepcopy
 
 import pytest
@@ -96,9 +94,9 @@ class TestChatTemplateConfigurations:
             and turn.get("from") in ["system", "context"]
             and ("mistral" in tokenizer.name_or_path.lower())
         ):
-            assert (
-                start_idx == -1 and end_idx == -1
-            ), "Expected system message to be skipped"
+            assert start_idx == -1 and end_idx == -1, (
+                "Expected system message to be skipped"
+            )
             return True
         return False
 
@@ -155,7 +153,9 @@ class TestChatTemplateConfigurations:
 
             assert all(
                 label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
-            ), f"Expected labels for input '{response}' to be ignored, but got {labels[start_idx:end_idx]}"
+            ), (
+                f"Expected labels for input '{response}' to be ignored, but got {labels[start_idx:end_idx]}"
+            )
 
         LOG.debug("Full labels: %s", labels)
         LOG.debug("Full input_ids: %s", input_ids)
@@ -215,11 +215,15 @@ class TestChatTemplateConfigurations:
             if is_assistant:
                 assert all(
                     label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
-                ), f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+                ), (
+                    f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+                )
             else:
                 assert all(
                     label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
-                ), f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}"
+                ), (
+                    f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}"
+                )
 
     def test_roles_to_train_human_assistant_only(
         self,
@@ -276,11 +280,15 @@ class TestChatTemplateConfigurations:
             if should_be_labelled:
                 assert all(
                     label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
-                ), f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+                ), (
+                    f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+                )
             else:
                 assert all(
                     label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
-                ), f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}"
+                ), (
+                    f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}"
+                )
 
     def test_roles_to_train_all(
         self,
@@ -327,13 +335,15 @@ class TestChatTemplateConfigurations:
                 continue
 
             decoded_response = tokenizer.decode(input_ids[start_idx:end_idx])
-            assert (
-                response in decoded_response
-            ), f"Response {response} not found in index {start_idx}:{end_idx} decoded:{decoded_response}"
+            assert response in decoded_response, (
+                f"Response {response} not found in index {start_idx}:{end_idx} decoded:{decoded_response}"
+            )
 
             assert all(
                 label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
-            ), f"Expected labels for response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+            ), (
+                f"Expected labels for response '{response}' to be set, but got {labels[start_idx:end_idx]}"
+            )
 
     def test_empty_roles_to_train(
         self,
@@ -371,9 +381,9 @@ class TestChatTemplateConfigurations:
 
         # Verify that no labels are set when roles_to_train is empty
         LOG.debug("Full labels: %s", labels)
-        assert all(
-            label == IGNORE_TOKEN_ID for label in labels
-        ), "Expected all labels to be IGNORE_TOKEN_ID when roles_to_train is empty"
+        assert all(label == IGNORE_TOKEN_ID for label in labels), (
+            "Expected all labels to be IGNORE_TOKEN_ID when roles_to_train is empty"
+        )
 
     def test_train_on_eos_all(
         self,
@@ -417,9 +427,9 @@ class TestChatTemplateConfigurations:
 
         assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
         for eos_idx in eos_indices:
-            assert (
-                labels[eos_idx] != IGNORE_TOKEN_ID
-            ), f"Expected EOS token at index {eos_idx} to be labeled"
+            assert labels[eos_idx] != IGNORE_TOKEN_ID, (
+                f"Expected EOS token at index {eos_idx} to be labeled"
+            )
 
     def test_train_on_eos_turn(
         self,
@@ -477,9 +487,9 @@ class TestChatTemplateConfigurations:
             while eos_idx < len(input_ids) and input_ids[eos_idx] != eos_token_id:
                 eos_idx += 1
 
-            assert eos_idx < len(
-                input_ids
-            ), f"Could not find EOS token after '{response}'"
+            assert eos_idx < len(input_ids), (
+                f"Could not find EOS token after '{response}'"
+            )
 
             LOG.debug(
                 f"Turn {i}: role={turn['from']}, content='{turn['value']}', start_idx={start_idx}, end_idx={end_idx}, eos_idx={eos_idx}"
@@ -492,13 +502,13 @@ class TestChatTemplateConfigurations:
             # Verify EOS token labeling based on role
             is_assistant = turn["from"] == "assistant"
             if is_assistant:
-                assert (
-                    labels[eos_idx] != IGNORE_TOKEN_ID
-                ), f"Expected EOS token after assistant response '{response}' to be labeled"
+                assert labels[eos_idx] != IGNORE_TOKEN_ID, (
+                    f"Expected EOS token after assistant response '{response}' to be labeled"
+                )
             else:
-                assert (
-                    labels[eos_idx] == IGNORE_TOKEN_ID
-                ), f"Expected EOS token after non-assistant input '{response}' to not be labeled"
+                assert labels[eos_idx] == IGNORE_TOKEN_ID, (
+                    f"Expected EOS token after non-assistant input '{response}' to not be labeled"
+                )
 
     def test_train_on_eos_last(
         self,
@@ -545,12 +555,12 @@ class TestChatTemplateConfigurations:
 
         # Check that only the last EOS token is labeled
         for idx in eos_indices[:-1]:
-            assert (
-                labels[idx] == IGNORE_TOKEN_ID
-            ), f"Expected EOS token at index {idx} to not be labeled"
-        assert (
-            labels[last_eos_idx] != IGNORE_TOKEN_ID
-        ), f"Expected last EOS token at index {last_eos_idx} to be labeled"
+            assert labels[idx] == IGNORE_TOKEN_ID, (
+                f"Expected EOS token at index {idx} to not be labeled"
+            )
+        assert labels[last_eos_idx] != IGNORE_TOKEN_ID, (
+            f"Expected last EOS token at index {last_eos_idx} to be labeled"
+        )
 
     def test_train_on_eos_none(
         self,
@@ -594,9 +604,9 @@ class TestChatTemplateConfigurations:
 
         assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
         for eos_idx in eos_indices:
-            assert (
-                labels[eos_idx] == IGNORE_TOKEN_ID
-            ), f"Expected EOS token at index {eos_idx} to not be labeled"
+            assert labels[eos_idx] == IGNORE_TOKEN_ID, (
+                f"Expected EOS token at index {eos_idx} to not be labeled"
+            )
 
     def test_drop_system_message(
         self,
@@ -634,9 +644,9 @@ class TestChatTemplateConfigurations:
         # Check if system message is not present in input_ids
         system_message = "You are an AI assistant."
         decoded_message = tokenizer.decode(input_ids)
-        assert (
-            system_message not in decoded_message
-        ), "Expected system message to be dropped"
+        assert system_message not in decoded_message, (
+            "Expected system message to be dropped"
+        )
 
     def test_custom_roles(
         self,
@@ -711,7 +721,9 @@ class TestChatTemplateConfigurations:
             else:
                 assert all(
                     label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
-                ), f"Expected labels for non-AI message '{response}' to be IGNORE_TOKEN_ID"
+                ), (
+                    f"Expected labels for non-AI message '{response}' to be IGNORE_TOKEN_ID"
+                )
 
     def test_message_field_training(
         self,
@@ -776,13 +788,13 @@ class TestChatTemplateConfigurations:
         def verify_labels(labels_span, should_train, context_message):
             """Helper to verify if a span of labels matches expected training state"""
             if should_train:
-                assert all(
-                    label != IGNORE_TOKEN_ID for label in labels_span
-                ), f"Expected all labels for {context_message} to be set, but got {labels_span}"
+                assert all(label != IGNORE_TOKEN_ID for label in labels_span), (
+                    f"Expected all labels for {context_message} to be set, but got {labels_span}"
+                )
             else:
-                assert all(
-                    label == IGNORE_TOKEN_ID for label in labels_span
-                ), f"Expected all labels for {context_message} to be {IGNORE_TOKEN_ID}, but got {labels_span}"
+                assert all(label == IGNORE_TOKEN_ID for label in labels_span), (
+                    f"Expected all labels for {context_message} to be {IGNORE_TOKEN_ID}, but got {labels_span}"
+                )
 
         # Process all turns and verify labeling
         for i, turn in enumerate(modified_dataset[0]["messages"]):
@@ -861,9 +873,9 @@ class TestChatTemplateConfigurations:
                 actual_labels = labels[
                     start_idx : start_idx + len(token_offsets_masked)
                 ]
-                assert (
-                    actual_labels == expected_labels
-                ), f"Labels mismatch for turn: {turn['value']}\nExpected: {expected_labels}\nActual: {actual_labels}"
+                assert actual_labels == expected_labels, (
+                    f"Labels mismatch for turn: {turn['value']}\nExpected: {expected_labels}\nActual: {actual_labels}"
+                )
 
                 # Verify each detail section
                 for detail in adjusted_train_details:
@@ -958,7 +970,7 @@ class TestChatTemplateConfigurations:
         chat_template,
         chat_template_jinja,
         eos_token,
-        basic_dataset,  # pylint: disable=unused-argument
+        basic_dataset,
         request,
     ):
         """Test that an error is raised when eot_tokens contains eos_token and train_on_eot/train_on_eos conflict"""
@@ -1005,7 +1017,7 @@ class TestChatTemplateConfigurations:
         chat_template,
         chat_template_jinja,
         eos_token,
-        basic_dataset,  # pylint: disable=unused-argument
+        basic_dataset,
         request,
     ):
         """Test that eot_tokens inherits from eos_token when not specified"""
@@ -1032,12 +1044,12 @@ class TestChatTemplateConfigurations:
         )
 
         # In backward compatibility mode, eot_tokens should be derived from eos_token
-        assert strategy.eot_tokens == [
-            tokenizer.eos_token
-        ], f"Expected eot_tokens to inherit from eos_token, got {strategy.eot_tokens}"
-        assert (
-            strategy.train_on_eot == "turn"
-        ), f"Expected train_on_eot to inherit from train_on_eos, got {strategy.train_on_eot}"
+        assert strategy.eot_tokens == [tokenizer.eos_token], (
+            f"Expected eot_tokens to inherit from eos_token, got {strategy.eot_tokens}"
+        )
+        assert strategy.train_on_eot == "turn", (
+            f"Expected train_on_eot to inherit from train_on_eos, got {strategy.train_on_eot}"
+        )
 
     def test_token_not_in_template(
         self,
@@ -1091,7 +1103,7 @@ class TestChatTemplateConfigurations:
         tokenizer,
         chat_template,
         chat_template_jinja,
-        eos_token,  # pylint: disable=unused-argument
+        eos_token,
         basic_dataset,
         request,
     ):
@@ -1157,13 +1169,13 @@ class TestChatTemplateConfigurations:
             )
 
             if is_after_assistant:
-                assert (
-                    labels[eot_idx] != IGNORE_TOKEN_ID
-                ), f"Expected EOT token after assistant turn at index {eot_idx} to be labeled"
+                assert labels[eot_idx] != IGNORE_TOKEN_ID, (
+                    f"Expected EOT token after assistant turn at index {eot_idx} to be labeled"
+                )
             else:
-                assert (
-                    labels[eot_idx] == IGNORE_TOKEN_ID
-                ), f"Expected EOT token not after assistant turn at index {eot_idx} to not be labeled"
+                assert labels[eot_idx] == IGNORE_TOKEN_ID, (
+                    f"Expected EOT token not after assistant turn at index {eot_idx} to not be labeled"
+                )
 
     def test_multiple_train_on_eot_settings(
         self,
@@ -1224,9 +1236,9 @@ class TestChatTemplateConfigurations:
                 i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
             ]
 
-            assert (
-                len(eos_indices) > 0
-            ), "Expected at least one EOS/EOT token in the input"
+            assert len(eos_indices) > 0, (
+                "Expected at least one EOS/EOT token in the input"
+            )
 
             # Check labeling for each EOS/EOT token
             for idx, eos_idx in enumerate(eos_indices):
@@ -1252,13 +1264,13 @@ class TestChatTemplateConfigurations:
                 )
 
                 if expected_label:
-                    assert (
-                        labels[eos_idx] == IGNORE_TOKEN_ID
-                    ), f"Expected EOT token at index {eos_idx} to not be labeled with train_on_eot='{setting}'"
+                    assert labels[eos_idx] == IGNORE_TOKEN_ID, (
+                        f"Expected EOT token at index {eos_idx} to not be labeled with train_on_eot='{setting}'"
+                    )
                 else:
-                    assert (
-                        labels[eos_idx] != IGNORE_TOKEN_ID
-                    ), f"Expected EOT token at index {eos_idx} to be labeled with train_on_eot='{setting}'"
+                    assert labels[eos_idx] != IGNORE_TOKEN_ID, (
+                        f"Expected EOT token at index {eos_idx} to be labeled with train_on_eot='{setting}'"
+                    )
 
 
 class TestChatTemplateToolCalling:
@@ -1378,29 +1390,27 @@ class TestChatTemplateToolCalling:
         decoded_conversation = tokenizer.decode(input_ids)
 
         # Verify tool calling structure is present in the decoded conversation
-        assert (
-            '"type": "function",' in decoded_conversation
-        ), "Tool type function should be in conversation"
-        assert (
-            '"name": "multiples",' in decoded_conversation
-        ), "Tool function name should be in conversation"
+        assert '"type": "function",' in decoded_conversation, (
+            "Tool type function should be in conversation"
+        )
+        assert '"name": "multiples",' in decoded_conversation, (
+            "Tool function name should be in conversation"
+        )
 
         assert (
             '<|python_start|><|python_end|>{"name": "multiples", "parameters": {"number": 5, "limit": 20}}<|eot|>'
             in decoded_conversation
         ), "Assistant tool call should be in conversation"
-        assert (
-            "<|header_start|>ipython<|header_end|>" in decoded_conversation
-        ), "IPython header should be in conversation"
-        assert (
-            '"5,10,15"' in decoded_conversation
-        ), "Tool response should be in conversation"
+        assert "<|header_start|>ipython<|header_end|>" in decoded_conversation, (
+            "IPython header should be in conversation"
+        )
+        assert '"5,10,15"' in decoded_conversation, (
+            "Tool response should be in conversation"
+        )
 
         # Get conversation turns to verify labeling
         turns = strategy.get_conversation_thread(tool_calling_dataset[0])
-        tools = strategy._get_tools(  # pylint: disable=protected-access
-            tool_calling_dataset[0]
-        )
+        tools = strategy._get_tools(tool_calling_dataset[0])
 
         # Check that assistant responses are properly labeled
         for i, turn in enumerate(tool_calling_dataset[0]["messages"]):
@@ -1409,12 +1419,12 @@ class TestChatTemplateToolCalling:
                     turns=turns, turn_idx=i, tools=tools
                 )
 
-                assert (
-                    start_idx != -1 and end_idx != -1
-                ), f"Assistant turn {i} should be found"
+                assert start_idx != -1 and end_idx != -1, (
+                    f"Assistant turn {i} should be found"
+                )
 
                 # Verify that assistant responses have proper labels
                 turn_labels = labels[start_idx:end_idx]
-                assert all(
-                    label != IGNORE_TOKEN_ID for label in turn_labels
-                ), f"Assistant turn {i} should be unmasked"
+                assert all(label != IGNORE_TOKEN_ID for label in turn_labels), (
+                    f"Assistant turn {i} should be unmasked"
+                )
diff --git a/tests/prompt_strategies/test_chat_templates_mistral.py b/tests/prompt_strategies/test_chat_templates_mistral.py
index a5b31a771..85aa72111 100644
--- a/tests/prompt_strategies/test_chat_templates_mistral.py
+++ b/tests/prompt_strategies/test_chat_templates_mistral.py
@@ -28,7 +28,7 @@ def test_mistral_chat_template(
     request: pytest.FixtureRequest,
 ):
     """Test chat template with the Magistral/Devstral tokenizer"""
-    # pylint: disable=duplicate-code
+
     from axolotl.prompt_strategies.chat_template import MistralPrompter, MistralStrategy
 
     tokenizer: HFMistralTokenizer = request.getfixturevalue(tokenizer_str)
diff --git a/tests/prompt_strategies/test_chat_templates_thinking.py b/tests/prompt_strategies/test_chat_templates_thinking.py
index e807111aa..5475666a5 100644
--- a/tests/prompt_strategies/test_chat_templates_thinking.py
+++ b/tests/prompt_strategies/test_chat_templates_thinking.py
@@ -59,7 +59,7 @@ def messages_w_reasoning_fixture():
 @pytest.fixture(name="qwen3_tokenizer")
 def qwen3_tokenizer_fixture(
     download_qwen3_half_billion_model,
-):  # pylint: disable=unused-argument
+):
     tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
 
     return tokenizer
@@ -71,7 +71,6 @@ class TestSplitThinking:
     """
 
     def test_splits_think(self, messages_w_reasoning, qwen3_tokenizer):
-        # pylint: disable=duplicate-code
         strategy = load(
             qwen3_tokenizer,
             DictDefault(
@@ -130,6 +129,6 @@ class TestSplitThinking:
                 198,  # \n
             ]
             # fmt: on
-            assert (
-                input_ids == expected_input_ids
-            ), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+            assert input_ids == expected_input_ids, (
+                f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
+            )
diff --git a/tests/prompt_strategies/test_dpo_chat_templates.py b/tests/prompt_strategies/test_dpo_chat_templates.py
index e5f30a6c4..e570cfc9d 100644
--- a/tests/prompt_strategies/test_dpo_chat_templates.py
+++ b/tests/prompt_strategies/test_dpo_chat_templates.py
@@ -16,7 +16,6 @@ from tests.hf_offline_utils import enable_hf_offline
 
 @pytest.fixture(name="assistant_dataset")
 def fixture_assistant_dataset():
-    # pylint: disable=duplicate-code
     return Dataset.from_list(
         [
             {
@@ -49,7 +48,6 @@ def fixture_assistant_dataset():
 
 @pytest.fixture(name="custom_assistant_dataset")
 def fixture_custom_assistant_dataset():
-    # pylint: disable=duplicate-code
     return Dataset.from_list(
         [
             {
@@ -102,7 +100,6 @@ class TestAssistantDPOChatTemplateLlama3:
     """
 
     def test_llama3_defaults(self, llama3_tokenizer, assistant_dataset):
-        # pylint: disable=duplicate-code
         transform_fn, _ = default(
             DictDefault(
                 {
@@ -127,7 +124,6 @@ class TestAssistantDPOChatTemplateLlama3:
         assert result["rejected"] == "party on<|eot_id|>"
 
     def test_llama3_configured(self, llama3_tokenizer, custom_assistant_dataset):
-        # pylint: disable=duplicate-code
         transform_fn, _ = default(
             DictDefault(
                 {
@@ -168,7 +164,6 @@ class TestAssistantDPOChatTemplatePhi3:
     """
 
     def test_phi3_defaults(self, phi3_tokenizer, assistant_dataset):
-        # pylint: disable=duplicate-code
         transform_fn, _ = default(
             DictDefault(
                 {
@@ -198,7 +193,6 @@ class TestAssistantDPOChatTemplateGemma:
     """
 
     def test_gemma_defaults(self, gemma_tokenizer, assistant_dataset):
-        # pylint: disable=duplicate-code
         transform_fn, _ = default(
             DictDefault(
                 {
diff --git a/tests/prompt_strategies/test_stepwise.py b/tests/prompt_strategies/test_stepwise.py
index 2abe4ae18..ad3f7531f 100644
--- a/tests/prompt_strategies/test_stepwise.py
+++ b/tests/prompt_strategies/test_stepwise.py
@@ -20,7 +20,6 @@ class TestStepWiseSupervisedPromptTokenizingStrategy:
 
     @pytest.fixture()
     def stepwise_supervised_dataset(self):
-        # pylint: disable=duplicate-code
         return Dataset.from_list(
             [
                 {
diff --git a/tests/test_chunked_xentropy.py b/tests/test_chunked_xentropy.py
index 3e439f0a3..56ac1b168 100644
--- a/tests/test_chunked_xentropy.py
+++ b/tests/test_chunked_xentropy.py
@@ -22,7 +22,7 @@ def chunked_fixtures():
     return lm_head, hidden_state, labels, vocab_size
 
 
-def test_chunked_forward(chunked_fixtures):  # pylint: disable=redefined-outer-name
+def test_chunked_forward(chunked_fixtures):
     lm_head, hidden_state, labels, vocab_size = chunked_fixtures
     lm_loss = get_causal_lm_loss()
 
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 719dfdc19..ea5ee368d 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -374,7 +374,6 @@ class TestDatasetPreparation:
             }
         )
 
-        # pylint: disable=duplicate-code
         with patch(
             "axolotl.utils.data.rl.load_dataset_with_config"
         ) as mock_load_dataset:
diff --git a/tests/test_dict.py b/tests/test_dict.py
index 0bcf8ca7b..19a370199 100644
--- a/tests/test_dict.py
+++ b/tests/test_dict.py
@@ -21,26 +21,26 @@ class DictDefaultTest(unittest.TestCase):
             }
         )
 
-        assert (
-            cfg.key_a.key_b == "value_a"
-        ), "DictDefault should return value for existing nested keys"
+        assert cfg.key_a.key_b == "value_a", (
+            "DictDefault should return value for existing nested keys"
+        )
 
-        assert (
-            cfg.key_c == "value_c"
-        ), "DictDefault should return value for existing keys"
+        assert cfg.key_c == "value_c", (
+            "DictDefault should return value for existing keys"
+        )
 
-        assert (
-            cfg.key_d[0] == "value_d"
-        ), "DictDefault should return value for existing keys in list"
+        assert cfg.key_d[0] == "value_d", (
+            "DictDefault should return value for existing keys in list"
+        )
 
-        assert (
-            "value_e" in cfg.key_d
-        ), "DictDefault should support in operator for existing keys in list"
+        assert "value_e" in cfg.key_d, (
+            "DictDefault should support in operator for existing keys in list"
+        )
 
     def test_dict_or_operator(self):
         cfg = DictDefault({"key_a": {"key_b": "value_b"}, "key_f": "value_g"})
 
-        cfg = cfg | DictDefault(  # pylint: disable=unsupported-binary-operation
+        cfg = cfg | DictDefault(
             {
                 "key_a": {"key_b": "value_a"},
                 "key_c": "value_c",
@@ -49,9 +49,9 @@ class DictDefaultTest(unittest.TestCase):
             }
         )
 
-        assert (
-            cfg.key_a.key_b == "value_b"
-        ), "DictDefault should support OR operator for existing nested keys"
+        assert cfg.key_a.key_b == "value_b", (
+            "DictDefault should support OR operator for existing nested keys"
+        )
 
         assert cfg.key_c == "value_c", "DictDefault should not delete existing key"
 
@@ -60,9 +60,9 @@ class DictDefaultTest(unittest.TestCase):
             "value_e",
         ], "DictDefault should not overwrite existing keys in list"
 
-        assert (
-            cfg.key_f == "value_g"
-        ), "DictDefault should support OR operator for existing key"
+        assert cfg.key_f == "value_g", (
+            "DictDefault should support OR operator for existing key"
+        )
 
     def test_dict_missingkey(self):
         cfg = DictDefault({})
@@ -72,9 +72,9 @@ class DictDefaultTest(unittest.TestCase):
     def test_dict_or(self):
         cfg = DictDefault({}) | DictDefault({})
 
-        assert (
-            cfg.random_key is None
-        ), "DictDefault should return None for missing keys after | operation"
+        assert cfg.random_key is None, (
+            "DictDefault should return None for missing keys after | operation"
+        )
 
     def test_dict_nested_missingparentkey(self):
         """
diff --git a/tests/test_exact_deduplication.py b/tests/test_exact_deduplication.py
index d97aad8ea..65deb5209 100644
--- a/tests/test_exact_deduplication.py
+++ b/tests/test_exact_deduplication.py
@@ -41,9 +41,9 @@ def verify_deduplication(actual_dataset, expected_dataset, dataset_name):
     assert actual_rows == expected_rows, f"Mismatch in {dataset_name} dataset"
 
     # Verify size consistency
-    assert len(actual_rows) == len(
-        actual_dataset
-    ), f"Size mismatch in {dataset_name} dataset after deduplication"
+    assert len(actual_rows) == len(actual_dataset), (
+        f"Size mismatch in {dataset_name} dataset after deduplication"
+    )
 
 
 class TestDeduplicateIndividualFunctions(unittest.TestCase):
@@ -224,7 +224,6 @@ class TestDeduplicateRLDataset:
     ):
         """Verify that loading with deduplication removes duplicates."""
 
-        # pylint: disable=duplicate-code
         with (
             patch(
                 "axolotl.utils.data.rl.load_dataset_with_config"
@@ -251,7 +250,6 @@ class TestDeduplicateRLDataset:
         dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
         tokenizer_huggyllama,
     ):
-        # pylint: disable=duplicate-code
         with (
             patch(
                 "axolotl.utils.data.rl.load_dataset_with_config"
@@ -271,9 +269,9 @@ class TestDeduplicateRLDataset:
             train_dataset, _ = prepare_preference_datasets(cfg, tokenizer)
 
             # Verify that the dataset retains duplicates
-            assert (
-                len(train_dataset) == 1800 * 2
-            ), "Dataset deduplication occurred when it should not have"
+            assert len(train_dataset) == 1800 * 2, (
+                "Dataset deduplication occurred when it should not have"
+            )
 
 
 class TestDeduplicateNonRL(unittest.TestCase):
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
index d45f41998..f516d0ca4 100644
--- a/tests/test_loaders.py
+++ b/tests/test_loaders.py
@@ -17,7 +17,7 @@ class TestModelsUtils:
 
     def setup_method(self) -> None:
         # load config
-        self.cfg = DictDefault(  # pylint: disable=attribute-defined-outside-init
+        self.cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
                 "model_type": "AutoModelForCausalLM",
@@ -30,20 +30,16 @@ class TestModelsUtils:
                 "device_map": "auto",
             }
         )
-        self.tokenizer = MagicMock(  # pylint: disable=attribute-defined-outside-init
-            spec=PreTrainedTokenizerBase
-        )
-        self.inference = False  # pylint: disable=attribute-defined-outside-init
-        self.reference_model = True  # pylint: disable=attribute-defined-outside-init
+        self.tokenizer = MagicMock(spec=PreTrainedTokenizerBase)
+        self.inference = False
+        self.reference_model = True
 
         # init ModelLoader
-        self.model_loader = (  # pylint: disable=attribute-defined-outside-init
-            ModelLoader(
-                cfg=self.cfg,
-                tokenizer=self.tokenizer,
-                inference=self.inference,
-                reference_model=self.reference_model,
-            )
+        self.model_loader = ModelLoader(
+            cfg=self.cfg,
+            tokenizer=self.tokenizer,
+            inference=self.inference,
+            reference_model=self.reference_model,
         )
 
     def test_set_device_map_config(self):
@@ -51,7 +47,7 @@ class TestModelsUtils:
         device_map = self.cfg.device_map
         if is_torch_mps_available():
             device_map = "mps"
-        # pylint: disable=protected-access
+
         self.model_loader._set_device_map_config()
         if is_deepspeed_zero3_enabled():
             assert "device_map" not in self.model_loader.model_kwargs
@@ -78,7 +74,6 @@ class TestModelsUtils:
         self.cfg.gptq = gptq
         self.cfg.adapter = adapter
 
-        # pylint: disable=protected-access
         self.model_loader._set_quantization_config()
         if "quantization_config" in self.model_loader.model_kwargs or self.cfg.gptq:
             assert not (
@@ -194,7 +189,7 @@ class TestModelsUtils:
         is_fsdp,
         expected,
     ):
-        res = _get_parallel_config_kwargs(  # pylint: disable=protected-access
+        res = _get_parallel_config_kwargs(
             world_size,
             tensor_parallel_size,
             context_parallel_size,
diff --git a/tests/test_lora.py b/tests/test_lora.py
index 6edcdd88e..50cbea9bc 100644
--- a/tests/test_lora.py
+++ b/tests/test_lora.py
@@ -6,7 +6,6 @@ from axolotl.loaders import ModelLoader, load_tokenizer
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
 
-# pylint: disable=duplicate-code
 minimal_config = DictDefault(
     {
         "base_model": "HuggingFaceTB/SmolLM2-135M",
diff --git a/tests/test_packed_batch_sampler.py b/tests/test_packed_batch_sampler.py
index d839c6ea3..a5db7cbe0 100644
--- a/tests/test_packed_batch_sampler.py
+++ b/tests/test_packed_batch_sampler.py
@@ -93,7 +93,7 @@ class TestBatchedSamplerPacking:
         loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
-            collate_fn=V2BatchSamplerDataCollatorForSeq2Seq(  # pylint: disable=unexpected-keyword-arg
+            collate_fn=V2BatchSamplerDataCollatorForSeq2Seq(
                 tokenizer=tokenizer,
                 padding=True,
                 pad_to_multiple_of=max_seq_length,
diff --git a/tests/test_packed_dataset.py b/tests/test_packed_dataset.py
index 699d5e6cc..43e4f3d39 100644
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -26,7 +26,6 @@ class TestPacking(unittest.TestCase):
 
     @enable_hf_offline
     def setUp(self) -> None:
-        # pylint: disable=duplicate-code
         self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
         self.tokenizer.add_special_tokens(
             {
@@ -75,7 +74,6 @@ class TestPacking(unittest.TestCase):
 
     @with_temp_dir
     def test_lora_packing(self, temp_dir):
-        # pylint: disable=duplicate-code
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -127,9 +125,7 @@ class TestPacking(unittest.TestCase):
             _,
         ) = setup_model_and_trainer(cfg, dataset_meta)
 
-        sampler = trainer._get_eval_sampler(  # pylint: disable=protected-access
-            trainer.eval_dataset
-        )
+        sampler = trainer._get_eval_sampler(trainer.eval_dataset)
         assert "MultipackBatchSampler" in sampler.__class__.__name__
         assert (
             "V2BatchSamplerDataCollatorForSeq2Seq"
@@ -140,9 +136,7 @@ class TestPacking(unittest.TestCase):
         batch = next(dataloader_iter)
         assert batch["input_ids"].shape == (1, 8192)
 
-        sampler = trainer._get_train_sampler(  # pylint: disable=protected-access
-            trainer.train_dataset
-        )
+        sampler = trainer._get_train_sampler(trainer.train_dataset)
         assert "MultipackBatchSampler" in sampler.__class__.__name__
         assert (
             "V2BatchSamplerDataCollatorForSeq2Seq"
diff --git a/tests/test_packed_pretraining.py b/tests/test_packed_pretraining.py
index 115813df2..117bc0dbd 100644
--- a/tests/test_packed_pretraining.py
+++ b/tests/test_packed_pretraining.py
@@ -76,7 +76,6 @@ class TestPretrainingPacking:
             cfg.pretraining_dataset[0]["type"] or "pretrain",
         )
 
-        # pylint: disable=duplicate-code
         original_bsz = cfg.micro_batch_size
         train_dataset = wrap_pretraining_dataset(
             dataset,
diff --git a/tests/test_perplexity.py b/tests/test_perplexity.py
index 9a1c9b223..8f4306994 100644
--- a/tests/test_perplexity.py
+++ b/tests/test_perplexity.py
@@ -1,7 +1,5 @@
 """unit tests for perplexity eval callback"""
 
-# pylint: disable=redefined-outer-name
-
 from pytest import fixture
 from transformers.models.auto.modeling_auto import AutoModelForCausalLM
 from transformers.models.auto.tokenization_auto import AutoTokenizer
diff --git a/tests/test_prompt_tokenizers.py b/tests/test_prompt_tokenizers.py
index 5e5de4ff8..672643a92 100644
--- a/tests/test_prompt_tokenizers.py
+++ b/tests/test_prompt_tokenizers.py
@@ -64,7 +64,7 @@ class TestPromptTokenizationStrategies:
         tests the interface between the user and assistant parts
         """
         prompter = NoSystemPrompter()
-        # pylint: disable=duplicate-code
+
         strat = AlpacaPromptTokenizingStrategy(
             prompter,
             tokenizer_huggyllama_w_special_tokens,
@@ -85,7 +85,7 @@ class TestPromptTokenizationStrategies:
         """
         tests the interface between the user and assistant parts
         """
-        # pylint: disable=duplicate-code
+
         prompter = AlpacaPrompter()
         strat = AlpacaPromptTokenizingStrategy(
             prompter,
@@ -171,7 +171,7 @@ class Llama2ChatTokenizationTest:
         # from transformers.models.llama.tokenization_llama import DEFAULT_SYSTEM_PROMPT
         # broken as of 23/7/20
         # see https://github.com/huggingface/transformers/pull/24935
-        # pylint: disable=C0103
+
         DEFAULT_SYSTEM_PROMPT = """\
 You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
 
@@ -201,7 +201,7 @@ If a question does not make any sense, or is not factually coherent, explain why
             + user_input[1:-1],
             generated_responses=answers,
         )
-        # pylint: disable=W0212
+
         hf_tokens = tokenizer_llama2_7b._build_conversation_input_ids(hf_conf)
 
         assert hf_tokens == tokenized_conversation["input_ids"][: len(hf_tokens)]
diff --git a/tests/test_schedulers.py b/tests/test_schedulers.py
index 92664cca8..c783a68db 100644
--- a/tests/test_schedulers.py
+++ b/tests/test_schedulers.py
@@ -22,7 +22,7 @@ class TestCosineConstantLr(unittest.TestCase):
         self.constant_lr_ratio = 0.8
         self._lr = 0.01
         self.optimizer = SGD([torch.tensor(1)], lr=self._lr)
-        self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(  # pylint: disable=attribute-defined-outside-init
+        self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(
             self.optimizer,
             num_warmup_steps=self.warmup_steps,
             num_training_steps=self.train_steps,
diff --git a/tests/test_validation_dataset.py b/tests/test_validation_dataset.py
index 1a4c97314..3d3b5db96 100644
--- a/tests/test_validation_dataset.py
+++ b/tests/test_validation_dataset.py
@@ -24,7 +24,6 @@ def fixture_cfg():
     )
 
 
-# pylint: disable=too-many-public-methods (duplicate-code)
 class BaseValidation:
     """
     Base validation module to setup the log capture
diff --git a/tests/utils/schemas/validation/test_fsdp.py b/tests/utils/schemas/validation/test_fsdp.py
index 5b461a113..08fc50c61 100644
--- a/tests/utils/schemas/validation/test_fsdp.py
+++ b/tests/utils/schemas/validation/test_fsdp.py
@@ -2,7 +2,6 @@
 tests for pydantic fsdp validation
 """
 
-# pylint: disable=too-many-boolean-expressions
 import pytest
 
 from axolotl.utils.config import validate_config

From 0de254a0d043c864f97269175e0a04e00b78707b Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Tue, 26 Aug 2025 16:47:26 +0700
Subject: [PATCH 019/115] feat: add gemma3_text attention handling for lora
 kernels (#3103)

---
 src/axolotl/monkeypatch/lora_kernels.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/axolotl/monkeypatch/lora_kernels.py b/src/axolotl/monkeypatch/lora_kernels.py
index ef5174ba2..e845dc6ce 100644
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -149,6 +149,11 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
 
         return MistralAttention
 
+    if model_type == "gemma3_text":
+        from transformers.models.gemma3.modeling_gemma3 import Gemma3Attention
+
+        return Gemma3Attention
+
     try:
         # Dynamically import the module and attention class
         module_path = f"transformers.models.{model_type}.modeling_{model_type}"

From 0e9945e3b91e853b36e97c0dbd29bfd778382511 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 26 Aug 2025 09:29:50 -0400
Subject: [PATCH 020/115] deploy training jobs to baseten w truss in axolotl
 cli (#3086) [skip ci]

* deploy training jobs to baseten w truss in axolotl cli

* cleanup
---
 examples/cloud/baseten.yaml                   | 10 +++
 src/axolotl/cli/cloud/__init__.py             | 13 +++-
 src/axolotl/cli/cloud/baseten/__init__.py     | 48 +++++++++++++
 src/axolotl/cli/cloud/baseten/template/run.sh |  9 +++
 .../cli/cloud/baseten/template/train_sft.py   | 71 +++++++++++++++++++
 5 files changed, 149 insertions(+), 2 deletions(-)
 create mode 100644 examples/cloud/baseten.yaml
 create mode 100644 src/axolotl/cli/cloud/baseten/__init__.py
 create mode 100644 src/axolotl/cli/cloud/baseten/template/run.sh
 create mode 100644 src/axolotl/cli/cloud/baseten/template/train_sft.py

diff --git a/examples/cloud/baseten.yaml b/examples/cloud/baseten.yaml
new file mode 100644
index 000000000..23c4b52d6
--- /dev/null
+++ b/examples/cloud/baseten.yaml
@@ -0,0 +1,10 @@
+provider: baseten
+project_name:
+
+secrets:
+  - HF_TOKEN
+  - WANDB_API_KEY
+
+gpu: h100
+gpu_count: 8
+node_count: 1
diff --git a/src/axolotl/cli/cloud/__init__.py b/src/axolotl/cli/cloud/__init__.py
index bf12ab8cb..60f6a51ce 100644
--- a/src/axolotl/cli/cloud/__init__.py
+++ b/src/axolotl/cli/cloud/__init__.py
@@ -7,6 +7,8 @@ from typing import Literal
 
 import yaml
 
+from axolotl.cli.cloud.base import Cloud
+from axolotl.cli.cloud.baseten import BasetenCloud
 from axolotl.cli.cloud.modal_ import ModalCloud
 from axolotl.utils.dict import DictDefault
 
@@ -38,8 +40,15 @@ def do_cli_train(
     cwd=None,
     **kwargs,
 ) -> None:
-    cloud_cfg = load_cloud_cfg(cloud_config)
-    cloud = ModalCloud(cloud_cfg)
+    cloud_cfg: DictDefault = load_cloud_cfg(cloud_config)
+    provider = cloud_cfg.provider or "modal"
+    cloud: Cloud | None
+    if provider == "modal":
+        cloud = ModalCloud(cloud_cfg)
+    elif provider == "baseten":
+        cloud = BasetenCloud(cloud_cfg.to_dict())
+    else:
+        raise ValueError(f"Unsupported cloud provider: {provider}")
     with open(config, "r", encoding="utf-8") as file:
         config_yaml = file.read()
     local_dirs = {}
diff --git a/src/axolotl/cli/cloud/baseten/__init__.py b/src/axolotl/cli/cloud/baseten/__init__.py
new file mode 100644
index 000000000..914504de3
--- /dev/null
+++ b/src/axolotl/cli/cloud/baseten/__init__.py
@@ -0,0 +1,48 @@
+"""Baseten Cloud CLI"""
+
+import shutil
+import subprocess  # nosec B404
+import tempfile
+from os.path import dirname
+from typing import Literal
+
+import yaml
+
+from axolotl.cli.cloud.base import Cloud
+
+
+class BasetenCloud(Cloud):
+    """Baseten Cloud Axolotl CLI"""
+
+    def __init__(self, config: dict):
+        self.config = config
+
+    def preprocess(self, config_yaml: str, *args, **kwargs) -> None:
+        raise NotImplementedError(
+            "Separate preprocess function for Baseten is not "
+            "implemented and will happen during hte train step."
+        )
+
+    def train(
+        self,
+        config_yaml: str,
+        launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
+        launcher_args: list[str] | None = None,
+        local_dirs: dict[str, str] | None = None,  # pylint: disable=unused-argument
+        **kwargs,
+    ):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            config = self.config.copy()
+            config["launcher"] = launcher
+            config["launcher_args"] = launcher_args
+            with open(tmp_dir + "/cloud.yaml", "w", encoding="utf-8") as cloud_fout:
+                yaml.dump(config, cloud_fout)
+            with open(tmp_dir + "/train.yaml", "w", encoding="utf-8") as config_fout:
+                config_fout.write(config_yaml)
+            shutil.copyfile(dirname(__file__) + "/template/run.sh", tmp_dir + "/run.sh")
+            shutil.copyfile(
+                dirname(__file__) + "/template/train_sft.py", tmp_dir + "/train_sft.py"
+            )
+            subprocess.run(  # nosec B603 B607
+                ["truss", "train", "push", "train_sft.py"], cwd=tmp_dir, check=False
+            )
diff --git a/src/axolotl/cli/cloud/baseten/template/run.sh b/src/axolotl/cli/cloud/baseten/template/run.sh
new file mode 100644
index 000000000..37dc9688f
--- /dev/null
+++ b/src/axolotl/cli/cloud/baseten/template/run.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+set -eux
+
+export NCCL_SOCKET_IFNAME="^docker0,lo"
+export NCCL_IB_DISABLE=0
+export NCCL_TIMEOUT=1800000
+
+axolotl preprocess train.yaml
+axolotl train train.yaml --launcher ${AXOLOTL_LAUNCHER} ${AXOLOTL_LAUNCHER_ARGS}
diff --git a/src/axolotl/cli/cloud/baseten/template/train_sft.py b/src/axolotl/cli/cloud/baseten/template/train_sft.py
new file mode 100644
index 000000000..137fb9171
--- /dev/null
+++ b/src/axolotl/cli/cloud/baseten/template/train_sft.py
@@ -0,0 +1,71 @@
+"""
+Baseten Training Script for Axolotl
+"""
+
+# pylint: skip-file
+import yaml
+from truss.base import truss_config
+
+# Import necessary classes from the Baseten Training SDK
+from truss_train import definitions
+
+cloud_config = yaml.safe_load(open("cloud.yaml", "r"))
+gpu = cloud_config.get("gpu", "h100")
+gpu_count = int(cloud_config.get("gpu_count", 1))
+node_count = int(cloud_config.get("node_count", 1))
+project_name = cloud_config.get("project_name", "axolotl-project") or "axolotl-project"
+secrets = cloud_config.get("secrets", [])
+launcher = cloud_config.get("launcher", "accelerate")
+launcher_args = cloud_config.get("launcher_args", [])
+script_name = "run.sh"
+
+launcher_args_str = ""
+if launcher_args:
+    launcher_args_str = "-- " + " ".join(launcher_args)
+
+# 1. Define a base image for your training job
+# must use torch 2.7.0 for vllm
+BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu126-2.7.1"
+
+# 2. Define the Runtime Environment for the Training Job
+# This includes start commands and environment variables.a
+# Secrets from the baseten workspace like API keys are referenced using
+# `SecretReference`.
+
+env_vars = {
+    "AXOLOTL_LAUNCHER": launcher,
+    "AXOLOTL_LAUNCHER_ARGS": launcher_args_str,
+}
+for secret_name in secrets:
+    env_vars[secret_name] = definitions.SecretReference(name=secret_name)
+
+training_runtime = definitions.Runtime(
+    start_commands=[  # Example: list of commands to run your training script
+        f"/bin/sh -c 'chmod +x ./{script_name} && ./{script_name}'"
+    ],
+    environment_variables=env_vars,
+)
+
+# 3. Define the Compute Resources for the Training Job
+training_compute = definitions.Compute(
+    node_count=node_count,
+    accelerator=truss_config.AcceleratorSpec(
+        accelerator=truss_config.Accelerator.H100,
+        count=gpu_count,
+    ),
+)
+
+# 4. Define the Training Job
+# This brings together the image, compute, and runtime configurations.
+my_training_job = definitions.TrainingJob(
+    image=definitions.Image(base_image=BASE_IMAGE),
+    compute=training_compute,
+    runtime=training_runtime,
+)
+
+
+# This config will be pushed using the Truss CLI.
+# The association of the job to the project happens at the time of push.
+first_project_with_job = definitions.TrainingProject(
+    name=project_name, job=my_training_job
+)

From c4c4b906382fed7ec3b3dfb7ef2d6f4734962c60 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 26 Aug 2025 09:30:04 -0400
Subject: [PATCH 021/115] add tokenizer_save_jinja_files to keep legacy
 behavior of including chat template in tokenizer_config.json (#3093)

* add tokenizer_save_jinja_files to keep legacy behavior of including chat template in tokenizer_config.json

* fix test import
---
 src/axolotl/cli/merge_lora.py        |  5 ++-
 src/axolotl/cli/quantize.py          |  1 +
 src/axolotl/core/builders/causal.py  |  3 ++
 src/axolotl/core/trainers/base.py    | 17 +++++++-
 src/axolotl/train.py                 |  7 +++-
 src/axolotl/utils/config/__init__.py |  2 +-
 src/axolotl/utils/schemas/model.py   |  6 +++
 tests/e2e/test_tokenizer.py          | 63 ++++++++++++++++++++++++++++
 8 files changed, 100 insertions(+), 4 deletions(-)
 create mode 100644 tests/e2e/test_tokenizer.py

diff --git a/src/axolotl/cli/merge_lora.py b/src/axolotl/cli/merge_lora.py
index 31fad1b29..657ddcfe4 100644
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -43,7 +43,10 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
             safe_serialization=safe_serialization,
             progressbar=True,
         )
-        tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
+        tokenizer.save_pretrained(
+            str(Path(cfg.output_dir) / "merged"),
+            save_jinja_files=cfg.tokenizer_save_jinja_files,
+        )
 
         if processor:
             processor.save_pretrained(str(Path(cfg.output_dir) / "merged"))
diff --git a/src/axolotl/cli/quantize.py b/src/axolotl/cli/quantize.py
index 0782976fe..b8a8de781 100644
--- a/src/axolotl/cli/quantize.py
+++ b/src/axolotl/cli/quantize.py
@@ -84,5 +84,6 @@ def do_quantize(
         str(Path(output_dir) / "quantized"),
         safe_serialization=False,
         progressbar=True,
+        save_jinja_files=cfg.tokenizer_save_jinja_files,
     )
     LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}...")
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
index 94b0db851..e5bc68c39 100644
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -404,6 +404,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
             **trainer_kwargs,
         )
         trainer = self.hook_post_create_trainer(trainer)
+        # if the trainer has the `axolotl_cfg` property, set it
+        if hasattr(trainer, "axolotl_cfg"):
+            trainer.axolotl_cfg = self.cfg
         for callback in self.get_post_trainer_create_callbacks(trainer):
             trainer.add_callback(callback)
 
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 4b8861790..f707d4b5a 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -42,6 +42,7 @@ from axolotl.core.trainers.utils import (
 )
 from axolotl.utils import get_not_null
 from axolotl.utils.bench import get_gpu_memory_usage
+from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
 from axolotl.utils.logging import get_logger
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
@@ -63,6 +64,15 @@ class AxolotlTrainer(
 
     args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
     tag_names = ["axolotl"]
+    _axolotl_cfg: DictDefault | None = None
+
+    @property
+    def axolotl_cfg(self):
+        return self._axolotl_cfg
+
+    @axolotl_cfg.setter
+    def axolotl_cfg(self, cfg):
+        self._axolotl_cfg = cfg
 
     def __init__(
         self,
@@ -657,6 +667,11 @@ class AxolotlTrainer(
                 LOG.info(
                     "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
                 )
-                self.data_collator.tokenizer.save_pretrained(output_dir)
+                save_jinja_files = True
+                if self.axolotl_cfg:
+                    save_jinja_files = self.axolotl_cfg.tokenizer_save_jinja_files
+                self.data_collator.tokenizer.save_pretrained(
+                    output_dir, save_jinja_files=save_jinja_files
+                )
             # Good practice: save your training arguments together with the trained model
             torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index e409d4a11..e8e314579 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -416,7 +416,9 @@ def save_initial_configs(
 
     # Pre-save the tokenizer and model configs
     LOG.info(f"Pre-saving tokenizer to {cfg.output_dir}...")
-    tokenizer.save_pretrained(str(output_dir))
+    tokenizer.save_pretrained(
+        str(Path(cfg.output_dir)), save_jinja_files=cfg.tokenizer_save_jinja_files
+    )
     if hasattr(model, "config"):
         LOG.info(f"Pre-saving model config to {cfg.output_dir}...")
         model.config.save_pretrained(str(output_dir))
@@ -592,6 +594,9 @@ def train(
 
     # Save the trained model and cleanup
     save_trained_model(cfg, trainer, model, safe_serialization)
+    tokenizer.save_pretrained(
+        str(Path(cfg.output_dir)), save_jinja_files=cfg.tokenizer_save_jinja_files
+    )
     create_model_card(cfg, trainer)
     if not cfg.use_ray:
         cleanup_distributed()
diff --git a/src/axolotl/utils/config/__init__.py b/src/axolotl/utils/config/__init__.py
index 534d7c4a4..2b6ef8d98 100644
--- a/src/axolotl/utils/config/__init__.py
+++ b/src/axolotl/utils/config/__init__.py
@@ -77,7 +77,7 @@ def resolve_dtype(cfg):
     if cfg.device == "mps":
         cfg.load_in_8bit = False
         cfg.tf32 = False
-        if cfg.bf16:
+        if cfg.bf16 and cfg.fp16 is not False:
             cfg.fp16 = True
         cfg.bf16 = False
     else:
diff --git a/src/axolotl/utils/schemas/model.py b/src/axolotl/utils/schemas/model.py
index eb751bfcc..56b206b51 100644
--- a/src/axolotl/utils/schemas/model.py
+++ b/src/axolotl/utils/schemas/model.py
@@ -59,6 +59,12 @@ class ModelInputConfig(BaseModel):
     processor_type: str | None = Field(
         default=None, json_schema_extra={"description": "transformers processor class"}
     )
+    tokenizer_save_jinja_files: bool | None = Field(
+        default=True,  # match the default behavior from transformers
+        json_schema_extra={
+            "description": "Whether to save jinja files for tokenizer, transformers default is True"
+        },
+    )
     trust_remote_code: bool | None = Field(
         default=None,
         json_schema_extra={"description": "Trust remote code for untrusted source"},
diff --git a/tests/e2e/test_tokenizer.py b/tests/e2e/test_tokenizer.py
new file mode 100644
index 000000000..a65c17ac3
--- /dev/null
+++ b/tests/e2e/test_tokenizer.py
@@ -0,0 +1,63 @@
+"""
+e2e test for saving the tokenizer
+"""
+
+from unittest.mock import patch
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from tests.e2e.utils import check_model_output_exists
+
+
+def test_tokenizer_no_save_jinja_files(temp_dir):
+    # pylint: disable=duplicate-code
+    cfg = DictDefault(
+        {
+            "base_model": "HuggingFaceTB/SmolLM2-135M",
+            "tokenizer_type": "AutoTokenizer",
+            "sequence_len": 1024,
+            "load_in_8bit": True,
+            "adapter": "lora",
+            "lora_r": 8,
+            "lora_alpha": 16,
+            "lora_dropout": 0.05,
+            "lora_target_linear": True,
+            "val_set_size": 0.02,
+            "special_tokens": {
+                "pad_token": "<|endoftext|>",
+            },
+            "chat_template": "chatml",
+            "datasets": [
+                {
+                    "path": "mhenrichsen/alpaca_2k_test",
+                    "type": "alpaca",
+                },
+            ],
+            "num_epochs": 1,
+            "micro_batch_size": 2,
+            "gradient_accumulation_steps": 1,
+            "output_dir": temp_dir,
+            "learning_rate": 0.00001,
+            "optimizer": "adamw_torch_fused",
+            "lr_scheduler": "cosine",
+            "max_steps": 5,
+            "save_first_step": False,
+            "fp16": False,
+            "tokenizer_save_jinja_files": False,
+        }
+    )
+
+    cfg = validate_config(cfg)
+    normalize_config(cfg)
+    dataset_meta = load_datasets(cfg=cfg)
+
+    with patch("axolotl.train.execute_training"):
+        train(cfg=cfg, dataset_meta=dataset_meta)
+
+    check_model_output_exists(temp_dir, cfg)
+    with open(f"{temp_dir}/tokenizer_config.json", "r", encoding="utf-8") as f:
+        tokenizer_config = f.read()
+        assert "chat_template" in tokenizer_config

From e1131e9619f9c86cdd8f2fec1774e41354972238 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 26 Aug 2025 09:30:22 -0400
Subject: [PATCH 022/115] make always skip_move_to_device default as true
 (#3084)

---
 src/axolotl/utils/schemas/model.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/src/axolotl/utils/schemas/model.py b/src/axolotl/utils/schemas/model.py
index 56b206b51..04312eedd 100644
--- a/src/axolotl/utils/schemas/model.py
+++ b/src/axolotl/utils/schemas/model.py
@@ -71,10 +71,9 @@ class ModelInputConfig(BaseModel):
     )
 
     experimental_skip_move_to_device: bool | None = Field(
-        default=None,
+        default=True,
         json_schema_extra={
-            "description": "Don't move the model to the device before sharding. "
-            "This is an experimental feature that may be included in the future as the default."
+            "description": "Don't move the model to the device before sharding. Set to `false` to revert to legacy behavior."
         },
     )
 

From d0d2fc56069fae1f87c9370338f6730fa7976c49 Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Wed, 27 Aug 2025 09:10:14 +0100
Subject: [PATCH 023/115] Tokens per second logging [skip-e2e] (#3072)

---
 src/axolotl/core/builders/base.py             | 12 +++-
 src/axolotl/core/trainers/base.py             | 28 ++++++---
 src/axolotl/core/training_args_base.py        |  6 ++
 src/axolotl/utils/bench.py                    |  7 ++-
 .../utils/callbacks/tokens_per_second.py      | 62 +++++++++++++++++++
 src/axolotl/utils/schemas/config.py           |  9 ++-
 6 files changed, 109 insertions(+), 15 deletions(-)
 create mode 100644 src/axolotl/utils/callbacks/tokens_per_second.py

diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
index 44699e6ac..bee291fa2 100644
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -24,9 +24,7 @@ from pathlib import Path
 from typing import Any
 
 import torch
-from transformers import (
-    TrainerCallback,
-)
+from transformers import TrainerCallback
 from transformers.trainer_pt_utils import AcceleratorConfig
 
 from axolotl.integrations.base import PluginManager
@@ -38,6 +36,7 @@ from axolotl.utils.callbacks import (
     SaveModelOnFirstStepCallback,
 )
 from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
+from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback
 from axolotl.utils.distributed import build_parallelism_config
 from axolotl.utils.schemas.enums import CustomSupportedOptimizers
 
@@ -146,6 +145,12 @@ class TrainerBuilderBase(abc.ABC):
                     profiler_steps_start=self.cfg.profiler_steps_start,
                 )
             )
+        if self.cfg.include_tkps:
+            callbacks.append(
+                TokensPerSecondCallback(
+                    self.cfg.tensor_parallel_size, self.cfg.context_parallel_size
+                )
+            )
 
         return callbacks
 
@@ -512,6 +517,7 @@ class TrainerBuilderBase(abc.ABC):
                 self.cfg.eval_batch_size
             )
 
+        training_args_kwargs["include_tkps"] = self.cfg.include_tkps
         training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
         training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs
 
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index f707d4b5a..06eef445b 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -88,7 +88,6 @@ class AxolotlTrainer(
         self._signature_columns = None  # workaround for pylint
 
         super().__init__(*_args, **kwargs)
-
         self.train_data_collator = self.data_collator
         self._stored_metrics = defaultdict(lambda: defaultdict(list))
         if self.args.orpo_alpha:
@@ -337,6 +336,17 @@ class AxolotlTrainer(
         #     outputs = model(**inputs)
         #     loss = trainer_weighted_loss(outputs, labels, shift_labels=True)
         #     return (loss, outputs) if return_outputs else loss
+
+        # track number of tokens for tokens per second calculation
+        if self.args.include_tkps:
+            inputs_key = "labels" if "labels" in inputs else "input_ids"
+            if hasattr(self.state, "num_tokens"):
+                self.state.num_tokens = (
+                    self.state.num_tokens + (inputs[inputs_key] != -100).sum()
+                )
+            else:
+                self.state.num_tokens = (inputs[inputs_key] != -100).sum()
+
         if self.args.orpo_alpha:
             return self.orpo_compute_loss(
                 model,
@@ -536,9 +546,6 @@ class AxolotlTrainer(
 
         super().create_accelerator_and_postprocess()
 
-        # now we need to put parallelism_config back on the PartialState since we rely on that info in other places
-        # PartialState().parallelism_config = self.accelerator.state.parallelism_config
-
         if self.is_fsdp_enabled:
             if (
                 "limit_all_gathers" in self.args.fsdp_config
@@ -586,12 +593,19 @@ class AxolotlTrainer(
             # Add memory usage
             try:
                 active, allocated, reserved = get_gpu_memory_usage()
-                logs["memory/max_mem_active(gib)"] = round(active, 2)
-                logs["memory/max_mem_allocated(gib)"] = round(allocated, 2)
-                logs["memory/device_mem_reserved(gib)"] = round(reserved, 2)
+                logs["memory/max_active (GiB)"] = round(active, 2)
+                logs["memory/max_allocated (GiB)"] = round(allocated, 2)
+                logs["memory/device_reserved (GiB)"] = round(reserved, 2)
             except (ValueError, TypeError, FileNotFoundError):
                 pass
 
+        if self.args.include_tkps and train_eval == "train":
+            # each rank will log its own tokens per second
+            # for logging_steps > 1 we obtain a moving average of this metric
+            logs["tokens_per_second_per_gpu"] = round(
+                self.state.last_tokens_per_second.item() / self.args.logging_steps, 2
+            )
+
         del self._stored_metrics[train_eval]
 
         return super().log(logs, start_time)
diff --git a/src/axolotl/core/training_args_base.py b/src/axolotl/core/training_args_base.py
index a9cc7d224..41ee8e91e 100644
--- a/src/axolotl/core/training_args_base.py
+++ b/src/axolotl/core/training_args_base.py
@@ -49,6 +49,12 @@ class AxolotlTrainingMixins:
         default=False,
         metadata={"help": "Use real batches for efficient training."},
     )
+    include_tkps: bool = field(
+        default=True,
+        metadata={
+            "help": "Whether to include tokens per second in the training metrics."
+        },
+    )
     eval_sample_packing: Optional[bool] = field(
         default=None,
         metadata={"help": "Use sample packing for efficient evals."},
diff --git a/src/axolotl/utils/bench.py b/src/axolotl/utils/bench.py
index dd3a85b8c..0a4594991 100644
--- a/src/axolotl/utils/bench.py
+++ b/src/axolotl/utils/bench.py
@@ -60,13 +60,14 @@ def gpu_memory_usage_all(device=0):
     active = torch.cuda.memory_stats().get("active_bytes.all.peak", 0) / 1024.0**3
     allocated = torch.cuda.max_memory_allocated(device) / 1024.0**3
     reserved = torch.cuda.max_memory_reserved(device) / 1024.0**3
+    torch.cuda.reset_peak_memory_stats(device)
     return active, allocated, reserved
 
 
 def mps_memory_usage_all():
-    usage = torch.mps.current_allocated_memory() / 1024.0**3
-    reserved = torch.mps.driver_allocated_memory() / 1024.0**3
-    return usage, reserved - usage, 0
+    active = torch.mps.current_allocated_memory() / 1024.0**3
+    allocated = torch.mps.driver_allocated_memory() / 1024.0**3
+    return active, allocated, 0
 
 
 def npu_memory_usage_all(device=0):
diff --git a/src/axolotl/utils/callbacks/tokens_per_second.py b/src/axolotl/utils/callbacks/tokens_per_second.py
new file mode 100644
index 000000000..85bcd5041
--- /dev/null
+++ b/src/axolotl/utils/callbacks/tokens_per_second.py
@@ -0,0 +1,62 @@
+"""A callback for calculating tokens per second during training."""
+
+import time
+
+import torch
+from transformers import (
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+
+
+class TokensPerSecondCallback(TrainerCallback):
+    """
+    A callback to measure and log tokens per second during training.
+    """
+
+    def __init__(self, tensor_parallel_size, context_parallel_size):
+        super().__init__()
+        self.step_time = 0.0
+        self.start_time = 0.0
+        self.non_data_parallel_size = 1
+        if tensor_parallel_size is not None:
+            self.non_data_parallel_size *= tensor_parallel_size
+        if context_parallel_size is not None:
+            self.non_data_parallel_size *= context_parallel_size
+
+    def on_step_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):  # pylint: disable=unused-argument
+        self.start_time = time.perf_counter()
+        state.last_tokens_per_second = torch.zeros(1)
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):  # pylint: disable=unused-argument
+        step_time = time.perf_counter() - self.start_time
+        num_tokens_per_device = state.num_tokens.clone()
+        # non data parallel groups have duplicated tokens, so we avoid double-counting
+        num_tokens_per_device = num_tokens_per_device / self.non_data_parallel_size
+        state.last_tokens_per_second = num_tokens_per_device / step_time
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs=None,
+        **kwargs,
+    ):  # pylint: disable=unused-argument
+        # after logging, clear the running metrics
+        state.last_tokens_per_second.zero_()
+        state.num_tokens = 0
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 4d660d4b7..4b5f571dc 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -830,10 +830,15 @@ class AxolotlInputConfig(
     include_tokens_per_second: bool | None = Field(
         default=None,
         json_schema_extra={
-            "description": "bool of whether to include tokens trainer per second in the training metrics. This iterates over the entire dataset once, so it takes some time."
+            "description": "bool of whether to report tokens per second at the end of training. This is not supported with pre-training datasets."
+        },
+    )
+    include_tkps: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "bool of whether to report tokens per second during training by measuring throughput of non-padding tokens."
         },
     )
-
     neftune_noise_alpha: float | None = Field(
         default=None,
         json_schema_extra={

From dc338c3b0eccdfd12b2fdd83eeff1e48c7965bc1 Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Wed, 27 Aug 2025 09:50:52 -0400
Subject: [PATCH 024/115] Update .coderabbit.yaml (#3109) [skip ci]

Oops, should be false.
---
 .coderabbit.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.coderabbit.yaml b/.coderabbit.yaml
index b7cf7d969..821d6bd5b 100644
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -12,6 +12,6 @@ reviews:
   auto_review:
     enabled: true
     drafts: false
-    auto_incremental_review: true
+    auto_incremental_review: false
 chat:
   auto_reply: true

From 6afba3871d3f6748372afc6289722ce82c5c00c7 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Thu, 28 Aug 2025 09:10:40 -0400
Subject: [PATCH 025/115] Add support for PyTorch 2.8.0 (#3106)

* Add support for PyTorch 2.8.0

* loosen triton requirements

* handle torch 2.8.0 in setup.py

* fix versions

* no vllm for torch 2.8.0

* remove comment

Co-authored-by: NanoCode012 <nano@axolotl.ai>

---------

Co-authored-by: NanoCode012 <nano@axolotl.ai>
---
 .github/workflows/main.yml                     | 16 ++++++++++++++++
 .github/workflows/multi-gpu-e2e.yml            | 14 +++++++-------
 .github/workflows/tests.yml                    | 18 ++++++++++++------
 .../colab-axolotl-example.ipynb                |  2 +-
 requirements.txt                               |  3 +--
 scripts/cutcrossentropy_install.py             |  2 +-
 setup.py                                       |  4 +++-
 .../integrations/cut_cross_entropy/README.md   |  2 +-
 .../integrations/cut_cross_entropy/__init__.py |  2 +-
 9 files changed, 43 insertions(+), 20 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3daf39e43..3f98dd2b4 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -36,6 +36,11 @@ jobs:
             python_version: "3.11"
             pytorch: 2.7.1
             axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.8.0
+            axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout
@@ -110,6 +115,11 @@ jobs:
             python_version: "3.11"
             pytorch: 2.7.1
             axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.8.0
+            axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout
@@ -169,6 +179,12 @@ jobs:
             pytorch: 2.7.1
             axolotl_extras: vllm
             is_latest: true
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.8.0
+            axolotl_extras:
+            is_latest:
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout
diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index 308526151..6492e5d3e 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -33,13 +33,6 @@ jobs:
             axolotl_extras:
             num_gpus: 2
             nightly_build: "true"
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.0
-            axolotl_extras:
-            num_gpus: 2
-            nightly_build: "true"
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
@@ -47,6 +40,13 @@ jobs:
             axolotl_extras: vllm
             num_gpus: 2
             nightly_build: "true"
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.8.0
+            axolotl_extras:
+            num_gpus: 2
+            nightly_build: "true"
     runs-on: [self-hosted, modal]
     timeout-minutes: 120
     steps:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index fe63aa313..59011ee77 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -55,7 +55,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
+        pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
     timeout-minutes: 20
 
     steps:
@@ -130,7 +130,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.6.0", "2.7.0", "2.7.1"]
+        pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
     timeout-minutes: 20
 
     steps:
@@ -240,7 +240,7 @@ jobs:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.7.1
             num_gpus: 1
             axolotl_extras:
             dockerfile: "Dockerfile-uv.jinja"
@@ -298,6 +298,12 @@ jobs:
             pytorch: 2.7.1
             num_gpus: 1
             axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.8.0
+            num_gpus: 1
+            axolotl_extras:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
@@ -334,10 +340,10 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 124
-            cuda_version: 12.4.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.7.1
             num_gpus: 1
             axolotl_extras:
     steps:
diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index 30ef1c3de..b780a1c48 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
     "%%capture\n",
     "# This step can take ~5-10 minutes to install dependencies\n",
     "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5\""
    ]
   },
   {
diff --git a/requirements.txt b/requirements.txt
index c51c9d1fe..5accd13ed 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,8 +2,7 @@
 
 # START section of dependencies that don't install on Darwin/MacOS
 bitsandbytes==0.47.0
-# triton 3.4.0 is not compatible with CCE
-triton>=3.0.0,<3.4.0
+triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
 autoawq==0.2.7.post3
diff --git a/scripts/cutcrossentropy_install.py b/scripts/cutcrossentropy_install.py
index b2bb0fcf8..5b49e7427 100644
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
 
 print(
     UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5"'
 )
diff --git a/setup.py b/setup.py
index 5aab9d7c0..5bf9ae840 100644
--- a/setup.py
+++ b/setup.py
@@ -64,7 +64,9 @@ def parse_requirements(extras_require_map):
             else:
                 raise ValueError("Invalid version format")
 
-            if (major, minor) >= (2, 7):
+            if (major, minor) >= (2, 8):
+                pass
+            elif (major, minor) >= (2, 7):
                 _install_requires.pop(_install_requires.index(xformers_version))
                 if patch == 0:
                     _install_requires.append("xformers==0.0.30")
diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md
index 02e4e6686..a64bdd054 100644
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
 
 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5"
 ```
 
 ## Usage
diff --git a/src/axolotl/integrations/cut_cross_entropy/__init__.py b/src/axolotl/integrations/cut_cross_entropy/__init__.py
index 6dd7c97e1..d0eb1ebdb 100644
--- a/src/axolotl/integrations/cut_cross_entropy/__init__.py
+++ b/src/axolotl/integrations/cut_cross_entropy/__init__.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)
 
 _CCE_INSTALL_MESSAGE = (
     "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@0ee9ee8"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5"`'
 )
 
 

From 5b6ec2820f26ce4b50c624b41453d93b2a9c6063 Mon Sep 17 00:00:00 2001
From: VED <146507396+ved1beta@users.noreply.github.com>
Date: Fri, 29 Aug 2025 21:42:09 +0530
Subject: [PATCH 026/115] patch for ds_grads_remaining in deepspeed (#3102)
 [skip ci]

* patch deepspeed

* deepspeed patch for ds_grads_remaining

* patch in Patchmanager

* chore: lint

* deepseed utils

* chore2

* patch ds_grads_remaining chore

* chore lint

* chore lint

* remove torch.nn patch

* lint

* Update src/axolotl/monkeypatch/utils.py

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>

* patched with checkpointwarapper

* lint

* only apply deepspeed patch when using activation offloading

---------

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
Co-authored-by: Wing Lian <wing@axolotl.ai>
---
 src/axolotl/loaders/patch_manager.py       | 15 +++++
 src/axolotl/monkeypatch/deepspeed_utils.py | 66 ++++++++++++++++++++++
 2 files changed, 81 insertions(+)
 create mode 100644 src/axolotl/monkeypatch/deepspeed_utils.py

diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index 4959bd6ba..eafe89d29 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -3,6 +3,7 @@
 Applies pre- and post-model load patches for various fixes and optimizations.
 """
 
+import os
 import importlib.util
 from functools import cached_property
 
@@ -66,6 +67,7 @@ class PatchManager:
         self._apply_mistral_cross_entropy_patch()
         self._apply_self_attention_lora_patch()
         self._apply_fsdp2_bnb_patches()
+        self._apply_patch_deepspeed_zero3()
 
     def apply_post_plugin_pre_model_load_patches(self):
         """Apply post plugin-pre_model_load load patches based on config."""
@@ -471,3 +473,16 @@ class PatchManager:
             from axolotl.monkeypatch.lora_kernels import apply_lora_kernel_patches
 
             apply_lora_kernel_patches(model=model, cfg=self.cfg)
+
+    def _apply_patch_deepspeed_zero3(self):
+        try:
+            from axolotl.monkeypatch.deepspeed_utils import apply_deepspeed_patches
+            from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
+
+            if self.cfg.activation_offloading is True and (
+                is_deepspeed_zero3_enabled()
+                or os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3"
+            ):
+                apply_deepspeed_patches()
+        except ImportError as e:
+            LOG.warning(f"DeepSpeed patches not applied: {e}")
diff --git a/src/axolotl/monkeypatch/deepspeed_utils.py b/src/axolotl/monkeypatch/deepspeed_utils.py
new file mode 100644
index 000000000..6740f556b
--- /dev/null
+++ b/src/axolotl/monkeypatch/deepspeed_utils.py
@@ -0,0 +1,66 @@
+import importlib
+import importlib.util
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def patch_checkpoint_wrapper_setattr():
+    """
+    Patch CheckpointWrapper to properly forward DeepSpeed attributes to wrapped modules.
+
+    This fixes the issue where CheckpointWrapper doesn't forward ds_* attributes
+    (like ds_grads_remaining) to the actual wrapped module, causing DeepSpeed
+    ZeRO-3 to fail when gradient checkpointing is enabled.
+
+    This issue occurs specifically with:
+    - QLoRA + DeepSpeed ZeRO-3
+    - gradient_checkpointing: true
+    - activation_offloading: true
+
+    References:
+    - https://github.com/deepspeedai/DeepSpeed/issues/7203
+    - https://github.com/deepspeedai/DeepSpeed/blob/38d1a9eb64c9e01e32eccc50b25ba18925287441/deepspeed/runtime/zero/parameter_offload.py#L424-L458
+    - https://github.com/axolotl-ai-cloud/axolotl/pull/3102
+    """
+
+    try:
+        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
+            CheckpointWrapper,
+        )
+
+        # Check if already patched
+        if hasattr(CheckpointWrapper, "_axolotl_setattr_patched"):
+            LOG.debug("CheckpointWrapper already patched")
+            return
+
+        original_setattr = CheckpointWrapper.__setattr__
+
+        def new_setattr(self, name: str, value) -> None:
+            if name.startswith("ds_") and hasattr(self, "_checkpoint_wrapped_module"):
+                setattr(self._checkpoint_wrapped_module, name, value)
+                LOG.debug(
+                    f"Forwarded {name} to wrapped module {type(self._checkpoint_wrapped_module).__name__}"
+                )
+            else:
+                original_setattr(self, name, value)
+
+        CheckpointWrapper.__setattr__ = new_setattr
+        CheckpointWrapper._axolotl_setattr_patched = True
+
+        LOG.info("CheckpointWrapper patched to forward DeepSpeed attributes")
+
+    except ImportError as e:
+        LOG.debug(f"CheckpointWrapper not available: {e}")
+    except Exception as e:
+        LOG.warning(f"Failed to patch CheckpointWrapper: {e}")
+
+
+def apply_deepspeed_patches():
+    """
+    Apply DeepSpeed-related patches
+    """
+    if importlib.util.find_spec("deepspeed") is not None:
+        patch_checkpoint_wrapper_setattr()
+    else:
+        LOG.debug("DeepSpeed not available, skipping patches")

From 7ed40f1d70957f83447f4d93f72b8d0015dab34d Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Fri, 29 Aug 2025 13:36:47 -0400
Subject: [PATCH 027/115] automatically set env vars for single gpu deepspeed
 zero3 (#3118) [skip ci]

* automatically set env vars for single gpu deepspeed zero3

* use setdefault
---
 docs/multi-gpu.qmd           | 9 ---------
 src/axolotl/utils/trainer.py | 7 +++++++
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/docs/multi-gpu.qmd b/docs/multi-gpu.qmd
index 71676bc84..fb91f81e5 100644
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -63,15 +63,6 @@ Start from Stage 1 -> Stage 2 -> Stage 3.
 
 :::
 
-::: {.callout-tip}
-
-Using ZeRO Stage 3 with Single-GPU training
-
-ZeRO Stage 3 can be used for training on a single GPU by manually setting the environment variables:
-`WORLD_SIZE=1 LOCAL_RANK=0 MASTER_ADDR=0.0.0.0 MASTER_PORT=29500`
-
-:::
-
 ## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}
 
 ::: {.callout-note}
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 08038cb18..43f76c0cd 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -547,6 +547,13 @@ def setup_deepspeed_env(cfg, stage=None):
         if stage == 3:
             os.environ["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = "true"
 
+    device_count = torch.cuda.device_count()
+    if device_count == 1:
+        os.environ.setdefault("WORLD_SIZE", "1")
+        os.environ.setdefault("LOCAL_RANK", "0")
+        os.environ.setdefault("MASTER_ADDR", "0.0.0.0")  # nosec B104
+        os.environ.setdefault("MASTER_PORT", "29500")
+
     # NOTE(djsaunde): The distribued state cannot be initialized prior to the
     # ACCELERATE_USE_DEEPSPEED assignment, but it must be initialized some time prior
     # to model load.

From 0094a2d744553fb89e4874e2d79ac309f26cae77 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Fri, 29 Aug 2025 13:52:49 -0400
Subject: [PATCH 028/115] support for tiledmlp for GPT-OSS (#3116)

* fix use of flex attn kwargs and add support for tiledmlp for GPT-OSS

* add logging back

* update deps
---
 requirements.txt                              |   2 +-
 setup.py                                      |   2 +-
 src/axolotl/loaders/patch_manager.py          |  14 +-
 .../monkeypatch/attention/flex_attn.py        | 174 +++---------------
 src/axolotl/monkeypatch/tiled_mlp/base.py     | 107 ++++++++++-
 src/axolotl/monkeypatch/tiled_mlp/patch.py    |   7 +-
 6 files changed, 144 insertions(+), 162 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 5accd13ed..9e3dbbca4 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ packaging==23.2
 
 huggingface_hub>=0.33.0
 peft>=0.17.0
-transformers==4.55.3
+transformers==4.55.4
 tokenizers>=0.21.1
 accelerate==1.10.0
 datasets==4.0.0
diff --git a/setup.py b/setup.py
index 5bf9ae840..4cbc562e0 100644
--- a/setup.py
+++ b/setup.py
@@ -127,7 +127,7 @@ extras_require = {
         "yunchang==0.6.0",
     ],
     "deepspeed": [
-        "deepspeed==0.17.2",
+        "deepspeed==0.17.5",
         "deepspeed-kernels",
     ],
     "mamba-ssm": [
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index eafe89d29..94b307a62 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -149,14 +149,12 @@ class PatchManager:
     def _apply_flex_attention_patches(self):
         """Apply patches for flexible attention."""
         if self.cfg.flex_attention:
-            # from axolotl.monkeypatch.attention.flex_attn import (
-            #     patch_flex_make_mask,
-            #     patch_flex_wrapper,
-            # )
-            #
-            # flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
-            # patch_flex_wrapper(**flex_attn_compile_kwargs)
-            # patch_flex_make_mask()
+            from axolotl.monkeypatch.attention.flex_attn import (
+                patch_flex_wrapper,
+            )
+
+            flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
+            patch_flex_wrapper(**flex_attn_compile_kwargs)
             if self.cfg.sample_packing:
                 from axolotl.core.attention.flex_block_mask import (
                     patch_create_causal_mask,
diff --git a/src/axolotl/monkeypatch/attention/flex_attn.py b/src/axolotl/monkeypatch/attention/flex_attn.py
index f59b8abe2..65ccad533 100644
--- a/src/axolotl/monkeypatch/attention/flex_attn.py
+++ b/src/axolotl/monkeypatch/attention/flex_attn.py
@@ -1,11 +1,11 @@
 """Flex attention monkey patch"""
 
 import sys
-from typing import Optional, Tuple, Union
+from packaging import version
 
 import torch
 import transformers
-
+from transformers.utils.import_utils import _torch_version, is_torch_less_or_equal
 from axolotl.utils.logging import get_logger
 
 LOG = get_logger(__name__)
@@ -46,19 +46,33 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
             """
             self.training = None
             if not self._is_flex_compiled or training != self.training:
+                self.training = training
+                if is_torch_less_or_equal("2.5.1"):
+                    self._compiled_flex_attention = torch.compile(
+                        flex_attention, dynamic=False
+                    )
                 # In PyTorch 2.6.0, there's a known issue with flex attention compilation which may
                 # cause errors. The suggested fix is to compile with "max-autotune-no-cudagraphs"
                 # see https://github.com/pytorch/pytorch/issues/146260 for training
-                self.training = training
-                LOG.info(
-                    "Compiling flex attention with kwargs: %s. This may take a while...",
-                    flex_attn_compile_kwargs,
-                )
-                self._compiled_flex_attention = torch.compile(
-                    flex_attention,
-                    **flex_attn_compile_kwargs,
-                )
-                LOG.info("Flex attention compiled successfully.")
+                elif version.parse(_torch_version).base_version == "2.6.0" and training:
+                    self._compiled_flex_attention = torch.compile(
+                        flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs"
+                    )
+                # Fallback, usually the most recent torch 2.7.x+ versions
+                else:
+                    LOG.info(
+                        "Compiling flex attention with kwargs: %s. This may take a while...",
+                        flex_attn_compile_kwargs,
+                        main_process_only=True,
+                    )
+                    self._compiled_flex_attention = torch.compile(
+                        flex_attention,
+                        **flex_attn_compile_kwargs,
+                    )
+                    LOG.info(
+                        "Flex attention compiled successfully.", main_process_only=True
+                    )
+
                 self._is_flex_compiled = True
 
         def __call__(self):
@@ -68,139 +82,3 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
     sys.modules[
         "transformers.integrations.flex_attention"
     ].WrappedFlexAttention = WrappedFlexAttention
-
-
-def patch_flex_make_mask():
-    is_torch_2_6 = torch.__version__.startswith("2.6")
-
-    if not is_torch_2_6:
-        return
-
-    from torch.nn.attention.flex_attention import (
-        _DEFAULT_SPARSE_BLOCK_SIZE as flex_default_block_size,
-    )
-    from torch.nn.attention.flex_attention import (
-        BlockMask,
-    )
-    from torch.nn.attention.flex_attention import (
-        create_block_mask as create_block_causal_mask_flex,
-    )
-
-    Offset = Union[torch.Tensor, int]
-
-    def patched_make_flex_block_causal_mask(
-        attention_mask_2d: torch.Tensor,
-        attention_chunk_size: Optional[int] = None,
-        query_length=None,
-        key_length=None,
-        offsets: Optional[Tuple[Offset, Offset]] = None,
-    ) -> "BlockMask":
-        """
-        Create a block causal document mask for a batch of sequences, both packed and unpacked.
-        Create Block causal logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
-        The resultant BlockMask is a compressed representation of the full block causal
-        mask. BlockMask is essential for performant computation of flex attention.
-        See: https://pytorch.org/blog/flexattention/
-
-        Args:
-            attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
-            of shape (batch_size, total_seq_len). e.g.
-
-            For unpacked sequence:
-            [[1, 1, 1, 1, 0, 0, 0],
-             [1, 1, 1, 1, 1, 0, 0]]
-
-            For packed sequence:
-            [[1, 1, 1, 2, 2, 2, 0],
-             [1, 1, 2, 2, 2, 3, 3]]
-
-        Returns:
-            BlockMask
-        """
-
-        batch_size, total_seq_len = attention_mask_2d.shape
-        if not key_length:
-            key_length = total_seq_len
-        if not query_length:
-            query_length = total_seq_len
-        attention_mask_2d = torch.nn.functional.pad(
-            attention_mask_2d,
-            value=0,
-            pad=(0, abs(total_seq_len - max(key_length, flex_default_block_size))),
-        )
-        device = attention_mask_2d.device
-        document_ids = attention_mask_2d.clone()
-
-        if attention_chunk_size is not None:
-            # we create an arange, then we just // by chunk size to get [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
-            chunk_idxs = (document_ids.clone().fill_(1).cumsum(-1) - 1) // (
-                attention_chunk_size
-            )
-
-        # Instead of passing a tensor mask, flex attention requires a mask_mod function
-        # that determines which elements of QK^T should be included in the attention
-        # computation prior to the softmax. For sample packing, we need both the
-        # logic for both causal mask and document mask. See PyTorch's official
-        # blog post for more details: https://pytorch.org/blog/flexattention/#mask-mods
-        def causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
-            """
-            Defines the logic of a block causal mask by combining both a standard causal mask
-            and a block diagonal document mask.
-
-            See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
-            for an illustration.
-            """
-            causal_mask = q_idx >= kv_idx  # not valid when decoding
-            document_mask = (
-                document_ids[batch_idx, q_idx] == document_ids[batch_idx, kv_idx]
-            )
-            padding_mask = attention_mask_2d[batch_idx, q_idx] > 0
-            final_mask = causal_mask & padding_mask & document_mask
-            return final_mask
-
-        def chunk_causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
-            """
-            Combines the chunk mask with the causal mask for chunked attention.
-            """
-            chunk_mask = chunk_idxs[batch_idx, q_idx] == chunk_idxs[batch_idx, kv_idx]
-            causal_doc_mask = causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx)
-            return chunk_mask & causal_doc_mask
-
-        mask_mod_maybe_combined = (
-            causal_mask_mod if attention_chunk_size is None else chunk_causal_mask_mod
-        )
-
-        if offsets is not None:
-            q_offset = offsets[0]
-            kv_offset = offsets[1]
-
-            def mask_mod(batch_idx, head_idx, q_idx, kv_idx):
-                offset_q = q_idx + q_offset
-                offset_kv = kv_idx + kv_offset
-                return mask_mod_maybe_combined(batch_idx, head_idx, offset_q, offset_kv)
-
-        else:
-            mask_mod = mask_mod_maybe_combined
-        return create_block_causal_mask_flex(
-            mask_mod=mask_mod,
-            B=batch_size,
-            H=None,  # attention head
-            Q_LEN=query_length,
-            KV_LEN=key_length,
-            device=device,
-            _compile=True,
-        )
-
-    for n in tuple(sys.modules):
-        if ".modeling_" in n:
-            if hasattr(sys.modules[n], "make_flex_block_causal_mask"):
-                sys.modules[
-                    n
-                ].make_flex_block_causal_mask = patched_make_flex_block_causal_mask
-                sys.modules[
-                    n
-                ].make_flex_block_causal_mask = patched_make_flex_block_causal_mask
-
-    transformers.integrations.flex_attention.make_flex_block_causal_mask = (
-        patched_make_flex_block_causal_mask
-    )
diff --git a/src/axolotl/monkeypatch/tiled_mlp/base.py b/src/axolotl/monkeypatch/tiled_mlp/base.py
index 3b7326bdb..2c9dc8e4c 100644
--- a/src/axolotl/monkeypatch/tiled_mlp/base.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/base.py
@@ -8,6 +8,94 @@ from typing import List
 import torch
 
 
+class DeepSpeedTiledMLPMoE(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        fn,
+        self,
+        x,
+        shards,
+        compute_params,
+    ) -> torch.Tensor:
+        ctx.fn = fn
+        ctx.self = self
+        ctx.shards = shards
+        ctx.compute_params = [p for p in compute_params if p.requires_grad]
+        ctx.save_for_backward(x)
+
+        x_shards = list(torch.chunk(x, chunks=shards, dim=1))
+        with torch.no_grad():
+            output_shards = [fn(self, x_shard) for x_shard in x_shards]
+
+        ctx.is_tuple_output = isinstance(output_shards[0], tuple)
+        if isinstance(output_shards[0], tuple):
+            tuple_dim_idx = [1, 0]
+            output_unsharded = tuple(
+                torch.cat(
+                    [output_shard[i] for output_shard in output_shards],
+                    dim=tuple_dim_idx[i],
+                )
+                for i in range(len(output_shards[0]))
+            )
+        else:
+            output_unsharded = torch.cat(output_shards, dim=1)
+
+        return output_unsharded
+
+    @staticmethod
+    def backward(ctx, *grads) -> torch.Tensor:
+        fn = ctx.fn
+        (x,) = ctx.saved_tensors
+        self = ctx.self
+        shards = ctx.shards
+        compute_params = ctx.compute_params
+        is_tuple_output = ctx.is_tuple_output
+
+        x_requires_grad = x.requires_grad
+        x = x.detach()
+        # detach() unsets `x.requires_grad`, so restore it
+        x.requires_grad_(x_requires_grad)
+
+        incoming_grad = grads[0]
+        x_grad = torch.zeros_like(x)
+        x_shards = list(torch.chunk(x, chunks=shards, dim=1))
+
+        shard_step = x_shards[0].numel()
+        for i, x_shard in enumerate(x_shards):
+            # Tell deepspeed not to add a new grad to its ipg bucket until the last shard is run
+            if compute_params is not None:
+                if i + 1 < shards:
+                    for param in compute_params:
+                        param.ds_grad_is_ready = False
+                else:
+                    # last shard, can add the grad
+                    for param in compute_params:
+                        param.ds_grad_is_ready = True
+
+            x_shard.requires_grad_(x_requires_grad)
+
+            shard_offset = i * shard_step
+            x_shard.grad = (
+                x_grad.view(-1)
+                .narrow(0, shard_offset, x_shard.numel())
+                .view_as(x_shard)
+            )
+            incoming_grad_shard = (
+                incoming_grad.view(-1)
+                .narrow(0, shard_offset, x_shard.numel())
+                .view_as(x_shard)
+            )
+            with torch.enable_grad():
+                output = fn(self, x_shard)
+            if is_tuple_output:
+                torch.autograd.backward(output[0], incoming_grad_shard)
+            else:
+                torch.autograd.backward(output, incoming_grad_shard)
+
+        return (None, None, x_grad, None, None)
+
+
 class TiledMLP(torch.autograd.Function):
     """
     TiledMLP implementation using gradient hooks
@@ -31,7 +119,18 @@ class TiledMLP(torch.autograd.Function):
         x_shards = list(torch.chunk(x, chunks=shards, dim=1))
         with torch.no_grad():
             output_shards = [fn(self, x_shard) for x_shard in x_shards]
-        output_unsharded = torch.cat(output_shards, dim=1)
+        ctx.is_tuple_output = isinstance(output_shards[0], tuple)
+        if isinstance(output_shards[0], tuple):
+            tuple_dim_idx = [1, 0]
+            output_unsharded = tuple(
+                torch.cat(
+                    [output_shard[i] for output_shard in output_shards],
+                    dim=tuple_dim_idx[i],
+                )
+                for i in range(len(output_shards[0]))
+            )
+        else:
+            output_unsharded = torch.cat(output_shards, dim=1)
 
         return output_unsharded
 
@@ -42,6 +141,7 @@ class TiledMLP(torch.autograd.Function):
         self = ctx.self
         shards = ctx.shards
         compute_params = ctx.compute_params
+        is_tuple_output = ctx.is_tuple_output
 
         x_requires_grad = x.requires_grad
         x = x.detach()
@@ -76,7 +176,10 @@ class TiledMLP(torch.autograd.Function):
 
             with torch.enable_grad():
                 output = fn(self, x_shard)
-            torch.autograd.backward(output, incoming_grad_shard)
+            if is_tuple_output:
+                torch.autograd.backward(output[0], incoming_grad_shard)
+            else:
+                torch.autograd.backward(output, incoming_grad_shard)
 
         # Clean up hooks
         grad_accumulator.cleanup()
diff --git a/src/axolotl/monkeypatch/tiled_mlp/patch.py b/src/axolotl/monkeypatch/tiled_mlp/patch.py
index 7cdc6d3a3..c0f89236b 100644
--- a/src/axolotl/monkeypatch/tiled_mlp/patch.py
+++ b/src/axolotl/monkeypatch/tiled_mlp/patch.py
@@ -17,7 +17,7 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
         TiledMLP as DeepSpeedTiledMLP,
     )
 
-    from axolotl.monkeypatch.tiled_mlp.base import TiledMLP
+    from axolotl.monkeypatch.tiled_mlp.base import DeepSpeedTiledMLPMoE, TiledMLP
 
     try:
         # Dynamically import the module and MLP class
@@ -64,7 +64,10 @@ def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
                         for p in self._compute_params
                     )
                 ) or os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
-                    self._tiled_mlp_dist_impl = DeepSpeedTiledMLP
+                    if model_type == "gpt_oss":
+                        self._tiled_mlp_dist_impl = DeepSpeedTiledMLPMoE
+                    else:
+                        self._tiled_mlp_dist_impl = DeepSpeedTiledMLP
                 else:
                     self._tiled_mlp_dist_impl = TiledMLP
 

From 231a67e70bbfc095fc94e057537412cf57a472cf Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Tue, 2 Sep 2025 12:08:44 -0400
Subject: [PATCH 029/115] Streaming SFT support (#3101)

* working

* fixes

* deprecate --iterable; cleanup

* pretrain_multipack_buffer_size -> streaming_multipack_buffer_size

* improvements

* tests

* remove unused

* docs, examples

* nit

* nit

* add val_set_size validation

* val

* nit

* min

* coderabbito

* cleanup

* nit

* add depr warning, cleanup

* nit

* fix test, fix quarto

* fix

* review comments

* review comments

* fix
---
 _quarto.yml                                   |   3 +-
 docs/streaming.qmd                            | 120 +++++++++
 examples/streaming/README.md                  |  50 ++++
 examples/streaming/pretrain.yaml              |  57 +++++
 examples/streaming/sft.yaml                   |  55 ++++
 src/axolotl/cli/args.py                       |   8 +-
 src/axolotl/cli/preprocess.py                 |  12 +-
 src/axolotl/common/datasets.py                |   2 -
 src/axolotl/datasets.py                       | 145 +----------
 src/axolotl/prompt_tokenizers.py              |   2 +-
 src/axolotl/utils/collators/__init__.py       |  16 +-
 src/axolotl/utils/data/__init__.py            |  10 +-
 src/axolotl/utils/data/sft.py                 |  92 +++----
 src/axolotl/utils/data/shared.py              |   2 -
 .../data/{pretraining.py => streaming.py}     |  59 +++--
 src/axolotl/utils/data/utils.py               |  11 +-
 src/axolotl/utils/schemas/config.py           |  40 ++-
 src/axolotl/utils/schemas/validation.py       |  82 ++++++
 tests/e2e/integrations/test_kd.py             |   2 +-
 tests/e2e/test_streaming.py                   |  73 ++++++
 tests/test_data.py                            |   4 +-
 tests/test_packed_dataset.py                  |  42 ----
 tests/test_packed_pretraining.py              |   7 +-
 tests/test_streaming.py                       | 238 ++++++++++++++++++
 24 files changed, 849 insertions(+), 283 deletions(-)
 create mode 100644 docs/streaming.qmd
 create mode 100644 examples/streaming/README.md
 create mode 100644 examples/streaming/pretrain.yaml
 create mode 100644 examples/streaming/sft.yaml
 rename src/axolotl/utils/data/{pretraining.py => streaming.py} (86%)
 create mode 100644 tests/e2e/test_streaming.py
 create mode 100644 tests/test_streaming.py

diff --git a/_quarto.yml b/_quarto.yml
index 934d393cb..3ffb0e627 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -153,7 +153,7 @@ quartodoc:
         - utils.distributed
         - utils.dict
         - utils.optimizers.adopt
-        - utils.data.pretraining
+        - utils.data.streaming
         - utils.data.sft
         - utils.quantization
     - title: Schemas
@@ -272,6 +272,7 @@ website:
           contents:
             - docs/batch_vs_grad.qmd
             - docs/dataset_preprocessing.qmd
+            - docs/streaming.qmd
             - docs/multipack.qmd
             - docs/mixed_precision.qmd
             - docs/optimizers.qmd
diff --git a/docs/streaming.qmd b/docs/streaming.qmd
new file mode 100644
index 000000000..2a233a4fc
--- /dev/null
+++ b/docs/streaming.qmd
@@ -0,0 +1,120 @@
+---
+title: Streaming Datasets
+description: How to use streaming mode for large-scale datasets and memory-efficient training
+order: 10
+---
+
+Streaming enables memory-efficient training with large datasets by loading data
+incrementally rather than loading the entire dataset into memory at once.
+
+Use streaming when:
+
+- Your dataset is too large to fit in memory (e.g. when you're doing pretraining with massive text corpora)
+- You want to start training immediately without preprocessing the entire dataset
+
+Streaming works with both remote and locally stored datasets!
+
+::: {.callout-note}
+Streaming currently only supports a single dataset. Multi-dataset support will be added soon.
+:::
+
+
+## Configuration
+
+### Basic Streaming
+
+Enable streaming mode by setting the `streaming` flag:
+
+```yaml
+streaming: true
+```
+
+### Pretraining with Streaming
+
+For pretraining tasks, streaming is automatically enabled when using `pretraining_dataset`:
+
+```yaml
+pretraining_dataset:
+  - path: HuggingFaceFW/fineweb-edu
+    type: pretrain
+    text_column: text
+    split: train
+
+# Optionally, enable sample packing
+streaming_multipack_buffer_size: 10000
+sample_packing: true
+```
+
+### SFT with Streaming
+
+For supervised fine-tuning with streaming:
+
+```yaml
+streaming: true
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+    split: train
+
+# Optionally, enable sample packing
+streaming_multipack_buffer_size: 10000
+sample_packing: true
+```
+
+## Configuration Options
+
+### `streaming_multipack_buffer_size`
+
+Controls the buffer size for multipack streaming (default: 10,000). This determines how
+many samples are buffered before packing. Larger buffers can improve packing efficiency
+but use more memory.
+
+### `shuffle_merged_datasets`
+
+When enabled, shuffles the streaming dataset using the buffer. This requires additional
+memory for the shuffle buffer.
+
+## Sample Packing with Streaming
+
+Sample packing is supported for streaming datasets. When enabled, multiple samples are
+packed into a single sequence to maximize GPU utilization:
+
+```yaml
+sample_packing: true
+streaming_multipack_buffer_size: 10000
+
+# For SFT: attention is automatically isolated between packed samples
+# For pretraining: control with pretrain_multipack_attn
+pretrain_multipack_attn: true  # prevent cross-attention between packed samples
+```
+
+For more information, see our [documentation](multipack.qmd) on multipacking.
+
+## Important Considerations
+
+### Memory Usage
+
+While streaming reduces memory usage compared to loading entire datasets, you still need
+to consider:
+
+- You can control the memory usage by adjusting `streaming_multipack_buffer_size`
+- Sample packing requires buffering multiple samples
+- Shuffling requires additional memory for the shuffle buffer
+
+### Performance
+
+- Streaming may have slightly higher latency compared to preprocessed datasets, as samples are processed on-the-fly
+- Network speed and disk read speed are important when streaming from remote sources or a local dataset, respectively
+- Consider using `axolotl preprocess` for smaller or more frequently used datasets
+
+### Evaluation Datasets
+
+Evaluation datasets are not streamed to ensure consistent evaluation metrics. They're
+loaded normally even when training uses streaming.
+
+## Examples
+
+See the `examples/streaming/` directory for complete configuration examples:
+
+- `pretrain.yaml`: Pretraining with streaming dataset
+- `sft.yaml`: Supervised fine-tuning with streaming
diff --git a/examples/streaming/README.md b/examples/streaming/README.md
new file mode 100644
index 000000000..cdbb5baea
--- /dev/null
+++ b/examples/streaming/README.md
@@ -0,0 +1,50 @@
+# Streaming Dataset Examples
+
+This directory contains example configurations for using Axolotl's streaming dataset
+functionality, which enables memory-efficient training with large datasets.
+
+## Examples
+
+Run the following examples with e.g. `axolotl train examples/streaming/sft.yaml`; no
+`axolotl preprocess` required!
+
+### Pretraining (`pretrain.yaml`)
+
+Demonstrates streaming configuration for pretraining tasks using the fineweb-edu dataset
+with SmolLM2-135M.
+
+- Uses `pretraining_dataset` configuration for automatic streaming
+- Multipack attention control to prevent cross-attention between packed sequences
+- Buffer size configuration for memory management
+
+### SFT (`sft.yaml`)
+
+Shows how to use streaming for supervised fine-tuning with the Alpaca dataset.
+
+- Explicit `streaming: true` flag for SFT datasets
+- Memory-efficient training on instruction datasets
+- Evaluation datasets are currently not streamed
+
+## Key Configuration Options
+
+### `streaming`
+- Enables streaming mode for standard datasets
+- Automatically enabled for `pretraining_dataset`
+
+### `streaming_multipack_buffer_size`
+- Controls buffer size for sample packing (default: 10,000)
+- Larger values improve packing efficiency but use more memory
+- Adjust based on available memory
+
+### `shuffle_merged_datasets`
+- Enables shuffling of streaming datasets
+- Requires additional memory for shuffle buffer
+
+### `sample_packing`
+- Packs multiple samples into single sequences
+- Minimize per-step padding tokens
+
+## Performance Tips
+
+- Download small / frequently-used datasets locally for better performance
+- Larger buffer sizes improve packing efficiency
diff --git a/examples/streaming/pretrain.yaml b/examples/streaming/pretrain.yaml
new file mode 100644
index 000000000..bc8edefd6
--- /dev/null
+++ b/examples/streaming/pretrain.yaml
@@ -0,0 +1,57 @@
+base_model: HuggingFaceTB/SmolLM2-135M
+
+# Streaming pretraining configuration
+pretraining_dataset:
+  - path: HuggingFaceFW/fineweb-edu
+    name: sample-10BT
+    type: pretrain
+    text_column: text
+    split: train
+
+# Streaming-specific settings
+streaming_multipack_buffer_size: 10000
+shuffle_merged_datasets: true
+
+# Training configuration
+max_steps: 1000
+output_dir: ./outputs/smollm2-135m-pretrain-streaming
+
+# Sequence and packing settings
+sequence_len: 1024
+sample_packing: true
+pretrain_multipack_attn: true  # Prevent cross-attention between packed sequences
+flash_attention: true
+
+# Batch size settings
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+
+# Optimizer and scheduler
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 5e-4
+warmup_ratio: 0.1
+weight_decay: 0.01
+
+# Precision and performance
+bf16: auto
+tf32: true
+
+# Logging and checkpointing
+logging_steps: 10
+save_strategy: steps
+save_steps: 250
+save_total_limit: 3
+
+# Weights & Biases (optional)
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+# Special tokens
+special_tokens:
+  pad_token: "<|endoftext|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/streaming/sft.yaml b/examples/streaming/sft.yaml
new file mode 100644
index 000000000..47b9f493f
--- /dev/null
+++ b/examples/streaming/sft.yaml
@@ -0,0 +1,55 @@
+base_model: HuggingFaceTB/SmolLM2-135M
+
+# Dataset configuration
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+    split: train
+
+# Streaming-specific settings
+streaming: true
+streaming_multipack_buffer_size: 10000
+shuffle_merged_datasets: true
+
+# Training configuration
+max_steps: 1000
+output_dir: ./outputs/smollm2-135m-sft-streaming
+
+# Sequence and packing settings
+sequence_len: 1024
+sample_packing: true
+flash_attention: true
+
+# Batch size settings
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+
+# Optimizer and scheduler
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 2e-4
+warmup_ratio: 0.1
+weight_decay: 0.0
+
+# Precision and performance
+bf16: auto
+tf32: true
+
+# Logging and checkpointing
+logging_steps: 10
+save_strategy: steps
+save_steps: 100
+save_total_limit: 3
+
+# Weights & Biases (optional)
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+# Special tokens
+special_tokens:
+  pad_token: "<|endoftext|>"
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/src/axolotl/cli/args.py b/src/axolotl/cli/args.py
index 9bb544aff..396e9a8af 100644
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -14,9 +14,13 @@ class PreprocessCliArgs:
     prompter: Optional[str] = field(default=None)
     download: Optional[bool] = field(default=True)
     iterable: Optional[bool] = field(
-        default=None,
+        default=False,
         metadata={
-            "help": "Use IterableDataset for streaming processing of large datasets"
+            "help": (
+                "Deprecated in v0.13.0, will be removed in v0.14.0. For streaming "
+                "datasets, use 'axolotl train' and set 'streaming: true' in your YAML "
+                "config, or pass --streaming instead in the CLI."
+            )
         },
     )
 
diff --git a/src/axolotl/cli/preprocess.py b/src/axolotl/cli/preprocess.py
index ff4551c64..6c05a55f1 100644
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -35,10 +35,20 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
     check_accelerate_default_config()
     check_user_token()
 
+    if cli_args.iterable:
+        LOG.error(
+            "The --iterable CLI argument for 'axolotl preprocess' is no longer "
+            "supported. For training, set 'streaming: true' in your YAML config or "
+            "pass '--streaming' in your 'axolotl train' command for on-the-fly "
+            "preprocessing."
+        )
+        return
+
     for key in ["skip_prepare_dataset", "pretraining_dataset"]:
         if cfg.get(key):
             LOG.error(
-                f"You have set `{key}:`. `preprocess` is not needed. Run the `axolotl train` CLI directly instead."
+                f"You have set `{key}:`. `preprocess` is not needed. Run the 'axolotl "
+                "train' CLI directly instead."
             )
             return
 
diff --git a/src/axolotl/common/datasets.py b/src/axolotl/common/datasets.py
index e7433e3c2..8d7758e66 100644
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -55,13 +55,11 @@ def load_datasets(
     """
     tokenizer = load_tokenizer(cfg)
     processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
-    preprocess_iterable = getattr(cli_args, "iterable", False)
 
     train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
         cfg,
         tokenizer,
         processor=processor,
-        preprocess_iterable=preprocess_iterable,
     )
 
     if (
diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py
index b8f9484bc..20acb8521 100644
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -1,18 +1,17 @@
-"""Module containing Dataset functionality"""
+"""
+Module containing dataset functionality.
+
+We want this to be a wrapper for an existing dataset that we have loaded. Lets use the
+concept of middlewares to wrap each dataset. We'll use the collators later on to pad the
+datasets.
+"""
 
-import torch
 from datasets import Dataset, IterableDataset
 
 from axolotl.utils.logging import get_logger
 
 from .prompt_tokenizers import PromptTokenizingStrategy
 
-# We want this to be a wrapper for an existing dataset that we have loaded
-# lets use the concept of middlewares to wrap each dataset, for example
-# ConstantLengthDataset(ShuffledDataset([TokenizedPromptDataset(alpaca_dataset)]))
-# let's check to ensure we don't truncate an item in the middle, we'll use
-# the collators later on to pad the datasets
-
 LOG = get_logger(__name__)
 
 
@@ -86,133 +85,3 @@ def wrap_dataset_for_tokenized_prompt(
             **map_kwargs,
         )
     return TokenizedPromptDataset(prompt_tokenizer, dataset, **kwargs)
-
-
-# TODO this isn't the best since it can't interleave datasets
-class ConstantLengthDataset(IterableDataset):
-    """Iterable dataset that returns constant length chunks of tokens from stream of
-    text files.
-
-    Args:
-        tokenizer: The processor used for processing the data.
-        dataset: Dataset with text files.
-        seq_length: Length of token sequences to return.
-    """
-
-    def __init__(
-        self,
-        tokenizer,
-        datasets,
-        seq_length=2048,
-    ):
-        self.tokenizer = tokenizer
-        self.concat_token_id = tokenizer.eos_token_id
-        self.datasets: list[IterableDataset] = datasets
-        self.seq_length = seq_length
-
-        vocab_size = len(tokenizer.get_vocab())
-
-        if vocab_size <= torch.iinfo(torch.int16).max:
-            self.tokens_dtype = torch.int16
-        elif vocab_size <= torch.iinfo(torch.int32).max:
-            self.tokens_dtype = torch.int32
-        else:
-            self.tokens_dtype = torch.int64
-
-    def __iter__(self):
-        buffer = {
-            "input_ids": [],
-            "attention_mask": [],
-            "labels": [],
-            "position_ids": [],
-        }
-        buffer_len = 0
-        for dataset in self.datasets:
-            idx = 0
-            iterator = iter(dataset)
-            more_examples = True
-            while more_examples:
-                try:
-                    example = next(iterator)
-                    idx += 1
-                except StopIteration:
-                    more_examples = False
-                    example = None
-
-                add_concat_token = False
-                if example:
-                    example_len = len(example["input_ids"])
-                    add_concat_token = example["input_ids"][-1] != self.concat_token_id
-                else:
-                    example_len = 0
-
-                if not example_len or (
-                    buffer_len + int(add_concat_token) + example_len > self.seq_length
-                ):
-                    if buffer["input_ids"]:
-                        input_ids = torch.cat(buffer["input_ids"], dim=-1)[
-                            : self.seq_length
-                        ]
-                        attention_mask = torch.cat(buffer["attention_mask"], dim=-1)[
-                            : self.seq_length
-                        ]
-                        position_ids = torch.cat(buffer["position_ids"], dim=-1)[
-                            : self.seq_length
-                        ]
-                        labels = torch.cat(buffer["labels"], dim=-1)[: self.seq_length]
-                        if labels.size() == input_ids.size() and (
-                            attention_mask.size() == input_ids.size()
-                        ):
-                            yield {
-                                "input_ids": input_ids,
-                                "labels": labels,
-                                "attention_mask": attention_mask,
-                                "position_ids": position_ids,
-                            }
-                        else:
-                            LOG.warning(
-                                "Dropping batch due to tensor size mismatch "
-                                f"input_ids: {input_ids.size()}, "
-                                f"labels: {labels.size()}, "
-                                f"attention_mask: {attention_mask.size()}"
-                            )
-                    buffer = {
-                        "input_ids": [],
-                        "attention_mask": [],
-                        "labels": [],
-                        "position_ids": [],
-                    }
-                    buffer_len = 0
-                    idx = 1
-
-                if example:
-                    # FIXME
-                    # just going to drop data points that are too long
-                    if len(example["input_ids"]) <= self.seq_length:
-                        input_ids = example["input_ids"]
-                        attention_mask = example["attention_mask"]
-                        labels = example["labels"]
-
-                        if add_concat_token:
-                            input_ids.append(self.concat_token_id)
-                            attention_mask.append(1)
-                            labels.append(self.concat_token_id)
-
-                        input_ids_with_concat = torch.tensor(
-                            input_ids, dtype=self.tokens_dtype
-                        )
-                        attention_mask_with_concat = torch.tensor(
-                            [idx * m for m in attention_mask], dtype=torch.int16
-                        )
-                        labels_with_concat = torch.tensor(
-                            labels, dtype=self.tokens_dtype
-                        )
-                        position_ids = torch.arange(
-                            len(input_ids), dtype=self.tokens_dtype
-                        )
-
-                        buffer["input_ids"].append(input_ids_with_concat)
-                        buffer["attention_mask"].append(attention_mask_with_concat)
-                        buffer["labels"].append(labels_with_concat)
-                        buffer["position_ids"].append(position_ids)
-                        buffer_len += len(input_ids)
diff --git a/src/axolotl/prompt_tokenizers.py b/src/axolotl/prompt_tokenizers.py
index 2bf9ec763..a7bd963f8 100644
--- a/src/axolotl/prompt_tokenizers.py
+++ b/src/axolotl/prompt_tokenizers.py
@@ -75,7 +75,7 @@ class PromptTokenizingStrategy(abc.ABC):
     ) -> BatchEncoding:
         empty = BatchEncoding(data={"input_ids": [], "attention_mask": []})
         if not prompt:
-            LOG.warning("Empty text requested for tokenization.")
+            LOG.warning_once("Empty text requested for tokenization.")
             return empty
 
         result = self.tokenizer(
diff --git a/src/axolotl/utils/collators/__init__.py b/src/axolotl/utils/collators/__init__.py
index 8c60f223c..d5e6ad17d 100644
--- a/src/axolotl/utils/collators/__init__.py
+++ b/src/axolotl/utils/collators/__init__.py
@@ -1,11 +1,17 @@
-"""
-shared axolotl collators for multipack, mamba, multimodal
-"""
+"""Shared axolotl collators for multipacking, mamba, multimodal."""
 
-from .batching import (  # noqa: F401
+from .batching import (
     BatchSamplerDataCollatorForSeq2Seq,
     DataCollatorForSeq2Seq,
     PretrainingBatchSamplerDataCollatorForSeq2Seq,
     V2BatchSamplerDataCollatorForSeq2Seq,
 )
-from .mamba import MambaDataCollator  # noqa: F401
+from .mamba import MambaDataCollator
+
+__all__ = [
+    "DataCollatorForSeq2Seq",
+    "BatchSamplerDataCollatorForSeq2Seq",
+    "V2BatchSamplerDataCollatorForSeq2Seq",
+    "PretrainingBatchSamplerDataCollatorForSeq2Seq",
+    "MambaDataCollator",
+]
diff --git a/src/axolotl/utils/data/__init__.py b/src/axolotl/utils/data/__init__.py
index d162a7d0b..788f13638 100644
--- a/src/axolotl/utils/data/__init__.py
+++ b/src/axolotl/utils/data/__init__.py
@@ -1,8 +1,8 @@
 """Init for `axolotl.utils.data` module."""
 
-from axolotl.utils.data.pretraining import (
-    encode_pretraining,
-    wrap_pretraining_dataset,
+from axolotl.utils.data.streaming import (
+    encode_streaming,
+    wrap_streaming_dataset,
 )
 from axolotl.utils.data.rl import prepare_preference_datasets
 from axolotl.utils.data.sft import (
@@ -12,8 +12,8 @@ from axolotl.utils.data.sft import (
 from axolotl.utils.data.utils import md5
 
 __all__ = [
-    "encode_pretraining",
-    "wrap_pretraining_dataset",
+    "encode_streaming",
+    "wrap_streaming_dataset",
     "prepare_preference_datasets",
     "get_dataset_wrapper",
     "prepare_datasets",
diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py
index 2ae7d9052..28732e01d 100644
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -9,13 +9,14 @@ from datasets import (
     Dataset,
     DatasetDict,
     IterableDataset,
+    IterableDatasetDict,
     load_dataset,
 )
 from transformers import PreTrainedTokenizer, ProcessorMixin
 
 from axolotl.prompters import Prompter
 from axolotl.utils.data.lock import FileLockLoader
-from axolotl.utils.data.pretraining import wrap_pretraining_dataset
+from axolotl.utils.data.streaming import wrap_streaming_dataset
 from axolotl.utils.data.shared import (
     create_train_validation_split,
     datasets_with_name_generator,
@@ -48,7 +49,6 @@ def prepare_datasets(
     cfg: DictDefault,
     tokenizer: PreTrainedTokenizer,
     processor: ProcessorMixin | None = None,
-    preprocess_iterable: bool = False,
 ) -> tuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]:
     """Prepare training and evaluation datasets based on configuration.
 
@@ -56,23 +56,19 @@ def prepare_datasets(
         cfg: Dictionary mapping `axolotl` config keys to values.
         tokenizer: Tokenizer to use for processing text.
         processor: Optional processor for multimodal datasets.
-        preprocess_iterable: Whether to use iterable preprocessing.
 
     Returns:
         Tuple of (train_dataset, eval_dataset, total_steps, prompters).
     """
-    if cfg.pretraining_dataset:
-        return _prepare_pretraining_dataset(
-            cfg, tokenizer, processor, preprocess_iterable
-        )
-    return _prepare_standard_dataset(cfg, tokenizer, processor, preprocess_iterable)
+    if cfg.streaming or cfg.pretraining_dataset:
+        return _prepare_streaming_dataset(cfg, tokenizer, processor)
+    return _prepare_standard_dataset(cfg, tokenizer, processor)
 
 
 def _prepare_standard_dataset(
     cfg: DictDefault,
     tokenizer: PreTrainedTokenizer,
     processor: ProcessorMixin | None,
-    preprocess_iterable: bool,
 ) -> tuple[Dataset, Dataset | None, int, list[Prompter | None]]:
     """Prepare standard (non-pretraining) datasets."""
 
@@ -83,7 +79,6 @@ def _prepare_standard_dataset(
             cfg,
             split="train",
             processor=processor,
-            preprocess_iterable=preprocess_iterable,
         )
 
         # Overwrite eval_dataset if test data exists
@@ -93,7 +88,6 @@ def _prepare_standard_dataset(
                 cfg,
                 split="test",
                 processor=processor,
-                preprocess_iterable=preprocess_iterable,
             )
 
         return train_dataset, eval_dataset, prompters
@@ -128,22 +122,40 @@ def _prepare_standard_dataset(
     return train_dataset, eval_dataset, total_num_steps, prompters
 
 
-def _prepare_pretraining_dataset(
+def _prepare_streaming_dataset(
     cfg: DictDefault,
     tokenizer: PreTrainedTokenizer,
     processor: ProcessorMixin | None,
-    preprocess_iterable: bool,
 ) -> tuple[IterableDataset, Dataset | None, int, list[Prompter | None]]:
     """
-    Prepare dataset for pretraining mode.
+    Prepare dataset for streaming mode.
 
-    Note: Pre-training datasets are streamed from the HuggingFace Hub.
+    Note: Streaming datasets are loaded incrementally from the source.
     """
-    # Extract pretraining dataset configuration
-    pretraining_config = _extract_pretraining_config(cfg)
+    if cfg.pretraining_dataset:
+        dataset_config = _extract_pretraining_config(cfg)
+        train_dataset = _load_streaming_dataset(dataset_config, cfg, tokenizer)
+    elif cfg.sample_packing:
+        # TODO(djsaunde): Implement for multiple datasets
+        dataset_config = DictDefault(cfg.datasets[0])
 
-    # Load streaming dataset for training
-    train_dataset = _load_pretraining_dataset(pretraining_config, cfg, tokenizer)
+        # Ensure we have a split set - default to 'train' if not specified
+        if not hasattr(dataset_config, "split") or not dataset_config.split:
+            dataset_config.split = "train"
+        train_dataset = _load_streaming_dataset(dataset_config, cfg, tokenizer)
+    else:
+        # Use legacy loading function for non-packed streaming datasets
+        train_dataset, eval_dataset, prompters = _load_and_prepare_datasets(
+            tokenizer,
+            cfg,
+            split="train",
+            processor=processor,
+            streaming=True,
+        )
+
+        # Return early for non-packed streaming datasets
+        total_num_steps = cfg.max_steps if cfg.max_steps else -1
+        return train_dataset, eval_dataset, total_num_steps, prompters
 
     # Load evaluation dataset if specified
     eval_dataset = None
@@ -153,14 +165,12 @@ def _prepare_pretraining_dataset(
             cfg,
             split="test",
             processor=processor,
-            preprocess_iterable=preprocess_iterable,
+            streaming=False,
         )
 
-    if cfg.dataset_exact_deduplication:
-        LOG.info("Deduplication not available for pretrained datasets")
-
-    # For pretraining, we return max_steps directly from config
-    return train_dataset, eval_dataset, cfg.max_steps, []
+    # For streaming, we return max_steps directly from config or -1 if not set
+    total_num_steps = cfg.max_steps if cfg.max_steps else -1
+    return train_dataset, eval_dataset, total_num_steps, []
 
 
 def _extract_pretraining_config(cfg: DictDefault) -> DictDefault:
@@ -192,7 +202,7 @@ def _extract_pretraining_config(cfg: DictDefault) -> DictDefault:
     )
 
 
-def _load_pretraining_dataset(
+def _load_streaming_dataset(
     pretraining_config: DictDefault, cfg: DictDefault, tokenizer: PreTrainedTokenizer
 ) -> IterableDataset:
     """Load and prepare a streaming dataset for pretraining."""
@@ -227,15 +237,11 @@ def _load_pretraining_dataset(
         iter_dataset = iter_dataset.skip(pretraining_config["skip"])
 
     # Wrap the dataset for pretraining
-    train_dataset = wrap_pretraining_dataset(
+    train_dataset = wrap_streaming_dataset(
         iter_dataset,
         tokenizer,
         cfg,
         dataset_wrapper_partial,
-        max_tokens=cfg.sequence_len,
-        batch_size=cfg.micro_batch_size,
-        seed=cfg.seed,
-        buffer_size=cfg.pretrain_multipack_buffer_size or 10_000,
     )
 
     # Format for PyTorch
@@ -256,7 +262,7 @@ def _load_tokenized_prepared_datasets(
     cfg: DictDefault,
     split: Literal["train", "test"] = "train",
     processor: ProcessorMixin | None = None,
-    preprocess_iterable: bool = False,
+    streaming: bool = False,
 ) -> tuple[Dataset | DatasetDict, list[Prompter | None]]:
     """Load or create tokenized and prepared datasets for training or testing.
 
@@ -265,7 +271,7 @@ def _load_tokenized_prepared_datasets(
         cfg: Configuration object.
         split: Dataset split to load ('train' or 'test').
         processor: Optional processor for multimodal datasets.
-        preprocess_iterable: Whether to use iterable preprocessing.
+        streaming: Whether to use iterable preprocessing.
 
     Returns:
         Tuple of (dataset, prompters list).
@@ -296,7 +302,7 @@ def _load_tokenized_prepared_datasets(
             tokenizer,
             split,
             processor,
-            preprocess_iterable,
+            streaming,
         )
 
     return dataset, prompters
@@ -308,7 +314,7 @@ def _load_raw_datasets(
     tokenizer: PreTrainedTokenizer,
     split: str,
     processor: ProcessorMixin | None = None,
-    preprocess_iterable: bool = False,
+    streaming: bool = False,
 ) -> tuple[Dataset, list[Prompter | None]]:
     """Load, process, merge, and save raw datasets."""
     LOG.info("Loading raw datasets...", main_process_only=False)
@@ -329,7 +335,7 @@ def _load_raw_datasets(
             split=split,
             seed=cfg.seed,
             processor=processor,
-            preprocess_iterable=preprocess_iterable,
+            streaming=streaming,
         )
         datasets.append(dataset_wrapper)
         prompters.append(dataset_prompter)
@@ -337,7 +343,7 @@ def _load_raw_datasets(
     # Merge datasets
     dataset = merge_datasets(datasets, cfg)
 
-    if not cfg.skip_prepare_dataset:
+    if not cfg.skip_prepare_dataset and not streaming:
         if split == "test" and cfg.eval_sequence_len:
             dataset = handle_long_seq_in_dataset(dataset, cfg.eval_sequence_len, cfg)
         else:
@@ -361,19 +367,19 @@ def _load_and_process_single_dataset(
     split: str,
     seed: int,
     processor: ProcessorMixin | None = None,
-    preprocess_iterable: bool = False,
+    streaming: bool = False,
 ) -> tuple[Dataset | IterableDataset, Prompter | None]:
     """Load and process a single dataset based on the passed config."""
     # Load the dataset
     dataset = load_dataset_with_config(
-        dataset_config, cfg.hf_use_auth_token, streaming=preprocess_iterable
+        dataset_config, cfg.hf_use_auth_token, streaming=streaming
     )
 
     # Parse dataset type
     d_base_type, d_prompt_style = _parse_dataset_type(dataset_config.type)
 
     # Select the appropriate split
-    if isinstance(dataset, DatasetDict):
+    if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
         if dataset_config.split and dataset_config.split in dataset:
             dataset = dataset[dataset_config.split]
         elif split in dataset:
@@ -479,7 +485,7 @@ def _load_and_prepare_datasets(
     cfg: DictDefault,
     split: Literal["train", "test"] = "train",
     processor: ProcessorMixin | None = None,
-    preprocess_iterable: bool = False,
+    streaming: bool = False,
 ) -> tuple[Dataset | None, Dataset | None, list[Prompter | None]]:
     """Load and prepare datasets with optional validation split and sharding.
 
@@ -488,7 +494,7 @@ def _load_and_prepare_datasets(
         cfg: Configuration object.
         split: Dataset split to load ('train' or 'test').
         processor: Optional processor for multimodal datasets.
-        preprocess_iterable: Whether to use iterable preprocessing.
+        streaming: Whether to use iterable preprocessing.
 
     Returns:
         Tuple of (train_dataset, eval_dataset, prompters).
@@ -499,7 +505,7 @@ def _load_and_prepare_datasets(
         cfg,
         split=split,
         processor=processor,
-        preprocess_iterable=preprocess_iterable,
+        streaming=streaming,
     )
 
     # Apply dataset sharding if configured using shared function
diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py
index 1d7d37f15..6b6e0e281 100644
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -236,11 +236,9 @@ def _load_from_local_path(
         try:
             return load_from_disk(dataset_config.path)
         except FileNotFoundError:
-            load_dataset_kwargs["streaming"] = False
             return load_dataset(dataset_config.path, **load_dataset_kwargs)
     elif local_path.is_file():
         dataset_type = get_dataset_type(dataset_config)
-        load_dataset_kwargs["streaming"] = False
         return load_dataset(
             dataset_type,
             data_files=dataset_config.path,
diff --git a/src/axolotl/utils/data/pretraining.py b/src/axolotl/utils/data/streaming.py
similarity index 86%
rename from src/axolotl/utils/data/pretraining.py
rename to src/axolotl/utils/data/streaming.py
index 72c5536e9..2cb35ee7c 100644
--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/streaming.py
@@ -1,4 +1,4 @@
-"""data handling specific to pretraining"""
+"""Data handling specific to streaming datasets."""
 
 import functools
 from collections import defaultdict
@@ -17,10 +17,10 @@ from axolotl.utils.trainer import process_pretraining_datasets_for_packing
 LOG = get_logger(__name__)
 
 
-def encode_pretraining(
+def encode_streaming(
+    examples: Dict[str, List],
     tokenizer: PreTrainedTokenizerBase,
     max_tokens: int,
-    examples: Dict[str, List],
     text_column: str = "text",
     concatenate: bool = True,
 ) -> Dict[str, List]:
@@ -176,45 +176,57 @@ def encode_pretraining(
     return ret
 
 
-def wrap_pretraining_dataset(
+def wrap_streaming_dataset(
     dataset,
     tokenizer,
     cfg,
     ds_wrapper_fn,
-    max_tokens=2048,
-    batch_size=1,
-    seed=42,
-    buffer_size=10_000,
 ):
     if cfg.sample_packing:
+        # For SFT (non-pretraining) datasets, always use multipack_attn=True to ensure
+        # attention isolation between packed sequences
+        multipack_attn = (
+            True if not cfg.pretraining_dataset else cfg.pretrain_multipack_attn
+        )
+
         collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq(
             tokenizer,
             return_tensors="pt",
             padding=True,
-            pad_to_multiple_of=max_tokens,
-            multipack_attn=cfg.pretrain_multipack_attn,
+            pad_to_multiple_of=cfg.sequence_len,
+            multipack_attn=multipack_attn,
         )
         encode = functools.partial(
-            encode_packed_pretraining,
+            encode_packed_streaming,
             collate_fn,
             ds_wrapper_fn,
-            max_seq_length=max_tokens,
-            batch_size=batch_size,
-            multipack_attn=cfg.pretrain_multipack_attn,
+            max_seq_length=cfg.sequence_len,
+            batch_size=cfg.micro_batch_size,
+            multipack_attn=multipack_attn,
         )
-        # set this to 1 so downstream data_loader doesn't try to increase the batch again
+
+        # Set this to 1 so downstream data_loader doesn't try to increase the batch size
+        # again
         cfg.micro_batch_size = 1
     else:
+        # NOTE: This is not reachable for SFT datasets since we use the pre-existing
+        # loading function for non-packed streaming datasets. Refer to
+        # _prepare_streaming_datasets in sft.py for that code path.
+        text_column = (
+            getattr(cfg.pretraining_dataset[0], "text_column", "text") or "text"
+        )
         encode = functools.partial(
-            encode_pretraining,
-            tokenizer,
-            max_tokens,
-            text_column=cfg.pretraining_dataset[0].text_column or "text",
+            encode_streaming,
+            tokenizer=tokenizer,
+            max_tokens=cfg.sequence_len,
+            text_column=text_column,
             concatenate=cfg.pretraining_sample_concatenation is True,
         )
 
     if cfg.shuffle_merged_datasets:
-        dataset = dataset.shuffle(seed=seed, buffer_size=buffer_size)
+        dataset = dataset.shuffle(
+            seed=cfg.seed, buffer_size=cfg.streaming_multipack_buffer_size
+        )
     else:
         LOG.debug("NOT shuffling merged pretraining datasets")
 
@@ -232,14 +244,13 @@ def wrap_pretraining_dataset(
     dataset = dataset.map(
         encode,
         batched=True,
-        batch_size=buffer_size,
-        # input_columns="text",
+        batch_size=cfg.streaming_multipack_buffer_size,
         remove_columns=remove_columns,
     )
     return dataset
 
 
-def encode_packed_pretraining(
+def encode_packed_streaming(
     collate_fn,
     ds_wrapper: Callable,
     examples: Dict[str, List],
@@ -274,8 +285,6 @@ def encode_packed_pretraining(
     for batch in sampler:
         for data in batch:
             features = train_dataset[data]
-            if "num_truncated_tokens" in features:
-                del features["num_truncated_tokens"]
             if "num_truncated_tokens" in features:
                 del features["num_truncated_tokens"]
             if "overflow_to_sample_mapping" in features:
diff --git a/src/axolotl/utils/data/utils.py b/src/axolotl/utils/data/utils.py
index 4868576a0..445a65d6c 100644
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -190,12 +190,21 @@ def handle_long_seq_in_dataset(
     Returns:
         Filtered dataset with long sequences removed.
     """
-    if "input_ids" not in dataset.column_names:
+    if (
+        hasattr(dataset, "column_names")
+        and dataset.column_names
+        and "input_ids" not in dataset.column_names
+    ):
         LOG.warning(
             "Dataset does not contain 'input_ids' column. Skip drop long seq. This is "
             "expected for reward modeling."
         )
         return dataset
+    elif not hasattr(dataset, "column_names") or dataset.column_names is None:
+        LOG.info(
+            "Dataset is streaming (IterableDataset), skipping long sequence handling"
+        )
+        return dataset
 
     drop_long = functools.partial(
         drop_long_seq,
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 4b5f571dc..d43c346cd 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -475,12 +475,6 @@ class AxolotlInputConfig(
         },
     )
     multipack_real_batches: bool | None = None
-    pretraining_sample_concatenation: bool | None = Field(
-        default=None,
-        json_schema_extra={
-            "description": "whether to concatenate samples during pretraining",
-        },
-    )
 
     batch_flattening: Literal["auto"] | bool | None = Field(
         default=None,
@@ -495,13 +489,34 @@ class AxolotlInputConfig(
     pose_max_context_len: int | None = None
     pose_num_chunks: int | None = None
 
-    pretrain_multipack_buffer_size: int | None = 10_000
+    # Deprecated: Use streaming_multipack_buffer_size instead
+    pretrain_multipack_buffer_size: int | None = Field(
+        default=None,
+        deprecated="Deprecated in v0.13.0, will be removed in v0.14.0. Use streaming_multipack_buffer_size instead",
+    )
     pretrain_multipack_attn: bool | None = Field(
         default=True,
         json_schema_extra={
             "description": "whether to prevent cross attention for packed sequences during pretraining",
         },
     )
+    pretraining_sample_concatenation: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "whether to concatenate samples during pretraining",
+        },
+    )
+
+    streaming: bool | None = Field(
+        default=None,
+        json_schema_extra={"description": "Use streaming mode for loading datasets"},
+    )
+    streaming_multipack_buffer_size: int | None = Field(
+        default=10_000,
+        json_schema_extra={
+            "description": "Buffer size for multipack streaming datasets"
+        },
+    )
 
     xformers_attention: bool | None = Field(
         default=None,
@@ -1264,3 +1279,14 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
             data["dataset_processes"] = get_default_process_count()
 
         return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_deduplication_with_streaming(cls, data):
+        if data.get("dataset_exact_deduplication") and (
+            data.get("streaming") or data.get("pretraining_dataset")
+        ):
+            raise NotImplementedError(
+                "dataset_exact_deduplication is not available for streaming datasets. "
+            )
+        return data
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 791894990..49add8081 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -60,6 +60,20 @@ class DatasetValidationMixin:
             raise ValueError("either datasets or pretraining_dataset is required")
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_pretraining_streaming_deprecation(cls, data):
+        # TODO(djsaunde): remove this check + implement change for 0.13.0 release
+        if data.get("pretraining_dataset") and not data.get("streaming"):
+            LOG.warning(
+                "Setting `pretraining_dataset` without explicitly setting `streaming: "
+                "true` is deprecated. In a future release, streaming will not be "
+                "automatically enabled when using pretraining_dataset. Please "
+                "explicitly set `streaming: true` in your configuration to maintain "
+                "current behavior."
+            )
+        return data
+
     @model_validator(mode="before")
     @classmethod
     def check_push_ds_auth(cls, data):
@@ -340,6 +354,30 @@ class TrainingValidationMixin:
             )
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_multipack_buffer_size(cls, data):
+        if data.get("pretrain_multipack_buffer_size") and not data.get(
+            "streaming_multipack_buffer_size"
+        ):
+            LOG.warning(
+                "`pretrain_multipack_buffer_size` is deprecated in v0.13.0, will be "
+                "removed in v0.14.0. Use `streaming_multipack_buffer_size` instead."
+            )
+            data["streaming_multipack_buffer_size"] = data[
+                "pretrain_multipack_buffer_size"
+            ]
+            del data["pretrain_multipack_buffer_size"]
+        elif data.get("pretrain_multipack_buffer_size") and data.get(
+            "streaming_multipack_buffer_size"
+        ):
+            raise ValueError(
+                "pretrain_multipack_buffer_size is deprecated, use "
+                "streaming_multipack_buffer_size; both are set, please remove the "
+                "deprecated pretrain_multipack_buffer_size setting"
+            )
+        return data
+
     @model_validator(mode="after")
     def check_fft_possible_bad_config(self):
         if (
@@ -1074,6 +1112,50 @@ class PretrainingValidationMixin:
                     data["accelerator_config"]["dispatch_batches"] = False
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_pretraining_w_val_set_size(cls, data):
+        if data.get("pretraining_dataset") and data.get("val_set_size"):
+            raise ValueError(
+                "val_set_size is not supported with pretraining_dataset. "
+                "Use test_datasets to specify evaluation datasets for pretraining."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_streaming_w_val_set_size(cls, data):
+        if data.get("streaming") and data.get("val_set_size"):
+            raise ValueError(
+                "val_set_size is not supported with streaming datasets. "
+                "Use test_datasets to specify evaluation datasets when streaming is enabled."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_streaming_w_max_steps(cls, data):
+        if data.get("streaming") and not data.get("max_steps"):
+            raise ValueError(
+                "max_steps must be set when using streaming datasets. "
+                "Trainer cannot infer dataset length for iterable datasets."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_streaming_w_multiple_datasets(cls, data):
+        if (
+            data.get("streaming")
+            and data.get("sample_packing")
+            and data.get("datasets")
+            and len(data.get("datasets")) > 1
+        ):
+            raise NotImplementedError(
+                "Sample packing with multiple streaming datasets is not yet supported"
+            )
+        return data
+
 
 class ModelCompatibilityValidationMixin:
     """Validation methods for specific model compatibility."""
diff --git a/tests/e2e/integrations/test_kd.py b/tests/e2e/integrations/test_kd.py
index 98383614b..ff47b9427 100644
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -25,7 +25,7 @@ def min_cfg(temp_dir):
         "liger_rms_norm": True,
         "liger_glu_activation": True,
         "torch_compile": True,
-        "chat_template": "llama3",
+        "chat_template": "qwen3",
         "kd_trainer": True,
         "kd_ce_alpha": 0.1,
         "kd_alpha": 0.9,
diff --git a/tests/e2e/test_streaming.py b/tests/e2e/test_streaming.py
new file mode 100644
index 000000000..5dccf00dd
--- /dev/null
+++ b/tests/e2e/test_streaming.py
@@ -0,0 +1,73 @@
+"""E2E tests for streaming dataset functionality"""
+
+# pylint: disable=duplicate-code
+
+import pytest
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from .utils import check_model_output_exists, check_tensorboard
+
+
+class TestStreamingDatasets:
+    """Test case for streaming datasets"""
+
+    @pytest.mark.parametrize(
+        "sample_packing",
+        [True, False],
+    )
+    def test_streaming_dataset(self, temp_dir, sample_packing):
+        """Test streaming datasets"""
+
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "flash_attention": True,
+                "sequence_len": 1024,
+                "sample_packing": sample_packing,
+                "pretrain_multipack_attn": sample_packing,
+                "streaming_multipack_buffer_size": 10000,
+                "dataset_processes": 1,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                # Streaming config
+                "streaming": True,
+                "max_steps": 3,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "val_set_size": 0.0,
+                "output_dir": temp_dir,
+                "learning_rate": 0.00001,
+                "optimizer": "adamw_torch_fused",
+                "lr_scheduler": "cosine",
+                "save_safetensors": True,
+                "bf16": "auto",
+                "use_tensorboard": True,
+                "save_first_step": False,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
+
+        # Verify training actually happened by checking loss decrease
+        check_tensorboard(
+            temp_dir + "/runs",
+            "train/train_loss",
+            3.0,
+            "Train Loss (%s) is too high",
+        )
diff --git a/tests/test_data.py b/tests/test_data.py
index 6d583cfd3..99ed06336 100644
--- a/tests/test_data.py
+++ b/tests/test_data.py
@@ -6,7 +6,7 @@ import unittest
 
 from transformers import LlamaTokenizer
 
-from axolotl.utils.data import encode_pretraining, md5
+from axolotl.utils.data import encode_streaming, md5
 
 from tests.hf_offline_utils import enable_hf_offline
 
@@ -39,7 +39,7 @@ class TestEncodePretraining(unittest.TestCase):
                 "hello, hello",
             ]
         }
-        result = encode_pretraining(self.tokenizer, self.max_tokens, examples)
+        result = encode_streaming(examples, self.tokenizer, self.max_tokens)
 
         self.assertEqual(len(result["input_ids"]), 3)
 
diff --git a/tests/test_packed_dataset.py b/tests/test_packed_dataset.py
index 43e4f3d39..64f314e2e 100644
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -1,16 +1,11 @@
 """Module for testing dataset sequence packing"""
 
 import unittest
-from pathlib import Path
 
-from datasets import Dataset, load_dataset
 from transformers import AutoTokenizer
 
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
-from axolotl.datasets import ConstantLengthDataset, TokenizedPromptDataset
-from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
-from axolotl.prompters import AlpacaPrompter
 from axolotl.train import setup_model_and_trainer
 from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.dict import DictDefault
@@ -35,43 +30,6 @@ class TestPacking(unittest.TestCase):
             }
         )
 
-    def test_increments_attention(self):
-        prompter = AlpacaPrompter("chat")
-        strat = AlpacaPromptTokenizingStrategy(
-            prompter,
-            self.tokenizer,
-            False,
-            2048,
-        )
-        dateset = load_dataset(
-            "json",
-            data_files=str(Path(__file__).parent / "fixtures/alpaca/alpaca.json"),
-        )["train"]
-        dataset = Dataset.from_list(list(TokenizedPromptDataset(strat, dateset)))
-
-        constant_len_dataset = ConstantLengthDataset(
-            self.tokenizer,
-            [dataset],
-            seq_length=2048,
-        )
-        packed_dataset = Dataset.from_list(list(constant_len_dataset))
-        example = packed_dataset[0]
-        next_bos_index = (
-            example["input_ids"][1:].index(self.tokenizer.bos_token_id) + 1
-        )  # add one since we sliced
-
-        # first example doesn't have mask reset
-        assert example["input_ids"][0] == self.tokenizer.bos_token_id
-        assert example["attention_mask"][0] == 1
-        assert example["position_ids"][0] == 0
-        assert example["position_ids"][1] == 1
-
-        # but subsequent one does
-        assert example["input_ids"][next_bos_index] == self.tokenizer.bos_token_id
-        assert example["attention_mask"][next_bos_index] == 2
-        assert example["position_ids"][next_bos_index] == 0
-        assert example["position_ids"][next_bos_index + 1] == 1
-
     @with_temp_dir
     def test_lora_packing(self, temp_dir):
         cfg = DictDefault(
diff --git a/tests/test_packed_pretraining.py b/tests/test_packed_pretraining.py
index 117bc0dbd..0458f7ba2 100644
--- a/tests/test_packed_pretraining.py
+++ b/tests/test_packed_pretraining.py
@@ -9,7 +9,7 @@ import torch
 from datasets import IterableDataset
 from torch.utils.data import DataLoader
 
-from axolotl.utils.data import get_dataset_wrapper, wrap_pretraining_dataset
+from axolotl.utils.data import get_dataset_wrapper, wrap_streaming_dataset
 from axolotl.utils.dict import DictDefault
 
 
@@ -77,14 +77,11 @@ class TestPretrainingPacking:
         )
 
         original_bsz = cfg.micro_batch_size
-        train_dataset = wrap_pretraining_dataset(
+        train_dataset = wrap_streaming_dataset(
             dataset,
             tokenizer_huggyllama,
             cfg,
             ds_wrapper_partial,
-            max_tokens=cfg.sequence_len,
-            batch_size=cfg.micro_batch_size,
-            seed=cfg.seed or 42,
         )
 
         trainer_loader = DataLoader(
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
new file mode 100644
index 000000000..54acbb5e4
--- /dev/null
+++ b/tests/test_streaming.py
@@ -0,0 +1,238 @@
+"""Test streaming configuration and data loading functionality."""
+
+import unittest
+from unittest.mock import Mock, patch
+
+from datasets import IterableDataset
+
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.data.sft import (
+    _prepare_streaming_dataset,
+    prepare_datasets,
+)
+from axolotl.utils.config import validate_config
+
+
+class TestStreamingConfig(unittest.TestCase):
+    """Test streaming configuration and deprecation handling."""
+
+    def test_streaming_multipack_buffer_size_deprecation(self):
+        """Test that pretrain_multipack_buffer_size is properly deprecated."""
+        # Test with old config name
+        cfg_old = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "pretrain_multipack_buffer_size": 5000,
+                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
+                "sequence_len": 256,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "learning_rate": 0.0001,
+            }
+        )
+
+        with self.assertLogs("axolotl.utils.schemas.validation", level="WARNING") as cm:
+            validated_cfg = validate_config(cfg_old)
+            self.assertIn("pretrain_multipack_buffer_size` is deprecated", cm.output[0])
+
+        self.assertEqual(validated_cfg.streaming_multipack_buffer_size, 5000)
+        self.assertIsNone(
+            getattr(validated_cfg, "pretrain_multipack_buffer_size", None)
+        )
+
+    def test_streaming_multipack_buffer_size_new(self):
+        """Test that new streaming_multipack_buffer_size works correctly."""
+        cfg_new = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "streaming_multipack_buffer_size": 7000,
+                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
+                "sequence_len": 256,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "learning_rate": 0.0001,
+            }
+        )
+
+        validated_cfg = validate_config(cfg_new)
+        self.assertEqual(validated_cfg.streaming_multipack_buffer_size, 7000)
+
+    def test_both_buffer_sizes_raises_error(self):
+        """Test that having both old and new buffer size configs raises an error."""
+        cfg_both = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "pretrain_multipack_buffer_size": 5000,
+                "streaming_multipack_buffer_size": 7000,
+                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
+                "sequence_len": 256,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "learning_rate": 0.0001,
+            }
+        )
+
+        with self.assertRaises(ValueError) as cm:
+            validate_config(cfg_both)
+        self.assertIn("both are set", str(cm.exception))
+
+
+class TestStreamingDatasetPreparation(unittest.TestCase):
+    """Test dataset preparation with streaming configuration."""
+
+    def setUp(self):
+        self.tokenizer = Mock()
+        self.tokenizer.pad_token_id = 0
+        self.tokenizer.eos_token_id = 1
+
+    @patch("axolotl.utils.data.sft._prepare_streaming_dataset")
+    def test_prepare_datasets_with_streaming_true(self, mock_prepare_streaming):
+        """Test that streaming=True triggers streaming dataset preparation."""
+        cfg = DictDefault(
+            {
+                "streaming": True,
+                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
+            }
+        )
+
+        mock_prepare_streaming.return_value = (Mock(), None, 100, [])
+
+        prepare_datasets(cfg, self.tokenizer)
+
+        mock_prepare_streaming.assert_called_once_with(cfg, self.tokenizer, None)
+
+    @patch("axolotl.utils.data.sft._prepare_streaming_dataset")
+    def test_prepare_datasets_with_pretraining_dataset(self, mock_prepare_streaming):
+        """Test that pretraining_dataset triggers streaming dataset preparation."""
+        cfg = DictDefault(
+            {
+                "pretraining_dataset": "test/dataset",
+            }
+        )
+
+        mock_prepare_streaming.return_value = (Mock(), None, 100, [])
+
+        prepare_datasets(cfg, self.tokenizer)
+
+        mock_prepare_streaming.assert_called_once_with(cfg, self.tokenizer, None)
+
+    @patch("axolotl.utils.data.sft._prepare_standard_dataset")
+    def test_prepare_datasets_without_streaming(self, mock_prepare_standard):
+        """Test that without streaming, standard dataset preparation is used."""
+        cfg = DictDefault(
+            {
+                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
+            }
+        )
+
+        mock_prepare_standard.return_value = (Mock(), None, 100, [])
+
+        prepare_datasets(cfg, self.tokenizer)
+
+        mock_prepare_standard.assert_called_once_with(cfg, self.tokenizer, None)
+
+
+class TestStreamingWithSamplePacking(unittest.TestCase):
+    """Test streaming dataset preparation with sample packing."""
+
+    def setUp(self):
+        self.tokenizer = Mock()
+        self.tokenizer.pad_token_id = 0
+        self.tokenizer.eos_token_id = 1
+
+    @patch("axolotl.utils.data.sft._load_streaming_dataset")
+    def test_streaming_sft_with_sample_packing_sets_split(self, mock_load_streaming):
+        """Test that streaming SFT with sample_packing sets default split."""
+        cfg = DictDefault(
+            {
+                "streaming": True,
+                "sample_packing": True,
+                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
+                "sequence_len": 256,
+                "micro_batch_size": 1,
+            }
+        )
+
+        mock_load_streaming.return_value = Mock(spec=IterableDataset)
+
+        with patch("axolotl.utils.data.sft._load_and_prepare_datasets"):
+            _prepare_streaming_dataset(cfg, self.tokenizer, None)
+
+            # Check that the dataset config has split set to 'train'
+            call_args = mock_load_streaming.call_args
+            dataset_config = call_args[0][0]
+            self.assertEqual(dataset_config.split, "train")
+
+    def test_multipack_attn_forced_true_for_sft(self):
+        """Test that multipack_attn is forced to True for SFT with sample packing."""
+        from axolotl.utils.data.streaming import wrap_streaming_dataset
+
+        cfg = DictDefault(
+            {
+                "sample_packing": True,
+                "pretrain_multipack_attn": False,  # Should be overridden for SFT
+                "pretraining_dataset": None,  # This makes it SFT
+                "sequence_len": 256,
+                "micro_batch_size": 1,
+                "streaming_multipack_buffer_size": 1000,
+                "seed": 42,
+            }
+        )
+
+        mock_dataset = Mock()
+        mock_dataset.features = None  # For streaming datasets
+        mock_dataset.__iter__ = Mock(return_value=iter([]))  # Empty iterator
+        mock_dataset.map = Mock(return_value=mock_dataset)
+        mock_ds_wrapper = Mock()
+
+        with patch(
+            "axolotl.utils.data.streaming.PretrainingBatchSamplerDataCollatorForSeq2Seq"
+        ) as mock_collator:
+            with patch("axolotl.utils.data.streaming.encode_packed_streaming"):
+                wrap_streaming_dataset(
+                    mock_dataset, self.tokenizer, cfg, mock_ds_wrapper
+                )
+
+                # Check that multipack_attn=True was used in the collator
+                mock_collator.assert_called_once()
+                call_kwargs = mock_collator.call_args[1]
+                self.assertTrue(call_kwargs["multipack_attn"])
+
+    def test_multipack_attn_respects_config_for_pretraining(self):
+        """Test that multipack_attn respects config for pretraining datasets."""
+        from axolotl.utils.data.streaming import wrap_streaming_dataset
+
+        cfg = DictDefault(
+            {
+                "sample_packing": True,
+                "pretrain_multipack_attn": False,  # Should be respected for pretraining
+                "pretraining_dataset": "test/dataset",  # This makes it pretraining
+                "sequence_len": 256,
+                "micro_batch_size": 1,
+                "streaming_multipack_buffer_size": 1000,
+                "seed": 42,
+            }
+        )
+
+        mock_dataset = Mock()
+        mock_dataset.features = None  # For streaming datasets
+        mock_dataset.__iter__ = Mock(return_value=iter([]))  # Empty iterator
+        mock_dataset.map = Mock(return_value=mock_dataset)
+        mock_ds_wrapper = Mock()
+
+        with patch(
+            "axolotl.utils.data.streaming.PretrainingBatchSamplerDataCollatorForSeq2Seq"
+        ) as mock_collator:
+            with patch("axolotl.utils.data.streaming.encode_packed_streaming"):
+                wrap_streaming_dataset(
+                    mock_dataset, self.tokenizer, cfg, mock_ds_wrapper
+                )
+
+                # Check that multipack_attn=False was used (respecting config)
+                mock_collator.assert_called_once()
+                call_kwargs = mock_collator.call_args[1]
+                self.assertFalse(call_kwargs["multipack_attn"])
+
+
+if __name__ == "__main__":
+    unittest.main()

From 06bebcb65f2b2826d94f47ca0c2b36ea0ea80c67 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 2 Sep 2025 13:13:23 -0400
Subject: [PATCH 030/115] run cu128-2.8.0 e2e tests on B200 (#3126)

* run cu128-2.8.0 e2e tests on B200

* not an int :facepalm:

* fix yaml
---
 .github/workflows/tests.yml | 2 ++
 cicd/single_gpu.py          | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 59011ee77..337230d4a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -303,6 +303,7 @@ jobs:
             python_version: "3.11"
             pytorch: 2.8.0
             num_gpus: 1
+            gpu_type: "B200"
             axolotl_extras:
     steps:
       - name: Checkout
@@ -324,6 +325,7 @@ jobs:
           echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
           echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
           echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
           echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
           echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
       - name: Run tests job on Modal
diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py
index 0e2922e90..5a06a34f0 100644
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -57,7 +57,8 @@ VOLUME_CONFIG = {
 }
 
 N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = f"L40S:{N_GPUS}"
+GPU_TYPE = os.environ.get("GPU_TYPE", "L40S")
+GPU_CONFIG = f"{GPU_TYPE}:{N_GPUS}"
 
 
 def run_cmd(cmd: str, run_folder: str):

From 24aba5cacaf22c137882ae5d5b64f4e2c42ee23e Mon Sep 17 00:00:00 2001
From: xuyifann <159863565+xuyifann@users.noreply.github.com>
Date: Tue, 2 Sep 2025 22:40:27 -0700
Subject: [PATCH 031/115] Clamping the len of dataloader to minimum of 1
 (#3100) [skip ci]

* Clamping the len of dataloader to minimum of 1

* linter reformat
---
 src/axolotl/utils/trainer.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 43f76c0cd..a0f4fd567 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -475,7 +475,9 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                 train_dataset.remove_columns(["length"]),
                 batch_sampler=sampler,
             )
-            data_loader_len = len(data_loader) * cfg.micro_batch_size // cfg.batch_size
+            data_loader_len = max(
+                1, len(data_loader) * cfg.micro_batch_size // cfg.batch_size
+            )
             LOG.debug(f"data_loader_len: {data_loader_len}")
             # FIXME: is there a bug here somewhere? the total num steps depends
             # on the agreed on value for sample_packing_eff_est

From e48aa8a5b1d7b6e2fd4da18768e64fab74642259 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Wed, 3 Sep 2025 12:40:53 +0700
Subject: [PATCH 032/115] feat(doc): improve visibility for colab notebooks
 (#3110) [skip ci]

* feat: improve visibility for colab notebooks

* fix: link to GH colab

* feat: change to badge and move higher
---
 README.md             | 5 +++++
 docs/installation.qmd | 2 +-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 117eb9b12..d4794124a 100644
--- a/README.md
+++ b/README.md
@@ -17,6 +17,7 @@
     <br/>
     <a href="https://discord.com/invite/HhrNrHJPRb"><img src="https://img.shields.io/badge/discord-7289da.svg?style=flat-square&logo=discord" alt="discord" style="height: 20px;"></a>
     <a href="https://twitter.com/axolotl_ai"><img src="https://img.shields.io/twitter/follow/axolotl_ai?style=social" alt="twitter" style="height: 20px;"></a>
+    <a href="https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="google-colab" style="height: 20px;"></a>
     <br/>
     <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
     <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
@@ -70,6 +71,10 @@ Features:
 - Python 3.11
 - PyTorch ≥2.6.0
 
+### Google Colab
+
+[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)
+
 ### Installation
 
 #### Using pip
diff --git a/docs/installation.qmd b/docs/installation.qmd
index 763539278..265ff238c 100644
--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -134,7 +134,7 @@ For providers supporting Docker:
 
 ### Google Colab {#sec-colab}
 
-Use our [example notebook](../examples/colab-notebooks/colab-axolotl-example.ipynb).
+[![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)
 
 ## Platform-Specific Instructions {#sec-platform-specific}
 

From 4cc6038d52b2b66794be150b0caab82ede436872 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 3 Sep 2025 01:41:34 -0400
Subject: [PATCH 033/115] chore: update pre-commit hooks (#3122) [skip ci]

Co-authored-by: djsaunde <1245942+djsaunde@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 4c2861346..53e49d747 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
     -   id: no-commit-to-branch
         args: ['--branch', 'main']
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.9
+    rev: v0.12.11
     hooks:
     -   id: ruff
         args: [--fix]

From 53a0c1f39c3f043135d93723ff46fa27cc9360fc Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Wed, 3 Sep 2025 12:48:01 +0700
Subject: [PATCH 034/115] feat: add peft_trainable_token_indices (#3062)

* feat: add peft_trainable_token_indices

* feat: add warning compat with fix_untrained_tokens
---
 src/axolotl/loaders/adapter.py      |  2 ++
 src/axolotl/utils/schemas/config.py | 30 ++++++++++++++++++++++++++++-
 src/axolotl/utils/schemas/peft.py   | 10 ++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/loaders/adapter.py b/src/axolotl/loaders/adapter.py
index 867e6901c..989b34aee 100644
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -98,6 +98,8 @@ def load_lora(
         lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
     if cfg.peft_layer_replication:
         lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
+    if cfg.peft_trainable_token_indices:
+        lora_config_kwargs["trainable_token_indices"] = cfg.peft_trainable_token_indices
 
     lora_config = LoraConfig(
         r=cfg.lora_r,
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index d43c346cd..1d2ddf4ae 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -947,7 +947,15 @@ class AxolotlInputConfig(
         },
     )
 
-    fix_untrained_tokens: int | list[int] | None = None
+    fix_untrained_tokens: int | list[int] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": (
+                "Token index or indices to adjust embedding weights to the mean of the other tokens. "
+                "This is useful when the model has untrained embeddings."
+            )
+        },
+    )
 
     # INTERNALS - document for now, generally not set externally
     is_preprocess: bool | None = None
@@ -1006,6 +1014,26 @@ class AxolotlInputConfig(
             return [ds_config.model_dump(exclude_none=True) for ds_config in ds_configs]
         return None
 
+    @model_validator(mode="before")
+    @classmethod
+    def warn_peft_trainable_token_to_fix_untrained(cls, data):
+        if (
+            peft_trainable_token_indices := data.get("peft_trainable_token_indices")
+        ) and (fix_untrained_tokens := data.get("fix_untrained_tokens")):
+            if isinstance(fix_untrained_tokens, int):
+                fix_untrained_tokens = (fix_untrained_tokens,)
+
+            if isinstance(peft_trainable_token_indices, int):
+                peft_trainable_token_indices = (peft_trainable_token_indices,)
+
+            for untrained_token_id in fix_untrained_tokens:
+                if untrained_token_id not in peft_trainable_token_indices:
+                    LOG.warning_once(
+                        f"Token {untrained_token_id} is fixed via `fix_untrained_tokens`, yet not in `peft_trainable_token_indices: ` list. "
+                        "Please add it, otherwise the token won't be trained on."
+                    )
+        return data
+
 
 class AxolotlConfigWCapabilities(AxolotlInputConfig):
     """wrapper to valdiate GPU capabilities with the configured options"""
diff --git a/src/axolotl/utils/schemas/peft.py b/src/axolotl/utils/schemas/peft.py
index de29521cb..af22913fd 100644
--- a/src/axolotl/utils/schemas/peft.py
+++ b/src/axolotl/utils/schemas/peft.py
@@ -90,6 +90,16 @@ class LoraConfig(BaseModel):
             "description": "How to initialize LoRA weights. Default to True which is MS original implementation."
         },
     )
+    peft_trainable_token_indices: list[int] | dict[str, list[int]] | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": (
+                "A list of token indices to fine-tune on the `embed_tokens` layer.\n"
+                "Otherwise, a dict mapping an embedding layer name to its trainable token indices.\n"
+                "See https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-tokens-alongside-lora"
+            )
+        },
+    )
 
     qlora_sharded_model_loading: bool | None = Field(
         default=False,

From 48db520d92e541055bde7e41e0269b6a80ca2301 Mon Sep 17 00:00:00 2001
From: mhenrichsen <mads.gade.henrichsen@live.dk>
Date: Wed, 3 Sep 2025 22:20:32 +0200
Subject: [PATCH 035/115] Create 270m-qlora.yml (#3075) [skip ci]

Adds 270m gemma3 qlora
---
 examples/gemma3/270m-qlora.yml | 68 ++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 examples/gemma3/270m-qlora.yml

diff --git a/examples/gemma3/270m-qlora.yml b/examples/gemma3/270m-qlora.yml
new file mode 100644
index 000000000..8744fad26
--- /dev/null
+++ b/examples/gemma3/270m-qlora.yml
@@ -0,0 +1,68 @@
+base_model: google/gemma-3-270m-it
+# optionally might have model_type or tokenizer_type
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+# gemma3 doesn't seem to play nice with ddp
+ddp_find_unused_parameters: true
+
+load_in_8bit: false
+load_in_4bit: true
+
+# huggingface repo
+chat_template: gemma3
+eot_tokens:
+  - <end_of_turn>
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: false
+
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch:
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:

From efa1da52d500ae0dfa4cc192523a0a587611f020 Mon Sep 17 00:00:00 2001
From: yardenhoch <137788890+yardenhoch@users.noreply.github.com>
Date: Wed, 3 Sep 2025 23:22:37 +0300
Subject: [PATCH 036/115] Center rewards coefficient (#3124)

* feat: add center_rewards_coefficient for reward modeling

- Add center_rewards_coefficient parameter to Pydantic schema with paper reference
- Pass parameter through base builder and causal builder to training args
- Add documentation section with usage examples and theoretical background
- Enable parameter in reward modeling example configs with recommended value
- Enables reward centering for improved training stability in RLHF workflows

Implements auxiliary loss from Eisenstein et al. 2023 (https://huggingface.co/papers/2312.09244)
to incentivize mean-zero reward outputs without post-training normalization.

* Update description

* test: add unit tests for center_rewards_coefficient integration

* Update src/axolotl/core/builders/base.py

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>

* Update docs/reward_modelling.qmd

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>

* Update docs/reward_modelling.qmd

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>

* reference to TRL documentation.

* add new reward model configuration for qwen3 with comprehensive parameters

* Verified center_rewards_coefficient is correctly passed through the trainer builder to training arguments.

* Refactor reward modeling documentation to consolidate information on center_rewards_coefficient

* Remove unit tests for center_rewards_coefficient integration as part of codebase cleanup.

* linting

* nit

* Apply suggestions from code review

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>

* lint

---------

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
Co-authored-by: Salman Mohammadi <salman.mohammadi@outlook.com>
---
 docs/reward_modelling.qmd           |  1 +
 examples/qwen3/reward-model.yaml    | 44 +++++++++++++++++++++++++++++
 src/axolotl/core/builders/causal.py | 13 +++++----
 src/axolotl/utils/schemas/config.py |  6 ++++
 4 files changed, 58 insertions(+), 6 deletions(-)
 create mode 100644 examples/qwen3/reward-model.yaml

diff --git a/docs/reward_modelling.qmd b/docs/reward_modelling.qmd
index 386dc1f57..b5cf3010d 100644
--- a/docs/reward_modelling.qmd
+++ b/docs/reward_modelling.qmd
@@ -11,6 +11,7 @@ We support the reward modelling techniques supported by `trl`.
 ### (Outcome) Reward Models
 
 Outcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step).
+For improved training stability, you can use the `center_rewards_coefficient` parameter to encourage mean-zero reward outputs ([see TRL docs](https://huggingface.co/docs/trl/v0.10.1/en/reward_trainer#centering-rewards)).
 
 ```yaml
 base_model: google/gemma-2-2b
diff --git a/examples/qwen3/reward-model.yaml b/examples/qwen3/reward-model.yaml
new file mode 100644
index 000000000..43c62ecc4
--- /dev/null
+++ b/examples/qwen3/reward-model.yaml
@@ -0,0 +1,44 @@
+base_model: Skywork/Skywork-Reward-V2-Qwen3-8B
+model_type: AutoModelForSequenceClassification
+num_labels: 1
+
+reward_model: true
+center_rewards_coefficient: 0.01  # Incentivize mean-zero rewards for improved stability
+chat_template: qwen3
+datasets:
+  - path: argilla/distilabel-intel-orca-dpo-pairs
+    type: bradley_terry.chat_template
+
+val_set_size: 0.0
+output_dir: ./outputs/out
+
+sequence_len: 8192
+sample_packing: false
+eval_sample_packing: false
+pad_to_sequence_len: true
+
+deepspeed: deepspeed_configs/zero1.json
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 1
+eval_batch_size: 1
+num_epochs: 3
+optimizer: adamw_bnb_8bit
+lr_scheduler: linear
+learning_rate: 0.00002
+
+bf16: true
+tf32: true
+
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+warmup_ratio: 0.1
+logging_steps: 1
+weight_decay: 0.01
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
index e5bc68c39..057d0ab5c 100644
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -7,10 +7,7 @@ from pathlib import Path
 from typing import Type, Union
 
 import transformers
-from transformers import (
-    DataCollatorWithFlattening,
-    EarlyStoppingCallback,
-)
+from transformers import DataCollatorWithFlattening, EarlyStoppingCallback
 from trl.trainer.utils import RewardDataCollatorWithPadding
 
 from axolotl.core.builders.base import TrainerBuilderBase
@@ -26,12 +23,12 @@ from axolotl.monkeypatch.relora import ReLoRACallback
 from axolotl.processing_strategies import get_processing_strategy
 from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
-    LossWatchDogCallback,
-    SaveBetterTransformerModelCallback,
     bench_eval_callback_factory,
     causal_lm_bench_eval_callback_factory,
     colab_inference_post_train_callback,
     log_prediction_callback_factory,
+    LossWatchDogCallback,
+    SaveBetterTransformerModelCallback,
 )
 from axolotl.utils.callbacks.lisa import lisa_callback_factory
 from axolotl.utils.callbacks.qat import QATCallback
@@ -340,6 +337,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
 
         if self.cfg.reward_model:
             training_args_cls = AxolotlRewardConfig
+            if self.cfg.center_rewards_coefficient is not None:
+                training_arguments_kwargs["center_rewards_coefficient"] = (
+                    self.cfg.center_rewards_coefficient
+                )
         elif self.cfg.process_reward_model:
             training_args_cls = AxolotlPRMConfig
         else:
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 1d2ddf4ae..32d7b68e7 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -138,6 +138,12 @@ class AxolotlInputConfig(
             "description": "Process reward modelling: `True` or `False`"
         },
     )
+    center_rewards_coefficient: float | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`."
+        },
+    )
     num_labels: int | None = None
     # Whether to use weighting in DPO trainer.
     # If `None`, default is `False` in the trainer.

From c6ae5c43cbd900e72222f1515adfb081a3500b6a Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Thu, 4 Sep 2025 03:25:09 +0700
Subject: [PATCH 037/115] fix: chat template jinja file not being loaded during
 inference (#3112)

* fix: chat template jinja file not being loaded during inference

* fix: bot comment
---
 src/axolotl/cli/inference.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/axolotl/cli/inference.py b/src/axolotl/cli/inference.py
index 06b64292f..debe57167 100644
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -14,10 +14,7 @@ from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
 from axolotl.cli.args import InferenceCliArgs
 from axolotl.cli.config import load_cfg
 from axolotl.cli.utils import load_model_and_tokenizer
-from axolotl.utils.chat_templates import (
-    get_chat_template,
-    get_chat_template_from_config,
-)
+from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 
@@ -64,7 +61,9 @@ def do_inference(
             importlib.import_module("axolotl.prompters"), prompter
         )
     elif cfg.chat_template:
-        chat_template_str = get_chat_template(cfg.chat_template, tokenizer=tokenizer)
+        chat_template_str = get_chat_template_from_config(
+            cfg, ds_cfg=None, tokenizer=tokenizer
+        )
     elif cfg.datasets[0].type == "chat_template":
         chat_template_str = get_chat_template_from_config(
             cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
@@ -159,7 +158,13 @@ def do_inference_gradio(
             importlib.import_module("axolotl.prompters"), prompter
         )
     elif cfg.chat_template:
-        chat_template_str = get_chat_template(cfg.chat_template, tokenizer=tokenizer)
+        chat_template_str = get_chat_template_from_config(
+            cfg, ds_cfg=None, tokenizer=tokenizer
+        )
+    elif cfg.datasets[0].type == "chat_template":
+        chat_template_str = get_chat_template_from_config(
+            cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
+        )
 
     model = model.to(cfg.device, dtype=cfg.torch_dtype)
 

From 1d32278755108c47eafbb6dca95c62e807771351 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Fri, 5 Sep 2025 22:00:54 +0700
Subject: [PATCH 038/115] feat: upgrade transformers to v4.56.1 (#3127)

* feat: upgrade transformers to v4.56

* fix handling of CP/SP now that position_ids are default even for unpacked sequences

* feat: monkeypatch list_repo_templates

* fix: apply patch for tests only

* see if updated main works at least

* fix: update to patch release and remove monkeypatch

* remove fsdp2 eval patch

---------

Co-authored-by: Wing Lian <wing@axolotl.ai>
---
 requirements.txt                              |  2 +-
 src/axolotl/loaders/patch_manager.py          |  8 +------
 .../transformers/trainer_loss_calc.py         | 24 +------------------
 .../utils/ctx_managers/sequence_parallel.py   |  4 ++--
 tests/monkeypatch/test_trainer_loss_calc.py   |  2 --
 5 files changed, 5 insertions(+), 35 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9e3dbbca4..1292a179a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ packaging==23.2
 
 huggingface_hub>=0.33.0
 peft>=0.17.0
-transformers==4.55.4
+transformers==4.56.1
 tokenizers>=0.21.1
 accelerate==1.10.0
 datasets==4.0.0
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index 94b307a62..044c278a3 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -80,13 +80,7 @@ class PatchManager:
             patch_maybe_log_save_evaluate,
         )
 
-        patch_fsdp2 = (
-            self.cfg.torch_compile
-            and self.cfg.fsdp_config
-            and self.cfg.fsdp_version == 2
-        )
-
-        patch_evaluation_loop(patch_fsdp2)
+        patch_evaluation_loop()
         patch_maybe_log_save_evaluate()
 
     def apply_post_model_load_patches(self, model: PreTrainedModel):
diff --git a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
index 012c699fa..c9b968d71 100644
--- a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
@@ -28,15 +28,6 @@ PATCHED_EVAL_CODE = {
     "array": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(all_losses).item()',
 }
 
-ORIGINAL_FSDP2_CODE = """
-    model.eval()
-"""
-
-PATCHED_FSDP2_CODE = """
-    if hasattr(model, "eval") and callable(model.eval):
-        self.model.eval()
-"""
-
 ORIGINAL_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).mean().item()"
 PATCHED_MAYBE_CODE = "tr_loss_scalar = self._nested_gather(tr_loss).nanmean().item()"
 
@@ -46,13 +37,7 @@ def check_evaluation_loop_is_patchable() -> bool:
     return all(value in evaluation_loop_source for value in ORIGINAL_EVAL_CODE.values())
 
 
-def check_evaluation_loop_is_fsdp2_patchable() -> bool:
-    evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop)
-    evaluation_loop_source, _ = detab_code(evaluation_loop_source)
-    return ORIGINAL_FSDP2_CODE in evaluation_loop_source
-
-
-def patch_evaluation_loop(patch_fsdp2: bool):
+def patch_evaluation_loop():
     """Patch the evaluation_loop method."""
     # Check if already patched
     if hasattr(Trainer, "_original_evaluation_loop"):
@@ -75,13 +60,6 @@ def patch_evaluation_loop(patch_fsdp2: bool):
         ORIGINAL_EVAL_CODE["array"], PATCHED_EVAL_CODE["array"]
     )
 
-    # Apply FSDP2 eval guard patch if needed
-    if patch_fsdp2 and ORIGINAL_FSDP2_CODE in evaluation_loop_source:
-        evaluation_loop_source = evaluation_loop_source.replace(
-            ORIGINAL_FSDP2_CODE, PATCHED_FSDP2_CODE
-        )
-        LOG.info("Applied FSDP2 eval guard patch to evaluation_loop")
-
     # Rename the function to avoid conflicts
     evaluation_loop_source = evaluation_loop_source.replace(
         "def evaluation_loop(",
diff --git a/src/axolotl/utils/ctx_managers/sequence_parallel.py b/src/axolotl/utils/ctx_managers/sequence_parallel.py
index 1ec91ae2a..78b3d1cae 100644
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -48,10 +48,10 @@ def apply_sequence_parallelism(
             - The original sequence length before padding.
             - The number of padding tokens added.
     """
-    original_seq_len = batch["input_ids"].size(1)
+    batch_size, original_seq_len = batch["input_ids"].shape
 
     # Update ring attention params if needed
-    if batch.get("position_ids") is not None:
+    if batch.get("position_ids") is not None and batch_size == 1:
         update_ring_attn_params(position_ids=batch["position_ids"])
     else:
         # If position_ids aren't already in the batch, create them
diff --git a/tests/monkeypatch/test_trainer_loss_calc.py b/tests/monkeypatch/test_trainer_loss_calc.py
index de3e92621..c72cb621b 100644
--- a/tests/monkeypatch/test_trainer_loss_calc.py
+++ b/tests/monkeypatch/test_trainer_loss_calc.py
@@ -3,7 +3,6 @@
 import unittest
 
 from axolotl.monkeypatch.transformers.trainer_loss_calc import (
-    check_evaluation_loop_is_fsdp2_patchable,
     check_evaluation_loop_is_patchable,
     check_maybe_log_save_evaluate_is_patchable,
 )
@@ -20,7 +19,6 @@ class TestTrainerLossCalc(unittest.TestCase):
         the patched code changes upstream.
         """
         assert check_evaluation_loop_is_patchable()
-        assert check_evaluation_loop_is_fsdp2_patchable()
         assert check_maybe_log_save_evaluate_is_patchable()
 
 

From bf00f29f3a51b66221d3321c7de1c981c137db12 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Sun, 7 Sep 2025 10:33:20 -0400
Subject: [PATCH 039/115] chore: update pre-commit hooks (#3137) [skip ci]

Co-authored-by: djsaunde <1245942+djsaunde@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 53e49d747..92ddc7f41 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
     -   id: no-commit-to-branch
         args: ['--branch', 'main']
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.11
+    rev: v0.12.12
     hooks:
     -   id: ruff
         args: [--fix]

From 8fd9221f134da88773d62a8eb1e6f5f068ad5d8c Mon Sep 17 00:00:00 2001
From: Seungduk Kim <seungduk.kim@yanolja.com>
Date: Sun, 7 Sep 2025 23:49:10 +0900
Subject: [PATCH 040/115] Add `ipo` as an `rl` type that shares DPODataset
 config (#3128)

* Add `ipo` as an `rl` type that shares DPODataset config

* chore: lint

---------

Co-authored-by: Wing Lian <wing@axolotl.ai>
---
 src/axolotl/utils/config/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/utils/config/__init__.py b/src/axolotl/utils/config/__init__.py
index 2b6ef8d98..f40fe6687 100644
--- a/src/axolotl/utils/config/__init__.py
+++ b/src/axolotl/utils/config/__init__.py
@@ -273,7 +273,9 @@ def validate_config(
     # Convert datasets to proper format if needed
     if cfg.get("datasets"):
         for idx, ds_cfg in enumerate(cfg["datasets"]):
-            if cfg.get("rl") in ["dpo", "simpo"] and not isinstance(ds_cfg, DPODataset):
+            if cfg.get("rl") in ["dpo", "ipo", "simpo"] and not isinstance(
+                ds_cfg, DPODataset
+            ):
                 cfg["datasets"][idx] = DPODataset(**ds_cfg)
             elif cfg.get("rl") == "kto" and not isinstance(ds_cfg, KTODataset):
                 cfg["datasets"][idx] = KTODataset(**dict(ds_cfg))

From b5d4c7ff542d2eda635f9230e96f64373ecd5418 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Sun, 7 Sep 2025 11:01:03 -0400
Subject: [PATCH 041/115] allow 1% deviation for codecov (#3138) [skip ci]

---
 codecov.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/codecov.yml b/codecov.yml
index 28921f9be..fa3ad3073 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -12,7 +12,7 @@ coverage:
       default:
         # basic
         target: auto
-        threshold: 0%
+        threshold: 1%
         base: auto
         # advanced
         branches: null
@@ -27,7 +27,7 @@ coverage:
       default:
         # basic
         target: auto
-        threshold: 0%
+        threshold: 1%
         base: auto
         # advanced
         branches: null

From 9640338d37d0398cd3c0c0ab6e629b6dd9dcd5d3 Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Tue, 9 Sep 2025 15:50:21 +0100
Subject: [PATCH 042/115] Default `include_tkps` to true (#3134)

* default true

* force e2e

* causal trainer only

* fix eval loggin [skip-ci]

* revert setup.py

* force tests

* guarding

* guarding

* fix test case

* use evaluate [skip-e2e]

* use evaluate [skip-e2e]

* kick off ci

* fixing

* reverting
---
 src/axolotl/core/builders/base.py                |  7 -------
 src/axolotl/core/builders/causal.py              |  7 +++++++
 src/axolotl/core/trainers/base.py                |  4 ++--
 src/axolotl/utils/callbacks/tokens_per_second.py | 16 +++++++++-------
 src/axolotl/utils/schemas/config.py              |  4 ++--
 5 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
index bee291fa2..1ec818004 100644
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -36,7 +36,6 @@ from axolotl.utils.callbacks import (
     SaveModelOnFirstStepCallback,
 )
 from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
-from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback
 from axolotl.utils.distributed import build_parallelism_config
 from axolotl.utils.schemas.enums import CustomSupportedOptimizers
 
@@ -145,12 +144,6 @@ class TrainerBuilderBase(abc.ABC):
                     profiler_steps_start=self.cfg.profiler_steps_start,
                 )
             )
-        if self.cfg.include_tkps:
-            callbacks.append(
-                TokensPerSecondCallback(
-                    self.cfg.tensor_parallel_size, self.cfg.context_parallel_size
-                )
-            )
 
         return callbacks
 
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
index 057d0ab5c..ee6383d47 100644
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -39,6 +39,7 @@ from axolotl.utils.collators import (
     MambaDataCollator,
     V2BatchSamplerDataCollatorForSeq2Seq,
 )
+from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback
 from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
 from axolotl.utils.import_helper import get_cls_from_module_str
 from axolotl.utils.logging import get_logger
@@ -71,6 +72,12 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
         if self.cfg.qat:
             callbacks.append(QATCallback(self.cfg.qat))
 
+        if self.cfg.include_tkps:
+            callbacks.append(
+                TokensPerSecondCallback(
+                    self.cfg.tensor_parallel_size, self.cfg.context_parallel_size
+                )
+            )
         return callbacks
 
     def get_post_trainer_create_callbacks(self, trainer):
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 06eef445b..d7555261f 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -342,10 +342,10 @@ class AxolotlTrainer(
             inputs_key = "labels" if "labels" in inputs else "input_ids"
             if hasattr(self.state, "num_tokens"):
                 self.state.num_tokens = (
-                    self.state.num_tokens + (inputs[inputs_key] != -100).sum()
+                    self.state.num_tokens + (inputs[inputs_key] != -100).sum().cpu()
                 )
             else:
-                self.state.num_tokens = (inputs[inputs_key] != -100).sum()
+                self.state.num_tokens = (inputs[inputs_key] != -100).sum().cpu()
 
         if self.args.orpo_alpha:
             return self.orpo_compute_loss(
diff --git a/src/axolotl/utils/callbacks/tokens_per_second.py b/src/axolotl/utils/callbacks/tokens_per_second.py
index 85bcd5041..ead129240 100644
--- a/src/axolotl/utils/callbacks/tokens_per_second.py
+++ b/src/axolotl/utils/callbacks/tokens_per_second.py
@@ -43,11 +43,12 @@ class TokensPerSecondCallback(TrainerCallback):
         control: TrainerControl,
         **kwargs,
     ):  # pylint: disable=unused-argument
-        step_time = time.perf_counter() - self.start_time
-        num_tokens_per_device = state.num_tokens.clone()
-        # non data parallel groups have duplicated tokens, so we avoid double-counting
-        num_tokens_per_device = num_tokens_per_device / self.non_data_parallel_size
-        state.last_tokens_per_second = num_tokens_per_device / step_time
+        if hasattr(state, "num_tokens"):
+            step_time = time.perf_counter() - self.start_time
+            num_tokens_per_device = state.num_tokens.clone()
+            # non data parallel groups have duplicated tokens, so we avoid double-counting
+            num_tokens_per_device = num_tokens_per_device / self.non_data_parallel_size
+            state.last_tokens_per_second = num_tokens_per_device / step_time
 
     def on_log(
         self,
@@ -58,5 +59,6 @@ class TokensPerSecondCallback(TrainerCallback):
         **kwargs,
     ):  # pylint: disable=unused-argument
         # after logging, clear the running metrics
-        state.last_tokens_per_second.zero_()
-        state.num_tokens = 0
+        if hasattr(state, "last_tokens_per_second"):
+            state.last_tokens_per_second.zero_()
+            state.num_tokens = torch.zeros(1)
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 32d7b68e7..e4c1fdf29 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -855,9 +855,9 @@ class AxolotlInputConfig(
         },
     )
     include_tkps: bool | None = Field(
-        default=None,
+        default=True,
         json_schema_extra={
-            "description": "bool of whether to report tokens per second during training by measuring throughput of non-padding tokens."
+            "description": "bool of whether to report tokens per second per-gpu during training by measuring throughput of non-padding tokens."
         },
     )
     neftune_noise_alpha: float | None = Field(

From 79103b01ca1c914103d888d88fdb903e29840d4f Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Wed, 10 Sep 2025 09:01:02 +0700
Subject: [PATCH 043/115] Feat: add seedoss (#3104) [skip ci]

* feat: add seedoss cce

* feat: add seedoss config and docs

* fix: shouldn't have target modules with target linear

* feat: add vram numbers

* fix: hf link

* fix: name

* fix: support multipack seedoss

* fix: merge error

* feat: update seedoss instructions for transformers release
---
 examples/seed-oss/README.md                   | 54 ++++++++++++++++++
 examples/seed-oss/seed-oss-36b-qlora.yaml     | 56 +++++++++++++++++++
 .../integrations/cut_cross_entropy/README.md  |  3 +
 src/axolotl/monkeypatch/multipack.py          |  1 +
 4 files changed, 114 insertions(+)
 create mode 100644 examples/seed-oss/README.md
 create mode 100644 examples/seed-oss/seed-oss-36b-qlora.yaml

diff --git a/examples/seed-oss/README.md b/examples/seed-oss/README.md
new file mode 100644
index 000000000..5610c1316
--- /dev/null
+++ b/examples/seed-oss/README.md
@@ -0,0 +1,54 @@
+# Finetune ByteDance's Seed-OSS with Axolotl
+
+[Seed-OSS](https://huggingface.co/collections/ByteDance-Seed/seed-oss-68a609f4201e788db05b5dcd) are a series of 36B parameter open source models trained by ByteDance's Seed Team.
+
+This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Seed-OSS is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
+
+    Here is an example of how to install from main for pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'
+
+# Install Cut Cross Entropy
+python scripts/cutcrossentropy_install.py | sh
+```
+
+2. Run the finetuning example:
+
+```bash
+axolotl train examples/seed-oss/seed-oss-36b-qlora.yaml
+```
+
+This config uses about 27.7 GiB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### TIPS
+
+- For inference, the official Seed Team recommends `top_p=0.95` and `temperature=1.1`.
+- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+
+## Related Resources
+
+- [ByteDance Seed Website](https://seed.bytedance.com/)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/seed-oss/seed-oss-36b-qlora.yaml b/examples/seed-oss/seed-oss-36b-qlora.yaml
new file mode 100644
index 000000000..00e7cf3eb
--- /dev/null
+++ b/examples/seed-oss/seed-oss-36b-qlora.yaml
@@ -0,0 +1,56 @@
+base_model: ByteDance-Seed/Seed-OSS-36B-Instruct
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: false
+load_in_4bit: true
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md
index a64bdd054..393412f64 100644
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -34,6 +34,7 @@ plugins:
 - arcee
 - cohere
 - cohere2
+- deepseek_v3
 - gemma
 - gemma2
 - gemma3
@@ -42,6 +43,7 @@ plugins:
 - gemma3n_text
 - glm
 - glm4
+- glm4_moe
 - gpt_oss
 - granite
 - granitemoe
@@ -64,6 +66,7 @@ plugins:
 - qwen3
 - qwen3_moe
 - smollm3
+- seed_oss
 - voxtral
 
 ## Citation
diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
index e4f9ca2be..cbc546877 100644
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -38,6 +38,7 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
     "smollm3",
     "gpt_oss",
     "arcee",
+    "seed_oss",
 ]
 
 

From b71482cec5beb118efdd2bc466589e9a6eb64e77 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Wed, 10 Sep 2025 09:03:30 +0700
Subject: [PATCH 044/115] Feat: add hunyuan v1 (#3016)

* feat: add hunyuan cce support

* feat: update cce docs

* feat: add multipack support for granite and hunyuan

* feat: add hunyuan docs and example config

* feat: update readme instructions to include CCE installation

* fix: chat template log appearing despite tokenizer already having template

* feat: add vram usage

* fix: remove duplicate cce install

* fix: use latest commit of PR in case rebased/pushed

* Revert "fix: use latest commit of PR in case rebased/pushed"

This reverts commit 8b60aa00de5511c09a6cad64ae1cf476e6a5eddc.

* feat: update doc as upstream merged
---
 examples/devstral/README.md                  |  8 +-
 examples/hunyuan/README.md                   | 85 ++++++++++++++++++++
 examples/hunyuan/hunyuan-v1-dense-qlora.yaml | 64 +++++++++++++++
 examples/magistral/README.md                 |  8 +-
 examples/voxtral/README.md                   |  3 +
 src/axolotl/loaders/tokenizer.py             |  2 +-
 src/axolotl/monkeypatch/multipack.py         |  4 +
 7 files changed, 171 insertions(+), 3 deletions(-)
 create mode 100644 examples/hunyuan/README.md
 create mode 100644 examples/hunyuan/hunyuan-v1-dense-qlora.yaml

diff --git a/examples/devstral/README.md b/examples/devstral/README.md
index b53635a8f..ae0860662 100644
--- a/examples/devstral/README.md
+++ b/examples/devstral/README.md
@@ -20,7 +20,13 @@ pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
 
-2. Run the finetuning example:
+2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
+
+```bash
+python scripts/cutcrossentropy_install.py | sh
+```
+
+3. Run the finetuning example:
 
 ```bash
 axolotl train examples/devstral/devstral-small-qlora.yml
diff --git a/examples/hunyuan/README.md b/examples/hunyuan/README.md
new file mode 100644
index 000000000..96c6bbcfa
--- /dev/null
+++ b/examples/hunyuan/README.md
@@ -0,0 +1,85 @@
+# Finetune HunYuan with Axolotl
+
+Tencent released a family of opensource models called HunYuan with varying parameter scales of 0.5B, 1.8B, 4B, and 7B scale for both Pre-trained and Instruct variants. The models can be found at [HuggingFace](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as HunYuan is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
+
+    Here is an example of how to install from main for pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'
+
+# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
+python scripts/cutcrossentropy_install.py | sh
+```
+
+2. Run the finetuning example:
+
+```bash
+axolotl train examples/hunyuan/hunyuan-v1-dense-qlora.yaml
+```
+
+This config uses about 4.7 GB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### Dataset
+
+HunYuan Instruct models can choose to enter a slow think or fast think pattern. For best performance on fine-tuning their Instruct models, your dataset should be adjusted to match their pattern.
+
+```python
+# fast think pattern
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "/no_think What color is the sun?" },
+    {"role": "assistant", "content": "<think>\n\n</think>\n<answer>\nThe sun is yellow.\n</answer>"}
+]
+
+# slow think pattern
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "/no_think What color is the sun?" },
+    {"role": "assistant", "content": "<think>\nThe user is asking about the color of the sun. I need to ...\n</think>\n<answer>\nThe sun is yellow.\n</answer>"}
+]
+```
+
+### TIPS
+
+- For inference, the official Tencent team recommends
+
+```json
+
+{
+  "do_sample": true,
+  "top_k": 20,
+  "top_p": 0.8,
+  "repetition_penalty": 1.05,
+  "temperature": 0.7
+}
+
+```
+
+- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+
+## Related Resources
+
+- [Tencent HunYuan Blog](https://hunyuan.tencent.com/)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/hunyuan/hunyuan-v1-dense-qlora.yaml b/examples/hunyuan/hunyuan-v1-dense-qlora.yaml
new file mode 100644
index 000000000..a94345a61
--- /dev/null
+++ b/examples/hunyuan/hunyuan-v1-dense-qlora.yaml
@@ -0,0 +1,64 @@
+base_model: tencent/Hunyuan-0.5B-Instruct
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: false
+load_in_4bit: true
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/magistral/README.md b/examples/magistral/README.md
index 48ce712da..f4f278208 100644
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -18,7 +18,13 @@ pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 ```
 
-2. Run the finetuning example:
+2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage
+
+```bash
+python scripts/cutcrossentropy_install.py | sh
+```
+
+3. Run the finetuning example:
 
 ```bash
 axolotl train examples/magistral/magistral-small-qlora.yaml
diff --git a/examples/voxtral/README.md b/examples/voxtral/README.md
index f31e9cfd0..984af4ddb 100644
--- a/examples/voxtral/README.md
+++ b/examples/voxtral/README.md
@@ -22,6 +22,9 @@ pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
 # audio
 pip3 install librosa==0.11.0
 pip3 install 'mistral_common[audio]==1.8.3'
+
+# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
+python scripts/cutcrossentropy_install.py | sh
 ```
 
 3. Run the finetuning example:
diff --git a/src/axolotl/loaders/tokenizer.py b/src/axolotl/loaders/tokenizer.py
index dcc255938..37b66ac83 100644
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -296,7 +296,7 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
             )
 
         tokenizer.chat_template = chat_template_string
-    else:
+    elif getattr(tokenizer, "chat_template", None) is None:
         LOG.info(
             "No Chat template selected. Consider adding a chat template for easier inference."
         )
diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
index cbc546877..a32430d9f 100644
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -36,6 +36,10 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
     "glm",
     "glm4",
     "smollm3",
+    "granite",
+    "granitemoe",
+    "hunyuan_v1_dense",
+    "hunyuan_v1_moe",
     "gpt_oss",
     "arcee",
     "seed_oss",

From 1b53c49e1a8408ff209ae72a480681d18f7f8c81 Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Wed, 10 Sep 2025 20:27:00 -0400
Subject: [PATCH 045/115] text diffusion training plugin (#3067)

* diffusion training plugin

* cleanup

* nits

* fixes + improvements

* add back in reinit_weights (clobbered?); masking / pretrain fixes

* nits

* cleanup; tests draft

* sample generation, tests fixes

* fixes

* nits

* add inference support; add auto-mask token support

* nits

* nits

* progress

* simplify logging

* lint

* prefix args with diffusion_

* coderabbito

* tests fix

* nit

* nits

* cleanup + nits

* nits

* fix SFT sample gen

* fixes

* fix

* comments

* comments

* lint

* reward model lora fix

* cleanup; fix pretraining_dataset case

* gradio inference

* update cfgs

* update cfgs

* train, generation parity, cleanup

* fix

* simplify

* test

* test fix
---
 .pre-commit-config.yaml                       |   2 +-
 .../colab-axolotl-example.ipynb               |   2 +-
 examples/llama-3/diffusion/pretrain-1b.yaml   |  56 +++
 examples/llama-3/diffusion/sft-1b.yaml        |  59 +++
 src/axolotl/cli/inference.py                  |  63 ++-
 src/axolotl/cli/utils/diffusion.py            | 375 ++++++++++++++++
 src/axolotl/core/builders/causal.py           |  15 +-
 src/axolotl/core/trainers/base.py             |  46 +-
 src/axolotl/integrations/base.py              |   2 +-
 src/axolotl/integrations/config.py            |   2 +-
 src/axolotl/integrations/diffusion/README.md  | 154 +++++++
 .../integrations/diffusion/__init__.py        |  19 +
 src/axolotl/integrations/diffusion/args.py    |  95 ++++
 .../integrations/diffusion/callbacks.py       | 174 ++++++++
 .../integrations/diffusion/generation.py      | 409 ++++++++++++++++++
 src/axolotl/integrations/diffusion/plugin.py  |  41 ++
 src/axolotl/integrations/diffusion/trainer.py | 301 +++++++++++++
 src/axolotl/integrations/diffusion/utils.py   | 159 +++++++
 src/axolotl/loaders/adapter.py                |  12 +-
 src/axolotl/loaders/model.py                  | 118 +++--
 src/axolotl/loaders/patch_manager.py          |   5 +-
 src/axolotl/monkeypatch/accelerate/fsdp2.py   |   8 +-
 .../monkeypatch/attention/flex_attn.py        |   3 +-
 src/axolotl/monkeypatch/deepspeed_utils.py    |   1 +
 src/axolotl/utils/config/__init__.py          |   2 +-
 src/axolotl/utils/data/__init__.py            |   8 +-
 src/axolotl/utils/data/sft.py                 |   2 +-
 src/axolotl/utils/environment.py              |   2 -
 src/axolotl/utils/schemas/config.py           |   6 +
 src/axolotl/utils/schemas/validation.py       |   1 -
 tests/e2e/test_diffusion.py                   | 139 ++++++
 tests/integrations/test_diffusion.py          | 274 ++++++++++++
 tests/integrations/test_diffusion_callback.py |  92 ++++
 tests/test_streaming.py                       |   4 +-
 34 files changed, 2550 insertions(+), 101 deletions(-)
 create mode 100644 examples/llama-3/diffusion/pretrain-1b.yaml
 create mode 100644 examples/llama-3/diffusion/sft-1b.yaml
 create mode 100644 src/axolotl/cli/utils/diffusion.py
 create mode 100644 src/axolotl/integrations/diffusion/README.md
 create mode 100644 src/axolotl/integrations/diffusion/__init__.py
 create mode 100644 src/axolotl/integrations/diffusion/args.py
 create mode 100644 src/axolotl/integrations/diffusion/callbacks.py
 create mode 100644 src/axolotl/integrations/diffusion/generation.py
 create mode 100644 src/axolotl/integrations/diffusion/plugin.py
 create mode 100644 src/axolotl/integrations/diffusion/trainer.py
 create mode 100644 src/axolotl/integrations/diffusion/utils.py
 create mode 100644 tests/e2e/test_diffusion.py
 create mode 100644 tests/integrations/test_diffusion.py
 create mode 100644 tests/integrations/test_diffusion_callback.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 92ddc7f41..9c80898ff 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@ repos:
     rev: v0.12.12
     hooks:
     -   id: ruff
-        args: [--fix]
+        args: [--fix, --select, I]
     -   id: ruff-format
 -   repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.17.1
diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index b780a1c48..0e6ba984e 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -176,8 +176,8 @@
     }
    ],
    "source": [
-    "from axolotl.utils.dict import DictDefault\n",
     "from axolotl.cli.config import load_cfg\n",
+    "from axolotl.utils.dict import DictDefault\n",
     "\n",
     "# Axolotl provides full control and transparency over model and training configuration\n",
     "config = DictDefault(\n",
diff --git a/examples/llama-3/diffusion/pretrain-1b.yaml b/examples/llama-3/diffusion/pretrain-1b.yaml
new file mode 100644
index 000000000..8d05e4c60
--- /dev/null
+++ b/examples/llama-3/diffusion/pretrain-1b.yaml
@@ -0,0 +1,56 @@
+base_model: meta-llama/Llama-3.2-1B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+pretraining_dataset:
+  - path: wikitext
+    name: wikitext-103-raw-v1
+    type: completion
+    field: text
+
+plugins:
+  - axolotl.integrations.diffusion.DiffusionPlugin
+
+diffusion:
+  noise_schedule: cosine
+  min_mask_ratio: 0.15
+  max_mask_ratio: 0.85
+  num_diffusion_steps: 128
+  eps: 5e-4
+  importance_weighting: true
+  mask_token_id: 128002
+  generate_samples: true
+  generation_interval: 250
+
+output_dir: ./outputs/model-out
+
+sequence_len: 512
+sample_packing: true
+
+gradient_accumulation_steps: 8
+micro_batch_size: 4
+max_steps: 10000
+warmup_ratio: 0.1
+
+optimizer: adamw_8bit
+lr_scheduler: cosine
+learning_rate: 3e-4
+sdp_attention: true
+
+bf16: auto
+tf32: true
+
+logging_steps: 1
+save_strategy: steps
+save_steps: 1000
+
+special_tokens:
+  pad_token: "<|end_of_text|>"
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/diffusion/sft-1b.yaml b/examples/llama-3/diffusion/sft-1b.yaml
new file mode 100644
index 000000000..f3b29a809
--- /dev/null
+++ b/examples/llama-3/diffusion/sft-1b.yaml
@@ -0,0 +1,59 @@
+base_model: meta-llama/Llama-3.2-1B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+datasets:
+  - path: teknium/GPT4-LLM-Cleaned
+    type: alpaca
+val_set_size: 0.05
+
+plugins:
+  - axolotl.integrations.diffusion.DiffusionPlugin
+
+diffusion:
+  noise_schedule: cosine
+  min_mask_ratio: 0.1
+  max_mask_ratio: 0.9
+  num_diffusion_steps: 128
+  eps: 1e-3
+  importance_weighting: true
+  mask_token_id: 128002
+  generate_samples: true
+  generation_interval: 250
+
+output_dir: ./outputs/model-out
+
+sequence_len: 512
+sample_packing: true
+eval_sample_packing: true
+
+gradient_accumulation_steps: 4
+micro_batch_size: 4
+num_epochs: 1
+warmup_steps: 0.1
+
+optimizer: adamw_8bit
+lr_scheduler: cosine
+learning_rate: 1e-5
+
+bf16: auto
+tf32: true
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+sdp_attention: true
+
+logging_steps: 1
+save_strategy: best
+eval_strategy: epoch
+
+special_tokens:
+  pad_token: "<|end_of_text|>"
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/src/axolotl/cli/inference.py b/src/axolotl/cli/inference.py
index debe57167..30d407713 100644
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -14,6 +14,13 @@ from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
 from axolotl.cli.args import InferenceCliArgs
 from axolotl.cli.config import load_cfg
 from axolotl.cli.utils import load_model_and_tokenizer
+from axolotl.cli.utils.diffusion import (
+    diffusion_inference,
+    launch_diffusion_gradio_ui,
+    render_html,
+    run_diffusion,
+)
+from axolotl.integrations.base import PluginManager
 from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
@@ -29,6 +36,7 @@ def get_multi_line_input() -> str:
         Possibly multi-line, possibly empty stdin input as a string.
     """
     print("Give me an instruction (Ctrl + D to submit): ")
+    print("=" * 80)
 
     instruction = ""
     for line in sys.stdin:
@@ -43,9 +51,9 @@ def do_inference(
     cli_args: InferenceCliArgs,
 ):
     """
-    Runs inference on the command line in a loop. User input is accepted, a chat template
-    is (optionally) applied, and the model specified in the `axolotl` config is used to
-    generate completions according to a default generation config.
+    Runs inference on the command line in a loop. User input is accepted, a chat
+    template is (optionally) applied, and the model specified in the `axolotl` config is
+    used to generate completions according to a default generation config.
 
     Args:
         cfg: Dictionary mapping `axolotl` config keys to values.
@@ -64,16 +72,28 @@ def do_inference(
         chat_template_str = get_chat_template_from_config(
             cfg, ds_cfg=None, tokenizer=tokenizer
         )
-    elif cfg.datasets[0].type == "chat_template":
+    elif cfg.datasets and cfg.datasets[0].type == "chat_template":
         chat_template_str = get_chat_template_from_config(
             cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
         )
 
     model = model.to(cfg.device, dtype=cfg.torch_dtype)
 
+    # Detect diffusion mode
+    plugin_manager = PluginManager.get_instance()
+    is_diffusion = any(
+        plugin.__class__.__name__ == "DiffusionPlugin"
+        for plugin in plugin_manager.plugins.values()
+    )
+
+    if is_diffusion:
+        print("=" * 80)
+        print("Commands:")
+        print(":complete N -> completion mode with N tokens (default 64)")
+        print(":mask R     -> random masking with ratio R (0.0–1.0)")
+
     while True:
         print("=" * 80)
-        # support for multiline inputs
         instruction = get_multi_line_input()
         if not instruction:
             return
@@ -103,9 +123,19 @@ def do_inference(
         else:
             batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
 
-        print("=" * 40)
+        print("=" * 80)
         model.eval()
         with torch.no_grad():
+            if is_diffusion:
+                diffusion_inference(
+                    model=model,
+                    tokenizer=tokenizer,
+                    cfg=cfg,
+                    prompt=prompt,
+                    chat_template_str=chat_template_str,
+                )
+                continue
+
             generation_config = GenerationConfig(
                 repetition_penalty=1.1,
                 max_new_tokens=1024,
@@ -128,7 +158,7 @@ def do_inference(
                 generation_config=generation_config,
                 streamer=streamer,
             )
-        print("=" * 40)
+        print("=" * 80)
         print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
 
 
@@ -161,13 +191,30 @@ def do_inference_gradio(
         chat_template_str = get_chat_template_from_config(
             cfg, ds_cfg=None, tokenizer=tokenizer
         )
-    elif cfg.datasets[0].type == "chat_template":
+    elif cfg.datasets and cfg.datasets[0].type == "chat_template":
         chat_template_str = get_chat_template_from_config(
             cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
         )
 
     model = model.to(cfg.device, dtype=cfg.torch_dtype)
 
+    # Detect diffusion mode
+    plugin_manager = PluginManager.get_instance()
+    is_diffusion = any(
+        plugin.__class__.__name__ == "DiffusionPlugin"
+        for plugin in plugin_manager.plugins.values()
+    )
+
+    if is_diffusion:
+        launch_diffusion_gradio_ui(
+            model=model,
+            tokenizer=tokenizer,
+            cfg=cfg,
+            prompter_module=prompter_module,
+            chat_template_str=chat_template_str,
+        )
+        return
+
     def generate(instruction):
         if not instruction:
             return
diff --git a/src/axolotl/cli/utils/diffusion.py b/src/axolotl/cli/utils/diffusion.py
new file mode 100644
index 000000000..f83d9077b
--- /dev/null
+++ b/src/axolotl/cli/utils/diffusion.py
@@ -0,0 +1,375 @@
+"""Helpers for diffusion-mode inference in CLI and Gradio."""
+
+from __future__ import annotations
+
+import gradio as gr
+import torch
+from colorama import Fore, Style
+
+from axolotl.integrations.diffusion import generate, resolve_mask_token_id
+from axolotl.utils.dict import DictDefault
+
+
+def diffusion_inference(
+    model,
+    tokenizer,
+    cfg,
+    prompt: str,
+    chat_template_str: str | None = None,
+):
+    """Diffusion inference helper method."""
+    mode = "random"
+    completion_tokens = 0
+    target_mask_ratio = None
+    mode, completion_tokens, target_mask_ratio, cleaned = _parse_commands(prompt)
+
+    if cleaned:
+        prompt = cleaned
+
+    info = run_diffusion(
+        model=model,
+        tokenizer=tokenizer,
+        cfg=cfg,
+        prompt=prompt,
+        chat_template_str=chat_template_str,
+        mode=mode,
+        target_mask_ratio=target_mask_ratio,
+        completion_tokens=completion_tokens,
+    )
+    masked_text = info["masked_text"]
+    mask_ratio = info["mask_ratio"]
+    generated_ids = info["generated_ids"]
+    masked_positions = info["masked_positions"]
+    orig_ids = info["orig_ids"]
+
+    # Display with masked preview and colored diff
+    if masked_text is not None and mask_ratio is not None:
+        print(f"Masked ({mask_ratio:.1%}):\n{masked_text}\n")
+    if generated_ids is not None:
+        # Compute per-token style
+        styles: list[str] = []
+        for i, tid in enumerate(generated_ids):
+            if i in masked_positions:
+                if i < len(orig_ids) and tid == orig_ids[i]:
+                    styles.append("green")  # correct fill
+                elif i < len(orig_ids):
+                    styles.append("red")  # incorrect fill
+                else:
+                    styles.append("normal")  # appended
+            else:
+                same = i < len(orig_ids) and tid == orig_ids[i]
+                styles.append("dim" if same else "normal")
+
+        # Group contiguous spans by style
+        styled_spans: list[tuple[str, int, int]] = []
+        if generated_ids:
+            current_style = styles[0]
+            start = 0
+            for i in range(1, len(generated_ids)):
+                s = styles[i]
+                if s != current_style:
+                    styled_spans.append((current_style, start, i))
+                    current_style, start = s, i
+            styled_spans.append((current_style, start, len(generated_ids)))
+
+        out_parts = []
+        for style_name, a, b in styled_spans:
+            chunk_text = tokenizer.decode(generated_ids[a:b], skip_special_tokens=False)
+            if style_name == "green":
+                out_parts.append(Fore.GREEN + chunk_text + Style.RESET_ALL)
+            elif style_name == "red":
+                out_parts.append(Fore.RED + chunk_text + Style.RESET_ALL)
+            else:
+                if style_name == "dim":
+                    out_parts.append(Style.DIM + chunk_text + Style.RESET_ALL)
+                else:
+                    out_parts.append(chunk_text)
+        print("Generated:\n" + "".join(out_parts))
+    else:
+        print("Generated:\n(no output)")
+
+
+def _parse_commands(text: str):
+    """
+    Parse leading diffusion commands.
+
+    Supported at start of input (can be chained):
+      :complete N  -> completion mode with N tokens (default 64)
+      :mask R      -> random masking with ratio R in [0, 1]
+    """
+    tokens = text.strip().split()
+    i = 0
+    mode = "random"
+    completion_tokens = 0
+    target_mask_ratio = None
+    consumed = 0
+    while i < len(tokens) and tokens[i].startswith(":"):
+        cmd = tokens[i]
+        i += 1
+        consumed = i
+        if cmd == ":complete":
+            mode = "completion"
+            if i < len(tokens):
+                try:
+                    completion_tokens = int(tokens[i])
+                    i += 1
+                    consumed = i
+                except Exception:
+                    completion_tokens = 64
+            else:
+                completion_tokens = 64
+        elif cmd == ":mask":
+            mode = "random"
+            if i < len(tokens):
+                try:
+                    target_mask_ratio = float(tokens[i])
+                    i += 1
+                    consumed = i
+                except Exception:
+                    target_mask_ratio = None
+        else:
+            i -= 1
+            consumed = i
+            break
+
+    cleaned = " ".join(tokens[consumed:])
+
+    return mode, completion_tokens, target_mask_ratio, cleaned
+
+
+def run_diffusion(
+    *,
+    model,
+    tokenizer,
+    cfg: DictDefault,
+    prompt: str,
+    chat_template_str: str | None,
+    mode: str = "random",
+    target_mask_ratio: float | None = None,
+    completion_tokens: int = 0,
+):
+    """Run a single diffusion generation and return a structured result dict."""
+    if chat_template_str:
+        batch = tokenizer.apply_chat_template(
+            [{"role": "user", "content": prompt}],
+            return_tensors="pt",
+            add_special_tokens=True,
+            add_generation_prompt=True,
+            chat_template=chat_template_str,
+            tokenize=True,
+            return_dict=True,
+        )
+    else:
+        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
+
+    mask_token_id = resolve_mask_token_id(tokenizer, cfg, allow_add=False)
+
+    seq = batch["input_ids"].to(cfg.device)
+    gen_mode = "completion" if mode == "completion" else "random"
+    comp_tokens = int(completion_tokens) if gen_mode == "completion" else 0
+
+    result = generate(
+        model,
+        tokenizer,
+        original_sequence=seq[:1],
+        num_diffusion_steps=cfg.diffusion.num_diffusion_steps,
+        temperature=cfg.diffusion.generation_temperature,
+        mask_token_id=int(mask_token_id),
+        mode=gen_mode,  # type: ignore[arg-type]
+        completion_tokens=comp_tokens,
+        target_mask_ratio=target_mask_ratio,
+    )
+
+    masked_text = result.get("masked") if isinstance(result, dict) else None
+    mask_ratio = result.get("mask_ratio") if isinstance(result, dict) else None
+    generated_ids = result.get("generated_ids") if isinstance(result, dict) else None
+    masked_positions = (
+        set(result.get("masked_positions") or []) if isinstance(result, dict) else set()
+    )
+    orig_ids = seq[0].detach().cpu().tolist()
+
+    return {
+        "masked_text": masked_text,
+        "mask_ratio": mask_ratio,
+        "generated_ids": generated_ids,
+        "masked_positions": masked_positions,
+        "orig_ids": orig_ids,
+    }
+
+
+def render_html(
+    *,
+    generated_ids: list[int] | None,
+    orig_ids: list[int],
+    masked_positions: set[int],
+    tokenizer,
+) -> str:
+    """Render HTML visualizing diffusion outputs."""
+    if not generated_ids:
+        return "<pre>Generated:\n(no output)</pre>"
+
+    def _style_for(i: int, tid: int) -> str:
+        if i in masked_positions:
+            if i < len(orig_ids) and tid == orig_ids[i]:
+                return "green"
+            if i < len(orig_ids):
+                return "red"
+            return "normal"
+        same = i < len(orig_ids) and tid == orig_ids[i]
+        return "dim" if same else "normal"
+
+    # Group contiguous spans by style to reduce HTML size
+    spans: list[tuple[str, int, int]] = []
+    if generated_ids:
+        cur = _style_for(0, generated_ids[0])
+        start = 0
+        for i in range(1, len(generated_ids)):
+            s = _style_for(i, generated_ids[i])
+            if s != cur:
+                spans.append((cur, start, i))
+                cur, start = s, i
+        spans.append((cur, start, len(generated_ids)))
+
+    html_parts = []
+    for style_name, a, b in spans:
+        txt = tokenizer.decode(generated_ids[a:b], skip_special_tokens=False)
+        if style_name == "green":
+            html_parts.append(f'<span style="color:#2e7d32">{txt}</span>')
+        elif style_name == "red":
+            html_parts.append(f'<span style="color:#c62828">{txt}</span>')
+        elif style_name == "dim":
+            html_parts.append(f'<span style="opacity:0.6">{txt}</span>')
+        else:
+            html_parts.append(txt)
+
+    legend = (
+        '<div style="font-size:0.9em;margin-bottom:4px">'
+        '<span style="color:#2e7d32">correct</span>, '
+        '<span style="color:#c62828">incorrect</span>, '
+        '<span style="opacity:0.6">unchanged</span>'
+        "</div>"
+    )
+
+    return (
+        legend
+        + '<pre style="white-space:pre-wrap">Generated:\n'
+        + "".join(html_parts)
+        + "</pre>"
+    )
+
+
+def launch_diffusion_gradio_ui(
+    *,
+    model,
+    tokenizer,
+    cfg: DictDefault,
+    prompter_module=None,
+    chat_template_str: str | None = None,
+):
+    """Build and launch a simple Gradio UI for diffusion inference."""
+    with gr.Blocks(
+        title=cfg.get("gradio_title", "Axolotl Diffusion Interface")
+    ) as demo:
+        gr.Markdown(
+            """
+            ## Axolotl Diffusion Inference
+            - Mode "Random" masks tokens at a target ratio and fills them.
+            - Mode "Completion" appends N masked tokens at the end and fills them.
+            """
+        )
+
+        with gr.Row():
+            mode = gr.Radio(
+                choices=["random", "completion"],
+                value="random",
+                label="Mode",
+            )
+            mask_ratio = gr.Slider(
+                minimum=0.0,
+                maximum=1.0,
+                step=0.05,
+                value=0.4,
+                label="Mask ratio (random mode)",
+                interactive=True,
+            )
+            completion_tokens = gr.Number(
+                value=64,
+                precision=0,
+                label="Completion tokens (completion mode)",
+                interactive=True,
+                visible=False,
+            )
+
+        instruction = gr.Textbox(label="Instruction", lines=6)
+        run_btn = gr.Button("Generate")
+
+        masked_preview = gr.Textbox(label="Masked preview", lines=6)
+        html_out = gr.HTML(label="Generated")
+
+        def _toggle_controls(selected_mode: str):
+            return (
+                gr.update(visible=(selected_mode == "random")),
+                gr.update(visible=(selected_mode == "completion")),
+            )
+
+        mode.change(
+            _toggle_controls,
+            inputs=[mode],
+            outputs=[mask_ratio, completion_tokens],
+        )
+
+        def _gen(instruction_text: str, selected_mode: str, mratio: float, ctoks: int):
+            if not instruction_text:
+                return "", "<pre>Generated:\n(no output)</pre>"
+
+            if prompter_module:
+                prompt: str = next(
+                    prompter_module().build_prompt(
+                        instruction=instruction_text.strip("\n")
+                    )
+                )
+            else:
+                prompt = instruction_text.strip()
+
+            info = run_diffusion(
+                model=model,
+                tokenizer=tokenizer,
+                cfg=cfg,
+                prompt=prompt,
+                chat_template_str=chat_template_str,
+                mode=selected_mode,
+                target_mask_ratio=mratio if selected_mode == "random" else None,
+                completion_tokens=int(ctoks) if selected_mode == "completion" else 0,
+            )
+
+            masked_text = info.get("masked_text")
+            mask_ratio_val = info.get("mask_ratio")
+            generated_ids = info.get("generated_ids")
+            masked_positions = info.get("masked_positions") or set()
+            orig_ids = info.get("orig_ids") or []
+
+            preview = (
+                f"Masked ({mask_ratio_val:.1%}):\n{masked_text}"
+                if masked_text is not None and mask_ratio_val is not None
+                else ""
+            )
+            html = render_html(
+                generated_ids=generated_ids,
+                orig_ids=orig_ids,
+                masked_positions=masked_positions,
+                tokenizer=tokenizer,
+            )
+            return preview, html
+
+        run_btn.click(
+            _gen,
+            inputs=[instruction, mode, mask_ratio, completion_tokens],
+            outputs=[masked_preview, html_out],
+        )
+
+        demo.queue().launch(
+            show_api=False,
+            share=cfg.get("gradio_share", True),
+            server_name=cfg.get("gradio_server_name", "127.0.0.1"),
+            server_port=cfg.get("gradio_server_port", None),
+        )
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
index ee6383d47..f7f350e1a 100644
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -7,7 +7,11 @@ from pathlib import Path
 from typing import Type, Union
 
 import transformers
-from transformers import DataCollatorWithFlattening, EarlyStoppingCallback
+from transformers import (
+    DataCollatorWithFlattening,
+    EarlyStoppingCallback,
+    Trainer,
+)
 from trl.trainer.utils import RewardDataCollatorWithPadding
 
 from axolotl.core.builders.base import TrainerBuilderBase
@@ -23,15 +27,16 @@ from axolotl.monkeypatch.relora import ReLoRACallback
 from axolotl.processing_strategies import get_processing_strategy
 from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
+    LossWatchDogCallback,
+    SaveBetterTransformerModelCallback,
     bench_eval_callback_factory,
     causal_lm_bench_eval_callback_factory,
     colab_inference_post_train_callback,
     log_prediction_callback_factory,
-    LossWatchDogCallback,
-    SaveBetterTransformerModelCallback,
 )
 from axolotl.utils.callbacks.lisa import lisa_callback_factory
 from axolotl.utils.callbacks.qat import QATCallback
+from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback
 from axolotl.utils.chat_templates import get_chat_template_from_config
 from axolotl.utils.collators import (
     BatchSamplerDataCollatorForSeq2Seq,
@@ -39,7 +44,6 @@ from axolotl.utils.collators import (
     MambaDataCollator,
     V2BatchSamplerDataCollatorForSeq2Seq,
 )
-from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback
 from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
 from axolotl.utils.import_helper import get_cls_from_module_str
 from axolotl.utils.logging import get_logger
@@ -391,10 +395,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                 **data_collator_kwargs,
             )
         sig = inspect.signature(trainer_cls)
-        if "processing_class" in sig.parameters:
+        if "processing_class" in sig.parameters or issubclass(trainer_cls, Trainer):
             trainer_kwargs["processing_class"] = self.tokenizer
         elif "tokenizer" in sig.parameters:
             trainer_kwargs["tokenizer"] = self.tokenizer
+
         if (
             trainer_cls not in [AxolotlRewardTrainer, AxolotlPRMTrainer]
             and self.cfg.datasets is not None
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index d7555261f..3427a0b86 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -49,6 +49,13 @@ from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 
 LOG = get_logger(__name__)
 
+REDUCTION_FNS = {
+    "mean": torch.mean,
+    "min": torch.min,
+    "max": torch.max,
+    "sum": torch.sum,
+}
+
 
 class AxolotlTrainer(
     PackingMixin,
@@ -89,7 +96,9 @@ class AxolotlTrainer(
 
         super().__init__(*_args, **kwargs)
         self.train_data_collator = self.data_collator
-        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+        self._stored_metrics = defaultdict(
+            lambda: defaultdict(lambda: {"values": [], "reduction": "mean"})
+        )
         if self.args.orpo_alpha:
             self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
 
@@ -585,9 +594,17 @@ class AxolotlTrainer(
         """
         # logs either has 'loss' or 'eval_loss'
         train_eval = "train" if "loss" in logs else "eval"
-        # Add averaged stored metrics to logs
-        for key, metrics in self._stored_metrics[train_eval].items():
-            logs[key] = torch.tensor(metrics).mean().item()
+
+        for key, metric_data in self._stored_metrics[train_eval].items():
+            values = torch.tensor(metric_data["values"])  # type: ignore[arg-type]
+            reduction_type = metric_data["reduction"]
+
+            fn = REDUCTION_FNS.get(reduction_type)
+            if fn is None:
+                raise NotImplementedError(
+                    "Metric reduction must be one of [mean, min, max, sum]"
+                )
+            logs[key] = round(fn(values).item(), 4)
 
         if is_main_process():
             # Add memory usage
@@ -611,10 +628,27 @@ class AxolotlTrainer(
         return super().log(logs, start_time)
 
     def store_metrics(
-        self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train"
+        self,
+        metrics: dict[str, float] | dict[str, tuple[int | float, str]],
+        train_eval: Literal["train", "eval"] = "train",
+        reduction: Literal["mean", "min", "max", "sum"] = "mean",
     ) -> None:
+        """
+        Store metrics with specified reduction type.
+
+        Args:
+            metrics: Dictionary of metric names to values, or metric names to (value,
+                reduction_type) tuples.
+            train_eval: Whether this is for training or evaluation.
+        """
         for key, value in metrics.items():
-            self._stored_metrics[train_eval][key].append(value)
+            if isinstance(value, tuple):
+                value, _reduction = value  # type: ignore[assignment]
+            else:
+                value, _reduction = value, reduction
+
+            self._stored_metrics[train_eval][key]["values"].append(value)
+            self._stored_metrics[train_eval][key]["reduction"] = _reduction
 
     def _save_checkpoint(self, model, trial, **kwargs):
         # make sure the checkpoint dir exists, since trainer is flakey
diff --git a/src/axolotl/integrations/base.py b/src/axolotl/integrations/base.py
index 8edee18a3..c66bc01c6 100644
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -142,7 +142,7 @@ class BasePlugin:
             model: The loaded model.
         """
 
-    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
+    def get_trainer_cls(self, cfg: DictDefault) -> type[Trainer] | None:
         """Returns a custom class for the trainer.
 
         Args:
diff --git a/src/axolotl/integrations/config.py b/src/axolotl/integrations/config.py
index 2217b2819..8ae8aab39 100644
--- a/src/axolotl/integrations/config.py
+++ b/src/axolotl/integrations/config.py
@@ -20,8 +20,8 @@ from typing import Any, Dict, List, Type
 
 from axolotl.utils.schemas.config import (
     AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
+    AxolotlInputConfig as AxolotlInputConfigBase,
 )
-from axolotl.utils.schemas.config import AxolotlInputConfig as AxolotlInputConfigBase
 
 
 def merge_input_args():
diff --git a/src/axolotl/integrations/diffusion/README.md b/src/axolotl/integrations/diffusion/README.md
new file mode 100644
index 000000000..c27f33de1
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/README.md
@@ -0,0 +1,154 @@
+# Diffusion LM Training Plugin for Axolotl
+
+This plugin enables diffusion language model training using an approach inspired by
+LLaDA (Large Language Diffusion Models) within Axolotl.
+
+## Overview
+
+LLaDA is a diffusion-based approach to language model training that uses:
+- **Random token masking** during training instead of next-token prediction
+- **Bidirectional attention** to allow the model to attend to the full context
+- **Importance weighting** based on masking probabilities for stable training
+
+This approach can lead to more robust language models with better understanding of
+bidirectional context.
+
+## Installation
+
+The plugin is included with Axolotl. See our
+[installation docs](https://docs.axolotl.ai/docs/installation.html).
+
+## Quickstart
+
+Train with an example config (Llama‑3.2 1B):
+   - Pretrain: `axolotl train examples/llama-3/diffusion-3.2-1b-pretrain.yaml`
+   - SFT: `axolotl train examples/llama-3/diffusion-3.2-1b-sft.yaml`
+
+### Basic Configuration
+
+You can also modify your existing configs to enable / customize diffusion training.
+
+Add the following to your Axolotl config:
+
+```yaml
+# Enable diffusion LM training plugin
+plugins:
+  - axolotl.integrations.diffusion.DiffusionPlugin
+```
+
+And, configure the nested `diffusion` block (defaults shown):
+
+```yaml
+diffusion:
+  noise_schedule: linear  # or "cosine"
+  min_mask_ratio: 0.1
+  max_mask_ratio: 0.9
+  num_diffusion_steps: 128
+  eps: 1e-3
+  importance_weighting: true
+
+  # Mask token (training auto-adds if missing, avoid pad/eos)
+  mask_token_str: "<|diffusion_mask|>"
+  # Or use an existing special token id (e.g., 128002 for Llama-3.x)
+  # mask_token_id: 128002
+
+  # Sample generation during training (optional)
+  generate_samples: true
+  generation_interval: 100
+  num_generation_samples: 3
+  generation_steps: 128
+  generation_temperature: 0.0
+  generation_max_length: 100
+```
+
+## Supported Models
+
+Any models that support 4D attention masks should work out of the box. If not, please
+create an [issue](https://github.com/axolotl-ai-cloud/axolotl/issues) or open a
+[PR](https://github.com/axolotl-ai-cloud/axolotl/compare)!
+
+## How It Works
+
+### Random Masking
+During training, tokens are randomly masked:
+- Sample timestep `t` uniformly from [0, 1]
+- Calculate masking probability: `p = (1 - eps) * t + eps`
+- Randomly mask tokens with probability `p`
+
+### Diffusion Loss
+
+Loss is computed only on masked tokens with (optional) importance weighting:
+
+```python
+loss = sum(cross_entropy(pred, target) / p_mask) / total_tokens
+```
+
+## Sample Generation
+
+When `diffusion.generate_samples: true`, the plugin generates samples during training:
+
+```
+Sample 1:
+   Original (45 tokens): The quick brown fox jumps over the lazy dog...
+   Masked (18/45 tokens, 40.0%): The [MASK] [MASK] fox [MASK] over [MASK] lazy [MASK]...
+   Generated: The quick brown fox jumps over the lazy dog...
+```
+
+Samples are logged to console and wandb (if enabled).
+
+## Inference
+
+Diffusion inference is integrated into the standard Axolotl CLI. Use the same config
+you trained with and run:
+
+```
+axolotl inference path/to/your-config.yaml
+```
+
+Optionally, pass `--gradio` to use a simple web interface.
+
+Interactive controls (prefix the prompt with commands):
+- `:complete N` → completion mode with N new masked tokens appended (default 64)
+- `:mask R` → random masking mode with target mask ratio R in [0.0, 1.0]
+
+Example session:
+
+```
+================================================================================
+Commands:
+:complete N -> completion mode with N tokens (default 64)
+:mask R     -> random masking with ratio R (0.0–1.0)
+================================================================================
+Give me an instruction (Ctrl + D to submit):
+
+:mask 0.4 The quick brown fox jumps over the lazy dog
+
+Masked (40.0%):
+The [MASK] brown [MASK] jumps over the [MASK] dog
+
+Generated:
+The quick brown fox jumps over the loud dog
+```
+
+## Metrics and Monitoring
+
+The plugin adds (or modifies) several metrics to track diffusion training:
+
+- `train/loss`: Weighted diffusion loss
+- `train/accuracy`: Accuracy on masked tokens
+- `train/mask_ratio`: Average fraction of tokens masked
+- `train/num_masked_tokens`: Number of tokens masked
+- `train/avg_p_mask`: Average masking probability
+- `train/ce_loss`: Unweighted cross-entropy loss
+- `train/importance_weight_avg`: Average importance weight
+
+## Limitations
+
+- No flash attention support
+- No RL training support
+
+## References
+
+- [LLaDA Paper](https://arxiv.org/abs/2404.10406)
+- [Axolotl Documentation](https://docs.axolotl.ai/)
+- [API reference for plugin](https://docs.axolotl.ai/docs/api/integrations.diffusion.args.html#axolotl.integrations.diffusion.args)
diff --git a/src/axolotl/integrations/diffusion/__init__.py b/src/axolotl/integrations/diffusion/__init__.py
new file mode 100644
index 000000000..9e38cc5c1
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/__init__.py
@@ -0,0 +1,19 @@
+"""Diffusion LM training plugin init."""
+
+from .args import DiffusionArgs, DiffusionConfig
+from .callbacks import DiffusionGenerationCallback
+from .generation import generate
+from .plugin import DiffusionPlugin
+from .trainer import DiffusionTrainer
+from .utils import create_bidirectional_attention_mask, resolve_mask_token_id
+
+__all__ = [
+    "DiffusionArgs",
+    "DiffusionPlugin",
+    "DiffusionTrainer",
+    "generate",
+    "resolve_mask_token_id",
+    "create_bidirectional_attention_mask",
+    "DiffusionGenerationCallback",
+    "DiffusionConfig",
+]
diff --git a/src/axolotl/integrations/diffusion/args.py b/src/axolotl/integrations/diffusion/args.py
new file mode 100644
index 000000000..4f5bfe499
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/args.py
@@ -0,0 +1,95 @@
+"""Config args for diffusion LM training (nested under `diffusion:`)."""
+
+from __future__ import annotations
+
+from typing import Literal
+
+from pydantic import BaseModel, Field, model_validator
+
+
+class DiffusionConfig(BaseModel):
+    """Nested diffusion configuration available under the `diffusion` key."""
+
+    # Noise schedule config
+    noise_schedule: Literal["linear", "cosine"] = Field(
+        default="linear", description="Type of noise schedule for diffusion training"
+    )
+    min_mask_ratio: float = Field(
+        default=0.1,
+        ge=0.0,
+        le=1.0,
+        description="Minimum masking ratio for diffusion noise schedule",
+    )
+    max_mask_ratio: float = Field(
+        default=0.9,
+        ge=0.0,
+        le=1.0,
+        description="Maximum masking ratio for diffusion noise schedule",
+    )
+    num_diffusion_steps: int = Field(
+        default=128, ge=1, description="Number of diffusion timesteps"
+    )
+    eps: float = Field(
+        default=1e-3,
+        ge=0.0,
+        le=1.0,
+        description="Epsilon value for minimum masking probability in forward process",
+    )
+
+    # Training config
+    importance_weighting: bool = Field(
+        default=True,
+        description="Apply importance weighting to loss based on masking probability",
+    )
+    mask_token_id: int | None = Field(
+        default=None,
+        description=(
+            "Token ID to use for masking. Unset by default; can use one of the "
+            "tokenizer's special tokens here."
+        ),
+    )
+    mask_token_str: str | None = Field(
+        default=None,
+        description=(
+            "Token string to use as a mask. If `mask_token_id` is invalid or unset, "
+            "this token will be ensured to exist as an additional special token and "
+            "used. If absent, a default '<|diffusion_mask|>' will be added."
+        ),
+    )
+
+    # Sample generation config
+    generate_samples: bool = Field(
+        default=True, description="Enable sample generation during training"
+    )
+    generation_interval: int = Field(
+        default=100, ge=1, description="Generate samples every N steps"
+    )
+    num_generation_samples: int = Field(
+        default=3, ge=1, description="Number of samples to generate each time"
+    )
+    generation_steps: int = Field(
+        default=128, ge=1, description="Number of diffusion steps for generation"
+    )
+    generation_temperature: float = Field(
+        default=0.0,
+        ge=0.0,
+        description="Temperature for generation sampling (0.0 = deterministic)",
+    )
+    generation_max_length: int = Field(
+        default=100, ge=1, description="Maximum sequence length for generation"
+    )
+
+    @model_validator(mode="after")
+    def _validate_mask_ratios(self) -> "DiffusionConfig":
+        if self.min_mask_ratio > self.max_mask_ratio:
+            raise ValueError("min_mask_ratio must be ≤ max_mask_ratio")
+        return self
+
+
+class DiffusionArgs(BaseModel):
+    """Plugin entry that exposes the nested `diffusion` block to the core config."""
+
+    diffusion: DiffusionConfig = Field(
+        default_factory=DiffusionConfig,
+        description="Diffusion training configuration. Only nested block is supported.",
+    )
diff --git a/src/axolotl/integrations/diffusion/callbacks.py b/src/axolotl/integrations/diffusion/callbacks.py
new file mode 100644
index 000000000..18a64023b
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/callbacks.py
@@ -0,0 +1,174 @@
+"""Callbacks for diffusion training."""
+
+import logging
+import sys
+
+import wandb
+from colorama import Fore, Style
+from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
+from transformers.training_args import TrainingArguments
+
+from .generation import generate_samples
+
+# Simpler logger for more readable sample generation
+logger = logging.getLogger(__name__)
+if not logger.handlers:
+    handler = logging.StreamHandler(sys.stdout)
+    handler.setFormatter(logging.Formatter("%(message)s"))
+    logger.addHandler(handler)
+    logger.propagate = False
+logger.setLevel(logging.INFO)
+
+
+class DiffusionGenerationCallback(TrainerCallback):
+    """Callback for generating samples during diffusion training."""
+
+    def __init__(self, trainer):
+        self.trainer = trainer
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        """Generate samples at specified intervals."""
+        if (
+            state.global_step > 0
+            and state.global_step % self.trainer.cfg.diffusion.generation_interval == 0
+        ):
+            if not self.trainer.state.is_world_process_zero:
+                return
+
+            # Use eval dataloader if available, otherwise use train dataloader
+            dataloader = None
+            try:
+                if getattr(self.trainer, "eval_dataset", None) is not None:
+                    dataloader = self.trainer.get_eval_dataloader()
+            except Exception:
+                dataloader = None
+            if dataloader is None:
+                dataloader = self.trainer.get_train_dataloader()
+
+            # Generate samples
+            diffusion_cfg = self.trainer.cfg.diffusion
+            samples = generate_samples(
+                model=self.trainer.model,
+                tokenizer=self.trainer.processing_class,
+                dataloader=dataloader,
+                num_generation_samples=diffusion_cfg.num_generation_samples,
+                max_length=diffusion_cfg.generation_max_length,
+                num_diffusion_steps=diffusion_cfg.generation_steps,
+                temperature=diffusion_cfg.generation_temperature,
+                mask_token_id=diffusion_cfg.mask_token_id,
+            )
+
+            # Log samples
+            self._log_samples(samples, state.global_step)
+
+    def _log_samples(self, samples: list, step: int):
+        """Log generated samples."""
+        if not samples:
+            return
+
+        logger.info("=" * 60)
+        logger.info("GENERATED SAMPLES")
+        logger.info("=" * 60)
+
+        for i, sample_data in enumerate(samples, 1):
+            original = sample_data["original"]
+            masked = sample_data["masked"]
+            generated = sample_data["generated"]
+            mask_ratio = sample_data["mask_ratio"]
+            masked_tokens = sample_data["masked_tokens"]
+            total_tokens = sample_data["total_tokens"]
+
+            logger.info(f"\nSample {i}:")
+            logger.info(f"\tOriginal ({total_tokens} tokens): {original}")
+            logger.info(
+                f"\tMasked ({masked_tokens}/{total_tokens} tokens, "
+                f"{mask_ratio:.1%}): {masked}"
+            )
+
+            try:
+                gen_ids = sample_data.get("generated_ids")
+                orig_ids = sample_data.get("orig_ids")
+                masked_positions = set(sample_data.get("masked_positions") or [])
+                if isinstance(gen_ids, list) and isinstance(orig_ids, list):
+                    styles: list[str] = []
+                    for i, tid in enumerate(gen_ids):
+                        if i in masked_positions:
+                            if i < len(orig_ids) and tid == orig_ids[i]:
+                                styles.append("green")
+                            elif i < len(orig_ids):
+                                styles.append("red")
+                            else:
+                                styles.append("normal")
+                        else:
+                            same = i < len(orig_ids) and tid == orig_ids[i]
+                            styles.append("dim" if same else "normal")
+
+                    spans: list[tuple[str, int, int]] = []
+                    if gen_ids:
+                        cur = styles[0]
+                        start = 0
+                        for i in range(1, len(gen_ids)):
+                            s = styles[i]
+                            if s != cur:
+                                spans.append((cur, start, i))
+                                cur, start = s, i
+                        spans.append((cur, start, len(gen_ids)))
+
+                    parts = []
+                    for style_name, a, b in spans:
+                        chunk_text = self.trainer.processing_class.decode(
+                            gen_ids[a:b], skip_special_tokens=False
+                        )
+                        if style_name == "green":
+                            parts.append(Fore.GREEN + chunk_text + Style.RESET_ALL)
+                        elif style_name == "red":
+                            parts.append(Fore.RED + chunk_text + Style.RESET_ALL)
+                        else:
+                            if style_name == "dim":
+                                parts.append(Style.DIM + chunk_text + Style.RESET_ALL)
+                            else:
+                                parts.append(chunk_text)
+                    logger.info("\tGenerated:\n%s", "".join(parts))
+                else:
+                    logger.info(f"\tGenerated: {generated}")
+            except Exception:
+                logger.info(f"\tGenerated: {generated}")
+
+        logger.info("=" * 60)
+
+        if self.trainer.cfg.use_wandb:
+            if wandb.run is not None:
+                wandb.log(
+                    {
+                        "generated_samples": wandb.Table(
+                            columns=[
+                                "step",
+                                "original",
+                                "masked",
+                                "generated",
+                                "mask_ratio",
+                                "masked_tokens",
+                                "total_tokens",
+                            ],
+                            data=[
+                                [
+                                    step,
+                                    sample["original"],
+                                    sample["masked"],
+                                    sample["generated"],
+                                    f"{sample['mask_ratio']:.1%}",
+                                    sample["masked_tokens"],
+                                    sample["total_tokens"],
+                                ]
+                                for sample in samples
+                            ],
+                        )
+                    },
+                    step=step,
+                )
diff --git a/src/axolotl/integrations/diffusion/generation.py b/src/axolotl/integrations/diffusion/generation.py
new file mode 100644
index 000000000..49e3cdfae
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/generation.py
@@ -0,0 +1,409 @@
+"""Sample generation utilities for diffusion training."""
+
+import re
+from typing import Any, List, Literal, Optional
+
+import torch
+
+from axolotl.utils.logging import get_logger
+
+from .utils import create_bidirectional_attention_mask
+
+LOG = get_logger(__name__)
+
+
+def generate_samples(
+    model: torch.nn.Module,
+    tokenizer: Any,
+    dataloader: Optional[Any] = None,
+    num_generation_samples: int = 3,
+    max_length: int = 100,
+    num_diffusion_steps: int = 128,
+    temperature: float = 0.0,
+    mask_token_id: int = 32000,
+    mode: Literal["random", "completion"] = "random",
+    completion_tokens: int = 0,
+    target_mask_ratio: Optional[float] = None,
+) -> List[dict]:
+    """
+    Generate text samples using the diffusion model by randomly masking sequences from
+    the given dataset and running the reverse diffusion process.
+
+    Args:
+        model: The wrapped or unwrapped model
+        tokenizer: Tokenizer for encoding/decoding
+        dataloader: Validation dataloader (for sampling sequences)
+        num_generation_samples: Number of samples to generate
+        max_length: Maximum length of sequences to use
+        num_diffusion_steps: Number of diffusion steps for generation
+        temperature: Temperature for sampling (0.0 = deterministic)
+        mask_token_id: Token ID used for masking
+
+    Returns:
+        List of dictionaries with original text, masked text, and generated text
+    """
+    if dataloader is None:
+        LOG.warning("No validation dataloader provided, cannot generate samples")
+        return []
+
+    unwrapped_model = model.module if hasattr(model, "module") else model
+    training = unwrapped_model.training
+    unwrapped_model.eval()
+
+    # Resolve device robustly (some modules don't expose `.device`)
+    device = getattr(unwrapped_model, "device", None)
+    if device is None:
+        try:
+            device = next(unwrapped_model.parameters()).device
+        except StopIteration:
+            device = torch.device("cpu")
+    generations = []
+
+    # Sample sequences from validation dataset
+    sampled_sequences = _sample_sequences_from_dataloader(
+        dataloader, num_generation_samples, max_length, device
+    )
+    LOG.info(f"Sampled {len(sampled_sequences)} sequences from validation dataset")
+
+    # Generate samples using reverse diffusion process
+    with torch.no_grad():
+        for sample in sampled_sequences:
+            if isinstance(sample, dict):
+                original_sequence = sample.get("input_ids")
+                labels_seq = sample.get("labels")
+                attn_seq = sample.get("attention_mask")
+            else:
+                original_sequence = sample
+                labels_seq = None
+                attn_seq = None
+            generation_result = generate(
+                unwrapped_model,
+                tokenizer,
+                original_sequence,
+                num_diffusion_steps,
+                temperature,
+                mask_token_id,
+                mode=mode,
+                completion_tokens=completion_tokens,
+                target_mask_ratio=target_mask_ratio,
+                labels=labels_seq,
+                attention_mask=attn_seq,
+            )
+            generations.append(generation_result)
+
+    # Restore prior training state
+    if training:
+        unwrapped_model.train()
+    else:
+        unwrapped_model.eval()
+
+    return generations
+
+
+def _sample_sequences_from_dataloader(
+    dataloader: Any, num_samples: int, max_length: int, device: torch.device
+) -> List[Any]:
+    """Sample sequences from validation dataloader."""
+    sampled_sequences: list[dict[str, torch.Tensor] | torch.Tensor] = []
+    sample_count = 0
+
+    # Skip a random number of batches (we could be more clever about this)
+    skip_batches = torch.randint(0, 10, (1,)).item()
+    batch_count = 0
+
+    for batch in dataloader:
+        # Skip some batches for variety
+        if batch_count < skip_batches:
+            batch_count += 1
+            continue
+
+        if sample_count >= num_samples:
+            break
+
+        batch_count += 1
+        input_ids = batch["input_ids"]
+        attention_mask = batch.get("attention_mask")
+        labels = batch.get("labels")
+
+        # Randomly sample from sequences in this batch
+        batch_indices = torch.randperm(input_ids.size(0)).tolist()
+
+        for i in batch_indices:
+            if sample_count >= num_samples:
+                break
+
+            # Get actual sequence length (non-padded)
+            if attention_mask is not None:
+                seq_len = attention_mask[i].sum().item()
+            else:
+                seq_len = input_ids.size(1)
+
+            if seq_len < 10:
+                continue
+
+            # Determine truncation length
+            max_total = min(seq_len, max_length)
+            if labels is not None:
+                labels_i = labels[i][:seq_len]
+                answer_mask = labels_i != -100
+                if not answer_mask.any():
+                    # No answer tokens; skip for SFT masking
+                    continue
+                first_ans_idx = int(
+                    torch.nonzero(answer_mask, as_tuple=False)[0].item()
+                )
+                prompt_len = first_ans_idx
+                if prompt_len >= max_total:
+                    # Prompt alone reaches cap; cannot include any answer
+                    continue
+                remaining_answer = int(answer_mask[prompt_len:].sum().item())
+                allowed_answer = max_total - prompt_len
+                take_answer = min(remaining_answer, allowed_answer)
+                if take_answer <= 0:
+                    continue
+                actual_length = prompt_len + take_answer
+            else:
+                actual_length = max_total
+
+            # Extract the (possibly truncated) sequence
+            sequence = input_ids[i][:actual_length].unsqueeze(0).to(device)
+            attn_seq = (
+                attention_mask[i][:actual_length].unsqueeze(0).to(device)
+                if attention_mask is not None
+                else None
+            )
+            if labels is not None:
+                labels_seq = labels[i][:actual_length].unsqueeze(0).to(device)
+                sampled_sequences.append(
+                    {
+                        "input_ids": sequence,
+                        "labels": labels_seq,
+                        "attention_mask": attn_seq,
+                    }
+                )
+            else:
+                if attn_seq is not None:
+                    sampled_sequences.append(
+                        {"input_ids": sequence, "attention_mask": attn_seq}
+                    )
+                else:
+                    sampled_sequences.append(sequence)
+            sample_count += 1
+
+    return sampled_sequences
+
+
+def generate(
+    model: torch.nn.Module,
+    tokenizer: Any,
+    original_sequence: torch.Tensor,
+    num_diffusion_steps: int,
+    temperature: float,
+    mask_token_id: int,
+    *,
+    mode: Literal["random", "completion"] = "random",
+    completion_tokens: int = 0,
+    target_mask_ratio: Optional[float] = None,
+    labels: Optional[torch.Tensor] = None,
+    attention_mask: Optional[torch.Tensor] = None,
+) -> dict:
+    """Generate a single sample using reverse diffusion."""
+    # Get original text for comparison
+    original_text = tokenizer.decode(
+        original_sequence[0].cpu(), skip_special_tokens=True
+    )
+
+    # Build masked sequence
+    if (
+        labels is not None
+        and labels.numel() > 0
+        and (labels == -100).any()
+        and (labels != -100).any()
+    ):
+        # SFT case: completely mask all answer tokens (labels != -100)
+        total_tokens = original_sequence.size(1)
+        masked_indices = (labels != -100).to(dtype=torch.bool)
+        masked_sequence = original_sequence.clone()
+        masked_sequence[masked_indices] = mask_token_id
+        masked_tokens = int(masked_indices.sum().item())
+        mask_ratio = masked_tokens / max(int(total_tokens), 1)
+    elif mode == "completion" and completion_tokens > 0:
+        # Append mask tokens to the right for completion
+        total_tokens = original_sequence.size(1) + int(completion_tokens)
+        masked_indices = torch.zeros(
+            1, total_tokens, dtype=torch.bool, device=original_sequence.device
+        )
+        masked_indices[0, -int(completion_tokens) :] = True
+
+        append = torch.full(
+            (1, int(completion_tokens)), mask_token_id, device=original_sequence.device
+        )
+        masked_sequence = torch.cat([original_sequence, append], dim=1)
+        masked_tokens = int(completion_tokens)
+        mask_ratio = masked_tokens / total_tokens
+    else:
+        # Apply random masking with optional fixed ratio
+        total_tokens = original_sequence.size(1)
+        if target_mask_ratio is None:
+            min_ratio, max_ratio = 0.1, 0.7
+            target_mask_ratio = (
+                torch.rand(1).item() * (max_ratio - min_ratio) + min_ratio
+            )
+        target_masked_tokens = max(1, int(total_tokens * float(target_mask_ratio)))
+
+        # Create random mask indices
+        mask_positions = torch.randperm(total_tokens)[:target_masked_tokens]
+        masked_indices = torch.zeros(
+            1, total_tokens, dtype=torch.bool, device=original_sequence.device
+        )
+        masked_indices[0, mask_positions] = True
+
+        # Create masked sequence
+        masked_sequence = original_sequence.clone()
+        masked_sequence[masked_indices] = mask_token_id
+
+        # Calculate actual mask ratio
+        masked_tokens = masked_indices.sum().item()
+        mask_ratio = masked_tokens / total_tokens
+
+    # Get masked text for comparison
+    masked_text = tokenizer.decode(masked_sequence[0].cpu(), skip_special_tokens=False)
+    masked_text = _clean_masked_text(masked_text, tokenizer, mask_token_id)
+
+    # Run reverse diffusion process
+    sequence = masked_sequence.clone()
+    attention_mask = create_bidirectional_attention_mask(
+        sequence, attention_mask, sample_packing=attention_mask is not None
+    )
+    for step in range(num_diffusion_steps):
+        sequence = _diffusion_step(
+            model,
+            sequence,
+            step,
+            num_diffusion_steps,
+            temperature,
+            mask_token_id,
+            attention_mask,
+        )
+    generated_text = tokenizer.decode(sequence[0].cpu(), skip_special_tokens=True)
+
+    # Collect diagnostic info
+    final_ids = sequence[0].detach().cpu().tolist()
+    orig_ids_for_render = original_sequence[0].detach().cpu().tolist()
+    if masked_indices is not None:
+        masked_positions = (
+            torch.where(masked_indices[0])[0].detach().cpu().tolist()
+            if masked_indices.ndim == 2
+            else []
+        )
+    else:
+        masked_positions = []
+
+    result = {
+        "original": original_text,
+        "masked": masked_text,
+        "generated": generated_text,
+        "mask_ratio": mask_ratio,
+        "masked_tokens": masked_tokens,
+        "total_tokens": total_tokens,
+        "generated_ids": final_ids,
+        "masked_positions": masked_positions,
+        "orig_ids": orig_ids_for_render,
+        "formatted": (
+            f"Original: '{original_text}' → Masked: '{masked_text}' "
+            f"({mask_ratio:.1%}) → Generated: '{generated_text}'"
+        ),
+    }
+
+    return result
+
+
+def _clean_masked_text(masked_text: str, tokenizer: Any, mask_token_id: int) -> str:
+    """Clean up masked text for display."""
+    mask_token_repr = tokenizer.decode([mask_token_id], skip_special_tokens=False)
+    cleaned = masked_text.replace(mask_token_repr, "[MASK]")
+
+    # Remove literal special token strings
+    if hasattr(tokenizer, "special_tokens_map"):
+        for token_value in tokenizer.special_tokens_map.values():
+            if token_value and isinstance(token_value, str):
+                cleaned = cleaned.replace(token_value, "")
+
+    # Normalize whitespace but preserve newlines
+    cleaned = cleaned.replace("\r\n", "\n").replace("\r", "\n")
+    cleaned = re.sub(r"[ \t]+", " ", cleaned)
+    cleaned = "\n".join(line.rstrip() for line in cleaned.split("\n")).strip()
+    return cleaned
+
+
+def _diffusion_step(
+    model: torch.nn.Module,
+    sequence: torch.Tensor,
+    step: int,
+    num_diffusion_steps: int,
+    temperature: float,
+    mask_token_id: int,
+    attention_mask: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """Perform a single diffusion step with remasking."""
+    # Only process if there are masked tokens remaining
+    current_mask = sequence == mask_token_id
+    if not current_mask.any():
+        return sequence
+
+    # Create or use provided attention mask
+    if attention_mask is None:
+        batch_size, seq_len = sequence.shape
+        attention_mask = torch.ones(
+            batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=sequence.device
+        )
+
+    # Forward pass
+    outputs = model(input_ids=sequence, attention_mask=attention_mask)
+    logits = outputs.logits
+
+    # Only sample at currently masked positions
+    if current_mask.any():
+        masked_logits = logits[current_mask]
+
+        # Apply temperature scaling
+        if temperature > 0:
+            scaled_logits = masked_logits / temperature
+        else:
+            scaled_logits = masked_logits
+
+        # Suppress mask token in outputs
+        scaled_logits[:, mask_token_id] = -float("inf")
+
+        if temperature > 0:
+            # Add Gumbel noise for sampling
+            gumbel_noise = -torch.log(
+                -torch.log(torch.rand_like(scaled_logits, dtype=torch.float32))
+            )
+            gumbel_logits = scaled_logits + gumbel_noise
+            predicted_tokens = torch.argmax(gumbel_logits, dim=-1)
+        else:
+            predicted_tokens = torch.argmax(scaled_logits, dim=-1)
+
+        # Calculate probabilities for confidence scoring
+        probs = torch.softmax(scaled_logits, dim=-1)
+        predicted_token_probs = probs[range(len(predicted_tokens)), predicted_tokens]
+
+        # Determine how many tokens to unmask this step
+        remaining_masked = current_mask.sum().item()
+        if step == num_diffusion_steps - 1:
+            num_to_unmask = remaining_masked
+        else:
+            unmask_ratio = 1.0 / (num_diffusion_steps - step)
+            num_to_unmask = max(1, int(remaining_masked * unmask_ratio))
+
+        # Select highest confidence predictions to unmask
+        if num_to_unmask >= remaining_masked:
+            sequence[current_mask] = predicted_tokens
+        else:
+            _, top_indices = predicted_token_probs.topk(num_to_unmask)
+            mask_positions = torch.where(current_mask)[1]
+            positions_to_unmask = mask_positions[top_indices]
+            sequence[0, positions_to_unmask] = predicted_tokens[top_indices]
+
+    return sequence
diff --git a/src/axolotl/integrations/diffusion/plugin.py b/src/axolotl/integrations/diffusion/plugin.py
new file mode 100644
index 000000000..c31f48b03
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/plugin.py
@@ -0,0 +1,41 @@
+"""Diffusion LM training plugin for Axolotl."""
+
+from peft import PeftModel
+from transformers import PreTrainedModel
+
+from axolotl.integrations.base import BasePlugin
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+from .trainer import DiffusionTrainer
+
+LOG = get_logger(__name__)
+
+
+class DiffusionPlugin(BasePlugin):
+    """
+    Plugin for diffusion language model training.
+
+    This plugin enables diffusion-based training using the LLaDA approach, which uses
+    random masking and bidirectional attention to train language models.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.cfg = None
+
+    def get_input_args(self) -> str:
+        """Returns the pydantic model for LLaDA plugin arguments."""
+        return "axolotl.integrations.diffusion.DiffusionArgs"
+
+    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        """Perform actions after model is loaded."""
+        self.cfg = cfg
+
+    def get_trainer_cls(self, cfg: DictDefault) -> type[DiffusionTrainer] | None:
+        """Return custom trainer class for diffusion training."""
+        return DiffusionTrainer
+
+    def post_trainer_create(self, cfg: DictDefault, trainer: DiffusionTrainer):
+        """Configure trainer after creation."""
+        trainer.set_config(cfg)
diff --git a/src/axolotl/integrations/diffusion/trainer.py b/src/axolotl/integrations/diffusion/trainer.py
new file mode 100644
index 000000000..42b2468f4
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/trainer.py
@@ -0,0 +1,301 @@
+"""Custom trainer for diffusion LM training."""
+
+from typing import Any, Literal
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from axolotl.core.trainers.base import AxolotlTrainer
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.logging import get_logger
+
+from .callbacks import DiffusionGenerationCallback
+from .utils import create_bidirectional_attention_mask
+
+LOG = get_logger(__name__)
+
+
+class DiffusionTrainer(AxolotlTrainer):
+    """Custom trainer for diffusion LM training that overrides loss computation."""
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.cfg = None
+        self._special_token_ids = None
+
+    def set_config(self, config: DictDefault):
+        """Set config for diffusion training."""
+        self.cfg = config
+        self._cache_special_token_ids()
+        self._resolve_mask_token_id()
+
+        token_id = int(getattr(self.cfg.diffusion, "mask_token_id", 0))
+        LOG.info(f"Diffusion: using mask_token_id={token_id}")
+
+        if getattr(config.diffusion, "generate_samples", True):
+            generation_callback = DiffusionGenerationCallback(self)
+            self.add_callback(generation_callback)
+
+    def _resolve_mask_token_id(self) -> None:
+        """Ensure mask_token_id is valid for the current tokenizer."""
+        from .utils import resolve_mask_token_id
+
+        tokenizer = getattr(self, "processing_class", None)
+        if tokenizer is None:
+            return
+
+        mid = resolve_mask_token_id(
+            tokenizer,
+            self.cfg,
+            allow_add=True,
+            model=getattr(self, "model", None),
+        )
+        try:
+            self.cfg.diffusion.mask_token_id = int(mid)
+        except Exception:
+            pass
+
+    def compute_loss(
+        self,
+        model: nn.Module,
+        inputs: dict[str, torch.Tensor],
+        return_outputs: bool = False,
+        num_items_in_batch: torch.Tensor | None = None,
+    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
+        """Override compute_loss to use diffusion loss."""
+        input_ids = inputs.get("input_ids")
+        attention_mask = inputs.get("attention_mask")
+        labels = inputs.get("labels")
+
+        if input_ids is None:
+            raise ValueError("input_ids is required for diffusion training")
+
+        loss, outputs = self._compute_diffusion_loss(
+            model, input_ids, attention_mask, labels
+        )
+
+        if return_outputs:
+            return loss, outputs
+        return loss
+
+    def _cache_special_token_ids(self):
+        """Cache special token IDs to avoid repeated tokenizer access."""
+        if self.processing_class is None:
+            self._special_token_ids = set()
+            return
+
+        tokenizer = self.processing_class
+        special_tokens = set()
+
+        if hasattr(tokenizer, "bos_token_id") and tokenizer.bos_token_id is not None:
+            special_tokens.add(tokenizer.bos_token_id)
+        if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id is not None:
+            special_tokens.add(tokenizer.eos_token_id)
+        if hasattr(tokenizer, "pad_token_id") and tokenizer.pad_token_id is not None:
+            special_tokens.add(tokenizer.pad_token_id)
+
+        self._special_token_ids = special_tokens
+
+    def _forward_process(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        labels: torch.Tensor | None = None,
+        eps: float = 1e-3,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """
+        Forward noising process. A timestep is sampled along the process, and tokens are
+        masked with probability determined by the configured noise schedule.
+
+        Args:
+            input_ids: Input token ids [batch_size, seq_len].
+            attention_mask: Attention mask [batch_size, seq_len].
+            labels: Labels for SFT training [batch_size, seq_len].
+            eps: Small epsilon value for minimum masking probability.
+
+        Returns:
+            noisy_batch: Input with some tokens masked.
+            masked_indices: Boolean mask indicating which tokens were masked.
+            p_mask: Masking probabilities for each token [batch_size, seq_len].
+        """
+        batch_size, seq_len = input_ids.shape
+        device = input_ids.device
+
+        # Sample random timesteps for each sample in batch
+        t = torch.rand(batch_size, device=device)
+        p_mask = (1 - eps) * t + eps  # [batch_size]
+        p_mask = p_mask[:, None].repeat(1, seq_len)  # [batch_size, seq_len]
+
+        # Don't mask padding tokens if attention_mask is provided
+        if attention_mask is not None:
+            valid_mask = attention_mask.bool()
+            p_mask = p_mask * valid_mask.float()
+
+        # Create mask to exclude special tokens
+        special_token_mask = torch.zeros_like(input_ids, dtype=torch.bool)
+        if self._special_token_ids:
+            for token_id in self._special_token_ids:
+                special_token_mask |= input_ids == token_id
+
+        # Create random mask based on p_mask
+        masked_indices = torch.rand((batch_size, seq_len), device=device) < p_mask
+        masked_indices = masked_indices & ~special_token_mask
+        if attention_mask is not None:
+            masked_indices = masked_indices & attention_mask.bool()
+
+        # For SFT data, only mask answer tokens
+        if labels is not None:
+            answer_mask = labels != -100
+            masked_indices = masked_indices & answer_mask
+
+        # Create masked input
+        mask_token_id = int(self.cfg.diffusion.mask_token_id)
+        mask_value = torch.full_like(input_ids, mask_token_id)
+        noisy_batch = torch.where(masked_indices, mask_value, input_ids)
+
+        return noisy_batch, masked_indices, p_mask
+
+    def _compute_diffusion_loss(
+        self,
+        model: nn.Module,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        labels: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, torch.Tensor | Any]:
+        """
+        Compute diffusion loss.
+
+        Args:
+            model: The model to compute loss for.
+            input_ids: Ground truth token ids [batch_size, seq_len].
+            attention_mask: Attention mask [batch_size, seq_len].
+            labels: Labels for SFT training [batch_size, seq_len].
+
+        Returns:
+            loss: Cross-entropy loss.
+            metrics: Dictionary of metrics.
+        """
+        # Short-circuit empty sequences
+        if input_ids is None or input_ids.numel() == 0 or input_ids.shape[1] == 0:
+            zero = torch.tensor(
+                0.0,
+                device=(input_ids.device if input_ids is not None else None),
+                requires_grad=True,
+            )
+            return zero, {}
+
+        # If an attention_mask is provided and all positions are padding for every
+        # sample in this batch, skip the step.
+        if attention_mask is not None:
+            if attention_mask.dim() == 2 and (attention_mask.sum(dim=1) == 0).all():
+                zero = torch.tensor(0.0, device=input_ids.device, requires_grad=True)
+                return zero, {}
+
+        # Apply forward process
+        noisy_batch, masked_indices, p_mask = self._forward_process(
+            input_ids, attention_mask, labels, self.cfg.diffusion.eps
+        )
+
+        # Create bidirectional attention mask
+        bidirectional_mask = create_bidirectional_attention_mask(
+            input_ids, attention_mask, sample_packing=self.cfg.sample_packing
+        )
+
+        # Forward pass
+        outputs = model(
+            input_ids=noisy_batch.long(),
+            attention_mask=bidirectional_mask,
+        )
+        logits = outputs.logits
+
+        if masked_indices.sum() > 0:
+            valid_indices = torch.where(masked_indices)
+            batch_indices, seq_indices = valid_indices
+
+            masked_logits = logits[batch_indices, seq_indices]
+            masked_targets = input_ids[batch_indices, seq_indices]
+            masked_p_mask = p_mask[batch_indices, seq_indices]
+
+            # Compute cross-entropy loss without reduction
+            token_loss = F.cross_entropy(
+                masked_logits.float(), masked_targets, reduction="none"
+            )
+
+            if self.cfg.diffusion.importance_weighting:
+                masked_p_mask = masked_p_mask.float()
+                weighted_loss = token_loss / masked_p_mask
+            else:
+                weighted_loss = token_loss
+
+            if labels is not None:
+                # For SFT data: normalize by answer token count per sample
+                answer_mask = labels != -100
+                answer_lengths = answer_mask.sum(dim=1).float()  # [batch_size]
+
+                # Get batch indices for masked tokens
+                masked_batch_indices = batch_indices
+
+                # Sum losses per sample and divide by answer length
+                batch_size = input_ids.shape[0]
+                loss_per_sample = torch.zeros(batch_size, device=input_ids.device)
+                for i in range(batch_size):
+                    sample_mask = masked_batch_indices == i
+                    if sample_mask.sum() > 0:
+                        sample_loss = weighted_loss[sample_mask].sum()
+                        denom = answer_lengths[i].clamp(min=1.0)
+                        loss_per_sample[i] = sample_loss / denom
+
+                loss = loss_per_sample.mean()
+            else:
+                # Non-SFT: when importance weighting is enabled, use unbiased estimator
+                # (sum(loss/p) / total_tokens). Otherwise, average over masked tokens
+                # for stable scaling across varying mask ratios.
+                if self.cfg.diffusion.importance_weighting:
+                    loss = weighted_loss.sum() / (
+                        input_ids.shape[0] * input_ids.shape[1]
+                    )
+                else:
+                    loss = weighted_loss.mean()
+
+            ce_loss = token_loss.mean()
+
+            # Compute accuracy on masked tokens
+            with torch.no_grad():
+                pred_tokens = masked_logits.argmax(dim=-1)
+                accuracy = (pred_tokens == masked_targets).float().mean()
+        else:
+            loss = torch.tensor(0.0, device=input_ids.device, requires_grad=True)
+            accuracy = torch.tensor(0.0, device=input_ids.device)
+            ce_loss = torch.tensor(0.0, device=input_ids.device)
+            masked_p_mask = torch.tensor(1.0, device=input_ids.device)
+
+        avg_p_mask = (
+            p_mask[masked_indices].mean().item() if masked_indices.any() else 0.0
+        )
+        metrics = {
+            "loss": loss.item(),
+            "accuracy": accuracy.item(),
+            "mask_ratio": masked_indices.float().mean().item(),
+            "num_masked_tokens": (masked_indices.sum().item(), "sum"),
+            "avg_p_mask": avg_p_mask,
+            "ce_loss": ce_loss.item(),
+        }
+
+        # If doing SFT training, log answer-specific metrics
+        if self.cfg.datasets is not None:
+            with torch.no_grad():
+                answer_mask = labels != -100
+                answer_lengths = answer_mask.sum(dim=1).float()  # type: ignore
+                total_answer_tokens = answer_mask.sum().item()  # type: ignore
+                total_tokens = labels.numel()  # type: ignore
+                metrics["answer_ratio"] = total_answer_tokens / max(total_tokens, 1)
+                metrics["avg_answer_length"] = answer_lengths.mean().item()
+
+        if self.cfg.diffusion.importance_weighting:
+            metrics["importance_weight_avg"] = (1.0 / masked_p_mask).mean().item()
+
+        train_eval: Literal["train", "eval"] = "train" if model.training else "eval"
+        self.store_metrics(metrics, train_eval=train_eval)
+
+        return loss, outputs
diff --git a/src/axolotl/integrations/diffusion/utils.py b/src/axolotl/integrations/diffusion/utils.py
new file mode 100644
index 000000000..47abf6fec
--- /dev/null
+++ b/src/axolotl/integrations/diffusion/utils.py
@@ -0,0 +1,159 @@
+"""Shared utilities for diffusion integration."""
+
+from __future__ import annotations
+
+from typing import Any, Optional
+
+import torch
+
+from axolotl.utils.dict import DictDefault
+
+
+def resolve_mask_token_id(
+    tokenizer: Any,
+    cfg: DictDefault,
+    *,
+    allow_add: bool,
+    model: Any | None = None,
+    default_token: str = "<|diffusion_mask|>",
+) -> int:
+    """Resolve mask token id. Training may add a new special token; inference won't."""
+    # Determine vocab size if available
+    vocab_size = None
+    if tokenizer is not None:
+        if hasattr(tokenizer, "vocab_size") and tokenizer.vocab_size is not None:
+            try:
+                vocab_size = int(tokenizer.vocab_size)  # type: ignore[arg-type]
+            except Exception:
+                vocab_size = None
+        elif hasattr(tokenizer, "__len__"):
+            try:
+                vocab_size = int(len(tokenizer))
+            except Exception:
+                vocab_size = None
+
+    # Use explicit id from config if provided
+    diffusion_cfg = getattr(cfg, "diffusion", None)
+    # Fallback to top-level attr names only if nested missing (shouldn't happen)
+    cfg_id = (
+        getattr(diffusion_cfg, "mask_token_id", None)
+        if diffusion_cfg is not None
+        else getattr(cfg, "diffusion_mask_token_id", None)
+    )
+    if isinstance(cfg_id, int) and cfg_id >= 0:
+        if vocab_size is None or cfg_id < vocab_size:
+            return int(cfg_id)
+
+    def _existing_special_token_id(token_str: str | None) -> int | None:
+        """Attempt to resolve an existing special token string to a real ID."""
+        if not token_str or not hasattr(tokenizer, "convert_tokens_to_ids"):
+            return None
+        try:
+            token_id = tokenizer.convert_tokens_to_ids(token_str)
+        except Exception:
+            return None
+
+        if not isinstance(token_id, int) or token_id < 0:
+            return None
+
+        # Ensure it's registered as special and not UNK, and within vocab
+        unk_id = getattr(tokenizer, "unk_token_id", None)
+        specials = set(getattr(tokenizer, "all_special_tokens", []) or [])
+        addl = set(getattr(tokenizer, "additional_special_tokens", []) or [])
+        is_special = token_str in specials or token_str in addl
+        in_vocab = vocab_size is None or token_id < vocab_size
+        if (
+            (unk_id is not None and token_id == unk_id)
+            or not is_special
+            or not in_vocab
+        ):
+            return None
+        return token_id
+
+    # Try mask token string if provided
+    token_str = (
+        getattr(diffusion_cfg, "mask_token_str", None)
+        if diffusion_cfg is not None
+        else getattr(cfg, "diffusion_mask_token_str", None)
+    )
+    for candidate in (token_str, default_token):
+        token_id = _existing_special_token_id(candidate)
+        if isinstance(token_id, int):
+            try:
+                if diffusion_cfg is None:
+                    cfg.diffusion_mask_token_id = int(token_id)  # legacy fallback
+                else:
+                    diffusion_cfg.mask_token_id = int(token_id)
+            except Exception:
+                pass
+            return int(token_id)
+
+    # Optionally add and return a dedicated special token during training
+    if allow_add and hasattr(tokenizer, "add_special_tokens"):
+        token_to_add = token_str or default_token
+        try:
+            tokenizer.add_special_tokens({"additional_special_tokens": [token_to_add]})
+
+            # Resize embeddings if possible
+            if (
+                model is not None
+                and hasattr(tokenizer, "__len__")
+                and hasattr(model, "resize_token_embeddings")
+            ):
+                try:
+                    model.resize_token_embeddings(len(tokenizer))
+                except Exception:
+                    pass
+            new_id = tokenizer.convert_tokens_to_ids(token_to_add)
+            if isinstance(new_id, int) and new_id >= 0:
+                try:
+                    if diffusion_cfg is None:
+                        cfg.diffusion_mask_token_id = int(new_id)  # legacy fallback
+                    else:
+                        diffusion_cfg.mask_token_id = int(new_id)
+                except Exception:
+                    pass
+                return int(new_id)
+        except Exception:
+            pass
+
+    # Fallback to unk or 0 (do not update cfg)
+    fallback = getattr(tokenizer, "unk_token_id", 0) or 0
+    return int(fallback)
+
+
+def create_bidirectional_attention_mask(
+    input_ids: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    sample_packing: bool = False,
+) -> torch.Tensor:
+    """
+    Create bidirectional attention mask to override default causal masking.
+    Handles sample-packed sequences where different samples are identified
+    by different attention mask values.
+
+    Args:
+        input_ids: Input token ids [batch_size, seq_len]
+        attention_mask: Attention mask [batch_size, seq_len]
+        sample_packing: Whether sample packing is enabled
+
+    Returns:
+        bidirectional_mask: 4D attention mask [batch_size, 1, seq_len, seq_len]
+    """
+    batch_size, seq_len = input_ids.shape
+    device = input_ids.device
+
+    if attention_mask is None or not sample_packing:
+        return torch.ones(
+            batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=device
+        )
+
+    # Handle sample packing: tokens can only attend within their sample
+    mask_i = attention_mask.unsqueeze(2)  # [batch_size, seq_len, 1]
+    mask_j = attention_mask.unsqueeze(1)  # [batch_size, 1, seq_len]
+
+    # Tokens can attend to each other if they have the same non-zero sample ID
+    bidirectional_mask = (mask_i == mask_j) & (mask_i > 0)
+
+    # Add head dimension: [batch_size, 1, seq_len, seq_len]
+    return bidirectional_mask.unsqueeze(1)
diff --git a/src/axolotl/loaders/adapter.py b/src/axolotl/loaders/adapter.py
index 989b34aee..bcde4bf96 100644
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -14,6 +14,7 @@ from peft import (
     PeftConfig,
     PeftMixedModel,
     PeftModel,
+    TaskType,
     get_peft_model,
 )
 from transformers import PreTrainedModel
@@ -101,6 +102,15 @@ def load_lora(
     if cfg.peft_trainable_token_indices:
         lora_config_kwargs["trainable_token_indices"] = cfg.peft_trainable_token_indices
 
+    # Determine the correct PEFT task type
+    model_cls = type(model).__name__
+    if "SequenceClassification" in model_cls:
+        task_type = TaskType.SEQ_CLS
+    elif "TokenClassification" in model_cls:
+        task_type = TaskType.TOKEN_CLS
+    else:
+        task_type = TaskType.CAUSAL_LM
+
     lora_config = LoraConfig(
         r=cfg.lora_r,
         lora_alpha=cfg.lora_alpha,
@@ -112,7 +122,7 @@ def load_lora(
         fan_in_fan_out=cfg.lora_fan_in_fan_out,
         modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
         bias="none",
-        task_type="CAUSAL_LM",
+        task_type=task_type,
         **lora_config_kwargs,
     )
 
diff --git a/src/axolotl/loaders/model.py b/src/axolotl/loaders/model.py
index a9507d685..f438d6b61 100644
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -673,6 +673,33 @@ class ModelLoader:
 
         return hf_ds_cfg
 
+    def _load_model_from_config(self, model_loader_class=None) -> PreTrainedModel:
+        """
+        Load model with random initialization using from_config.
+
+        Uses the selected loader when provided; otherwise falls back to the auto loader.
+        """
+        loader = model_loader_class or self.auto_model_loader
+        if loader in [AutoModelForCausalLM, AutoModelForVision2Seq]:
+            model = loader.from_config(
+                config=self.model_config,
+                trust_remote_code=self.cfg.trust_remote_code or False,
+            )
+        else:
+            model = loader(config=self.model_config)
+
+        return model
+
+    def _load_model_from_pretrained(self, model_loader_class=None) -> PreTrainedModel:
+        """Load model from pretrained weights."""
+        loader = model_loader_class or self.auto_model_loader
+        kwargs = {
+            "config": self.model_config,
+            "trust_remote_code": self.cfg.trust_remote_code or False,
+            **self.model_kwargs,
+        }
+        return loader.from_pretrained(self.base_model, **kwargs)
+
     def _build_model(self) -> bool:
         """Load model, with load strategy depending on config."""
         skip_move_to_device = False
@@ -687,7 +714,8 @@ class ModelLoader:
         if self.is_fsdp_enabled:
             if self.cfg.fsdp_config.cpu_ram_efficient_loading:
                 skip_move_to_device = True
-                # Don't delete device_map for QLoRA + FSDP - it was set correctly in _set_device_map
+                # Don't delete device_map for QLoRA + FSDP - it was set correctly in
+                # _set_device_map
                 if (
                     "device_map" in self.model_kwargs
                     and not self.is_qlora_and_fsdp_enabled
@@ -716,6 +744,11 @@ class ModelLoader:
                 or self.cfg.qlora_sharded_model_loading
             )
         ):
+            if self.cfg.reinit_weights:
+                LOG.warning(
+                    "reinit_weights is not supported with sharded quantized loading. "
+                    "Loading from pretrained weights instead."
+                )
             quant_storage = self.cfg.torch_dtype
             quantization_config = getattr(
                 self.model_config, "quantization_config", None
@@ -731,33 +764,12 @@ class ModelLoader:
                 quantization_config=quantization_config,
             )
             skip_move_to_device = True
-        elif (
-            self.model_config.model_type in ["llama", "llama4"]
-            and not self.cfg.trust_remote_code
-            and not self.cfg.gptq
-        ):
-            # Please don't remove underscore binding without reading the fn docstring.
-            _ = self._configure_zero3_memory_efficient_loading()
-
-            # Load model with random initialization if specified
-            if self.cfg.random_init_weights:
-                # AutoModel classes support the from_config method
-                if self.auto_model_loader in [
-                    AutoModelForCausalLM,
-                    AutoModelForVision2Seq,
-                ]:
-                    self.model = self.auto_model_loader.from_config(
-                        config=self.model_config,
-                    )
-                else:
-                    self.model = self.auto_model_loader(config=self.model_config)
-            else:
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    **self.model_kwargs,
-                )
         elif self.model_type == "MambaLMHeadModel":
+            if self.cfg.reinit_weights:
+                LOG.warning(
+                    "reinit_weights is not supported with MambaLMHeadModel. "
+                    "Loading from pretrained weights instead."
+                )
             # FIXME this is janky at best and hacked together to make it work
             MambaLMHeadModel = fix_mamba_attn_for_loss()
 
@@ -770,41 +782,27 @@ class ModelLoader:
                 self.base_model,
                 **self.model_kwargs,
             )
-        elif (
-            self.model_type
-            and self.model_type != "AutoModelForCausalLM"
-            and not self.cfg.trust_remote_code
-        ):
-            if self.cfg.gptq:
-                self.model = self.auto_model_loader.from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-            else:
-                self.model = getattr(transformers, self.model_type).from_pretrained(
-                    self.base_model,
-                    config=self.model_config,
-                    trust_remote_code=self.cfg.trust_remote_code or False,
-                    **self.model_kwargs,
-                )
-        elif self.cfg.gptq:
-            self.model = self.auto_model_loader.from_pretrained(
-                self.base_model,
-                config=self.model_config,
-                trust_remote_code=self.cfg.trust_remote_code or False,
-                **self.model_kwargs,
-            )
         else:
-            # Please don't remove underscore binding without reading the fn docstring.
+            # Please don't remove underscore binding without reading the fn docstring
             _ = self._configure_zero3_memory_efficient_loading()
-            self.model = self.auto_model_loader.from_pretrained(
-                self.base_model,
-                config=self.model_config,
-                trust_remote_code=self.cfg.trust_remote_code or False,
-                **self.model_kwargs,
-            )
+
+            if (
+                self.model_type
+                and self.model_type != "AutoModelForCausalLM"
+                and not self.cfg.trust_remote_code
+                and not self.cfg.gptq
+            ):
+                # Use model type from transformers
+                model_loader_class = getattr(transformers, self.model_type)
+            else:
+                # Use auto model loader (handles gptq and default cases)
+                model_loader_class = self.auto_model_loader
+
+            if self.cfg.reinit_weights:
+                self.model = self._load_model_from_config(model_loader_class)
+            else:
+                self.model = self._load_model_from_pretrained(model_loader_class)
+
         if is_deepspeed_zero3_enabled():
             skip_move_to_device = True
 
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index 044c278a3..a5a630cb5 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -3,8 +3,8 @@
 Applies pre- and post-model load patches for various fixes and optimizations.
 """
 
-import os
 import importlib.util
+import os
 from functools import cached_property
 
 import addict
@@ -468,9 +468,10 @@ class PatchManager:
 
     def _apply_patch_deepspeed_zero3(self):
         try:
-            from axolotl.monkeypatch.deepspeed_utils import apply_deepspeed_patches
             from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 
+            from axolotl.monkeypatch.deepspeed_utils import apply_deepspeed_patches
+
             if self.cfg.activation_offloading is True and (
                 is_deepspeed_zero3_enabled()
                 or os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3"
diff --git a/src/axolotl/monkeypatch/accelerate/fsdp2.py b/src/axolotl/monkeypatch/accelerate/fsdp2.py
index 3b38a33b7..d8ba02cb2 100644
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -160,9 +160,11 @@ def get_state_dict(self, model, unwrap=True):
                 state_dict[param_name] = param.cpu()
             torch.distributed.barrier()
     elif self.distributed_type == DistributedType.FSDP:
-        from torch.distributed.fsdp import FullStateDictConfig
-        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-        from torch.distributed.fsdp import StateDictType
+        from torch.distributed.fsdp import (
+            FullStateDictConfig,
+            FullyShardedDataParallel as FSDP,
+            StateDictType,
+        )
 
         full_state_dict_config = FullStateDictConfig(
             offload_to_cpu=True, rank0_only=True
diff --git a/src/axolotl/monkeypatch/attention/flex_attn.py b/src/axolotl/monkeypatch/attention/flex_attn.py
index 65ccad533..678f65bee 100644
--- a/src/axolotl/monkeypatch/attention/flex_attn.py
+++ b/src/axolotl/monkeypatch/attention/flex_attn.py
@@ -1,11 +1,12 @@
 """Flex attention monkey patch"""
 
 import sys
-from packaging import version
 
 import torch
 import transformers
+from packaging import version
 from transformers.utils.import_utils import _torch_version, is_torch_less_or_equal
+
 from axolotl.utils.logging import get_logger
 
 LOG = get_logger(__name__)
diff --git a/src/axolotl/monkeypatch/deepspeed_utils.py b/src/axolotl/monkeypatch/deepspeed_utils.py
index 6740f556b..d7e69e112 100644
--- a/src/axolotl/monkeypatch/deepspeed_utils.py
+++ b/src/axolotl/monkeypatch/deepspeed_utils.py
@@ -1,5 +1,6 @@
 import importlib
 import importlib.util
+
 from axolotl.utils.logging import get_logger
 
 LOG = get_logger(__name__)
diff --git a/src/axolotl/utils/config/__init__.py b/src/axolotl/utils/config/__init__.py
index f40fe6687..7a2bbd6f9 100644
--- a/src/axolotl/utils/config/__init__.py
+++ b/src/axolotl/utils/config/__init__.py
@@ -17,8 +17,8 @@ from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.config import (
     AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
+    AxolotlInputConfig as AxolotlInputConfigBase,
 )
-from axolotl.utils.schemas.config import AxolotlInputConfig as AxolotlInputConfigBase
 from axolotl.utils.schemas.datasets import DPODataset, KTODataset, SFTDataset
 
 LOG = get_logger(__name__)
diff --git a/src/axolotl/utils/data/__init__.py b/src/axolotl/utils/data/__init__.py
index 788f13638..8b9e4e91d 100644
--- a/src/axolotl/utils/data/__init__.py
+++ b/src/axolotl/utils/data/__init__.py
@@ -1,14 +1,14 @@
 """Init for `axolotl.utils.data` module."""
 
-from axolotl.utils.data.streaming import (
-    encode_streaming,
-    wrap_streaming_dataset,
-)
 from axolotl.utils.data.rl import prepare_preference_datasets
 from axolotl.utils.data.sft import (
     get_dataset_wrapper,
     prepare_datasets,
 )
+from axolotl.utils.data.streaming import (
+    encode_streaming,
+    wrap_streaming_dataset,
+)
 from axolotl.utils.data.utils import md5
 
 __all__ = [
diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py
index 28732e01d..ba5aec2d6 100644
--- a/src/axolotl/utils/data/sft.py
+++ b/src/axolotl/utils/data/sft.py
@@ -16,7 +16,6 @@ from transformers import PreTrainedTokenizer, ProcessorMixin
 
 from axolotl.prompters import Prompter
 from axolotl.utils.data.lock import FileLockLoader
-from axolotl.utils.data.streaming import wrap_streaming_dataset
 from axolotl.utils.data.shared import (
     create_train_validation_split,
     datasets_with_name_generator,
@@ -27,6 +26,7 @@ from axolotl.utils.data.shared import (
     save_preprocessed_dataset,
     try_load_from_hub,
 )
+from axolotl.utils.data.streaming import wrap_streaming_dataset
 from axolotl.utils.data.utils import (
     deduplicate_and_log_datasets,
     handle_long_seq_in_dataset,
diff --git a/src/axolotl/utils/environment.py b/src/axolotl/utils/environment.py
index 751f7e253..192aca4e1 100644
--- a/src/axolotl/utils/environment.py
+++ b/src/axolotl/utils/environment.py
@@ -6,8 +6,6 @@ from importlib.metadata import version
 
 from accelerate.utils.environment import (
     check_cuda_p2p_ib_support as accelerate_check_cuda_p2p_ib_support,
-)
-from accelerate.utils.environment import (
     get_gpu_info,
 )
 from packaging.version import Version, parse
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index e4c1fdf29..d612ec8a5 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -106,6 +106,12 @@ class AxolotlInputConfig(
             "description": "Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs"
         },
     )
+    reinit_weights: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Reinitialize model weights randomly instead of loading pretrained weights"
+        },
+    )
 
     trainer_cls: str | None = Field(
         default=None,
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 49add8081..64018ca48 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -14,7 +14,6 @@ from transformers.utils.import_utils import is_torch_npu_available
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType
 
-
 LOG = get_logger(__name__)
 
 SUPPORTED_METRICS = {"sacrebleu", "comet", "ter", "chrf", "perplexity"}
diff --git a/tests/e2e/test_diffusion.py b/tests/e2e/test_diffusion.py
new file mode 100644
index 000000000..cc3d8070b
--- /dev/null
+++ b/tests/e2e/test_diffusion.py
@@ -0,0 +1,139 @@
+"""E2E smoke test for diffusion training plugin."""
+
+from axolotl.common.datasets import load_datasets
+from axolotl.train import train
+from axolotl.utils.config import normalize_config, validate_config
+from axolotl.utils.dict import DictDefault
+
+from tests.e2e.utils import check_model_output_exists
+
+
+class TestDiffusion:
+    """Test case for diffusion training plugin."""
+
+    def test_diffusion_smoke_test(self, temp_dir):
+        """
+        Smoke test for diffusion training to ensure the plugin loads and trains without
+        error.
+        """
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_type": "AutoTokenizer",
+                "trust_remote_code": True,
+                "sequence_len": 256,
+                "val_set_size": 0.1,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 3,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.0001,
+                "optimizer": "adamw_torch",
+                "lr_scheduler": "cosine",
+                "bf16": True,
+                "save_safetensors": True,
+                "save_first_step": False,
+                "logging_steps": 1,
+                "eval_steps": 3,
+                # Diffusion-specific config
+                "plugins": ["axolotl.integrations.diffusion.DiffusionPlugin"],
+                "diffusion": {
+                    # sample generation
+                    "generate_samples": True,
+                    "generation_interval": 1,
+                    "num_generation_samples": 1,
+                    "generation_steps": 2,
+                    "generation_max_length": 32,
+                    "generation_temperature": 0.0,
+                    # training-specific
+                    "mask_token_id": 16,
+                    "eps": 1e-3,
+                    "importance_weighting": False,
+                },
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
+
+    def test_diffusion_sft_labels(self, temp_dir):
+        """Test that diffusion training properly handles SFT data with labels."""
+        cfg = DictDefault(
+            {
+                "base_model": "HuggingFaceTB/SmolLM2-135M",
+                "tokenizer_type": "AutoTokenizer",
+                "trust_remote_code": True,
+                "sequence_len": 256,
+                "val_set_size": 0.1,
+                "special_tokens": {
+                    "pad_token": "<|endoftext|>",
+                },
+                "datasets": [
+                    {
+                        "path": "mhenrichsen/alpaca_2k_test",
+                        "type": "alpaca",
+                    },
+                ],
+                "num_epochs": 1,
+                "max_steps": 3,
+                "micro_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "output_dir": temp_dir,
+                "learning_rate": 0.0001,
+                "optimizer": "adamw_torch",
+                "lr_scheduler": "cosine",
+                "bf16": True,
+                "save_safetensors": True,
+                "save_first_step": False,
+                "logging_steps": 1,
+                "eval_steps": 2,
+                # Diffusion-specific config
+                "plugins": ["axolotl.integrations.diffusion.DiffusionPlugin"],
+                "diffusion": {
+                    # sample generation
+                    "generate_samples": True,
+                    "generation_interval": 1,
+                    "num_generation_samples": 1,
+                    "generation_steps": 2,
+                    "generation_max_length": 32,
+                    "generation_temperature": 0.0,
+                    # training-specific
+                    "mask_token_id": 16,
+                    "eps": 1e-3,
+                    "importance_weighting": True,
+                },
+                # Ensure we have proper SFT labels
+                "train_on_inputs": False,
+            }
+        )
+
+        cfg = validate_config(cfg)
+        normalize_config(cfg)
+        dataset_meta = load_datasets(cfg=cfg)
+
+        # Verify that the dataset has labels
+        sample = dataset_meta.train_dataset[0]
+        assert "labels" in sample, "SFT dataset should have labels"
+
+        # Check that some labels are -100 (prompt tokens)
+        labels = sample["labels"]
+        if hasattr(labels, "tolist"):
+            labels = labels.tolist()
+        assert -100 in labels, "SFT dataset should have -100 labels for prompt tokens"
+
+        train(cfg=cfg, dataset_meta=dataset_meta)
+        check_model_output_exists(temp_dir, cfg)
diff --git a/tests/integrations/test_diffusion.py b/tests/integrations/test_diffusion.py
new file mode 100644
index 000000000..141d8d150
--- /dev/null
+++ b/tests/integrations/test_diffusion.py
@@ -0,0 +1,274 @@
+"""Tests for diffusion trainer integration."""
+
+# pylint: disable=redefined-outer-name,protected-access
+
+from unittest.mock import Mock
+
+import pytest
+import torch
+
+from axolotl.integrations.diffusion import DiffusionTrainer
+from axolotl.integrations.diffusion.utils import create_bidirectional_attention_mask
+from axolotl.utils.dict import DictDefault
+
+
+@pytest.fixture
+def mock_tokenizer():
+    """Create a mock tokenizer."""
+    tokenizer = Mock()
+    tokenizer.bos_token_id = 1
+    tokenizer.eos_token_id = 2
+    tokenizer.pad_token_id = 0
+    return tokenizer
+
+
+@pytest.fixture
+def diffusion_config():
+    """Create a diffusion config."""
+    return DictDefault(
+        {
+            "diffusion": {
+                "mask_token_id": 32000,
+                "eps": 1e-3,
+                "importance_weighting": False,
+            },
+            "sample_packing": False,
+        }
+    )
+
+
+@pytest.fixture
+def diffusion_trainer_instance(mock_tokenizer, diffusion_config):
+    """Create a diffusion trainer instance for testing methods directly."""
+    # Create a minimal trainer instance just for testing methods
+    trainer = object.__new__(DiffusionTrainer)  # Bypass __init__
+    trainer.cfg = diffusion_config
+    trainer._special_token_ids = {0, 1, 2}  # pad, bos, eos
+    trainer.processing_class = mock_tokenizer
+    trainer.store_metrics = Mock()  # Mock metrics storage
+    return trainer
+
+
+class TestDiffusionTrainer:
+    """Test the DiffusionTrainer class."""
+
+    def test_forward_process_basic(self, diffusion_trainer_instance):
+        """Test basic forward process without labels."""
+        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
+
+        noisy_batch, masked_indices, p_mask = (
+            diffusion_trainer_instance._forward_process(input_ids, eps=0.1)
+        )
+
+        # Check shapes
+        assert noisy_batch.shape == input_ids.shape
+        assert masked_indices.shape == input_ids.shape
+        assert p_mask.shape == input_ids.shape
+
+        # Check that special tokens are not masked
+        special_token_positions = (input_ids == 1) | (input_ids == 2) | (input_ids == 0)
+        assert not masked_indices[special_token_positions].any()
+
+        # Check that mask token is applied
+        mask_token_id = diffusion_trainer_instance.cfg.diffusion.mask_token_id
+        masked_positions = masked_indices
+        if masked_positions.any():
+            assert (noisy_batch[masked_positions] == mask_token_id).all()
+
+    def test_forward_process_with_labels(self, diffusion_trainer_instance):
+        """Test forward process with SFT labels."""
+        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
+        labels = torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long)
+
+        noisy_batch, masked_indices, p_mask = (
+            diffusion_trainer_instance._forward_process(
+                input_ids, labels=labels, eps=0.1
+            )
+        )
+
+        # Check shapes
+        assert noisy_batch.shape == input_ids.shape
+        assert masked_indices.shape == input_ids.shape
+        assert p_mask.shape == input_ids.shape
+
+        # Check that only answer tokens can be masked (where labels != -100)
+        non_answer_mask = labels == -100
+
+        # No masking should occur on non-answer tokens
+        assert not masked_indices[non_answer_mask].any()
+
+        # p_mask should be the same for all positions (sampled timestep),
+        # but masking is only applied to answer tokens
+        assert p_mask.shape == input_ids.shape
+        # Verify that masked_indices respects the answer mask
+        assert not masked_indices[non_answer_mask].any()
+
+    def test_forward_process_with_attention_mask(self, diffusion_trainer_instance):
+        """Test forward process with attention mask."""
+        input_ids = torch.tensor([[1, 10, 20, 0]], dtype=torch.long)
+        attention_mask = torch.tensor([[1, 1, 1, 0]], dtype=torch.long)
+
+        _, masked_indices, p_mask = diffusion_trainer_instance._forward_process(
+            input_ids, attention_mask=attention_mask, eps=0.1
+        )
+
+        # Check that padding tokens are not masked
+        padding_positions = attention_mask == 0
+        assert not masked_indices[padding_positions].any()
+        assert (p_mask[padding_positions] == 0).all()
+
+    def test_bidirectional_attention_mask_no_packing(self, diffusion_trainer_instance):
+        """Test bidirectional attention mask without sample packing."""
+        input_ids = torch.tensor([[1, 10, 20, 2]], dtype=torch.long)
+
+        mask = create_bidirectional_attention_mask(input_ids)
+
+        # Should be all-to-all attention
+        expected_shape = (1, 1, 4, 4)
+        assert mask.shape == expected_shape
+        assert mask.all()
+
+    def test_bidirectional_attention_mask_with_packing(
+        self, diffusion_trainer_instance
+    ):
+        """Test bidirectional attention mask with sample packing."""
+        diffusion_trainer_instance.cfg.sample_packing = True
+        input_ids = torch.tensor([[1, 10, 20, 30, 40, 2]], dtype=torch.long)
+        # Sample IDs: first sample (1), second sample (2)
+        attention_mask = torch.tensor([[1, 1, 1, 2, 2, 2]], dtype=torch.long)
+
+        mask = create_bidirectional_attention_mask(
+            input_ids, attention_mask, sample_packing=True
+        )
+
+        # Check that tokens within same sample can attend to each other
+        # but not across samples
+        assert mask[0, 0, 0, 1].item()  # First sample tokens can attend to each other
+        assert mask[0, 0, 1, 2].item()
+        assert not mask[0, 0, 0, 3].item()  # Can't attend across samples
+        assert not mask[0, 0, 2, 4].item()
+        assert mask[0, 0, 3, 4].item()  # Second sample tokens can attend to each other
+
+    def test_compute_loss_basic(self, diffusion_trainer_instance):
+        """Test basic loss computation."""
+        # Mock model that returns logits
+        mock_model = Mock()
+        mock_outputs = Mock()
+        vocab_size = 1000
+        seq_len = 5
+        mock_outputs.logits = torch.randn(1, seq_len, vocab_size, requires_grad=True)
+        mock_model.return_value = mock_outputs
+        mock_model.training = True
+
+        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
+
+        loss, outputs = diffusion_trainer_instance._compute_diffusion_loss(
+            mock_model, input_ids
+        )
+
+        # Check that loss is computed
+        assert isinstance(loss, torch.Tensor)
+        assert loss.requires_grad
+        assert outputs == mock_outputs
+
+        # Check that metrics were stored
+        diffusion_trainer_instance.store_metrics.assert_called_once()
+
+    def test_compute_loss_sft(self, diffusion_trainer_instance):
+        """Test loss computation with SFT labels."""
+        # Mock model
+        mock_model = Mock()
+        mock_outputs = Mock()
+        vocab_size = 1000
+        seq_len = 5
+        mock_outputs.logits = torch.randn(1, seq_len, vocab_size, requires_grad=True)
+        mock_model.return_value = mock_outputs
+        mock_model.training = True
+        diffusion_trainer_instance.cfg.datasets = Mock()
+
+        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
+        labels = torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long)
+
+        loss, _ = diffusion_trainer_instance._compute_diffusion_loss(
+            mock_model, input_ids, labels=labels
+        )
+
+        # Check that loss is computed
+        assert isinstance(loss, torch.Tensor)
+        assert loss.requires_grad
+
+        # Check that SFT metrics were added
+        call_args = diffusion_trainer_instance.store_metrics.call_args[0][0]
+        assert "answer_ratio" in call_args
+        assert "avg_answer_length" in call_args
+
+    def test_compute_loss_no_masked_tokens(self, diffusion_trainer_instance):
+        """Test loss computation when no tokens are masked."""
+        # Mock model
+        mock_model = Mock()
+        mock_outputs = Mock()
+        vocab_size = 1000
+        seq_len = 3
+        mock_outputs.logits = torch.randn(1, seq_len, vocab_size)
+        mock_model.return_value = mock_outputs
+        mock_model.training = True
+
+        # Only special tokens (which won't be masked)
+        input_ids = torch.tensor([[1, 0, 2]], dtype=torch.long)
+
+        loss, _ = diffusion_trainer_instance._compute_diffusion_loss(
+            mock_model, input_ids
+        )
+
+        # Loss should be zero when no tokens are masked
+        assert loss.item() == 0.0
+        assert loss.requires_grad
+
+    def test_cache_special_token_ids(self, mock_tokenizer):
+        """Test caching of special token IDs."""
+        trainer = object.__new__(DiffusionTrainer)
+        trainer.processing_class = mock_tokenizer
+        trainer._cache_special_token_ids()
+        assert trainer._special_token_ids == {0, 1, 2}
+
+    def test_cache_special_token_ids_no_tokenizer(self):
+        """Test caching when no tokenizer is available."""
+        trainer = object.__new__(DiffusionTrainer)
+        trainer.processing_class = None
+        trainer._cache_special_token_ids()
+
+        assert trainer._special_token_ids == set()
+
+    def test_main_compute_loss_interface(self, diffusion_trainer_instance):
+        """Test the main compute_loss interface."""
+        # Mock model
+        mock_model = Mock()
+        mock_outputs = Mock()
+        mock_outputs.logits = torch.randn(1, 5, 1000)
+        mock_model.return_value = mock_outputs
+        mock_model.training = True
+
+        inputs = {
+            "input_ids": torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long),
+            "attention_mask": torch.tensor([[1, 1, 1, 1, 1]], dtype=torch.long),
+            "labels": torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long),
+        }
+
+        # Test without return_outputs
+        loss = diffusion_trainer_instance.compute_loss(mock_model, inputs)
+        assert isinstance(loss, torch.Tensor)
+
+        # Test with return_outputs
+        loss, outputs = diffusion_trainer_instance.compute_loss(
+            mock_model, inputs, return_outputs=True
+        )
+        assert isinstance(loss, torch.Tensor)
+        assert outputs == mock_outputs
+
+    def test_missing_input_ids_raises_error(self, diffusion_trainer_instance):
+        """Test that missing input_ids raises ValueError."""
+        mock_model = Mock()
+        inputs = {"attention_mask": torch.tensor([[1, 1, 1]])}
+
+        with pytest.raises(ValueError, match="input_ids is required"):
+            diffusion_trainer_instance.compute_loss(mock_model, inputs)
diff --git a/tests/integrations/test_diffusion_callback.py b/tests/integrations/test_diffusion_callback.py
new file mode 100644
index 000000000..3e8785fe0
--- /dev/null
+++ b/tests/integrations/test_diffusion_callback.py
@@ -0,0 +1,92 @@
+"""Tests for diffusion generation callback dataloader selection and triggering."""
+
+from types import SimpleNamespace
+from unittest.mock import Mock
+
+import pytest
+
+from axolotl.integrations.diffusion import DiffusionGenerationCallback
+
+
+class DummyTrainer:
+    """Minimal trainer double with required attributes/methods for the callback."""
+
+    def __init__(self, use_eval: bool):
+        # Config used by callback
+        self.cfg = SimpleNamespace(
+            diffusion=SimpleNamespace(
+                generation_interval=1,
+                num_generation_samples=1,
+                generation_max_length=32,
+                generation_steps=4,
+                generation_temperature=0.0,
+                mask_token_id=16,
+            ),
+            use_wandb=False,
+        )
+
+        # Model/tokenizer are passed through to generate_samples; not used here
+        self.model = Mock()
+        self.processing_class = Mock()
+
+        # Datasets and loaders
+        self.eval_dataset = object() if use_eval else None
+        self._train_loader = object()
+        self._eval_loader = object()
+
+        # State for world process check
+        self.state = SimpleNamespace(is_world_process_zero=True)
+
+        # Track which loader was requested
+        self.requested: list[str] = []
+
+    def get_train_dataloader(self):
+        self.requested.append("train")
+        return self._train_loader
+
+    def get_eval_dataloader(self):
+        self.requested.append("eval")
+        return self._eval_loader
+
+
+@pytest.mark.parametrize("use_eval", [False, True])
+def test_callback_uses_correct_dataloader(monkeypatch, use_eval):
+    trainer = DummyTrainer(use_eval=use_eval)
+    callback = DiffusionGenerationCallback(trainer)
+
+    captured = {}
+
+    # Patch generate_samples in the callback module's namespace
+    def fake_generate_samples(**kwargs):
+        captured["dataloader"] = kwargs.get("dataloader")
+        # Return one dummy sample to exercise logging path
+        return [
+            {
+                "original": "o",
+                "masked": "m",
+                "generated": "g",
+                "mask_ratio": 0.5,
+                "masked_tokens": 1,
+                "total_tokens": 2,
+            }
+        ]
+
+    monkeypatch.setattr(
+        "axolotl.integrations.diffusion.callbacks.generate_samples",
+        fake_generate_samples,
+    )
+
+    # Trigger at step 1 (interval=1)
+    args = SimpleNamespace()
+    state = SimpleNamespace(global_step=1)
+    control = SimpleNamespace()
+
+    callback.on_step_end(args=args, state=state, control=control)
+
+    # Assert the expected dataloader path was used
+    if use_eval:
+        assert trainer.requested[0] == "eval"
+        assert captured["dataloader"] is trainer._eval_loader
+    else:
+        assert trainer.requested[0] == "train"
+        assert captured["dataloader"] is trainer._train_loader
diff --git a/tests/test_streaming.py b/tests/test_streaming.py
index 54acbb5e4..2c1f9f936 100644
--- a/tests/test_streaming.py
+++ b/tests/test_streaming.py
@@ -5,12 +5,12 @@ from unittest.mock import Mock, patch
 
 from datasets import IterableDataset
 
-from axolotl.utils.dict import DictDefault
+from axolotl.utils.config import validate_config
 from axolotl.utils.data.sft import (
     _prepare_streaming_dataset,
     prepare_datasets,
 )
-from axolotl.utils.config import validate_config
+from axolotl.utils.dict import DictDefault
 
 
 class TestStreamingConfig(unittest.TestCase):

From 9406c0c488277ef9d7152568b9fda50600c4221e Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Thu, 11 Sep 2025 11:19:30 +0100
Subject: [PATCH 046/115] log before eval step (#3148) [skip-ci]

---
 src/axolotl/core/trainers/base.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 3427a0b86..627f8e3f8 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -371,6 +371,11 @@ class AxolotlTrainer(
             num_items_in_batch=num_items_in_batch,
         )
 
+    @override
+    def evaluate(self, *args, **kwargs):
+        LOG.info("Running evaluation step...")
+        return super().evaluate(*args, **kwargs)
+
     @staticmethod
     def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
         concatenated_batch = {}

From fcfc13d7106fe965054e46f0ad6b4f478cc5ba7c Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Fri, 12 Sep 2025 14:45:18 +0700
Subject: [PATCH 047/115] feat(doc): update thinking and chat_template notes
 (#3114) [skip ci]

* feat: update thinking and chat_template notes

* fix: grammar
---
 examples/gpt-oss/README.md | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md
index 0aa04a71c..fb6c67498 100644
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -106,6 +106,16 @@ See [Nanobit/text-tools-2k-test](https://huggingface.co/datasets/Nanobit/text-to
 
 Refer to [our docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use) for more info.
 
+### Thinking and chat_template masking conflict
+
+OpenAI’s Harmony template hides `thinking` in all non-final turns, which conflicts with Axolotl’s `chat_template` masking.
+
+If your dataset has `thinking` content mid-turn, there are two paths we recommend:
+
+- Train only on the last turn. This can be accomplished via chat_template's [train on last doc](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#training-on-last-message).
+
+- Adjust your dataset to only have `thinking` content in the last turn.
+
 ### TIPS
 
 - Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).

From 0401a15888e480d03c7cd0fb6439b27e0dacd3a0 Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Fri, 12 Sep 2025 10:55:11 +0100
Subject: [PATCH 048/115] SEO go brrr (#3153) [skip-ci]

---
 CITATION.cff |  2 +-
 README.md    | 16 ++++++++++------
 2 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/CITATION.cff b/CITATION.cff
index e6ecc7cb8..7bbfeec64 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -1,6 +1,6 @@
 cff-version: 1.2.0
 type: software
-title: "Axolotl: Post-Training for AI Models"
+title: "Axolotl: Open Source LLM Post-Training"
 message: "If you use this software, please cite it as below."
 authors:
   - name: "Axolotl maintainers and contributors"
diff --git a/README.md b/README.md
index d4794124a..1a033acd9 100644
--- a/README.md
+++ b/README.md
@@ -5,6 +5,9 @@
         <img alt="Axolotl" src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
     </picture>
 </p>
+  <p align="center">
+      <strong>A Free and Open Source LLM Fine-tuning Framework</strong><br>
+  </p>
 
 <p align="center">
     <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
@@ -50,20 +53,21 @@
 
 ## ✨ Overview
 
-Axolotl is a tool designed to streamline post-training for various AI models.
+Axolotl is a free and open-source tool designed to streamline post-training and fine-tuning for the latest large language models (LLMs).
 
 Features:
 
-- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
-- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
-- **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
+- **Multiple Model Support**: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub.
+- **Multimodal Training**: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, and audio models like Voxtral with image, video, and audio support.
+- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).
+- **Easy Configuration**: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference.
 - **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
 - **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
 - **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
 
 
 
-## 🚀 Quick Start
+## 🚀 Quick Start - LLM Fine-tuning in Minutes
 
 **Requirements**:
 
@@ -160,7 +164,7 @@ If you use Axolotl in your research or projects, please cite it as follows:
 
 ```bibtex
 @software{axolotl,
-  title = {Axolotl: Post-Training for AI Models},
+  title = {Axolotl: Open Source LLM Post-Training},
   author = {{Axolotl maintainers and contributors}},
   url = {https://github.com/axolotl-ai-cloud/axolotl},
   license = {Apache-2.0},

From 58d67bf98ddca63cb082374a04f8b2250ffc2c59 Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Fri, 12 Sep 2025 10:55:50 +0100
Subject: [PATCH 049/115] Migrate QAT API; fix `axolotl quantize` for QAT-ed
 models; add NVFP4 (#3107)

---
 .github/workflows/multi-gpu-e2e.yml       |   2 +-
 .github/workflows/tests.yml               |   2 +-
 docs/quantize.qmd                         |   8 +
 examples/llama-3/3b-qat-fsdp2-nvfp4.yaml  |  64 ++++
 examples/llama-3/3b-qat-fsdp2.yaml        |  18 +-
 requirements.txt                          |   2 +-
 setup.py                                  |   1 +
 src/axolotl/cli/args.py                   |   1 +
 src/axolotl/cli/quantize.py               |  50 ++-
 src/axolotl/train.py                      |  19 +-
 src/axolotl/utils/quantization.py         | 244 +++++++-------
 src/axolotl/utils/schemas/enums.py        |  25 +-
 src/axolotl/utils/schemas/quantization.py |  54 ++--
 tests/e2e/test_qat.py                     |   4 +-
 tests/e2e/test_quantization.py            | 369 ++++++++++++----------
 tests/e2e/utils.py                        |  30 ++
 16 files changed, 554 insertions(+), 339 deletions(-)
 create mode 100644 examples/llama-3/3b-qat-fsdp2-nvfp4.yaml

diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index 6492e5d3e..05f9e0761 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -44,7 +44,7 @@ jobs:
             cuda_version: 12.8.1
             python_version: "3.11"
             pytorch: 2.8.0
-            axolotl_extras:
+            axolotl_extras: fbgemm-gpu
             num_gpus: 2
             nightly_build: "true"
     runs-on: [self-hosted, modal]
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 337230d4a..cfd2c715d 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -304,7 +304,7 @@ jobs:
             pytorch: 2.8.0
             num_gpus: 1
             gpu_type: "B200"
-            axolotl_extras:
+            axolotl_extras: fbgemm-gpu
     steps:
       - name: Checkout
         uses: actions/checkout@v4
diff --git a/docs/quantize.qmd b/docs/quantize.qmd
index 113fcafbe..43c817a5b 100644
--- a/docs/quantize.qmd
+++ b/docs/quantize.qmd
@@ -51,3 +51,11 @@ axolotl quantize qat.yml
 ```
 
 This ensures that an identical quantization configuration is used to quantize the model as was used to train it.
+
+
+::: {.callout-note}
+
+If you have configured pushing to hub with `hub_model_id`, your model hub name will have the quantization schema appended to it,
+e.g. `axolotl-ai-cloud/qat-nvfp4-llama3B` will become `axolotl-ai-cloud/qat-nvfp4-llama3B-nvfp4w`
+
+:::
diff --git a/examples/llama-3/3b-qat-fsdp2-nvfp4.yaml b/examples/llama-3/3b-qat-fsdp2-nvfp4.yaml
new file mode 100644
index 000000000..1ec809bbe
--- /dev/null
+++ b/examples/llama-3/3b-qat-fsdp2-nvfp4.yaml
@@ -0,0 +1,64 @@
+base_model: meta-llama/Llama-3.2-3B
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+
+plugins:
+  - axolotl.integrations.liger.LigerPlugin
+
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_layer_norm: true
+liger_fused_linear_cross_entropy: true
+
+datasets:
+  - path: yahma/alpaca-cleaned
+    type: alpaca
+    split: train[:95%]
+
+output_dir: ./outputs/qat_out/
+dataset_prepared_path: ./outputs/dataset_prepared
+
+sequence_len: 8192
+flash_attention: true
+
+qat:
+  activation_dtype: nvfp4
+  weight_dtype: nvfp4
+  group_size: 16 # only group_size of 16 is supported with nvfp4
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_checkpointing: true
+gradient_accumulation_steps: 1
+micro_batch_size: 64
+num_epochs: 1
+optimizer: adamw_torch_fused
+
+cosine_constant_lr_ratio: 0
+cosine_min_lr_ratio: 1.0
+learning_rate: 2e-5
+save_only_model: true
+bf16: true
+
+resume_from_checkpoint:
+logging_steps: 1
+
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+warmup_ratio: 0.1
+weight_decay: 0.0
+
+special_tokens:
+  pad_token: <|finetune_right_pad_id|>
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/llama-3/3b-qat-fsdp2.yaml b/examples/llama-3/3b-qat-fsdp2.yaml
index 35e3461e2..0c5a87891 100644
--- a/examples/llama-3/3b-qat-fsdp2.yaml
+++ b/examples/llama-3/3b-qat-fsdp2.yaml
@@ -15,20 +15,18 @@ liger_glu_activation: true
 liger_layer_norm: true
 liger_fused_linear_cross_entropy: true
 
+
 datasets:
   - path: yahma/alpaca-cleaned
     type: alpaca
+    split: train[:95%]
 
 output_dir: ./outputs/qat_out/
+dataset_prepared_path: ./outputs/qat_out/dataset_prepared
 
-sample_packing: true
-
-sequence_len: 512
-
-flex_attention: true
-flex_attn_compile_kwargs:
-  dynamic: false
-  mode: max-autotune-no-cudagraphs
+sample_packing: false
+sequence_len: 8192
+flash_attention: true
 
 qat:
   activation_dtype: int8
@@ -67,7 +65,7 @@ fsdp:
 fsdp_config:
   fsdp_version: 2
   fsdp_offload_params: false
-  fsdp_cpu_ram_efficient_loading: true
+  fsdp_cpu_ram_efficient_loading: false
   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
   fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
   fsdp_state_dict_type: FULL_STATE_DICT
@@ -76,6 +74,6 @@ fsdp_config:
   fsdp_activation_checkpointing: true
 
 special_tokens:
-  pad_token: <|end_of_text|>
+  pad_token: <|finetune_right_pad_id|>
 
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/requirements.txt b/requirements.txt
index 1292a179a..6138707af 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -64,7 +64,7 @@ langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2
 
-torchao==0.12.0
+torchao==0.13.0
 schedulefree==1.4.1
 
 axolotl-contribs-lgpl==0.0.6
diff --git a/setup.py b/setup.py
index 4cbc562e0..3a44f0ae9 100644
--- a/setup.py
+++ b/setup.py
@@ -162,6 +162,7 @@ extras_require = {
     "llmcompressor": [
         "llmcompressor==0.5.1",
     ],
+    "fbgemm-gpu": ["fbgemm-gpu-genai>=1.2.0"],
 }
 install_requires, dependency_links, extras_require_build = parse_requirements(
     extras_require
diff --git a/src/axolotl/cli/args.py b/src/axolotl/cli/args.py
index 396e9a8af..14dafa43f 100644
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -115,6 +115,7 @@ class QuantizeCliArgs:
     quantize_embedding: Optional[bool] = field(default=None)
     group_size: Optional[int] = field(default=None)
     output_dir: Optional[str] = field(default=None)
+    hub_model_id: Optional[str] = field(default=None)
 
 
 @dataclass
diff --git a/src/axolotl/cli/quantize.py b/src/axolotl/cli/quantize.py
index b8a8de781..6838f47d8 100644
--- a/src/axolotl/cli/quantize.py
+++ b/src/axolotl/cli/quantize.py
@@ -5,12 +5,17 @@ CLI to post-training quantize a model using torchao
 from pathlib import Path
 from typing import Union
 
-from transformers import AutoModelForCausalLM
+from transformers import AutoConfig, AutoModelForCausalLM, TorchAoConfig
 
 from axolotl.cli.config import load_cfg
 from axolotl.loaders import load_tokenizer
 from axolotl.utils.logging import get_logger
-from axolotl.utils.quantization import TorchIntDType, quantize_model_for_ptq
+from axolotl.utils.quantization import (
+    TorchAOQuantDType,
+    get_quantization_config,
+    quantization_config_to_str,
+    quantize_model,
+)
 
 LOG = get_logger(__name__)
 
@@ -43,13 +48,13 @@ def do_quantize(
             "No quantization configuration found. Please specify either qat or quantization in your config file."
         )
 
-    model_path = cli_args.get("model_path") or cfg.output_dir
+    model_path = cli_args.get("base_model") or cfg.output_dir
     if weight_dtype := cli_args.get("weight_dtype"):
-        weight_dtype = TorchIntDType[weight_dtype]
+        weight_dtype = TorchAOQuantDType.from_string(weight_dtype)
     else:
         weight_dtype = quantize_cfg.weight_dtype
     if activation_dtype := cli_args.get("activation_dtype"):
-        activation_dtype = TorchIntDType[activation_dtype]
+        activation_dtype = TorchAOQuantDType.from_string(activation_dtype)
     else:
         activation_dtype = quantize_cfg.activation_dtype
     group_size = cli_args.get("group_size") or quantize_cfg.group_size
@@ -57,10 +62,15 @@ def do_quantize(
         cli_args.get("quantize_embedding") or quantize_cfg.quantize_embedding
     )
     output_dir = cli_args.get("output_dir") or cfg.output_dir
+    hub_model_id = cli_args.get("hub_model_id") or cfg.hub_model_id
 
-    LOG.info(f"Loading model from {model_path}...")
+    LOG.info(f"Loading model from {model_path}.")
     tokenizer = load_tokenizer(cfg)
-    model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")
+    config = AutoConfig.from_pretrained(model_path)
+    torch_dtype = config.torch_dtype if hasattr(config, "torch_dtype") else None
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path, device_map="auto", torch_dtype=torch_dtype
+    )
 
     LOG.info(
         f"Quantizing model with configuration: \n"
@@ -70,11 +80,21 @@ def do_quantize(
         f"\tquantize_embedding: {quantize_embedding}"
     )
 
-    quantize_model_for_ptq(
+    quantize_model(
         model, weight_dtype, group_size, activation_dtype, quantize_embedding
     )
 
-    LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}...")
+    quantization_config = get_quantization_config(
+        weight_dtype, activation_dtype, group_size
+    )
+
+    ao_config = TorchAoConfig(
+        quant_type=quantization_config,
+        include_input_output_embeddings=quantize_embedding,
+    )
+    model.config.quantization_config = ao_config
+
+    LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}.")
     model.save_pretrained(
         str(Path(output_dir) / "quantized"),
         safe_serialization=False,
@@ -86,4 +106,14 @@ def do_quantize(
         progressbar=True,
         save_jinja_files=cfg.tokenizer_save_jinja_files,
     )
-    LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}...")
+
+    if hub_model_id:
+        hub_model_id = (
+            hub_model_id.rstrip("-")
+            + f"-{quantization_config_to_str[type(quantization_config)]}"
+        )
+        model.push_to_hub(hub_model_id, safe_serialization=False)
+        tokenizer.push_to_hub(hub_model_id)
+        LOG.info(f"Quantized model pushed to: {hub_model_id}.")
+
+    LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}.")
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index e8e314579..b0482bb1e 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -30,11 +30,7 @@ from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
     fix_untrained_tokens,
 )
 from axolotl.integrations.base import PluginManager
-from axolotl.loaders import (
-    ModelLoader,
-    load_processor,
-    load_tokenizer,
-)
+from axolotl.loaders import ModelLoader, load_processor, load_tokenizer
 from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
@@ -234,16 +230,15 @@ def save_trained_model(
 
     # handle QAT
     if cfg.qat:
-        from axolotl.utils.quantization import convert_qat_model_for_ptq
+        from axolotl.utils.quantization import convert_qat_model
 
-        LOG.info("Processing QAT model for saving...")
-        convert_qat_model_for_ptq(
+        convert_qat_model(
             model,
             quantize_embedding=cfg.qat.quantize_embedding,
         )
         LOG.info(
-            "QAT modules have been converted for PTQ. Please ensure you quantize "
-            "your model weights with `axolotl quantize`."
+            "QAT usage note: please ensure you quantize your model fine-tuned using QAT by running `axolotl quantize`"
+            " with the same config which you used for training."
         )
     # Handle ReLoRA early return case
     if cfg.relora:
@@ -337,9 +332,7 @@ def save_trained_model(
 
     if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
         # TODO: add integration support so this can be implemented completely within the plugin
-        from axolotl.integrations.llm_compressor.utils import (
-            save_compressed_model,
-        )
+        from axolotl.integrations.llm_compressor.utils import save_compressed_model
 
         save_compressed_model(
             model=model,
diff --git a/src/axolotl/utils/quantization.py b/src/axolotl/utils/quantization.py
index f9a30b660..6c29a5442 100644
--- a/src/axolotl/utils/quantization.py
+++ b/src/axolotl/utils/quantization.py
@@ -3,30 +3,47 @@ Utilities for quantization including QAT and PTQ using torchao.
 """
 
 import torch
-from torch import nn
+from packaging import version
 from torchao.core.config import AOBaseConfig
 from torchao.quantization import quantize_
 from torchao.quantization.qat import (
-    FakeQuantizeConfig,
-    FromIntXQuantizationAwareTrainingConfig,
-    IntXQuantizationAwareTrainingConfig,
+    QATConfig,
 )
 from torchao.quantization.quant_api import (
-    Int4DynamicActivationInt4WeightConfig,
-    Int4WeightOnlyConfig,
+    Float8DynamicActivationFloat8WeightConfig,
+    Float8DynamicActivationInt4WeightConfig,
     Int8DynamicActivationInt4WeightConfig,
-    Int8DynamicActivationInt8WeightConfig,
-    Int8WeightOnlyConfig,
-    UIntXWeightOnlyConfig,
-    _is_linear,
 )
 
-from axolotl.utils.schemas.enums import TorchIntDType
+from axolotl.utils.schemas.enums import TorchAOQuantDType
+
+quantization_config_to_str = {
+    Int8DynamicActivationInt4WeightConfig: "int8int4",
+    Float8DynamicActivationFloat8WeightConfig: "fp8fp8",
+    Float8DynamicActivationInt4WeightConfig: "fp8int4",
+}
+
+if version.parse(torch.__version__) >= version.parse("2.8.0"):
+    try:
+        from torchao.prototype.mx_formats import NVFP4InferenceConfig
+
+        quantization_config_to_str[NVFP4InferenceConfig] = "nvfp4"
+    except:
+        pass
+
+    # int4 weight config imports will fail on machines with fbgemm-gpu installed
+    # without a CUDA runtime available so we do this safely
+    try:
+        from torchao.quantization.quant_api import Int4WeightOnlyConfig
+
+        quantization_config_to_str[Int4WeightOnlyConfig] = "int4"
+    except:
+        pass
 
 
-def get_ptq_config(
-    weight_dtype: TorchIntDType,
-    activation_dtype: TorchIntDType | None = None,
+def get_quantization_config(
+    weight_dtype: TorchAOQuantDType,
+    activation_dtype: TorchAOQuantDType | None = None,
     group_size: int | None = None,
 ) -> AOBaseConfig:
     """
@@ -45,44 +62,101 @@ def get_ptq_config(
             or if the group size is not specified for int8 or int4 weight only quantization.
     """
     if activation_dtype is None:
-        if not weight_dtype.value.is_signed:  # type: ignore[attr-defined,union-attr]
-            return UIntXWeightOnlyConfig(
-                dtype=weight_dtype.value,
-                group_size=group_size,
-                set_inductor_config=False,
-            )
-        if weight_dtype == TorchIntDType.int8:
-            if group_size is None:
-                raise ValueError(
-                    "group_size must be specified for int8 weight only quantization"
-                )
-            return Int8WeightOnlyConfig(
-                group_size=group_size,
-            )
-        if weight_dtype == TorchIntDType.int4:
-            if group_size is None:
-                raise ValueError(
-                    "group_size must be specified for int4 weight only quantization"
-                )
-            return Int4WeightOnlyConfig(
-                group_size=group_size,
-            )
-    if activation_dtype == TorchIntDType.int4 and weight_dtype == TorchIntDType.int4:
-        return Int4DynamicActivationInt4WeightConfig()
-    if activation_dtype == TorchIntDType.int8 and weight_dtype == TorchIntDType.int8:
-        return Int8DynamicActivationInt8WeightConfig()
-    if activation_dtype == TorchIntDType.int8 and weight_dtype == TorchIntDType.int4:
-        return Int8DynamicActivationInt4WeightConfig()
+        if weight_dtype == TorchAOQuantDType.int8:
+            raise ValueError("Int8WeightOnlyConfig is not supported by torchao QAT.")
+        if weight_dtype == TorchAOQuantDType.int4:
+            from torchao.quantization.quant_api import Int4WeightOnlyConfig
+
+            if group_size is not None:
+                return Int4WeightOnlyConfig(group_size=group_size, version=2)
+            else:
+                return Int4WeightOnlyConfig(version=2)
+    if (
+        activation_dtype == TorchAOQuantDType.int4
+        and weight_dtype == TorchAOQuantDType.int4
+    ):
+        raise ValueError(
+            "Int4DynamicActivationInt4WeightConfig is not supported by torchao QAT."
+        )
+    if (
+        activation_dtype == TorchAOQuantDType.int8
+        and weight_dtype == TorchAOQuantDType.int8
+    ):
+        raise ValueError(
+            "Int8DynamicActivationInt8WeightConfig is not supported by torchao QAT."
+        )
+    if (
+        activation_dtype == TorchAOQuantDType.int8
+        and weight_dtype == TorchAOQuantDType.int4
+    ):
+        if group_size is not None:
+            return Int8DynamicActivationInt4WeightConfig(group_size=group_size)
+        else:
+            return Int8DynamicActivationInt4WeightConfig()
+    if (
+        activation_dtype == TorchAOQuantDType.float8_e4m3fn
+        and weight_dtype == TorchAOQuantDType.float8_e4m3fn
+    ):
+        return Float8DynamicActivationFloat8WeightConfig()
+    if (
+        activation_dtype == TorchAOQuantDType.float8_e4m3fn
+        and weight_dtype == TorchAOQuantDType.int4
+    ):
+        return Float8DynamicActivationInt4WeightConfig()
+    if weight_dtype == TorchAOQuantDType.nvfp4:
+        from torchao.prototype.mx_formats import NVFP4InferenceConfig
+
+        if group_size is not None and group_size != 16:
+            raise ValueError("NVFP4 quantization must use a group_size of 16")
+        return NVFP4InferenceConfig()
     raise ValueError(
         f"Invalid activation/weight dtype combination: {activation_dtype}/{weight_dtype}"
     )
 
 
+def quantize_model(
+    model,
+    weight_dtype: TorchAOQuantDType,
+    group_size: int | None = None,
+    activation_dtype: TorchAOQuantDType | None = None,
+    quantize_embedding: bool | None = None,
+):
+    """
+    This function is used to quantize a model.
+
+    Args:
+        model: The model to quantize.
+        weight_dtype: The dtype to use for weight quantization.
+        group_size: The group size to use for weight quantization.
+        activation_dtype: The dtype to use for activation quantization.
+        quantize_embedding: Whether to quantize the model's embedding weights.
+
+    """
+    linear_ptq_config = get_quantization_config(
+        weight_dtype=weight_dtype,
+        activation_dtype=activation_dtype,
+        group_size=group_size,
+    )
+    quantize_(model, linear_ptq_config)
+    if quantize_embedding:
+        # activation fake quantization is not supported for embedding layers
+        embedding_quantize_config = get_quantization_config(
+            weight_dtype=weight_dtype,
+            activation_dtype=None,
+            group_size=group_size,
+        )
+        quantize_(
+            model,
+            embedding_quantize_config,
+            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
+        )
+
+
 def prepare_model_for_qat(
     model,
-    weight_dtype: TorchIntDType,
-    group_size: int,
-    activation_dtype: TorchIntDType | None = None,
+    weight_dtype: TorchAOQuantDType,
+    group_size: int | None = None,
+    activation_dtype: TorchAOQuantDType | None = None,
     quantize_embedding: bool = False,
 ):
     """
@@ -100,86 +174,40 @@ def prepare_model_for_qat(
     Raises:
         ValueError: If the activation/weight dtype combination is invalid.
     """
-    if activation_dtype:
-        activation_config = FakeQuantizeConfig(
-            dtype=activation_dtype.value, granularity="per_token", is_symmetric=False
-        )
-    weight_config = FakeQuantizeConfig(dtype=weight_dtype.value, group_size=group_size)
-    linear_quantize_config = IntXQuantizationAwareTrainingConfig(
-        activation_config=None if activation_dtype is None else activation_config,
-        weight_config=weight_config,
-    )
-    quantize_(model, linear_quantize_config)
-    if quantize_embedding:
-        # activation fake quantization is not supported for embedding layers
-        embedding_quantize_config = IntXQuantizationAwareTrainingConfig(
-            activation_config=None,
-            weight_config=weight_config,
-        )
-        quantize_(
-            model,
-            embedding_quantize_config,
-            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
-        )
-
-
-def quantize_model_for_ptq(
-    model,
-    weight_dtype: TorchIntDType,
-    group_size: int | None = None,
-    activation_dtype: TorchIntDType | None = None,
-    quantize_embedding: bool | None = None,
-):
-    """
-    This function is used to quantize a model for post-training quantization.
-    It swaps the model's linear layers with fake quantized linear layers.
-    If `quantize_embedding` is True, it will also swap the model's embedding weights with fake quantized embedding weights.
-
-    Args:
-        model: The model to quantize.
-        weight_dtype: The dtype to use for weight quantization.
-        group_size: The group size to use for weight quantization.
-        activation_dtype: The dtype to use for activation quantization.
-        quantize_embedding: Whether to quantize the model's embedding weights.
-
-    """
-    linear_ptq_config = get_ptq_config(
+    base_config = get_quantization_config(
         weight_dtype=weight_dtype,
         activation_dtype=activation_dtype,
         group_size=group_size,
     )
-    quantize_(model, linear_ptq_config)
+    qat_config = QATConfig(base_config)
+    quantize_(model, qat_config)
     if quantize_embedding:
-        embedding_quantize_config = get_ptq_config(
+        # activation fake quantization is not supported for embedding layers
+        embedding_base_config = get_quantization_config(
             weight_dtype=weight_dtype,
             activation_dtype=None,
             group_size=group_size,
         )
+        embedding_qat_config = QATConfig(embedding_base_config)
         quantize_(
             model,
-            embedding_quantize_config,
+            embedding_qat_config,
             filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
         )
 
 
-def convert_qat_model_for_ptq(
+def convert_qat_model(
     model,
-    *,
-    quantize_embedding: bool | None = None,
+    quantize_embedding: bool = False,
 ):
     """
-    This function is used to convert a swap fake-quantized modules in a model
-    which has been trained with QAT back to the original modules, ready for PTQ.
-
-    Args:
-        model: The model to convert.
-        quantize_embedding: Whether to quantize the model's embedding weights.
+    This function converts a QAT model which has fake quantized layers back to the original model.
     """
+    config = QATConfig(step="convert")
+    quantize_(model, config)
     if quantize_embedding:
-
-        def filter_fn(m, _):
-            return isinstance(m, nn.Embedding) or _is_linear(m)
-
-    else:
-        filter_fn = _is_linear
-    quantize_(model, FromIntXQuantizationAwareTrainingConfig(), filter_fn=filter_fn)
+        quantize_(
+            model,
+            config,
+            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
+        )
diff --git a/src/axolotl/utils/schemas/enums.py b/src/axolotl/utils/schemas/enums.py
index 8f4718aa9..bcd03e1a2 100644
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -5,18 +5,21 @@ from enum import Enum
 import torch
 
 
-class TorchIntDType(Enum):
-    """Torch integer data types - `getattr` guards against torch < 2.6 which does not support int4"""
+class TorchAOQuantDType(Enum):
+    int4 = torch.int4
+    int8 = torch.int8
+    float8_e4m3fn = torch.float8_e4m3fn
+    nvfp4 = "nvfp4"
 
-    uint1 = getattr(torch, "uint1", None)
-    uint2 = getattr(torch, "uint2", None)
-    uint3 = getattr(torch, "uint3", None)
-    uint4 = getattr(torch, "uint4", None)
-    uint5 = getattr(torch, "uint5", None)
-    uint6 = getattr(torch, "uint6", None)
-    uint7 = getattr(torch, "uint7", None)
-    int4 = getattr(torch, "int4", None)
-    int8 = getattr(torch, "int8", None)
+    def from_string(str):
+        if str == "int4":
+            return TorchAOQuantDType.int4
+        if str == "int8":
+            return TorchAOQuantDType.int8
+        if str in ["float8_e4m3fn", "fp8", "float8"]:
+            return TorchAOQuantDType.float8_e4m3fn
+        if str == "nvfp4":
+            return TorchAOQuantDType.nvfp4
 
 
 class RLType(str, Enum):
diff --git a/src/axolotl/utils/schemas/quantization.py b/src/axolotl/utils/schemas/quantization.py
index 090640c7b..a7c130574 100644
--- a/src/axolotl/utils/schemas/quantization.py
+++ b/src/axolotl/utils/schemas/quantization.py
@@ -6,7 +6,23 @@ from typing import Any
 
 from pydantic import BaseModel, Field, field_validator
 
-from axolotl.utils.schemas.enums import TorchIntDType
+from axolotl.utils.schemas.enums import TorchAOQuantDType
+
+
+def validate_ao_dtype(v: Any) -> TorchAOQuantDType | None:
+    if v is None:
+        return None
+    if v == "int4":
+        return TorchAOQuantDType.int4
+    if v == "int8":
+        return TorchAOQuantDType.int8
+    if v in ["float8_e4m3fn", "fp8", "float8"]:
+        return TorchAOQuantDType.float8_e4m3fn
+    if v == "nvfp4":
+        return TorchAOQuantDType.nvfp4
+    raise ValueError(
+        f"Invalid dtype: '{v}'. Must be one of: {[e.name for e in TorchAOQuantDType] + ['fp8', 'float8']}"
+    )
 
 
 class QATConfig(BaseModel):
@@ -14,13 +30,13 @@ class QATConfig(BaseModel):
     QAT Config Schema
     """
 
-    activation_dtype: TorchIntDType | None = Field(
+    activation_dtype: TorchAOQuantDType | None = Field(
         default=None,
-        description='Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"',
+        description="Fake quantization layout to use for activation quantization.",
     )
-    weight_dtype: TorchIntDType = Field(
-        default=TorchIntDType.int8,
-        description='Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"',
+    weight_dtype: TorchAOQuantDType = Field(
+        default=TorchAOQuantDType.int8,
+        description="Fake quantization layout to use for weight quantization.",
     )
     quantize_embedding: bool | None = Field(
         default=False, description="Quantize embedding"
@@ -35,12 +51,8 @@ class QATConfig(BaseModel):
 
     @field_validator("activation_dtype", "weight_dtype", mode="before")
     @classmethod
-    def validate_dtype(cls, v: Any) -> TorchIntDType | None:
-        if v == "int4":
-            return TorchIntDType.int4
-        if v == "int8":
-            return TorchIntDType.int8
-        raise ValueError(f"Invalid dtype: '{v}'. Must be one of: ['int4', 'int8']")
+    def validate_dtype(cls, v: Any) -> TorchAOQuantDType | None:
+        return validate_ao_dtype(v)
 
 
 class PTQConfig(BaseModel):
@@ -48,13 +60,13 @@ class PTQConfig(BaseModel):
     PTQ Config Schema
     """
 
-    weight_dtype: TorchIntDType = Field(
-        default=TorchIntDType.int8,
-        description="Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8",
+    weight_dtype: TorchAOQuantDType = Field(
+        default=TorchAOQuantDType.int8,
+        description="Fake quantization layout to use for weight quantization.",
     )
-    activation_dtype: TorchIntDType | None = Field(
+    activation_dtype: TorchAOQuantDType | None = Field(
         default=None,
-        description='Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"',
+        description="Fake quantization layout to use for activation quantization.",
     )
     quantize_embedding: bool | None = Field(
         default=None, description="Whether to quantize the embedding layer."
@@ -66,9 +78,5 @@ class PTQConfig(BaseModel):
 
     @field_validator("activation_dtype", "weight_dtype", mode="before")
     @classmethod
-    def validate_dtype(cls, v: Any) -> TorchIntDType | None:
-        if v == "int4":
-            return TorchIntDType.int4
-        if v == "int8":
-            return TorchIntDType.int8
-        raise ValueError(f"Invalid dtype: '{v}'. Must be one of: ['int4', 'int8']")
+    def validate_dtype(cls, v: Any) -> TorchAOQuantDType | None:
+        return validate_ao_dtype(v)
diff --git a/tests/e2e/test_qat.py b/tests/e2e/test_qat.py
index 7d41dfb50..2f8398ef7 100644
--- a/tests/e2e/test_qat.py
+++ b/tests/e2e/test_qat.py
@@ -43,7 +43,7 @@ class TestQATLlama:
                 "qat": {
                     "quantize_embedding": True,
                     "activation_dtype": "int8",
-                    "weight_dtype": "int8",
+                    "weight_dtype": "int4",
                     "group_size": 8,
                 },
                 "num_epochs": 1,
@@ -111,7 +111,7 @@ class TestQATLlama:
                 "qat": {
                     "quantize_embedding": True,
                     "activation_dtype": "int8",
-                    "weight_dtype": "int8",
+                    "weight_dtype": "int4",
                     "group_size": 8,
                 },
                 "save_first_step": False,
diff --git a/tests/e2e/test_quantization.py b/tests/e2e/test_quantization.py
index cfbdfec38..b64aef51a 100644
--- a/tests/e2e/test_quantization.py
+++ b/tests/e2e/test_quantization.py
@@ -5,41 +5,40 @@ Tests for axolotl.utils.quantization
 import pytest
 import torch
 from torch import nn
-from torchao.dtypes.affine_quantized_tensor import AffineQuantizedTensor
-from torchao.quantization.granularity import PerAxis, PerGroup
-from torchao.quantization.linear_activation_quantized_tensor import (
-    LinearActivationQuantizedTensor,
-)
+from torchao.quantization import LinearActivationQuantizedTensor
 from torchao.quantization.qat.embedding import FakeQuantizedEmbedding
 from torchao.quantization.qat.linear import FakeQuantizedLinear
 from torchao.quantization.quant_api import (
-    Int4DynamicActivationInt4WeightConfig,
-    Int4WeightOnlyConfig,
-    Int8DynamicActivationInt8WeightConfig,
-    Int8WeightOnlyConfig,
-    UIntXWeightOnlyConfig,
+    Float8DynamicActivationFloat8WeightConfig,
+    Float8DynamicActivationInt4WeightConfig,
+    Int8DynamicActivationInt4WeightConfig,
 )
+from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor
 from transformers import AutoModelForCausalLM
 from transformers.trainer_callback import TrainerState
 
 from axolotl.utils.callbacks.qat import QATCallback
 from axolotl.utils.quantization import (
-    convert_qat_model_for_ptq,
-    get_ptq_config,
+    convert_qat_model,
+    get_quantization_config,
     prepare_model_for_qat,
-    quantize_model_for_ptq,
+    quantize_model,
 )
-from axolotl.utils.schemas.enums import TorchIntDType
+from axolotl.utils.schemas.enums import TorchAOQuantDType
 from axolotl.utils.schemas.quantization import QATConfig
 
-from tests.e2e.utils import require_torch_2_6_0
+from tests.e2e.utils import (
+    require_torch_2_8_0,
+    requires_cuda_ge_8_9,
+    requires_sm_ge_100,
+)
 
 
 @pytest.fixture()
 def model():
     dummy_model = AutoModelForCausalLM.from_pretrained(
-        "HuggingFaceTB/SmolLM2-135M",
-        device_map="cuda",
+        "Qwen/Qwen2-0.5B",
+        device_map="auto",
         torch_dtype=torch.bfloat16,
     )
     with torch.device(dummy_model.device):
@@ -48,45 +47,56 @@ def model():
             dummy_model.model.embed_tokens.weight.shape[1],
             dtype=dummy_model.model.embed_tokens.weight.dtype,
         )
-    return dummy_model
+    yield dummy_model
+    del dummy_model
 
 
 ptq_config_test_cases = [
-    # weight_dtype, activation_dtype, group_size, expected_type, expected_params
+    # weight_dtype, activation_dtype, group_size, expected_type
     (
-        TorchIntDType.uint4,
+        TorchAOQuantDType.int4,
+        TorchAOQuantDType.int8,
         None,
-        None,
-        UIntXWeightOnlyConfig,
-        {"dtype": torch.uint4, "group_size": None},
-    ),
-    (TorchIntDType.int8, None, 32, Int8WeightOnlyConfig, {"group_size": 32}),
-    (TorchIntDType.int4, None, 4, Int4WeightOnlyConfig, {"group_size": 4}),
-    (
-        TorchIntDType.int4,
-        TorchIntDType.int4,
-        None,
-        Int4DynamicActivationInt4WeightConfig,
-        {},
+        Int8DynamicActivationInt4WeightConfig,
     ),
     (
-        TorchIntDType.int8,
-        TorchIntDType.int8,
+        TorchAOQuantDType.float8_e4m3fn,
+        TorchAOQuantDType.float8_e4m3fn,
         None,
-        Int8DynamicActivationInt8WeightConfig,
-        {},
+        Float8DynamicActivationFloat8WeightConfig,
+    ),
+    (
+        TorchAOQuantDType.int4,
+        TorchAOQuantDType.float8_e4m3fn,
+        None,
+        Float8DynamicActivationInt4WeightConfig,
     ),
 ]
 
 ptq_test_cases = [
-    # weight_dtype, activation_dtype, group_size, quantize_embedding, expected_exception
-    (TorchIntDType.int8, None, 8, False, None),
-    (TorchIntDType.int4, None, 4, True, None),
-    (TorchIntDType.uint4, None, 8, False, None),
-    (TorchIntDType.int4, TorchIntDType.int4, 8, False, None),
-    (TorchIntDType.int8, TorchIntDType.int8, 8, True, None),
-    (TorchIntDType.int8, None, None, False, ValueError),
-    (TorchIntDType.int4, None, None, False, ValueError),
+    # weight_dtype, activation_dtype, group_size, quantize_embedding, expected_exception, expected_tensor_class
+    (TorchAOQuantDType.int4, None, 4, True, None, Int4Tensor),
+    (
+        TorchAOQuantDType.int4,
+        TorchAOQuantDType.int8,
+        8,
+        False,
+        None,
+        LinearActivationQuantizedTensor,
+    ),
+    # (
+    #     TorchAOQuantDType.int4,
+    #     TorchAOQuantDType.float8_e4m3fn,
+    #     None,
+    #     False,
+    #     None,
+    #     Int4Tensor,
+    # ),
+    (TorchAOQuantDType.int4, None, None, False, None, Int4Tensor),
+    # Deprecated configs
+    (TorchAOQuantDType.int8, None, 8, False, ValueError, None),
+    (TorchAOQuantDType.int4, TorchAOQuantDType.int4, 8, False, ValueError, None),
+    (TorchAOQuantDType.int8, TorchAOQuantDType.int8, 8, True, ValueError, None),
 ]
 
 
@@ -96,44 +106,132 @@ class TestQuantization:
     """
 
     @pytest.mark.parametrize(
-        "weight_dtype,activation_dtype,group_size,expected_type,expected_params",
+        "weight_dtype,activation_dtype,group_size,expected_type",
         ptq_config_test_cases,
     )
-    @require_torch_2_6_0
+    @requires_cuda_ge_8_9
+    @require_torch_2_8_0
     def test_get_ptq_config(
-        self, weight_dtype, activation_dtype, group_size, expected_type, expected_params
+        self, weight_dtype, activation_dtype, group_size, expected_type
     ):
-        config = get_ptq_config(weight_dtype, activation_dtype, group_size)
-
+        config = get_quantization_config(weight_dtype, activation_dtype, group_size)
         assert isinstance(config, expected_type)
 
-        for param_name, param_value in expected_params.items():
-            if isinstance(param_value, (PerAxis, PerGroup)):
-                if isinstance(param_value, PerAxis):
-                    assert isinstance(getattr(config, param_name), PerAxis)
-                    assert getattr(config, param_name).axis == param_value.axis
-                else:
-                    assert isinstance(getattr(config, param_name), PerGroup)
-                    assert (
-                        getattr(config, param_name).group_size == param_value.group_size
-                    )
-            else:
-                assert getattr(config, param_name) == param_value
+    @requires_cuda_ge_8_9
+    @require_torch_2_8_0
+    def test_get_ptq_config_int4_weight_only(self):
+        from torchao.quantization.quant_api import Int4WeightOnlyConfig
+
+        config = get_quantization_config(TorchAOQuantDType.int4, None, 4)
+        assert isinstance(config, Int4WeightOnlyConfig)
 
     @pytest.mark.parametrize(
-        "weight_dtype", [TorchIntDType.int8, TorchIntDType.int4, TorchIntDType.uint4]
+        "weight_dtype,activation_dtype,group_size,quantize_embedding,expected_exception,expected_tensor_class",
+        ptq_test_cases,
     )
+    @requires_cuda_ge_8_9
+    @require_torch_2_8_0
+    def test_quantize_model_for_ptq(
+        self,
+        model,
+        weight_dtype,
+        activation_dtype,
+        group_size,
+        quantize_embedding,
+        expected_exception,
+        expected_tensor_class,
+    ):
+        if expected_exception:
+            with pytest.raises(expected_exception):
+                quantize_model(
+                    model,
+                    weight_dtype,
+                    group_size,
+                    activation_dtype,
+                    quantize_embedding,
+                )
+        else:
+            quantize_model(
+                model, weight_dtype, group_size, activation_dtype, quantize_embedding
+            )
+            if quantize_embedding:
+                assert isinstance(
+                    model.model.embed_tokens.weight, expected_tensor_class
+                ), "Embedding weight should be quantized"
+            for child in list(model.children()):
+                if isinstance(child, torch.nn.Linear):
+                    assert isinstance(child.weight, expected_tensor_class)
+
+    @require_torch_2_8_0
+    @requires_sm_ge_100
+    def test_quantize_model_for_ptq_fp8(
+        self,
+        model,
+    ):
+        from torchao.quantization.quantize_.workflows.float8.float8_tensor import (
+            Float8Tensor,
+            QuantizeTensorToFloat8Kwargs,
+        )
+
+        quantize_model(
+            model,
+            TorchAOQuantDType.float8_e4m3fn,
+            None,
+            TorchAOQuantDType.float8_e4m3fn,
+        )
+        for child in list(model.children()):
+            if isinstance(child, torch.nn.Linear):
+                assert isinstance(child.weight, Float8Tensor)
+                assert child.weight.act_quant_kwargs is not None and isinstance(
+                    child.weight.act_quant_kwargs, QuantizeTensorToFloat8Kwargs
+                )
+
+    @require_torch_2_8_0
+    @requires_sm_ge_100
+    def test_quantize_model_for_ptq_nvfp4(
+        self,
+        model,
+    ):
+        from torchao.prototype.mx_formats.nvfp4_tensor import (
+            NVFP4Tensor,
+            QuantizeTensorToNVFP4Kwargs,
+        )
+
+        quantize_model(model, TorchAOQuantDType.nvfp4, 16, TorchAOQuantDType.nvfp4)
+        for child in list(model.children()):
+            if isinstance(child, torch.nn.Linear):
+                assert isinstance(child.weight, NVFP4Tensor)
+                assert child.weight.act_quant_kwargs is not None and isinstance(
+                    child.weight.act_quant_kwargs, QuantizeTensorToNVFP4Kwargs
+                )
+
     @pytest.mark.parametrize(
-        "activation_dtype", [None, TorchIntDType.int4, TorchIntDType.int8]
+        "weight_dtype,activation_dtype,group_size,quantize_embedding",
+        [
+            (TorchAOQuantDType.int4, None, 8, False),
+            (TorchAOQuantDType.int4, None, 16, True),
+            (TorchAOQuantDType.int4, TorchAOQuantDType.int8, 8, False),
+            (TorchAOQuantDType.int4, TorchAOQuantDType.int8, 16, True),
+            (
+                TorchAOQuantDType.float8_e4m3fn,
+                TorchAOQuantDType.float8_e4m3fn,
+                None,
+                False,
+            ),
+            (TorchAOQuantDType.int4, TorchAOQuantDType.float8_e4m3fn, None, True),
+        ],
     )
-    @pytest.mark.parametrize("group_size", [4, 8])
-    @pytest.mark.parametrize("quantize_embedding", [False, True])
-    @require_torch_2_6_0
+    @require_torch_2_8_0
+    @requires_cuda_ge_8_9
     def test_prepare_model_for_qat(
         self, model, weight_dtype, activation_dtype, group_size, quantize_embedding
     ):
         prepare_model_for_qat(
-            model, weight_dtype, group_size, activation_dtype, quantize_embedding
+            model,
+            weight_dtype,
+            group_size,
+            activation_dtype,
+            quantize_embedding,
         )
         if quantize_embedding:
             assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
@@ -142,17 +240,19 @@ class TestQuantization:
                 model.model.embed_tokens.weight_fake_quantizer.config.dtype
                 == weight_dtype.value
             )
-            assert (
-                model.model.embed_tokens.weight_fake_quantizer.config.group_size
-                == group_size
-            )
+            if group_size:
+                assert (
+                    model.model.embed_tokens.weight_fake_quantizer.config.group_size
+                    == group_size
+                )
 
         for child in list(model.children()):
             if isinstance(child, torch.nn.Linear):
                 assert isinstance(child, FakeQuantizedLinear)
                 assert hasattr(child, "weight_fake_quantizer")
                 assert child.weight_fake_quantizer.config.dtype == weight_dtype.value
-                assert child.weight_fake_quantizer.config.group_size == group_size
+                if group_size:
+                    assert child.weight_fake_quantizer.config.group_size == group_size
                 if activation_dtype:
                     assert hasattr(child, "activation_fake_quantizer")
                     assert (
@@ -162,49 +262,40 @@ class TestQuantization:
                 else:
                     assert child.activation_fake_quantizer is None
 
-    @pytest.mark.parametrize(
-        "weight_dtype,activation_dtype,group_size,quantize_embedding,expected_exception",
-        ptq_test_cases,
-    )
-    @require_torch_2_6_0
-    def test_quantize_model_for_ptq(
-        self,
-        model,
-        weight_dtype,
-        activation_dtype,
-        group_size,
-        quantize_embedding,
-        expected_exception,
-    ):
-        if expected_exception:
-            with pytest.raises(expected_exception):
-                quantize_model_for_ptq(
-                    model,
-                    weight_dtype,
-                    group_size,
-                    activation_dtype,
-                    quantize_embedding,
-                )
-        else:
-            quantize_model_for_ptq(
-                model, weight_dtype, group_size, activation_dtype, quantize_embedding
-            )
-            if quantize_embedding:
-                assert isinstance(
-                    model.model.embed_tokens.weight, AffineQuantizedTensor
-                ), "Embedding weight should be quantized"
-            for child in list(model.children()):
-                if isinstance(child, torch.nn.Linear):
-                    if activation_dtype:
-                        assert isinstance(
-                            child.weight, LinearActivationQuantizedTensor
-                        ), (
-                            "Linear weight should be quantized with activation quantization"
-                        )
-                    else:
-                        assert isinstance(child.weight, AffineQuantizedTensor), (
-                            "Linear weight should be quantized without activation quantization"
-                        )
+    @require_torch_2_8_0
+    @requires_cuda_ge_8_9
+    def test_convert_qat_model(self, model):
+        config = QATConfig(
+            weight_dtype="int4",
+            activation_dtype="int8",
+            group_size=8,
+            quantize_embedding=True,
+        )
+
+        # quantize model for qat
+        prepare_model_for_qat(
+            model,
+            config.weight_dtype,
+            config.group_size,
+            config.activation_dtype,
+            config.quantize_embedding,
+        )
+
+        assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+        assert isinstance(model.lm_head, FakeQuantizedLinear)
+
+        # apply conversion
+        convert_qat_model(
+            model,
+            config.quantize_embedding,
+        )
+        # ensure modules have been swapped out
+        assert not isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
+        assert not isinstance(model.lm_head, FakeQuantizedLinear)
+
+        # ensure weights have been quantized
+        assert isinstance(model.model.embed_tokens.weight, nn.Parameter)
+        assert isinstance(model.lm_head.weight, nn.Parameter)
 
 
 class TestQuantizationCallback:
@@ -218,10 +309,10 @@ class TestQuantizationCallback:
             global_step=0,
         )
 
-    @require_torch_2_6_0
+    @require_torch_2_8_0
     def test_qat_callback_fake_quant_after_n_steps(self, model, trainer_state):
         cfg = QATConfig(
-            weight_dtype="int8",
+            weight_dtype="int4",
             activation_dtype="int8",
             group_size=8,
             quantize_embedding=True,
@@ -268,10 +359,10 @@ class TestQuantizationCallback:
         assert model.model.embed_tokens.weight_fake_quantizer.enabled
         assert model.lm_head.weight_fake_quantizer.enabled
 
-    @require_torch_2_6_0
+    @require_torch_2_8_0
     def test_qat_callback_fake_quant_after_n_steps_is_none(self, model, trainer_state):
         cfg = QATConfig(
-            weight_dtype="int8",
+            weight_dtype="int4",
             activation_dtype="int8",
             group_size=8,
             quantize_embedding=True,
@@ -304,43 +395,3 @@ class TestQuantizationCallback:
         # quantization should be enabled from the get-go
         assert model.model.embed_tokens.weight_fake_quantizer.enabled
         assert model.lm_head.weight_fake_quantizer.enabled
-
-
-class TestConvertQATModelForPTQ:
-    """
-    Test convert_qat_model_for_ptq
-    """
-
-    @require_torch_2_6_0
-    def test_convert_qat_model_for_ptq(self, model):
-        config = QATConfig(
-            weight_dtype="int8",
-            activation_dtype="int8",
-            group_size=8,
-            quantize_embedding=True,
-        )
-
-        # quantize model for qat
-        prepare_model_for_qat(
-            model,
-            config.weight_dtype,
-            config.group_size,
-            config.activation_dtype,
-            config.quantize_embedding,
-        )
-
-        assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
-        assert isinstance(model.lm_head, FakeQuantizedLinear)
-
-        # apply conversion
-        convert_qat_model_for_ptq(
-            model,
-            quantize_embedding=config.quantize_embedding,
-        )
-        # ensure modules have been swapped out
-        assert not isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
-        assert not isinstance(model.lm_head, FakeQuantizedLinear)
-
-        # ensure weights have been quantized
-        assert isinstance(model.model.embed_tokens.weight, nn.Parameter)
-        assert isinstance(model.lm_head.weight, nn.Parameter)
diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py
index 7db6cf74e..a2dd8bc5e 100644
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -90,6 +90,18 @@ def require_torch_2_7_0(test_case):
     return unittest.skipUnless(is_min_2_7_0(), "test requires torch>=2.7.0")(test_case)
 
 
+def require_torch_2_8_0(test_case):
+    """
+    Decorator marking a test that requires torch >= 2.7.0
+    """
+
+    def is_min_2_8_0():
+        torch_version = version.parse(torch.__version__)
+        return torch_version >= version.parse("2.8.0")
+
+    return unittest.skipUnless(is_min_2_8_0(), "test requires torch>=2.8.0")(test_case)
+
+
 def require_torch_lt_2_6_0(test_case):
     """
     Decorator marking a test that requires torch < 2.6.0
@@ -128,6 +140,24 @@ def require_llmcompressor(test_case):
     )(test_case)
 
 
+def requires_sm_ge_100(test_case):
+    is_sm_ge_100 = (
+        torch.cuda.is_available()
+        and torch.version.cuda
+        and torch.cuda.get_device_capability() >= (10, 0)
+    )
+    return unittest.skipUnless(is_sm_ge_100, "test requires sm>=100")(test_case)
+
+
+def requires_cuda_ge_8_9(test_case):
+    is_cuda_ge_8_9 = (
+        torch.cuda.is_available()
+        and torch.version.cuda
+        and torch.cuda.get_device_capability() >= (8, 9)
+    )
+    return unittest.skipUnless(is_cuda_ge_8_9, "test requires cuda>=8.9")(test_case)
+
+
 def is_hopper():
     compute_capability = torch.cuda.get_device_capability()
     return compute_capability == (9, 0)

From 1ef6c196f7d1cffb2010accd4f0ef716ddab405a Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 16 Sep 2025 14:52:29 -0400
Subject: [PATCH 050/115] setup env vars for ray train for FSDP (#3130) [skip
 ci]

---
 src/axolotl/cli/train.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/axolotl/cli/train.py b/src/axolotl/cli/train.py
index 5e766de37..8d33c0b84 100644
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -17,6 +17,7 @@ from axolotl.integrations.base import PluginManager
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, resolve_dtype
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.trainer import prepare_optim_env
 
 
 def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
@@ -92,6 +93,7 @@ def ray_train_func(kwargs: dict):
     # cast `cfg` back to DictDefault (ray tune deepcopy has issues with DictDefault so needed it to be dict)
     # also renormalize the config now that TorchTrainer has spawned distributed workers
     cfg = DictDefault(kwargs["cfg"])
+    prepare_optim_env(cfg)
     normalize_config(cfg)
 
     # now that we are on the worker node, we can check `is_torch_bf16_gpu_available` to resolve dtype

From d4cff1b7bbd43d546d95b31943cf2810e30efe8f Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 16 Sep 2025 14:52:45 -0400
Subject: [PATCH 051/115] improve setting of NCCL_P2P_DISABLE on runpod (#3132)
 [skip ci]

* improve setting of NCCL_P2P_DISABLE on runpod

* use recs from review
---
 src/axolotl/utils/environment.py | 37 ++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)

diff --git a/src/axolotl/utils/environment.py b/src/axolotl/utils/environment.py
index 192aca4e1..7b2348413 100644
--- a/src/axolotl/utils/environment.py
+++ b/src/axolotl/utils/environment.py
@@ -2,6 +2,8 @@
 utils to get GPU info for the current environment
 """
 
+import os
+import subprocess  # nosec B404
 from importlib.metadata import version
 
 from accelerate.utils.environment import (
@@ -14,6 +16,8 @@ from packaging.version import Version, parse
 def check_cuda_p2p_ib_support():
     if not accelerate_check_cuda_p2p_ib_support():
         return False
+    if not check_runpod_p2p_support():
+        return False
     unsupported_devices = {"RTX 6000 Ada", "L40S"}
     try:
         device_names, device_count = get_gpu_info()
@@ -29,6 +33,39 @@ def check_cuda_p2p_ib_support():
     return True
 
 
+def check_runpod_p2p_support() -> bool:
+    if "RUNPOD_GPU_COUNT" not in os.environ:
+        return True
+    try:
+        gpu_count = int(os.environ.get("RUNPOD_GPU_COUNT", "1"))
+    except ValueError:
+        return True
+    if gpu_count >= 2:
+        # run `nvidia-smi topo -p2p n` and inspect the GPU0 row
+        try:
+            result = subprocess.run(  # nosec B603 B607
+                ["nvidia-smi", "topo", "-p2p", "n"],
+                check=True,
+                capture_output=True,
+                text=True,
+                timeout=5,
+            )
+        except (
+            subprocess.CalledProcessError,
+            FileNotFoundError,
+            subprocess.TimeoutExpired,
+        ):
+            return True  # fail-open if detection fails
+        output_lines = result.stdout.strip().split("\n")
+        # filter rows that start with "GPU0" (avoid header row)
+        gpu0_rows = [line for line in output_lines if line.lstrip().startswith("GPU0")]
+        if not gpu0_rows:
+            return True
+        # consider P2P supported if any OK is present in the GPU0 row
+        return "OK" in gpu0_rows[-1]
+    return True
+
+
 def get_package_version(package: str) -> Version:
     version_str = version(package)
     return parse(version_str)

From 86d6ee7c0551393dd537a2c1c5e5c6362e1b3e41 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 16 Sep 2025 14:53:01 -0400
Subject: [PATCH 052/115] upgrade trl and accelerate (#3161)

* upgrade trl==0.23.0

* upgrade accelerate patch fix

* add hints when using gradient_checkpointing with DPO

* set gradient-checpointing properly
---
 requirements.txt                        |  4 ++--
 src/axolotl/core/builders/base.py       |  2 +-
 src/axolotl/utils/schemas/validation.py | 15 +++++++++++++++
 tests/e2e/multigpu/test_llama.py        |  4 ++--
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 6138707af..44a3c0277 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,10 +15,10 @@ huggingface_hub>=0.33.0
 peft>=0.17.0
 transformers==4.56.1
 tokenizers>=0.21.1
-accelerate==1.10.0
+accelerate==1.10.1
 datasets==4.0.0
 deepspeed>=0.17.0
-trl==0.21.0
+trl==0.23.0
 hf_xet==1.1.5
 kernels==0.9.0
 trackio
diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
index 1ec818004..3ad8012f9 100644
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -435,7 +435,7 @@ class TrainerBuilderBase(abc.ABC):
             # don't use the HF gradient checkpointing, manually wrap
             training_args_kwargs["gradient_checkpointing"] = False
             training_args_kwargs["activation_offloading"] = True
-        elif self.cfg.gradient_checkpointing:
+        elif self.cfg.gradient_checkpointing is not None:
             training_args_kwargs["gradient_checkpointing"] = (
                 self.cfg.gradient_checkpointing
             )
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 64018ca48..9671b10ae 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -1378,6 +1378,21 @@ class ComplexValidationMixin:
 
         return self
 
+    def hint_gradient_checkpointing_dpo_lora_ddp(self):
+        if (
+            (self.gradient_checkpointing is True or self.gradient_checkpointing is None)
+            and self.capabilities
+            and self.capabilities.get("n_gpu", 1) > 1
+            and self.adapter in ("lora", "qlora")
+            and self.rl == RLType.DPO
+            and not self.fsdp
+            and not self.deepspeed
+        ):
+            LOG.warning(
+                "gradient_checkpointing with DPO + DDP + LoRA is not recommended."
+            )
+        return self
+
 
 class DistributedValidationMixin:
     """validation for distributed training."""
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index ad15d628b..c16ef0c60 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -199,7 +199,7 @@ class TestMultiGPULlama:
                 "max_steps": 2,
                 "micro_batch_size": 2,
                 "gradient_accumulation_steps": 2,
-                # "gradient_checkpointing": True,
+                "gradient_checkpointing": False,
                 "output_dir": temp_dir,
                 "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "warmup_steps": 0,
@@ -278,7 +278,7 @@ class TestMultiGPULlama:
                 "max_steps": 2,
                 "micro_batch_size": 2,
                 "gradient_accumulation_steps": 2,
-                # "gradient_checkpointing": True,
+                "gradient_checkpointing": False,
                 "output_dir": temp_dir,
                 "dataset_prepared_path": temp_dir + "/last_run_prepared",
                 "warmup_steps": 0,

From e5c427f6dee386ad2f1fa4cbe855e6de637b5662 Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Wed, 17 Sep 2025 10:38:15 +0100
Subject: [PATCH 053/115] qat doc updates (#3162) [skip-ci]

---
 docs/qat.qmd                                          | 11 +++++++++--
 docs/quantize.qmd                                     |  7 +++----
 .../{3b-qat-fsdp2-nvfp4.yaml => 3b-qat-nvfp4.yaml}    |  0
 3 files changed, 12 insertions(+), 6 deletions(-)
 rename examples/llama-3/{3b-qat-fsdp2-nvfp4.yaml => 3b-qat-nvfp4.yaml} (100%)

diff --git a/docs/qat.qmd b/docs/qat.qmd
index e0d000a79..ad9779066 100644
--- a/docs/qat.qmd
+++ b/docs/qat.qmd
@@ -23,10 +23,17 @@ To enable QAT in axolotl, add the following to your configuration file:
 
 ```yaml
 qat:
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4" and "int8"
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8"
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4".
   group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
   fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
 ```
 
+We support the following quantization schemas:
+- `Int4WeightOnly` (requires the `fbgemm-gpu` extra when installing Axolotl)
+- `Int8DynamicActivationInt4Weight`
+- `Float8DynamicActivationFloat8Weight`
+- `Float8DynamicActivationInt4Weight`
+- `NVFP4`
+
 Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.
diff --git a/docs/quantize.qmd b/docs/quantize.qmd
index 43c817a5b..9c3de1ef1 100644
--- a/docs/quantize.qmd
+++ b/docs/quantize.qmd
@@ -22,8 +22,8 @@ Quantization is configured using the `quantization` key in your configuration fi
 ```yaml
 base_model: # The path to the model to quantize.
 quantization:
-  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are uintX for X in [1, 2, 3, 4, 5, 6, 7], or int4, or int8
-  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4" and "int8"
+  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8"
+  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4".
   group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
   quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.
 
@@ -39,9 +39,8 @@ you used to train the model:
 # qat.yml
 qat:
   activation_dtype: int8
-  weight_dtype: int8
+  weight_dtype: int4
   group_size: 256
-  quantize_embedding: true
 
 output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
 ```
diff --git a/examples/llama-3/3b-qat-fsdp2-nvfp4.yaml b/examples/llama-3/3b-qat-nvfp4.yaml
similarity index 100%
rename from examples/llama-3/3b-qat-fsdp2-nvfp4.yaml
rename to examples/llama-3/3b-qat-nvfp4.yaml

From 4065bc14c616e12c4da037c01de8a0defd9e7c10 Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Wed, 17 Sep 2025 13:27:03 -0400
Subject: [PATCH 054/115] Debug log, logging improvements (#3159)

* simplify logging

* remove comment

* progress on debug.log

* add debug-level logger for file log

* simplify

* case insensitivity; 3rd party logging improvements

* simplify

* fix

* tests

* lint

* nits

* nit

* Update tests/test_utils_tee.py

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>

* cleanup / comments

* fix

* oops

---------

Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com>
---
 .gitignore                                    |   3 +
 .pre-commit-config.yaml                       |   2 +-
 .../colab-axolotl-example.ipynb               |   6 +-
 pyproject.toml                                |   2 +-
 src/axolotl/cli/__init__.py                   |   4 +-
 src/axolotl/cli/config.py                     |   9 +-
 src/axolotl/cli/inference.py                  |   2 -
 src/axolotl/cli/main.py                       |   4 +-
 src/axolotl/cli/train.py                      |   1 -
 src/axolotl/cli/utils/diffusion.py            |   1 -
 src/axolotl/logging_config.py                 |  75 +++++---
 .../transformers/trainer_loss_calc.py         |   6 +-
 src/axolotl/train.py                          |   3 +-
 src/axolotl/utils/__init__.py                 |   9 -
 src/axolotl/utils/logging.py                  |   7 +-
 src/axolotl/utils/tee.py                      | 166 ++++++++++++++++++
 src/axolotl/utils/train.py                    |   4 +-
 src/axolotl/utils/trainer.py                  |   9 -
 tests/test_logging_config_file_capture.py     | 103 +++++++++++
 tests/test_utils_tee.py                       | 107 +++++++++++
 20 files changed, 454 insertions(+), 69 deletions(-)
 create mode 100644 src/axolotl/utils/tee.py
 create mode 100644 tests/test_logging_config_file_capture.py
 create mode 100644 tests/test_utils_tee.py

diff --git a/.gitignore b/.gitignore
index 40084b408..b75becc7c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -190,3 +190,6 @@ out/
 
 # vim
 *.swp
+
+# scm auto-versioning
+src/axolotl/_version.py
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 9c80898ff..92ddc7f41 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -14,7 +14,7 @@ repos:
     rev: v0.12.12
     hooks:
     -   id: ruff
-        args: [--fix, --select, I]
+        args: [--fix]
     -   id: ruff-format
 -   repo: https://github.com/pre-commit/mirrors-mypy
     rev: v1.17.1
diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index 0e6ba984e..774b78b82 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -251,10 +251,10 @@
    },
    "outputs": [],
    "source": [
-    "from axolotl.utils import patch_optimized_env\n",
+    "from axolotl.utils import set_pytorch_cuda_alloc_conf\n",
     "\n",
-    "# speedup downloads from HF 🤗 and set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
-    "patch_optimized_env()"
+    "# Set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
+    "set_pytorch_cuda_alloc_conf()"
    ]
   },
   {
diff --git a/pyproject.toml b/pyproject.toml
index 932219d9e..4213bc963 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ line-length = 88
 target-version = "py310"
 
 [tool.ruff.lint]
-select = ["E", "F", "W", "C90", "B"]
+select = ["E", "F", "W", "C90", "B", "I"]
 ignore = [
     "E203",  # Whitespace before ':'
     "E501",  # Line too long
diff --git a/src/axolotl/cli/__init__.py b/src/axolotl/cli/__init__.py
index 8955eca3e..fa647be65 100644
--- a/src/axolotl/cli/__init__.py
+++ b/src/axolotl/cli/__init__.py
@@ -4,5 +4,7 @@ import os
 
 from axolotl.logging_config import configure_logging
 
-os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+
 configure_logging()
diff --git a/src/axolotl/cli/config.py b/src/axolotl/cli/config.py
index 20e341a0b..93ac6147d 100644
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -23,7 +23,8 @@ from axolotl.utils.config import (
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 from axolotl.utils.mlflow_ import setup_mlflow_env_vars
-from axolotl.utils.trainer import prepare_opinionated_env, prepare_optim_env
+from axolotl.utils.tee import prepare_debug_log
+from axolotl.utils.trainer import prepare_optim_env
 from axolotl.utils.wandb_ import setup_wandb_env_vars
 
 LOG = get_logger(__name__)
@@ -227,8 +228,11 @@ def load_cfg(
         },
     )
 
+    # NOTE(djsaunde): We start outputting to output_dir/debug.log at this point since we
+    # have to wait for cfg.output to be resolved. We could call this earlier if we write
+    # to a temporary file, and then move it later.
+    prepare_debug_log(cfg)
     prepare_optim_env(cfg)
-    prepare_opinionated_env(cfg)
     normalize_config(cfg)
     normalize_cfg_datasets(cfg)
     setup_wandb_env_vars(cfg)
@@ -241,7 +245,6 @@ def load_cfg(
         for k, v in cfg.items()
         if v is not None
     }
-
     LOG.info(
         "config:\n%s",
         json.dumps(cfg_to_log, indent=2, default=str, sort_keys=True),
diff --git a/src/axolotl/cli/inference.py b/src/axolotl/cli/inference.py
index 30d407713..3e1c01520 100644
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -17,8 +17,6 @@ from axolotl.cli.utils import load_model_and_tokenizer
 from axolotl.cli.utils.diffusion import (
     diffusion_inference,
     launch_diffusion_gradio_ui,
-    render_html,
-    run_diffusion,
 )
 from axolotl.integrations.base import PluginManager
 from axolotl.utils.chat_templates import get_chat_template_from_config
diff --git a/src/axolotl/cli/main.py b/src/axolotl/cli/main.py
index acfa81389..dc6cca489 100644
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -26,7 +26,7 @@ from axolotl.cli.utils import (
     launch_training,
 )
 from axolotl.integrations.lm_eval.cli import lm_eval
-from axolotl.utils import patch_optimized_env
+from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.logging import get_logger
 from axolotl.utils.schemas.config import AxolotlInputConfig
 
@@ -44,7 +44,7 @@ def cli():
     """Axolotl CLI - Train and fine-tune large language models"""
     print_axolotl_text_art()
     load_dotenv()
-    patch_optimized_env()
+    set_pytorch_cuda_alloc_conf()
 
 
 @cli.command()
diff --git a/src/axolotl/cli/train.py b/src/axolotl/cli/train.py
index 8d33c0b84..2332717e7 100644
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -60,7 +60,6 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
         config: Path to `axolotl` config YAML file.
         kwargs: Additional keyword arguments to override config file values.
     """
-
     parsed_cfg = load_cfg(config, **kwargs)
     parser = HfArgumentParser(TrainerCliArgs)
     parsed_cli_args, _ = parser.parse_args_into_dataclasses(
diff --git a/src/axolotl/cli/utils/diffusion.py b/src/axolotl/cli/utils/diffusion.py
index f83d9077b..1157bfd66 100644
--- a/src/axolotl/cli/utils/diffusion.py
+++ b/src/axolotl/cli/utils/diffusion.py
@@ -3,7 +3,6 @@
 from __future__ import annotations
 
 import gradio as gr
-import torch
 from colorama import Fore, Style
 
 from axolotl.integrations.diffusion import generate, resolve_mask_token_id
diff --git a/src/axolotl/logging_config.py b/src/axolotl/logging_config.py
index 10c5ae9dc..67b1d32f1 100644
--- a/src/axolotl/logging_config.py
+++ b/src/axolotl/logging_config.py
@@ -1,10 +1,7 @@
-"""
-Common logging module for axolotl
-"""
+"""Common logging module for axolotl."""
 
 import logging
 import os
-import sys
 from logging import Formatter, Logger, LogRecord
 from logging.config import dictConfig
 from typing import Any, Dict
@@ -17,9 +14,9 @@ DEFAULT_LOG_LEVEL = "WARNING"
 
 class AxolotlOrWarnErrorFilter(logging.Filter):
     """
-    Allows ANY WARNING or higher (unless overridden by LOG_LEVEL)
-    Allows axolotl.* at INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL)
-    Drops all other records (i.e. non-axolotl.INFO, DEBUG, etc. by default)
+    Allows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at
+    INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records
+    (i.e. non-axolotl.INFO, DEBUG, etc. by default).
     """
 
     def __init__(self, **kwargs):
@@ -52,13 +49,12 @@ class AxolotlOrWarnErrorFilter(logging.Filter):
 
 
 class AxolotlLogger(Logger):
-    """A Logger that automatically rejects non-axolotl INFOs."""
+    """Logger that applies filtering to non-axolotl loggers."""
 
     def __init__(self, name: str, level: int = logging.NOTSET):
         super().__init__(name, level)
-
-        # set global filter on the logger itself
-        self.addFilter(AxolotlOrWarnErrorFilter())
+        if not name.startswith("axolotl"):
+            self.addFilter(AxolotlOrWarnErrorFilter())
 
 
 class ColorfulFormatter(Formatter):
@@ -74,6 +70,7 @@ class ColorfulFormatter(Formatter):
 
     def format(self, record):
         record.rank = int(os.getenv("LOCAL_RANK", "0"))
+        record.rank_fmt = f" [RANK:{record.rank}]" if record.rank != 0 else ""
         log_message = super().format(record)
         return self.COLORS.get(record.levelname, "") + log_message + Fore.RESET
 
@@ -87,32 +84,54 @@ DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
         },
         "colorful": {
             "()": ColorfulFormatter,
-            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] [RANK:%(rank)d] %(message)s",
+            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d]%(rank_fmt)s %(message)s",
+        },
+        "concise": {
+            "format": "[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s",
+        },
+        "concise_color": {
+            "()": ColorfulFormatter,
+            "format": "[%(asctime)s] [%(levelname)s] [%(name)s]%(rank_fmt)s %(message)s",
+        },
+    },
+    "filters": {
+        "ax_or_warn": {
+            "()": "axolotl.logging_config.AxolotlOrWarnErrorFilter",
         },
     },
-    "filters": {},
     "handlers": {
         "console": {
             "class": "logging.StreamHandler",
-            "formatter": "simple",
-            "filters": [],
-            "stream": sys.stdout,
+            "formatter": "concise",
+            "filters": ["ax_or_warn"],
+            "stream": "ext://sys.stdout",
         },
         "color_console": {
             "class": "logging.StreamHandler",
-            "formatter": "colorful",
-            "filters": [],
-            "stream": sys.stdout,
+            "formatter": "concise_color",
+            "filters": ["ax_or_warn"],
+            "stream": "ext://sys.stdout",
+        },
+        "ax_file_only": {
+            "class": "logging.StreamHandler",
+            "level": "DEBUG",
+            "formatter": "simple",
+            "stream": "ext://axolotl.utils.tee.file_only_stream",
+        },
+        "root_file_only": {
+            "class": "logging.StreamHandler",
+            "level": "DEBUG",
+            "formatter": "simple",
+            "stream": "ext://axolotl.utils.tee.file_only_stream",
         },
     },
-    # log level will be superseded by the AxolotlLogger
     "root": {
-        "handlers": ["console"],
-        "level": os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL),
+        "handlers": ["console", "root_file_only"],
+        "level": os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL).upper(),
     },
     "loggers": {
         "axolotl": {
-            "handlers": ["color_console"],
+            "handlers": ["color_console", "ax_file_only"],
             "level": os.getenv("AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL).upper(),
             "propagate": False,
         },
@@ -123,9 +142,15 @@ DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
 def configure_logging():
     """Configure with default logging"""
     init()  # Initialize colorama
+
     dictConfig(DEFAULT_LOGGING_CONFIG)
     logging.setLoggerClass(AxolotlLogger)
 
-    # set default `ACCELERATE_LOG_LEVEL` to `LOG_LEVEL` if available and not set
+    # Route Python warnings through logging so they reach file handlers
+    logging.captureWarnings(True)
+
+    # Set default `ACCELERATE_LOG_LEVEL` to `LOG_LEVEL` if available and not set
     if "ACCELERATE_LOG_LEVEL" not in os.environ:
-        os.environ["ACCELERATE_LOG_LEVEL"] = os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL)
+        os.environ["ACCELERATE_LOG_LEVEL"] = os.getenv(
+            "LOG_LEVEL", DEFAULT_LOG_LEVEL
+        ).upper()
diff --git a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
index c9b968d71..b8172bbe6 100644
--- a/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
@@ -41,7 +41,7 @@ def patch_evaluation_loop():
     """Patch the evaluation_loop method."""
     # Check if already patched
     if hasattr(Trainer, "_original_evaluation_loop"):
-        LOG.info("Trainer.evaluation_loop already patched")
+        LOG.debug("Trainer.evaluation_loop already patched")
         return
 
     # Check if the patterns exist
@@ -84,7 +84,7 @@ def patch_evaluation_loop():
     )
     exec(evaluation_loop_source, globals())
 
-    LOG.info("Patched Trainer.evaluation_loop with nanmean loss calculation")
+    LOG.debug("Patched Trainer.evaluation_loop with nanmean loss calculation")
     Trainer.evaluation_loop = axolotl_evaluation_loop
 
 
@@ -135,5 +135,5 @@ def patch_maybe_log_save_evaluate():
     )
     exec(maybe_log_source, globals())
 
-    LOG.info("Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation")
+    LOG.debug("Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation")
     Trainer._maybe_log_save_evaluate = axolotl_maybe_log_save_evaluate
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index b0482bb1e..2a70d9712 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -196,10 +196,11 @@ def execute_training(
                 )
             )
 
-        LOG.info("Starting trainer...")
         # TODO: disabling for now as not compatible with FSDP2 + torchao low bit optimizers
         # if cfg.bf16:
         #     torch.set_default_dtype(torch.bfloat16)
+
+        LOG.info("Starting trainer...")
         trainer.train(resume_from_checkpoint=resume_from_checkpoint)
 
         plugin_manager = PluginManager.get_instance()
diff --git a/src/axolotl/utils/__init__.py b/src/axolotl/utils/__init__.py
index e5050116a..7256a5700 100644
--- a/src/axolotl/utils/__init__.py
+++ b/src/axolotl/utils/__init__.py
@@ -44,15 +44,6 @@ def set_pytorch_cuda_alloc_conf():
             )
 
 
-def patch_optimized_env():
-    """
-    Patch environment variables to improve VRAM usage and increase download speed
-    """
-    if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
-        os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
-    set_pytorch_cuda_alloc_conf()
-
-
 def get_not_null(value, default=None):
     """
     return the value if it's not None, otherwise return the default value
diff --git a/src/axolotl/utils/logging.py b/src/axolotl/utils/logging.py
index 7cc3530ae..35810897a 100644
--- a/src/axolotl/utils/logging.py
+++ b/src/axolotl/utils/logging.py
@@ -2,7 +2,6 @@
 
 import functools
 import logging
-import os
 
 from axolotl.utils.distributed import is_main_process
 
@@ -40,10 +39,6 @@ class MultiProcessAdapter(logging.LoggerAdapter):
 
 
 def get_logger(name: str, log_level: str | None = None) -> MultiProcessAdapter:
-    if log_level is None:
-        log_level = os.environ.get("AXOLOTL_LOG_LEVEL", None)
     logger = logging.getLogger(name)
-    if log_level is not None:
-        logger.setLevel(log_level.upper())
-        logger.root.setLevel(log_level.upper())
+    logger.setLevel(logging.DEBUG)
     return MultiProcessAdapter(logger, extra={})
diff --git a/src/axolotl/utils/tee.py b/src/axolotl/utils/tee.py
new file mode 100644
index 000000000..1209ad1dd
--- /dev/null
+++ b/src/axolotl/utils/tee.py
@@ -0,0 +1,166 @@
+"""
+Utilities for managing the debug log file and providing a file-only stream for logging
+handlers.
+"""
+
+from __future__ import annotations
+
+import io
+import os
+import sys
+import threading
+from pathlib import Path
+from typing import TextIO, cast
+
+_lock = threading.Lock()
+_file_handle: io.TextIOWrapper | None = None
+_log_path: str | None = None
+_tee_installed: bool = False
+_orig_stdout: TextIO | None = None
+_orig_stderr: TextIO | None = None
+
+
+class _FileOnlyWriter(io.TextIOBase):
+    """A stream-like object that writes only to the tee file.
+
+    Before the file is prepared, writes are dropped (no-op).
+    """
+
+    def write(self, s: str) -> int:  # type: ignore[override]
+        with _lock:
+            if _file_handle is not None:
+                _file_handle.write(s)
+                return len(s)
+            return len(s)
+
+    def flush(self) -> None:  # type: ignore[override]
+        with _lock:
+            if _file_handle is not None:
+                try:
+                    _file_handle.flush()
+                except Exception:
+                    pass
+
+
+file_only_stream: io.TextIOBase = _FileOnlyWriter()
+
+
+class _StreamTee(io.TextIOBase):
+    """A minimal tee that mirrors writes to the debug log file.
+
+    Installed only after the debug log is prepared; no buffering.
+    """
+
+    def __init__(self, stream: io.TextIOBase):
+        self._stream = stream
+
+    def write(self, s: str) -> int:  # type: ignore[override]
+        with _lock:
+            n = self._stream.write(s)
+            if _file_handle is not None:
+                _file_handle.write(s)
+            return n
+
+    def flush(self) -> None:  # type: ignore[override]
+        with _lock:
+            self._stream.flush()
+            if _file_handle is not None:
+                try:
+                    _file_handle.flush()
+                except Exception:
+                    pass
+
+    @property
+    def encoding(self):  # type: ignore[override]
+        return getattr(self._stream, "encoding", None)
+
+    @property
+    def errors(self):  # type: ignore[override]
+        return getattr(self._stream, "errors", None)
+
+    def isatty(self):  # type: ignore[override]
+        return getattr(self._stream, "isatty", lambda: False)()
+
+    def fileno(self):  # type: ignore[override]
+        if hasattr(self._stream, "fileno"):
+            return self._stream.fileno()
+        raise OSError("Underlying stream has no fileno")
+
+
+def prepare_debug_log(cfg, filename: str = "debug.log") -> str:
+    """
+    Prepare the debug log.
+
+    Creates the output directory, handles append/truncate logic based on cfg, and opens
+    the debug log file for subsequent writes via file-only handlers.
+    """
+    global _file_handle, _log_path, _tee_installed
+
+    with _lock:
+        # If already initialized, reuse existing path
+        if _log_path is not None:
+            return _log_path
+
+        output_dir = cfg.output_dir
+        os.makedirs(output_dir, exist_ok=True)
+
+        log_path = Path(output_dir) / filename
+        append = bool(
+            cfg.get("resume_from_checkpoint") or cfg.get("auto_resume_from_checkpoints")
+        )
+
+        if not append and log_path.exists():
+            log_path.unlink()
+
+        fh = open(log_path, "a", encoding="utf-8")
+        fh.flush()
+
+        _file_handle = fh
+        _log_path = str(log_path)
+
+        # Install a tee so stdout/stderr are mirrored to the debug file
+        # Allow disabling via env for testing or advanced usage.
+        tee_enabled = os.getenv("AXOLOTL_TEE_STDOUT", "1").lower() not in {
+            "0",
+            "false",
+            "no",
+        }
+        if tee_enabled and not _tee_installed:
+            # Save originals so we can restore later (e.g., tests)
+            global _orig_stdout, _orig_stderr
+            _orig_stdout = sys.stdout
+            _orig_stderr = sys.stderr
+            sys.stdout = _StreamTee(cast(io.TextIOBase, sys.stdout))
+            sys.stderr = _StreamTee(cast(io.TextIOBase, sys.stderr))
+            _tee_installed = True
+
+        return _log_path
+
+
+def close_debug_log() -> None:
+    """Flush and close the debug log and uninstall the stdout/stderr tee.
+
+    Safe to call even if not initialized.
+    """
+    global _file_handle, _log_path, _tee_installed, _orig_stdout, _orig_stderr
+    with _lock:
+        # Restore original stdout/stderr if we installed a tee
+        if _tee_installed:
+            if _orig_stdout is not None:
+                sys.stdout = _orig_stdout
+            if _orig_stderr is not None:
+                sys.stderr = _orig_stderr
+            _tee_installed = False
+            _orig_stdout = None
+            _orig_stderr = None
+
+        # Close the file handle if open
+        if _file_handle is not None:
+            try:
+                _file_handle.flush()
+                _file_handle.close()
+            except Exception:
+                pass
+            finally:
+                _file_handle = None
+        _log_path = None
diff --git a/src/axolotl/utils/train.py b/src/axolotl/utils/train.py
index 1393459d9..ad3f72be4 100644
--- a/src/axolotl/utils/train.py
+++ b/src/axolotl/utils/train.py
@@ -31,6 +31,7 @@ def determine_last_checkpoint(cfg: DictDefault, update: bool = True) -> str | No
     if checkpoints:
         last_checkpoint = str(checkpoints[-1])
         if not update:
+            LOG.info(f"Resuming from last checkpoint at {last_checkpoint}")
             return last_checkpoint
 
     if (
@@ -40,6 +41,7 @@ def determine_last_checkpoint(cfg: DictDefault, update: bool = True) -> str | No
     ):
         cfg.resume_from_checkpoint = last_checkpoint
         LOG.info(
-            f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
+            "Using auto-resume functionality to resume from checkpoint at "
+            f"{cfg.resume_from_checkpoint}"
         )
     return cfg.resume_from_checkpoint
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index a0f4fd567..662a54655 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -655,15 +655,6 @@ def prepare_optim_env(cfg):
         os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
 
 
-def prepare_opinionated_env(cfg):
-    if cfg.qlora_sharded_model_loading:
-        # model loading is forked after the tokenizer
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-    if cfg.sample_packing:
-        # multipack parallel packing sampler defaults to using fork
-        os.environ["TOKENIZERS_PARALLELISM"] = "false"
-
-
 def setup_trainer(
     cfg,
     train_dataset,
diff --git a/tests/test_logging_config_file_capture.py b/tests/test_logging_config_file_capture.py
new file mode 100644
index 000000000..44b0ee5e6
--- /dev/null
+++ b/tests/test_logging_config_file_capture.py
@@ -0,0 +1,103 @@
+import logging
+import tempfile
+
+import pytest
+
+
+def read(path: str) -> str:
+    with open(path, "r", encoding="utf-8") as f:
+        return f.read()
+
+
+@pytest.fixture(autouse=True)
+def _reset_logging_state():
+    # Ensure a clean slate for logging between tests
+    for handler in logging.root.handlers[:]:
+        logging.root.removeHandler(handler)
+    logging.shutdown()
+    # Note: dictConfig in configure_logging will set up handlers again
+    yield
+    for handler in logging.root.handlers[:]:
+        logging.root.removeHandler(handler)
+    logging.shutdown()
+
+
+def test_axolotl_logs_captured_at_all_levels(monkeypatch):
+    from axolotl.logging_config import configure_logging
+    from axolotl.utils import tee
+    from axolotl.utils.logging import get_logger
+
+    with tempfile.TemporaryDirectory() as td:
+        # Avoid stdout tee in this test to simplify interaction with pytest capture
+        monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0")
+        configure_logging()
+        path = tee.prepare_debug_log(
+            type("Cfg", (), {"output_dir": td, "get": lambda *_: False})
+        )
+
+        log = get_logger("axolotl.test")
+        log.info("AX-INFO")
+        log.debug("AX-DEBUG")
+        tee.file_only_stream.flush()
+
+        data = read(path)
+        assert "AX-INFO" in data
+        assert "AX-DEBUG" in data
+        tee.close_debug_log()
+
+
+def test_third_party_logs_filtered_and_warning_captured(monkeypatch):
+    from axolotl.logging_config import configure_logging
+    from axolotl.utils import tee
+
+    with tempfile.TemporaryDirectory() as td:
+        monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0")
+        configure_logging()
+        path = tee.prepare_debug_log(
+            type("Cfg", (), {"output_dir": td, "get": lambda *_: False})
+        )
+
+        # Third-party logger (non-axolotl)
+        other = logging.getLogger("thirdparty.lib")
+        other.info("TP-INFO")
+        other.warning("TP-WARN")
+
+        # Simulate Python warnings routed through logging
+        logging.getLogger("py.warnings").warning("PY-WARN")
+
+        # Push through buffers
+        tee.file_only_stream.flush()
+
+        data = read(path)
+        # INFO from non-axolotl should be filtered out (not present)
+        assert "TP-INFO" not in data
+        # WARNING+ should be present
+        assert "TP-WARN" in data
+        # Python warnings captured (via py.warnings logger)
+        assert "PY-WARN" in data
+        tee.close_debug_log()
+        tee.close_debug_log()
+
+
+def test_prepare_debug_log_idempotent_and_no_duplicate(monkeypatch):
+    from axolotl.logging_config import configure_logging
+    from axolotl.utils import tee
+    from axolotl.utils.logging import get_logger
+
+    with tempfile.TemporaryDirectory() as td:
+        monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0")
+        configure_logging()
+        cfg = type("Cfg", (), {"output_dir": td, "get": lambda *_: False})
+        p1 = tee.prepare_debug_log(cfg)
+        p2 = tee.prepare_debug_log(cfg)
+        assert p1 == p2
+
+        log = get_logger("axolotl.test")
+        marker = "UNIQUE-MARKER-12345"
+        log.info(marker)
+        tee.file_only_stream.flush()
+
+        data = read(p1)
+        # Ensure the marker appears once (not duplicated via propagation)
+        assert data.count(marker) == 1
+        tee.close_debug_log()
diff --git a/tests/test_utils_tee.py b/tests/test_utils_tee.py
new file mode 100644
index 000000000..e2c153667
--- /dev/null
+++ b/tests/test_utils_tee.py
@@ -0,0 +1,107 @@
+import os
+import tempfile
+
+
+def _dummy_cfg(output_dir: str, append: bool = False):
+    # Minimal object with attributes used by prepare_debug_log
+    class Cfg:
+        def __init__(self, out, append):
+            self.output_dir = out
+            self._append = append
+
+        def get(self, key, default=None):
+            if key in {"resume_from_checkpoint", "auto_resume_from_checkpoints"}:
+                return self._append
+            return default
+
+    return Cfg(output_dir, append)
+
+
+def read(path: str) -> str:
+    with open(path, "r", encoding="utf-8") as f:
+        return f.read()
+
+
+def test_file_only_stream_writes_after_prepare(monkeypatch):
+    from axolotl.utils import tee
+
+    with tempfile.TemporaryDirectory() as td:
+        # Avoid stdout tee in this test
+        monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0")
+        cfg = _dummy_cfg(td, append=False)
+
+        # before prepare: writing to file_only_stream creates no file
+        tee.file_only_stream.write("before\n")
+        tee.file_only_stream.flush()
+        assert not os.path.exists(os.path.join(td, "debug.log"))
+
+        # prepare and write
+        path = tee.prepare_debug_log(cfg)
+        assert os.path.basename(path) == "debug.log"
+        tee.file_only_stream.write("hello\n")
+        tee.file_only_stream.flush()
+
+        content = read(path)
+        assert "hello" in content
+
+        tee.close_debug_log()
+
+
+def test_stdout_is_mirrored_after_prepare(capsys, monkeypatch):
+    from axolotl.utils import tee
+
+    with tempfile.TemporaryDirectory() as td:
+        cfg = _dummy_cfg(td, append=False)
+        try:
+            # Install tee while capture is disabled so stdout tee wraps real stdout.
+            with capsys.disabled():
+                monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "1")
+                path = tee.prepare_debug_log(cfg)
+                import sys
+
+                print("printed-line")
+                sys.stdout.flush()
+
+            # Now verify file contains the line
+            content = read(path)
+            assert "printed-line" in content
+        finally:
+            tee.close_debug_log()
+
+
+def test_truncate_vs_append_behavior(monkeypatch):
+    from axolotl.utils import tee
+
+    with tempfile.TemporaryDirectory() as td:
+        # Avoid stdout tee in this test
+        monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0")
+        # First run creates file with A
+        cfg = _dummy_cfg(td, append=False)
+        _ = tee.prepare_debug_log(cfg)
+        try:
+            tee.file_only_stream.write("A\n")
+            tee.file_only_stream.flush()
+        finally:
+            tee.close_debug_log()
+
+        # Second run with append=False truncates
+        cfg2 = _dummy_cfg(td, append=False)
+        path2 = tee.prepare_debug_log(cfg2)
+        try:
+            tee.file_only_stream.write("B\n")
+            tee.file_only_stream.flush()
+            content = read(path2)
+            assert "A\n" not in content and "B\n" in content
+        finally:
+            tee.close_debug_log()
+
+        # Third run with append=True preserves existing
+        cfg3 = _dummy_cfg(td, append=True)
+        path3 = tee.prepare_debug_log(cfg3)
+        try:
+            tee.file_only_stream.write("C\n")
+            tee.file_only_stream.flush()
+            content = read(path3)
+            assert "B\n" in content and "C\n" in content
+        finally:
+            tee.close_debug_log()

From 09959fac70e7a31c2feee78e841d71cba1d3b411 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Thu, 18 Sep 2025 15:42:20 +0700
Subject: [PATCH 055/115] Feat: add Magistral Small 2509 and native mistral3
 tokenizer support (#3165)

* feat: update mistral common

* feat: add mistral3processor

* fix: loading

* fix: cast pixel_values to fp32

* fix: image tensor conversion

* feat: add FA2 support for pixtral based models

* fix: update mistral small 3.1 to use native tokenizer

* fix: install tips

* fix: improve info on sample dataset files

* chore: move mistral configs into subfolders

* fix: remove unneeded patch

* fix: indent

* feat: add integration tests

* chore: move

* feat: add magistral 2509 docs and example

* fix: convert tensor to bool

* feat: expand tests

* chore: move tests
---
 docs/multimodal.qmd                           |  15 +-
 examples/gemma3n/README.md                    |  10 +-
 examples/magistral/README.md                  |  30 +---
 examples/magistral/think/README.md            |  73 ++++++++
 .../magistral-small-think-qlora.yaml          |   0
 examples/magistral/vision/README.md           |  60 +++++++
 .../magistral-small-vision-24B-qlora.yml      |  64 +++++++
 .../{ => bigstral}/bigstral-ds-zero3.yaml     |   0
 .../mistral/{ => dpo}/mistral-dpo-qlora.yml   |   0
 .../mistral-small-3.1-24B-lora.yml            |  14 +-
 .../mixtral-8x22b-qlora-fsdp.yml              |   0
 .../{ => mixtral}/mixtral-qlora-fsdp.yml      |   0
 examples/mistral/{ => mixtral}/mixtral.yml    |   0
 examples/mistral/{ => mixtral}/mixtral_22.yml |   0
 examples/mistral/{ => mps}/lora-mps.yml       |   0
 .../mistral/{ => orpo}/mistral-qlora-orpo.yml |   0
 examples/pixtral/lora-12b.yml                 |   3 +-
 examples/voxtral/README.md                    |   9 +-
 requirements.txt                              |   2 +-
 src/axolotl/loaders/patch_manager.py          |  14 ++
 src/axolotl/loaders/processor.py              |   7 +
 src/axolotl/loaders/tokenizer.py              |   5 -
 .../monkeypatch/models/mistral3/__init__.py   |   0
 .../mistral3/mistral_common_tokenizer.py      |  85 +++++++++
 .../monkeypatch/models/pixtral/__init__.py    |   0
 .../pixtral/modeling_flash_attention_utils.py |  42 +++++
 src/axolotl/processing_strategies.py          |  36 ++++
 src/axolotl/utils/mistral/__init__.py         |   3 +-
 .../utils/mistral/mistral3_processor.py       | 169 ++++++++++++++++++
 .../test_mistral_tokenizer_patch.py           |  35 ++++
 .../test_pixtral_flash_attention_patch.py     |  77 ++++++++
 .../test_voxtral_modeling_patch.py            |  43 +++++
 32 files changed, 757 insertions(+), 39 deletions(-)
 create mode 100644 examples/magistral/think/README.md
 rename examples/magistral/{ => think}/magistral-small-think-qlora.yaml (100%)
 create mode 100644 examples/magistral/vision/README.md
 create mode 100644 examples/magistral/vision/magistral-small-vision-24B-qlora.yml
 rename examples/mistral/{ => bigstral}/bigstral-ds-zero3.yaml (100%)
 rename examples/mistral/{ => dpo}/mistral-dpo-qlora.yml (100%)
 rename examples/mistral/{ => mistral-small}/mistral-small-3.1-24B-lora.yml (78%)
 rename examples/mistral/{ => mixtral}/mixtral-8x22b-qlora-fsdp.yml (100%)
 rename examples/mistral/{ => mixtral}/mixtral-qlora-fsdp.yml (100%)
 rename examples/mistral/{ => mixtral}/mixtral.yml (100%)
 rename examples/mistral/{ => mixtral}/mixtral_22.yml (100%)
 rename examples/mistral/{ => mps}/lora-mps.yml (100%)
 rename examples/mistral/{ => orpo}/mistral-qlora-orpo.yml (100%)
 create mode 100644 src/axolotl/monkeypatch/models/mistral3/__init__.py
 create mode 100644 src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
 create mode 100644 src/axolotl/monkeypatch/models/pixtral/__init__.py
 create mode 100644 src/axolotl/monkeypatch/models/pixtral/modeling_flash_attention_utils.py
 create mode 100644 src/axolotl/utils/mistral/mistral3_processor.py
 create mode 100644 tests/monkeypatch/test_mistral_tokenizer_patch.py
 create mode 100644 tests/monkeypatch/test_pixtral_flash_attention_patch.py
 create mode 100644 tests/monkeypatch/test_voxtral_modeling_patch.py

diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd
index d839ce211..413404195 100644
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -13,6 +13,7 @@ format:
 - [Pixtral](#sec-pixtral)
 - [Llava-1.5](#sec-llava-15)
 - [Mistral-Small-3.1](#sec-mistral-small-31)
+- [Magistral-Small-2509](#sec-magistral-small-2509)
 - [Voxtral](#sec-voxtral)
 - [Gemma-3](#sec-gemma-3)
 - [Gemma-3n](#sec-gemma-3n)
@@ -94,10 +95,22 @@ chat_template: llava
 
 ### Mistral-Small-3.1 {#sec-mistral-small-31}
 
+::: {.callout-tip}
+Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'`
+:::
+
 ```yaml
 base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
+```
 
-chat_template: mistral_v7_tekken
+### Magistral-Small-2509 {#sec-magistral-small-2509}
+
+::: {.callout-tip}
+Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'`
+:::
+
+```yaml
+base_model: mistralai/Magistral-Small-2509
 ```
 
 ### Voxtral {#sec-voxtral}
diff --git a/examples/gemma3n/README.md b/examples/gemma3n/README.md
index 8c4e02a1d..ff3946c90 100644
--- a/examples/gemma3n/README.md
+++ b/examples/gemma3n/README.md
@@ -23,7 +23,15 @@ pip3 install timm==1.0.17
 pip3 install librosa==0.11.0
 ```
 
-3. Run the finetuning example:
+3. Download sample dataset files
+
+```bash
+# for text + vision + audio only
+wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/African_elephant.jpg
+wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/En-us-African_elephant.oga
+```
+
+4. Run the finetuning example:
 
 ```bash
 # text only
diff --git a/examples/magistral/README.md b/examples/magistral/README.md
index f4f278208..a09138744 100644
--- a/examples/magistral/README.md
+++ b/examples/magistral/README.md
@@ -1,10 +1,10 @@
 # Finetune Magistral Small with Axolotl
 
-Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at [2506](https://huggingface.co/mistralai/Magistral-Small-2506) and [2507](https://huggingface.co/mistralai/Magistral-Small-2507) (see [Thinking](#thinking)). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
+Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at [2506](https://huggingface.co/mistralai/Magistral-Small-2506), [2507](https://huggingface.co/mistralai/Magistral-Small-2507) (see [Thinking](#thinking)), and [2509](https://huggingface.co/mistralai/Magistral-Small-2509) (see [Vision](#vision)). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
 
 MistralAI has also released a proprietary medium-sized version called Magistral Medium.
 
-Thanks to the team at MistralAI for giving us early access to prepare for this release.
+Thanks to the team at MistralAI for giving us early access to prepare for these releases.
 
 ## Getting started
 
@@ -36,29 +36,17 @@ Let us know how it goes. Happy finetuning! 🚀
 
 ### Thinking
 
-MistralAI has released their [2507](https://huggingface.co/mistralai/Magistral-Small-2507) model with thinking capabilities. The model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.
+MistralAI has released their [2507](https://huggingface.co/mistralai/Magistral-Small-2507) model with thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.
 
-Example format:
+📚 **[See the Thinking fine-tuning guide →](./think/README.md)**
 
-```json
-{
-    "messages": [
-        {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
-        {"role": "user", "content": [{ "type": "text", "text": "..."}]},
-        {"role": "assistant", "content": [{ "type": "thinking", "thinking": "..."}, { "type": "text", "text": "..." }]},
-    ],
-}
-```
+### Vision
 
-Example config: `./magistral-small-think-qlora.yaml`.
+MistralAI has released their [2509](https://huggingface.co/mistralai/Magistral-Small-2509) model with vision capabilities.
 
-The `thinking` section also supports an optional arg `closed: bool` (`True` default) which controls adding the closing `[/THINK]` tag.
+📚 **[See the Vision fine-tuning guide →](./vision/README.md)**
 
-Limitations:
-- You cannot mix `content: str` with `content: list[dict]` as the `dataset.load_dataset` may complain about different types for `content` key.
-- This mode does not work with custom `train_detail` and `training` at the moment.
-
-### TIPS
+### Tips
 
 - We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
 - For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
@@ -89,5 +77,5 @@ In addition, we do not support overriding tokens yet.
 
 ## Future Work
 
-- Add parity to Preference Tuning, RL, Multi-modal, etc.
+- Add parity to Preference Tuning, RL, etc.
 - Add parity to other tokenizer configs like overriding tokens.
diff --git a/examples/magistral/think/README.md b/examples/magistral/think/README.md
new file mode 100644
index 000000000..29950f59e
--- /dev/null
+++ b/examples/magistral/think/README.md
@@ -0,0 +1,73 @@
+# Magistral Small Thinking Fine-tuning
+
+This guide covers fine-tuning [Magistral Small 2507](https://huggingface.co/mistralai/Magistral-Small-2507) with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections.
+
+## Prerequisites
+
+Before starting, ensure you have:
+- Installed Axolotl (see [main README](../README.md))
+
+## Getting Started
+
+Run the thinking model fine-tuning:
+
+```bash
+axolotl train magistral-small-think-qlora.yaml
+```
+
+This config uses about 19.1 GiB VRAM.
+
+### Tips
+
+- Dataset uses multi-content format with `type: thinking` support. See [Dataset Format](#dataset-format) below.
+- You cannot mix `content: str` and `content: list[dict]`, otherwise, dataset loading will fail. Keep it consistent.
+
+## Dataset Format
+
+The thinking model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.
+
+Example format:
+
+```json
+{
+    "messages": [
+        {
+            "role": "system",
+            "content": [
+                { "type": "text", "text": "{SYSTEM_PROMPT}"}
+            ]
+        },
+        {
+            "role": "user",
+            "content": [
+                { "type": "text", "text": "Solve this step by step: What is 15% of 240?"}
+            ]
+        },
+        {
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "thinking",
+                    "thinking": "I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36."
+                },
+                {
+                    "type": "text",
+                    "text": "To find 15% of 240, I'll multiply 240 by 0.15:\n\n240 × 0.15 = 36\n\nTherefore, 15% of 240 is 36."
+                }
+            ]
+        }
+    ]
+}
+```
+
+### Advanced Options
+
+The `thinking` section supports an optional `closed` parameter:
+
+```json
+{
+    "type": "thinking",
+    "thinking": "Internal reasoning here...",
+    "closed": true  // Default: true, controls adding the closing [/THINK] tag
+}
+```
diff --git a/examples/magistral/magistral-small-think-qlora.yaml b/examples/magistral/think/magistral-small-think-qlora.yaml
similarity index 100%
rename from examples/magistral/magistral-small-think-qlora.yaml
rename to examples/magistral/think/magistral-small-think-qlora.yaml
diff --git a/examples/magistral/vision/README.md b/examples/magistral/vision/README.md
new file mode 100644
index 000000000..932a3631e
--- /dev/null
+++ b/examples/magistral/vision/README.md
@@ -0,0 +1,60 @@
+# Magistral Small Vision Fine-tuning
+
+This guide covers fine-tuning [Magistral Small 2509](https://huggingface.co/mistralai/Magistral-Small-2509) with vision capabilities using Axolotl.
+
+## Prerequisites
+
+Before starting, ensure you have:
+- Installed Axolotl from source (see [main README](../README.md#getting-started))
+
+## Getting started
+
+1. Install the required vision lib:
+    ```bash
+    pip install 'mistral-common[opencv]==1.8.5'
+    ```
+
+2. Download the example dataset image:
+   ```bash
+   wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
+   ```
+
+3. Run the fine-tuning:
+   ```bash
+   axolotl train magistral-small-vision-24B-qlora.yml
+   ```
+
+This config uses about 17GiB VRAM.
+
+WARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.
+
+### Tips
+
+Key differences from text-only model:
+- `max_tokens: 131072` for inference
+- Multi-modal dataset format required
+- Sample packing not supported
+
+## Dataset Format
+
+The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
+
+One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now.
+
+Example:
+```json
+{
+    "messages": [
+        {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
+        {"role": "user", "content": [
+            { "type": "text", "text": "What's in this image?"},
+            {"type": "image", "path": "path/to/image.jpg" }
+        ]},
+        {"role": "assistant", "content": [{ "type": "text", "text": "..." }]},
+    ],
+}
+```
+
+## Limitations
+
+- Sample Packing is not supported for multi-modality training currently.
diff --git a/examples/magistral/vision/magistral-small-vision-24B-qlora.yml b/examples/magistral/vision/magistral-small-vision-24B-qlora.yml
new file mode 100644
index 000000000..397db383e
--- /dev/null
+++ b/examples/magistral/vision/magistral-small-vision-24B-qlora.yml
@@ -0,0 +1,64 @@
+base_model: mistralai/Magistral-Small-2509
+processor_type: AutoProcessor
+
+# Enable to use mistral-common tokenizer
+tokenizer_use_mistral_common: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_4bit: true
+
+# these 3 lines are needed for now to handle vision chat templates w images
+skip_prepare_dataset: true
+remove_unused_columns: false
+sample_packing: false
+
+# sample dataset below requires downloading image in advance
+# wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
+datasets:
+  - path: Nanobit/text-vision-2k-test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.01
+output_dir: ./outputs/out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 1
+micro_batch_size: 1
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: true
+fp16:
+tf32: true
+
+gradient_checkpointing: true
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+weight_decay: 0.0
+special_tokens:
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/mistral/bigstral-ds-zero3.yaml b/examples/mistral/bigstral/bigstral-ds-zero3.yaml
similarity index 100%
rename from examples/mistral/bigstral-ds-zero3.yaml
rename to examples/mistral/bigstral/bigstral-ds-zero3.yaml
diff --git a/examples/mistral/mistral-dpo-qlora.yml b/examples/mistral/dpo/mistral-dpo-qlora.yml
similarity index 100%
rename from examples/mistral/mistral-dpo-qlora.yml
rename to examples/mistral/dpo/mistral-dpo-qlora.yml
diff --git a/examples/mistral/mistral-small-3.1-24B-lora.yml b/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
similarity index 78%
rename from examples/mistral/mistral-small-3.1-24B-lora.yml
rename to examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
index 3e477645e..ec197f333 100644
--- a/examples/mistral/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
@@ -1,6 +1,9 @@
 base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
 processor_type: AutoProcessor
 
+# Enable to use mistral-common tokenizer
+tokenizer_use_mistral_common: true
+
 load_in_8bit: true
 
 # these 3 lines are needed for now to handle vision chat templates w images
@@ -8,12 +11,12 @@ skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 
-chat_template: mistral_v7_tekken
+# sample dataset below requires downloading image in advance
+# wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
 datasets:
-  - path: HuggingFaceH4/llava-instruct-mix-vsft
+  - path: Nanobit/text-vision-2k-test
     type: chat_template
-    split: train[:1%]
-    field_messages: messages
+
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 output_dir: ./outputs/out
@@ -48,8 +51,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-# flash_attention: false # PixtralVisionModel does not support Flash Attention 2.0 yet.
-sdp_attention: true
+flash_attention: true
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/mistral/mixtral-8x22b-qlora-fsdp.yml b/examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml
similarity index 100%
rename from examples/mistral/mixtral-8x22b-qlora-fsdp.yml
rename to examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml
diff --git a/examples/mistral/mixtral-qlora-fsdp.yml b/examples/mistral/mixtral/mixtral-qlora-fsdp.yml
similarity index 100%
rename from examples/mistral/mixtral-qlora-fsdp.yml
rename to examples/mistral/mixtral/mixtral-qlora-fsdp.yml
diff --git a/examples/mistral/mixtral.yml b/examples/mistral/mixtral/mixtral.yml
similarity index 100%
rename from examples/mistral/mixtral.yml
rename to examples/mistral/mixtral/mixtral.yml
diff --git a/examples/mistral/mixtral_22.yml b/examples/mistral/mixtral/mixtral_22.yml
similarity index 100%
rename from examples/mistral/mixtral_22.yml
rename to examples/mistral/mixtral/mixtral_22.yml
diff --git a/examples/mistral/lora-mps.yml b/examples/mistral/mps/lora-mps.yml
similarity index 100%
rename from examples/mistral/lora-mps.yml
rename to examples/mistral/mps/lora-mps.yml
diff --git a/examples/mistral/mistral-qlora-orpo.yml b/examples/mistral/orpo/mistral-qlora-orpo.yml
similarity index 100%
rename from examples/mistral/mistral-qlora-orpo.yml
rename to examples/mistral/orpo/mistral-qlora-orpo.yml
diff --git a/examples/pixtral/lora-12b.yml b/examples/pixtral/lora-12b.yml
index fea2a60ff..0e6489914 100644
--- a/examples/pixtral/lora-12b.yml
+++ b/examples/pixtral/lora-12b.yml
@@ -45,8 +45,7 @@ tf32: true
 
 gradient_checkpointing: true
 logging_steps: 1
-# flash_attention:  # PixtralVisionModel does not support Flash Attention 2.0 yet
-sdp_attention: true
+flash_attention: true
 
 warmup_ratio: 0.1
 evals_per_epoch: 1
diff --git a/examples/voxtral/README.md b/examples/voxtral/README.md
index 984af4ddb..b77691d72 100644
--- a/examples/voxtral/README.md
+++ b/examples/voxtral/README.md
@@ -27,7 +27,14 @@ pip3 install 'mistral_common[audio]==1.8.3'
 python scripts/cutcrossentropy_install.py | sh
 ```
 
-3. Run the finetuning example:
+3. Download sample dataset files
+
+```bash
+# for text + audio only
+wget https://huggingface.co/datasets/Nanobit/text-audio-2k-test/resolve/main/En-us-African_elephant.oga
+```
+
+4. Run the finetuning example:
 
 ```bash
 # text only
diff --git a/requirements.txt b/requirements.txt
index 44a3c0277..86013374f 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -70,4 +70,4 @@ schedulefree==1.4.1
 axolotl-contribs-lgpl==0.0.6
 axolotl-contribs-mit==0.0.5
 
-mistral-common==1.8.3
+mistral-common==1.8.5
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index a5a630cb5..98eb07b0f 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -168,6 +168,13 @@ class PatchManager:
 
             patch_llama4_linearized_modeling()
 
+        if self.cfg.model_config_type == "mistral3" and self.cfg.processor_type:
+            from axolotl.monkeypatch.models.mistral3.mistral_common_tokenizer import (
+                apply_mistral_tokenizer_image_patch,
+            )
+
+            apply_mistral_tokenizer_image_patch()
+
     def _apply_fp8_patches(self):
         """Apply patches for FP8 support."""
         if self.cfg.fp8:
@@ -334,6 +341,13 @@ class PatchManager:
 
             replace_stablelm_attn_with_flash_attn(self.cfg.base_model)
 
+        if self.model_config.model_type in ("mistral3", "llava"):
+            from axolotl.monkeypatch.models.pixtral.modeling_flash_attention_utils import (
+                apply_patch_is_packed_sequence,
+            )
+
+            apply_patch_is_packed_sequence()
+
     def _patch_loss_llama(self):
         """Patch loss functions and other optimizations for LLaMA models."""
         if not self.cfg.is_llama_derived_model:
diff --git a/src/axolotl/loaders/processor.py b/src/axolotl/loaders/processor.py
index 2e3ec8d7f..7580b2008 100644
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -21,6 +21,13 @@ def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
     if cfg.processor_type:
         processor_cls = getattr(transformers, cfg.processor_type)
 
+    if cfg.tokenizer_use_mistral_common:
+        from axolotl.utils.mistral import Mistral3Processor
+
+        return Mistral3Processor(
+            tokenizer=tokenizer,
+        )
+
     processor = processor_cls.from_pretrained(
         cfg.processor_config,
         trust_remote_code=cfg.trust_remote_code or False,
diff --git a/src/axolotl/loaders/tokenizer.py b/src/axolotl/loaders/tokenizer.py
index 37b66ac83..69455dd77 100644
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -124,13 +124,8 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
 
     def _load_mistral_common_tokenizer(cfg: DictDefault):
         """Load mistral-common tokenizer"""
-        from transformers import tokenization_mistral_common
-
         from axolotl.utils.mistral import HFMistralTokenizer
 
-        # patch
-        tokenization_mistral_common.MistralCommonTokenizer = HFMistralTokenizer
-
         # Load the HF-compatible wrapper around MistralTokenizer
         tokenizer = HFMistralTokenizer.from_pretrained(cfg.tokenizer_config)
 
diff --git a/src/axolotl/monkeypatch/models/mistral3/__init__.py b/src/axolotl/monkeypatch/models/mistral3/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py b/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
new file mode 100644
index 000000000..9e7259a05
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
@@ -0,0 +1,85 @@
+"""
+Monkeypatch to fix inefficient tensor conversion in MistralCommonTokenizer.apply_chat_template
+"""
+
+import importlib
+import inspect
+
+from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def apply_mistral_tokenizer_image_patch():
+    """Apply patch to MistralCommonTokenizer.apply_chat_template to fix image tensor conversion."""
+    from transformers.tokenization_mistral_common import MistralCommonTokenizer
+
+    # Get original source
+    original_source = inspect.getsource(MistralCommonTokenizer.apply_chat_template)
+    original_source, _ = detab_code(original_source)
+
+    # Define the replacement
+    original_tensor_conversion = (
+        "                    pixel_values = torch.tensor(images)"
+    )
+
+    patched_tensor_conversion = """                    if isinstance(images, list) and len(images) > 0 and isinstance(images[0], np.ndarray):
+                        pixel_values = torch.tensor(np.array(images))
+                    else:
+                        pixel_values = torch.tensor(images)"""
+
+    # Apply the replacement
+    if original_tensor_conversion in original_source:
+        patched_source = original_source.replace(
+            original_tensor_conversion, patched_tensor_conversion
+        )
+        patched_source = patched_source.replace(
+            "def apply_chat_template(",
+            "def patched_apply_chat_template(",
+            1,
+        )
+
+        # Load necessary imports from the module
+        module_name = MistralCommonTokenizer.__module__
+        module = importlib.import_module(module_name)
+
+        # Detect what needs to be imported
+        items_to_import = []
+        for item in dir(module):
+            if item in patched_source and not item.startswith("_"):
+                items_to_import.append(item)
+
+        # Execute imports in global scope
+        if items_to_import:
+            exec(  # nosec B102
+                f"from {module_name} import ({', '.join(items_to_import)})",
+                globals(),
+            )
+
+        # Also need standard imports that might be used
+        exec("import numpy as np", globals())  # nosec B102
+        exec("import torch", globals())  # nosec B102
+        exec("from typing import Union, Optional, List, Dict, Any, Callable", globals())  # nosec B102
+        exec("from pathlib import Path", globals())  # nosec B102
+
+        # Import other dependencies that might be needed
+        try:
+            exec("from transformers.utils import is_torch_available", globals())  # nosec B102
+            exec(
+                "from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TensorType",
+                globals(),
+            )  # nosec B102
+            exec("from transformers.utils import logging", globals())  # nosec B102
+            exec("logger = logging.get_logger(__name__)", globals())  # nosec B102
+        except ImportError as e:
+            LOG.warning(f"Could not import some dependencies: {e}")
+
+        # Execute the patched source
+        exec(patched_source, globals())  # nosec B102
+
+        # Replace the method
+        MistralCommonTokenizer.apply_chat_template = patched_apply_chat_template
+        LOG.info("Successfully applied MistralCommonTokenizer tensor conversion patch")
+    else:
+        LOG.warning("Could not find target code for MistralCommonTokenizer patching")
diff --git a/src/axolotl/monkeypatch/models/pixtral/__init__.py b/src/axolotl/monkeypatch/models/pixtral/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/monkeypatch/models/pixtral/modeling_flash_attention_utils.py b/src/axolotl/monkeypatch/models/pixtral/modeling_flash_attention_utils.py
new file mode 100644
index 000000000..d2b482f19
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/pixtral/modeling_flash_attention_utils.py
@@ -0,0 +1,42 @@
+"""Monkeypatch for FA utils to accept 1D position_ids from Pixtral's position_ids_in_meshgrid"""
+
+import torch
+
+
+def apply_patch_is_packed_sequence():
+    """Apply patch to FA utils to accept 1D position_ids from Pixtral's position_ids_in_meshgrid"""
+    from transformers import modeling_flash_attention_utils
+
+    def fixed_is_packed_sequence(position_ids, batch_size):
+        """
+        Check the position ids whether packed sequences are indicated or not
+            1. Position ids exist
+            2. Flattened sequences only are supported
+            3. Compile-friendly `not (torch.diff(position_ids, dim=-1) >= 0).all()`, i.e. we have multiple increasing sequences
+        """
+        if position_ids is None:
+            return False
+
+        if position_ids.ndim == 1:
+            position_ids = position_ids.unsqueeze(0)  # [N] -> [1, N]
+
+        increasing_position_sequences = (
+            torch.arange(position_ids.shape[1], device=position_ids.device)
+            + position_ids.min()
+        )
+        return (
+            batch_size == 1
+            and (increasing_position_sequences - position_ids).abs().sum().bool().item()
+        )
+
+    # Store original method
+    old_fn = modeling_flash_attention_utils._is_packed_sequence
+
+    # Apply the patch
+    modeling_flash_attention_utils._is_packed_sequence = fixed_is_packed_sequence
+
+    def unpatch():
+        """Restore the original method"""
+        modeling_flash_attention_utils._is_packed_sequence = old_fn
+
+    return unpatch
diff --git a/src/axolotl/processing_strategies.py b/src/axolotl/processing_strategies.py
index 4b06eb4c8..5e7c1456a 100644
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -11,6 +11,7 @@ from transformers.image_utils import load_image
 
 from axolotl.utils.dict import remove_none_values
 from axolotl.utils.logging import get_logger
+from axolotl.utils.mistral.mistral3_processor import Mistral3Processor
 
 LOG = get_logger(__name__)
 
@@ -421,6 +422,36 @@ class SmolVLM2ProcessingStrategy(ProcessingStrategy):
         ]
 
 
+class Mistral3ProcessingStrategy(ProcessingStrategy):
+    """Processing Strategy class for Mistral3"""
+
+    def __init__(
+        self,
+        processor: Mistral3Processor,
+        chat_template: Optional[str] = None,
+        image_size: int | tuple[int, int] | None = None,
+        image_resize_algorithm: Resampling | None = None,
+    ):
+        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
+        special_ids = (
+            processor.tokenizer.tokenizer.instruct_tokenizer.image_encoder.special_ids
+        )
+
+        self.image_token = special_ids.img
+        self.image_break_token = special_ids.img_break
+        self.image_end_token = special_ids.img_end
+
+    def process_labels(self, input_ids):
+        labels = input_ids.clone()
+
+        labels[labels == self.processor.tokenizer.pad_token_id] = -100
+        labels[labels == self.image_token] = -100
+        labels[labels == self.image_break_token] = -100
+        labels[labels == self.image_end_token] = -100
+
+        return labels
+
+
 def get_processing_strategy(
     processor: ProcessorMixin,
     chat_template,
@@ -463,6 +494,11 @@ def get_processing_strategy(
             **processing_kwargs,
         )
 
+    if isinstance(processor, Mistral3Processor):
+        return Mistral3ProcessingStrategy(
+            **processing_kwargs,
+        )
+
     # llama3_2_vision, llama4, llava
     # mistral_v7_tekken, pixtral, lfm2vl
     return ProcessingStrategy(
diff --git a/src/axolotl/utils/mistral/__init__.py b/src/axolotl/utils/mistral/__init__.py
index eb1e2df89..eb51031ec 100644
--- a/src/axolotl/utils/mistral/__init__.py
+++ b/src/axolotl/utils/mistral/__init__.py
@@ -1,5 +1,6 @@
 """Init for `axolotl.utils.mistral` module."""
 
+from axolotl.utils.mistral.mistral3_processor import Mistral3Processor
 from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer
 
-__all__ = ["HFMistralTokenizer"]
+__all__ = ["HFMistralTokenizer", "Mistral3Processor"]
diff --git a/src/axolotl/utils/mistral/mistral3_processor.py b/src/axolotl/utils/mistral/mistral3_processor.py
new file mode 100644
index 000000000..85479ca7b
--- /dev/null
+++ b/src/axolotl/utils/mistral/mistral3_processor.py
@@ -0,0 +1,169 @@
+"""Processor for Mistral3 multimodal models with image support"""
+
+from typing import Any, Dict, Optional, Union
+
+import torch
+from transformers import ProcessorMixin
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.processing_utils import ProcessingKwargs
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+
+from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer
+
+
+class Mistral3ProcessorKwargs(ProcessingKwargs):
+    _defaults: Dict[str, Dict[str, Any]] = {
+        "text_kwargs": {
+            "padding": True,
+        },
+        "common_kwargs": {
+            "return_tensors": "pt",
+            "return_dict": True,
+            "tokenize": True,
+        },
+    }
+
+
+class Mistral3Processor(ProcessorMixin):
+    """
+    Processor for Mistral3 multimodal models that handles text and images.
+    Wraps HFMistralTokenizer and adds image processing capabilities.
+    """
+
+    attributes = ["tokenizer"]
+    tokenizer_class = "HFMistralTokenizer"
+
+    def __init__(self, tokenizer: HFMistralTokenizer):
+        # Don't call super().__init__ to avoid the class validation issue
+        self.tokenizer = tokenizer
+
+    @property
+    def chat_template(self) -> None:
+        """Chat template is not supported. Dummy method to satisfy HuggingFace API."""
+        return None
+
+    @property
+    def audio_tokenizer(self) -> None:
+        """Audio tokenizer is not supported. Dummy method to satisfy HuggingFace API."""
+        return None
+
+    def _merge_kwargs(
+        self, processor_kwargs_class: Any, **kwargs: Any
+    ) -> Dict[str, Dict[str, Any]]:
+        """Merge kwargs with defaults similar to ProcessorMixin"""
+        defaults = processor_kwargs_class._defaults
+        output_kwargs: Dict[str, Dict[str, Any]] = {}
+
+        for kwarg_type, default_values in defaults.items():
+            output_kwargs[kwarg_type] = {**default_values}
+
+        # Update with provided kwargs
+        for key, value in kwargs.items():
+            # Try to match key to appropriate kwarg type
+            if key in ["padding", "truncation", "max_length"]:
+                output_kwargs.setdefault("text_kwargs", {}).update({key: value})
+            elif key in ["return_tensors", "return_dict", "tokenize"]:
+                output_kwargs.setdefault("common_kwargs", {}).update({key: value})
+            else:
+                # Add to text_kwargs by default
+                output_kwargs.setdefault("text_kwargs", {}).update({key: value})
+
+        return output_kwargs
+
+    def apply_chat_template(
+        self,
+        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
+        **kwargs: Any,
+    ) -> Union[BatchFeature, str, list[str]]:
+        """
+        Apply chat template with image support for Mistral3.
+
+        Similar to VoxtralProcessor, this method extracts images from the conversation,
+        calls the tokenizer's apply_chat_template, then adds pixel_values and image_sizes
+        to the result.
+        """
+        output_kwargs = self._merge_kwargs(Mistral3ProcessorKwargs, **kwargs)
+        text_kwargs = output_kwargs["text_kwargs"]
+        common_kwargs = output_kwargs["common_kwargs"]
+
+        return_tensors = common_kwargs.pop("return_tensors", "pt")
+        if return_tensors != "pt":
+            raise ValueError(
+                f"{self.__class__.__name__} only supports `return_tensors='pt'`."
+            )
+
+        return_dict = common_kwargs.pop("return_dict", False)
+        tokenize = common_kwargs.pop("tokenize", False)
+
+        # Determine if batched
+        if isinstance(conversation, (list, tuple)) and (
+            isinstance(conversation[0], (list, tuple))
+            or hasattr(conversation[0], "content")
+        ):
+            is_batched = True
+            conversations = conversation
+        else:
+            is_batched = False
+            conversations = [conversation]  # type: ignore
+
+        # Call tokenizer's apply_chat_template
+        tokenizer_kwargs = {**text_kwargs, **common_kwargs}
+        tokenizer_kwargs["return_tensors"] = return_tensors
+        tokenizer_kwargs["tokenize"] = tokenize
+        tokenizer_kwargs["return_dict"] = return_dict
+
+        encoded_instruct_inputs = self.tokenizer.apply_chat_template(
+            conversations,
+            **tokenizer_kwargs,
+        )
+
+        if tokenize:
+            if return_dict:
+                # The tokenizer already handles pixel_values, we just need to add image_sizes
+                if hasattr(encoded_instruct_inputs, "items"):
+                    data: Dict[str, Any] = dict(encoded_instruct_inputs)  # type: ignore
+                elif hasattr(encoded_instruct_inputs, "data"):
+                    data = encoded_instruct_inputs.data  # type: ignore
+                else:
+                    raise ValueError("Unknown data type")
+
+                if "pixel_values" in data:
+                    pixel_values = data["pixel_values"]
+
+                    # MistralTokenizer returns a Double, so we convert to fp32
+                    data["pixel_values"] = pixel_values.to(dtype=torch.float32)
+
+                    # Always batched: [B, C, H, W] -> image_sizes: [B, 2]
+                    # Since tensor is homogeneous, all images have same H, W
+                    batch_size = pixel_values.shape[0]
+                    image_sizes = torch.tensor([pixel_values.shape[-2:]] * batch_size)
+                    data["image_sizes"] = image_sizes
+
+                return BatchFeature(data=data, tensor_type=return_tensors)
+
+        if not is_batched:
+            return encoded_instruct_inputs[0]
+
+        return encoded_instruct_inputs
+
+    def __call__(
+        self,
+        text: Optional[
+            Union[
+                TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
+            ]
+        ],
+        **kwargs: Any,
+    ) -> BatchFeature:
+        """
+        Forward text processing to the tokenizer.
+        This method does not support images - use apply_chat_template instead.
+        """
+        output_kwargs = self._merge_kwargs(Mistral3ProcessorKwargs, **kwargs)
+        text_kwargs = output_kwargs["text_kwargs"]
+        common_kwargs = output_kwargs["common_kwargs"]
+
+        out = self.tokenizer(text, **text_kwargs)
+        return BatchFeature(
+            data=out, tensor_type=common_kwargs.pop("return_tensors", None)
+        )
diff --git a/tests/monkeypatch/test_mistral_tokenizer_patch.py b/tests/monkeypatch/test_mistral_tokenizer_patch.py
new file mode 100644
index 000000000..cb82c0890
--- /dev/null
+++ b/tests/monkeypatch/test_mistral_tokenizer_patch.py
@@ -0,0 +1,35 @@
+"""Integration tests for MistralCommonTokenizer patches."""
+
+import pytest
+
+
+class TestMistralTokenizerPatchIntegration:
+    """Test MistralCommonTokenizer patch integration."""
+
+    @pytest.mark.integration
+    def test_mistral_tokenizer_image_patch(self):
+        """Test that MistralCommonTokenizer image patch can be applied."""
+        try:
+            from transformers.tokenization_mistral_common import MistralCommonTokenizer
+        except ImportError:
+            pytest.skip("MistralCommonTokenizer not available")
+
+        from axolotl.monkeypatch.models.mistral3.mistral_common_tokenizer import (
+            apply_mistral_tokenizer_image_patch,
+        )
+
+        # Store original method
+        original_apply_chat_template = MistralCommonTokenizer.apply_chat_template
+
+        # Apply patch
+        apply_mistral_tokenizer_image_patch()
+
+        # Verify patch was applied
+        assert (
+            MistralCommonTokenizer.apply_chat_template != original_apply_chat_template
+        ), "apply_chat_template was not patched"
+
+        # Verify the method is still callable
+        assert callable(MistralCommonTokenizer.apply_chat_template), (
+            "Patched method is not callable"
+        )
diff --git a/tests/monkeypatch/test_pixtral_flash_attention_patch.py b/tests/monkeypatch/test_pixtral_flash_attention_patch.py
new file mode 100644
index 000000000..285fde41e
--- /dev/null
+++ b/tests/monkeypatch/test_pixtral_flash_attention_patch.py
@@ -0,0 +1,77 @@
+"""Integration tests for Pixtral Flash Attention patches."""
+
+import pytest
+import torch
+
+
+class TestPixtralFlashAttentionPatchIntegration:
+    """Test Pixtral Flash Attention patch integration."""
+
+    @pytest.mark.integration
+    def test_pixtral_flash_attention_patch(self):
+        """Test that Pixtral Flash Attention patch can be applied and works correctly."""
+        try:
+            from transformers import modeling_flash_attention_utils
+        except ImportError:
+            pytest.skip("Flash Attention utils not available")
+
+        from axolotl.monkeypatch.models.pixtral.modeling_flash_attention_utils import (
+            apply_patch_is_packed_sequence,
+        )
+
+        # Store original method
+        original_is_packed_sequence = modeling_flash_attention_utils._is_packed_sequence
+
+        # Apply patch and get unpatch function
+        unpatch_fn = apply_patch_is_packed_sequence()
+
+        # Verify patch was applied
+        assert (
+            modeling_flash_attention_utils._is_packed_sequence
+            != original_is_packed_sequence
+        ), "_is_packed_sequence was not patched"
+
+        # Test the patched function with 1D position_ids
+        patched_fn = modeling_flash_attention_utils._is_packed_sequence
+
+        # Test 1D position_ids 1 sequence
+        position_ids_1d = torch.tensor([0, 1, 2, 3])
+        result = patched_fn(position_ids_1d, batch_size=1)
+        assert isinstance(result, bool), "Function should return a boolean"
+        assert result is False, "1D sequential position_ids should not be packed"
+
+        # Test 1D packed 2 sequences
+        position_ids_1d_packed = torch.tensor([0, 1, 2, 0, 1, 2])
+        result = patched_fn(position_ids_1d_packed, batch_size=1)
+        assert isinstance(result, bool), "Function should return a boolean"
+        assert result is True, "1D packed position_ids should be detected as packed"
+
+        # Test 2D packed 2 sequences
+        position_ids_2d_packed = torch.tensor([[0, 1, 2, 3, 0, 1]])
+        result = patched_fn(position_ids_2d_packed, batch_size=1)
+        assert isinstance(result, bool), "Function should return a boolean"
+        assert result is True, "2D packed position_ids should be detected as packed"
+
+        # Test 2D 1 sequence
+        position_ids_2d_normal = torch.tensor([[0, 1, 2, 3, 4, 5]])
+        result = patched_fn(position_ids_2d_normal, batch_size=1)
+        assert isinstance(result, bool), "Function should return a boolean"
+        assert result is False, "2D sequential position_ids should not be packed"
+
+        # Test 2D batch size 2
+        position_ids_2d_normal = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8]])
+        result = patched_fn(position_ids_2d_normal, batch_size=2)
+        assert isinstance(result, bool), "Function should return a boolean"
+        assert result is False, "2D position_ids batch 2 should not be packed"
+
+        # Test None case
+        result = patched_fn(None, batch_size=1)
+        assert isinstance(result, bool), "Function should return a boolean"
+        assert result is False, "None position_ids should return False"
+
+        # Test unpatch function
+        unpatch_fn()
+        assert (
+            modeling_flash_attention_utils._is_packed_sequence
+            == original_is_packed_sequence
+        ), "unpatch function did not restore original method"
diff --git a/tests/monkeypatch/test_voxtral_modeling_patch.py b/tests/monkeypatch/test_voxtral_modeling_patch.py
new file mode 100644
index 000000000..878bbc185
--- /dev/null
+++ b/tests/monkeypatch/test_voxtral_modeling_patch.py
@@ -0,0 +1,43 @@
+"""Integration tests for Voxtral modeling patches."""
+
+import pytest
+
+
+class TestVoxtralModelingPatchIntegration:
+    """Test Voxtral modeling patch integration."""
+
+    @pytest.mark.integration
+    def test_voxtral_conditional_generation_patch(self):
+        """Test that Voxtral conditional generation patch can be applied."""
+        try:
+            from transformers.models.voxtral.modeling_voxtral import (
+                VoxtralForConditionalGeneration,
+            )
+        except ImportError:
+            pytest.skip("VoxtralForConditionalGeneration not available")
+
+        from axolotl.monkeypatch.models.voxtral.modeling import (
+            patch_voxtral_conditional_generation_forward,
+        )
+
+        # Store original method
+        original_forward = VoxtralForConditionalGeneration.forward
+
+        # Apply patch and get unpatch function
+        unpatch_fn = patch_voxtral_conditional_generation_forward()
+
+        # Verify patch was applied
+        assert VoxtralForConditionalGeneration.forward != original_forward, (
+            "forward method was not patched"
+        )
+
+        # Verify the method is still callable
+        assert callable(VoxtralForConditionalGeneration.forward), (
+            "Patched method is not callable"
+        )
+
+        # Test unpatch function
+        unpatch_fn()
+        assert VoxtralForConditionalGeneration.forward == original_forward, (
+            "unpatch function did not restore original method"
+        )

From c51d6b06c3b25cfc94dd2ca08fc31b3a73ac4c29 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Fri, 19 Sep 2025 17:34:04 +0700
Subject: [PATCH 056/115] feat: add apertus model and cce (#3144) [skip ci]

* feat: add apertus, glm4v, glm4v_moe cce

* fix: arcee docs

* feat: add apertus

* feat: added vram usage

* fix: add apertus note

* feat: update doc on apertus xielu

* fix: add monkeypatch for xielu activation issue

* fix: simplify env

* feat: pin commit

* feat: add packing

* chore: move patch calling

* Update examples/apertus/README.md

Co-authored-by: salman <salman.mohammadi@outlook.com>

* Update examples/apertus/README.md

Co-authored-by: salman <salman.mohammadi@outlook.com>

* Update examples/apertus/README.md

Co-authored-by: salman <salman.mohammadi@outlook.com>

---------

Co-authored-by: salman <salman.mohammadi@outlook.com>
---
 examples/apertus/README.md                    | 110 ++++++++++++++++++
 examples/apertus/apertus-8b-qlora.yaml        |  64 ++++++++++
 examples/arcee/README.md                      |   3 +
 .../colab-axolotl-example.ipynb               |   2 +-
 scripts/cutcrossentropy_install.py            |   2 +-
 .../integrations/cut_cross_entropy/README.md  |   2 +-
 .../cut_cross_entropy/__init__.py             |   2 +-
 src/axolotl/loaders/patch_manager.py          |  12 +-
 .../monkeypatch/models/apertus/__init__.py    |   0
 .../monkeypatch/models/apertus/activation.py  |  52 +++++++++
 src/axolotl/monkeypatch/multipack.py          |   1 +
 11 files changed, 245 insertions(+), 5 deletions(-)
 create mode 100644 examples/apertus/README.md
 create mode 100644 examples/apertus/apertus-8b-qlora.yaml
 create mode 100644 src/axolotl/monkeypatch/models/apertus/__init__.py
 create mode 100644 src/axolotl/monkeypatch/models/apertus/activation.py

diff --git a/examples/apertus/README.md b/examples/apertus/README.md
new file mode 100644
index 000000000..774286333
--- /dev/null
+++ b/examples/apertus/README.md
@@ -0,0 +1,110 @@
+# Finetune Swiss-AI's Apertus with Axolotl
+
+[Apertus](https://huggingface.co/collections/swiss-ai/apertus-llm-68b699e65415c231ace3b059) is a family of opensource models trained by Swiss-ai.
+
+This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Apertus is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
+
+    Here is an example of how to install from main for pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'
+
+# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
+python scripts/cutcrossentropy_install.py | sh
+```
+
+2. (Optional, highly recommended) Install XIELU CUDA
+
+```bash
+## Recommended for reduced VRAM and faster speeds
+
+# Point to CUDA toolkit directory
+# For those using our Docker image, use the below path.
+export CUDA_HOME=/usr/local/cuda
+
+pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+```
+
+For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)
+
+3. Run the finetuning example:
+
+```bash
+axolotl train examples/apertus/apertus-8b-qlora.yaml
+```
+
+This config uses about 8.7 GiB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### Tips
+
+- For inference, the official Apertus team recommends `top_p=0.9` and `temperature=0.8`.
+- You can instead use full paremter fine-tuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+
+### XIELU Installation Issues
+
+#### `ModuleNotFoundError: No module named 'torch'`
+
+Please check these one by one:
+- Running in correct environment
+- Env has PyTorch installed
+- CUDA toolkit is at `CUDA_HOME`
+
+If those didn't help, please try the below solutions:
+
+1. Pass env for CMAKE and try install again:
+
+    ```bash
+    Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
+    ```
+
+2. Git clone the repo and manually hardcode python path:
+
+    ```bash
+    git clone https://github.com/nickjbrowning/XIELU
+    cd xielu
+    git checkout 59d6031
+
+    cd xielu
+    nano CMakeLists.txt  # or vi depending on your preference
+    ```
+
+    ```diff
+    execute_process(
+    -    COMMAND ${Python_EXECUTABLE} -c "import torch.utils; print(torch.utils.cmake_prefix_path)"
+    +    COMMAND /root/miniconda3/envs/py3.11/bin/python -c "import torch.utils; print(torch.utils.cmake_prefix_path)"
+        RESULT_VARIABLE TORCH_CMAKE_PATH_RESULT
+        OUTPUT_VARIABLE TORCH_CMAKE_PATH_OUTPUT
+        ERROR_VARIABLE TORCH_CMAKE_PATH_ERROR
+    )
+    ```
+
+    ```bash
+    pip3 install . --no-build-isolation --no-deps
+    ```
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+
+## Related Resources
+
+- [Apertus Tech Report](https://github.com/swiss-ai/apertus-tech-report/blob/main/Apertus_Tech_Report.pdf)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/apertus/apertus-8b-qlora.yaml b/examples/apertus/apertus-8b-qlora.yaml
new file mode 100644
index 000000000..521b282da
--- /dev/null
+++ b/examples/apertus/apertus-8b-qlora.yaml
@@ -0,0 +1,64 @@
+base_model: swiss-ai/Apertus-8B-Instruct-2509
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: false
+load_in_4bit: true
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+lora_target_modules:
+  - gate_proj
+  - down_proj
+  - up_proj
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/arcee/README.md b/examples/arcee/README.md
index 217893306..23f63663e 100644
--- a/examples/arcee/README.md
+++ b/examples/arcee/README.md
@@ -19,6 +19,9 @@ cd axolotl
 
 pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
 pip3 install --no-build-isolation -e '.[flash-attn]'
+
+# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
+python scripts/cutcrossentropy_install.py | sh
 ```
 
 2. Run the finetuning example:
diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index 774b78b82..e63632e7c 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
     "%%capture\n",
     "# This step can take ~5-10 minutes to install dependencies\n",
     "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c564afc\""
    ]
   },
   {
diff --git a/scripts/cutcrossentropy_install.py b/scripts/cutcrossentropy_install.py
index 5b49e7427..ada574805 100644
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
 
 print(
     UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c564afc"'
 )
diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md
index 393412f64..2361dde4a 100644
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
 
 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c564afc"
 ```
 
 ## Usage
diff --git a/src/axolotl/integrations/cut_cross_entropy/__init__.py b/src/axolotl/integrations/cut_cross_entropy/__init__.py
index d0eb1ebdb..dad3f7f89 100644
--- a/src/axolotl/integrations/cut_cross_entropy/__init__.py
+++ b/src/axolotl/integrations/cut_cross_entropy/__init__.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)
 
 _CCE_INSTALL_MESSAGE = (
     "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c6a32c5"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c564afc"`'
 )
 
 
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index 98eb07b0f..a78f8b965 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -68,11 +68,12 @@ class PatchManager:
         self._apply_self_attention_lora_patch()
         self._apply_fsdp2_bnb_patches()
         self._apply_patch_deepspeed_zero3()
+        self._apply_voxtral_patches()
+        self._apply_apertus_patches()
 
     def apply_post_plugin_pre_model_load_patches(self):
         """Apply post plugin-pre_model_load load patches based on config."""
         self._apply_tiled_mlp(self.cfg.model_config_type)
-        self._apply_voxtral_patches()
 
     def _apply_transformers_patches(self):
         from axolotl.monkeypatch.transformers.trainer_loss_calc import (
@@ -493,3 +494,12 @@ class PatchManager:
                 apply_deepspeed_patches()
         except ImportError as e:
             LOG.warning(f"DeepSpeed patches not applied: {e}")
+
+    def _apply_apertus_patches(self):
+        """Apply patches for Apertus model."""
+        if self.cfg.model_config_type == "apertus":
+            from axolotl.monkeypatch.models.apertus.activation import (
+                patch_apertus_xielu_activation,
+            )
+
+            patch_apertus_xielu_activation()
diff --git a/src/axolotl/monkeypatch/models/apertus/__init__.py b/src/axolotl/monkeypatch/models/apertus/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/src/axolotl/monkeypatch/models/apertus/activation.py b/src/axolotl/monkeypatch/models/apertus/activation.py
new file mode 100644
index 000000000..d5470aceb
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/apertus/activation.py
@@ -0,0 +1,52 @@
+"""Monkeypatch for Apertus to dtype mismatch in XIELU act"""
+
+from torch import Tensor
+
+
+def patch_apertus_xielu_activation():
+    try:
+        from transformers.activations import XIELUActivation
+    except ImportError as err:
+        raise ImportError(
+            "Cannot import XIELUActivation. "
+            "Please make sure to update your transformers version >= 4.56.1."
+        ) from err
+
+    from transformers.activations import logger
+
+    # Store the original method
+    old_fn = XIELUActivation._xielu_cuda
+
+    def _xielu_cuda_fixed(self, x: Tensor) -> Tensor:
+        """Firewall function to prevent torch.compile from seeing .item() calls"""
+        original_shape = x.shape
+        # CUDA kernel expects 3D tensors, reshape if needed
+        while x.dim() < 3:
+            x = x.unsqueeze(0)
+        if x.dim() > 3:
+            x = x.view(-1, 1, x.size(-1))
+        if original_shape != x.shape:
+            logger.warning_once(
+                "Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).",
+                original_shape,
+                x.shape,
+            )
+        result = self._xielu_cuda_obj.forward(
+            x,
+            self.alpha_p.to(x.dtype),
+            self.alpha_n.to(x.dtype),
+            # Temporary until xIELU CUDA fully implemented -> self.{beta,eps}.item()
+            self._beta_scalar,
+            self._eps_scalar,
+            self.with_vector_loads,
+        )
+        return result.view(original_shape)
+
+    # Apply the patch
+    XIELUActivation._xielu_cuda = _xielu_cuda_fixed
+
+    def unpatch():
+        """Restore the original method"""
+        XIELUActivation._xielu_cuda = old_fn
+
+    return unpatch
diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
index a32430d9f..726e60111 100644
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -11,6 +11,7 @@ from axolotl.monkeypatch.mixtral import patch_mixtral_moe_forward_zero3
 from axolotl.monkeypatch.utils import get_unpad_data
 
 SUPPORTED_MULTIPACK_MODEL_TYPES = [
+    "apertus",
     "mllama_text_model",
     "llama",
     "llama4",

From 7be8740c5c0a2ced61eb9c3ac2afa220baf4ff1e Mon Sep 17 00:00:00 2001
From: AlexHT Hung <hung_alex@icloud.com>
Date: Fri, 19 Sep 2025 18:34:28 +0800
Subject: [PATCH 057/115] fix(rl): pass max_prompt_len to training args as
 max_prompt_length (#3113)

* pass max_prompt_len to training args as max_prompt_length

* Update rl.py

* refactor

* format

* fix: default for max_prompt_length

* fix: defaults for trainer

---------

Co-authored-by: NanoCode012 <nano@axolotl.ai>
---
 src/axolotl/core/builders/rl.py           | 18 +++++++++++++-----
 src/axolotl/core/trainers/dpo/__init__.py |  1 -
 src/axolotl/utils/schemas/config.py       |  4 ++--
 3 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/src/axolotl/core/builders/rl.py b/src/axolotl/core/builders/rl.py
index a6e8355f4..0ceb80008 100644
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -120,6 +120,11 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
         if self.cfg.use_wandb:
             training_args_kwargs["run_name"] = self.cfg.wandb_name
 
+        if self.cfg.max_prompt_len:
+            training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
+        else:
+            training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
+
         training_args_cls = None
         blocklist_args_kwargs = []
         if self.cfg.rl is RLType.SIMPO:
@@ -129,10 +134,16 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
             if self.cfg.cpo_alpha is not None:
                 training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
 
+            # Handle when max_prompt_length == max_length from defaults
+            # CPOTrainer requires strictly less than
+            if (
+                training_args_kwargs["max_prompt_length"]
+                == training_args_kwargs["max_length"]
+            ):
+                training_args_kwargs["max_prompt_length"] -= 1
+
         elif self.cfg.rl is RLType.ORPO:
             training_args_cls = AxolotlORPOConfig
-            if self.cfg.max_prompt_len:
-                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
 
         elif self.cfg.rl is RLType.KTO:
             training_args_cls = AxolotlKTOConfig
@@ -144,9 +155,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
                 self.cfg.kto_undesirable_weight or 1.0
             )
 
-            if self.cfg.max_prompt_len:
-                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
-
         elif self.cfg.rl is RLType.GRPO:
             training_args_cls = GRPOStrategy.get_training_args_class()
             training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
diff --git a/src/axolotl/core/trainers/dpo/__init__.py b/src/axolotl/core/trainers/dpo/__init__.py
index 4b40d4085..3aa79c484 100644
--- a/src/axolotl/core/trainers/dpo/__init__.py
+++ b/src/axolotl/core/trainers/dpo/__init__.py
@@ -27,7 +27,6 @@ class DPOStrategy:
             training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing
         training_args_kwargs["max_completion_length"] = None
         training_args_kwargs["max_length"] = cfg.sequence_len
-        training_args_kwargs["max_prompt_length"] = cfg.sequence_len
         training_args_kwargs["generate_during_eval"] = cfg.dpo_generate_during_eval
         if cfg.dpo_use_weighting is not None:
             training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index d612ec8a5..0177b19f6 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -436,8 +436,8 @@ class AxolotlInputConfig(
         },
     )
     min_sample_len: int | None = None
-    max_prompt_len: int = Field(
-        default=512,
+    max_prompt_len: int | None = Field(
+        default=None,
         json_schema_extra={"description": "maximum prompt length for RL training"},
     )
     sample_packing: bool | None = Field(

From 08d831c3d5b567b76bfa03df536fca17af9f4a58 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Tue, 23 Sep 2025 11:31:15 +0700
Subject: [PATCH 058/115] Feat: add qwen3-next (w packing+cce) (#3150)

* feat: upgrade cce for qwen3-next

* feat: add sample qwen3 config

* feat: add packing patch for chunk_gated_delta_rule

* feat: add qwen3 link

* fix: tuple name

* feat: add tested qwen3 config

* fix: improve log

* feat: add patch for fla without packing

* fix: remove fla patch for standard mode

* feat: enable packing

* feat: add qwen3-next tests

* chore: move tests
---
 .../colab-axolotl-example.ipynb               |   2 +-
 examples/qwen3-next/README.md                 |  64 ++++
 .../qwen3-next/qwen3-next-80b-a3b-qlora.yaml  |  60 ++++
 scripts/cutcrossentropy_install.py            |   2 +-
 .../integrations/cut_cross_entropy/README.md  |   3 +-
 .../cut_cross_entropy/__init__.py             |   2 +-
 src/axolotl/loaders/patch_manager.py          |   7 +
 .../monkeypatch/models/qwen3_next/__init__.py |   1 +
 .../monkeypatch/models/qwen3_next/modeling.py | 317 ++++++++++++++++++
 src/axolotl/monkeypatch/multipack.py          |   1 +
 .../test_qwen3_next_modeling_patch.py         | 111 ++++++
 11 files changed, 566 insertions(+), 4 deletions(-)
 create mode 100644 examples/qwen3-next/README.md
 create mode 100644 examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
 create mode 100644 src/axolotl/monkeypatch/models/qwen3_next/__init__.py
 create mode 100644 src/axolotl/monkeypatch/models/qwen3_next/modeling.py
 create mode 100644 tests/monkeypatch/test_qwen3_next_modeling_patch.py

diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index e63632e7c..b48331063 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
     "%%capture\n",
     "# This step can take ~5-10 minutes to install dependencies\n",
     "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c564afc\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef\""
    ]
   },
   {
diff --git a/examples/qwen3-next/README.md b/examples/qwen3-next/README.md
new file mode 100644
index 000000000..eb0d5fd28
--- /dev/null
+++ b/examples/qwen3-next/README.md
@@ -0,0 +1,64 @@
+# Finetune Qwen3-Next with Axolotl
+
+[Qwen3-Next](https://huggingface.co/collections/Qwen/qwen3-next-68c25fd6838e585db8eeea9d) represents the next-generation foundation models optimized for extreme context length and large-scale parameter efficiency. The series introduces architectural innovations including Hybrid Attention (Gated DeltaNet + Gated Attention), High-Sparsity MoE with 1:50 activation ratio, and Multi-Token Prediction for enhanced performance and inference acceleration.
+
+This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
+
+## Getting started
+
+1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Qwen3-Next is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
+
+    Here is an example of how to install from main for pip:
+
+```bash
+# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
+git clone https://github.com/axolotl-ai-cloud/axolotl.git
+cd axolotl
+
+pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
+pip3 install --no-build-isolation -e '.[flash-attn]'
+
+# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
+python scripts/cutcrossentropy_install.py | sh
+```
+
+2. Install Qwen3-Next transformers commit
+```bash
+pip3 uninstall -y transformers && pip3 install "git+https://github.com/huggingface/transformers.git@b9282355bea846b54ed850a066901496b19da654"
+```
+
+3. Install FLA for improved performance
+```bash
+pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.3.2
+```
+
+4. Run the finetuning example:
+
+```bash
+axolotl train examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
+```
+
+This config uses about 41.7 GiB VRAM.
+
+Let us know how it goes. Happy finetuning! 🚀
+
+### TIPS
+
+- For inference, you can experiment with `temperature: 0.7`, `top_p: 0.8`, `top_k: 20`, and `min_p: 0`.
+- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. See [Multi-GPU](#optimization-guides) section below.
+- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
+- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
+
+## Optimization Guides
+
+- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
+- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
+
+## Related Resources
+
+- [Qwen3-Next Blog](https://qwenlm.github.io/blog/qwen3_next/)
+- [Axolotl Docs](https://docs.axolotl.ai)
+- [Axolotl Website](https://axolotl.ai)
+- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
+- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml b/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
new file mode 100644
index 000000000..11481dcd3
--- /dev/null
+++ b/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
@@ -0,0 +1,60 @@
+base_model: Qwen/Qwen3-Next-80B-A3B-Instruct
+
+# Automatically upload checkpoint and final model to HF
+# hub_model_id: username/custom_model_name
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: false
+load_in_4bit: true
+
+datasets:
+  - path: fozziethebeat/alpaca_messages_2k_test
+    type: chat_template
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.1
+output_dir: ./outputs/lora-out
+
+adapter: qlora
+lora_model_dir:
+
+sequence_len: 2048
+sample_packing: true
+
+lora_r: 16
+lora_alpha: 8
+lora_dropout: 0.05
+lora_target_modules:
+  - q_proj
+  - v_proj
+  - k_proj
+  - o_proj
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 2
+num_epochs: 1
+optimizer: adamw_bnb_8bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 1
+saves_per_epoch: 1
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/scripts/cutcrossentropy_install.py b/scripts/cutcrossentropy_install.py
index ada574805..dc117604a 100644
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
 
 print(
     UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c564afc"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"'
 )
diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md
index 2361dde4a..cc73eebb7 100644
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
 
 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c564afc"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"
 ```
 
 ## Usage
@@ -65,6 +65,7 @@ plugins:
 - qwen2_5_vl
 - qwen3
 - qwen3_moe
+- qwen3_next
 - smollm3
 - seed_oss
 - voxtral
diff --git a/src/axolotl/integrations/cut_cross_entropy/__init__.py b/src/axolotl/integrations/cut_cross_entropy/__init__.py
index dad3f7f89..812baf33f 100644
--- a/src/axolotl/integrations/cut_cross_entropy/__init__.py
+++ b/src/axolotl/integrations/cut_cross_entropy/__init__.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)
 
 _CCE_INSTALL_MESSAGE = (
     "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c564afc"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"`'
 )
 
 
diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index a78f8b965..3d4b7b96b 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -169,6 +169,13 @@ class PatchManager:
 
             patch_llama4_linearized_modeling()
 
+        if self.cfg.model_config_type == "qwen3_next" and self.cfg.sample_packing:
+            from axolotl.monkeypatch.models.qwen3_next.modeling import (
+                patch_qwen3_next_modeling_packing,
+            )
+
+            patch_qwen3_next_modeling_packing()
+
         if self.cfg.model_config_type == "mistral3" and self.cfg.processor_type:
             from axolotl.monkeypatch.models.mistral3.mistral_common_tokenizer import (
                 apply_mistral_tokenizer_image_patch,
diff --git a/src/axolotl/monkeypatch/models/qwen3_next/__init__.py b/src/axolotl/monkeypatch/models/qwen3_next/__init__.py
new file mode 100644
index 000000000..39bcd4115
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/qwen3_next/__init__.py
@@ -0,0 +1 @@
+"""Qwen3_Next model monkeypatches."""
diff --git a/src/axolotl/monkeypatch/models/qwen3_next/modeling.py b/src/axolotl/monkeypatch/models/qwen3_next/modeling.py
new file mode 100644
index 000000000..d68992d0e
--- /dev/null
+++ b/src/axolotl/monkeypatch/models/qwen3_next/modeling.py
@@ -0,0 +1,317 @@
+"""Monkeypatch for Qwen3_Next model to pass position_ids to linear attention."""
+
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+
+def get_cu_seqlens(position_ids):
+    """
+    Adapted from transformers.modeling_flash_attention_utils.prepare_fa_kwargs_from_position_ids.
+
+    https://github.com/huggingface/transformers/blob/0f1b128d3359a26bd18be99c26d7f04fb3cba914/src/transformers/modeling_flash_attention_utils.py#L316
+    """
+    tensor_kwargs = {"dtype": torch.int32, "device": position_ids.device}
+
+    position_ids = position_ids.view(-1)
+    indices_q = (position_ids == 0).nonzero().view(-1)
+
+    cu_seq_lens_q = torch.cat(
+        (
+            indices_q.to(**tensor_kwargs),
+            torch.tensor(position_ids.size(), **tensor_kwargs),
+        )
+    )
+
+    return cu_seq_lens_q
+
+
+def patch_qwen3_next_decoder_layer():
+    """Patch Qwen3NextDecoderLayer to pass position_ids to linear attention."""
+    try:
+        from transformers.models.qwen3_next.modeling_qwen3_next import (
+            Qwen3NextDecoderLayer,
+        )
+    except ImportError:
+        LOG.warning("Qwen3Next model not found, skipping patch")
+        return
+
+    # Store original forward method
+    original_decoder_forward = Qwen3NextDecoderLayer.forward
+
+    def patched_decoder_forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Tuple[torch.Tensor]] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Token Mixer
+        if self.layer_type == "linear_attention":
+            hidden_states = self.linear_attn(
+                hidden_states=hidden_states,
+                cache_params=past_key_values,
+                cache_position=cache_position,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+            )
+        elif self.layer_type == "full_attention":
+            # Self Attention
+            hidden_states, _ = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        # For the MoE layers, we need to unpack
+        if isinstance(hidden_states, Tuple):
+            hidden_states, _ = hidden_states
+        hidden_states = residual + hidden_states
+
+        return hidden_states
+
+    # Apply the patches
+    Qwen3NextDecoderLayer.forward = patched_decoder_forward
+
+    def unpatch():
+        """Restore the original forward method"""
+        Qwen3NextDecoderLayer.forward = original_decoder_forward
+
+    return unpatch
+
+
+def patch_qwen3_next_gateddelta_layer():
+    """Patch Qwen3NextGatedDeltaNet to parse cu_seqlens and pass to chunk_gated_delta_rule"""
+    try:
+        from transformers.models.qwen3_next.modeling_qwen3_next import (
+            Qwen3NextDynamicCache,
+            Qwen3NextGatedDeltaNet,
+            apply_mask_to_padding_states,
+        )
+    except ImportError:
+        LOG.warning("Qwen3Next model not found, skipping patch")
+        return
+
+    # Store original forward method
+    original_gated_delta_net_forward = Qwen3NextGatedDeltaNet.forward
+
+    def patched_gated_delta_net_forward(
+        self,
+        hidden_states: torch.Tensor,
+        cache_params: Optional[Qwen3NextDynamicCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+    ):
+        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)
+
+        # Set up dimensions for reshapes later
+        batch_size, seq_len, _ = hidden_states.shape
+
+        use_precomputed_states = (
+            cache_params is not None
+            and cache_params.has_previous_state
+            and seq_len == 1
+            and cache_position is not None
+        )
+
+        # getting projected states from cache if it exists
+        if cache_params is not None:
+            conv_state = cache_params.conv_states[self.layer_idx]
+            recurrent_state = cache_params.recurrent_states[self.layer_idx]
+
+        projected_states_qkvz = self.in_proj_qkvz(hidden_states)
+        projected_states_ba = self.in_proj_ba(hidden_states)
+        query, key, value, z, b, a = self.fix_query_key_value_ordering(
+            projected_states_qkvz, projected_states_ba
+        )
+        query, key, value = (
+            x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value)
+        )
+
+        mixed_qkv = torch.cat((query, key, value), dim=-1)
+        mixed_qkv = mixed_qkv.transpose(1, 2)
+
+        if use_precomputed_states:
+            # 2. Convolution sequence transformation
+            # NOTE: the conv state is updated in `causal_conv1d_update`
+            mixed_qkv = self.causal_conv1d_update(
+                mixed_qkv,
+                conv_state,
+                self.conv1d.weight.squeeze(1),
+                self.conv1d.bias,
+                self.activation,
+            )
+        else:
+            if cache_params is not None:
+                conv_state = F.pad(
+                    mixed_qkv, (self.conv_kernel_size - mixed_qkv.shape[-1], 0)
+                )
+                cache_params.conv_states[self.layer_idx] = conv_state
+            if self.causal_conv1d_fn is not None:
+                mixed_qkv = self.causal_conv1d_fn(
+                    x=mixed_qkv,
+                    weight=self.conv1d.weight.squeeze(1),
+                    bias=self.conv1d.bias,
+                    activation=self.activation,
+                    seq_idx=None,
+                )
+            else:
+                mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])
+
+        mixed_qkv = mixed_qkv.transpose(1, 2)
+        query, key, value = torch.split(
+            mixed_qkv,
+            [
+                self.key_dim,
+                self.key_dim,
+                self.value_dim,
+            ],
+            dim=-1,
+        )
+        query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim)
+        key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim)
+        value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim)
+
+        beta = b.sigmoid()
+        # If the model is loaded in fp16, without the .float() here, A might be -inf
+        g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
+        if self.num_v_heads // self.num_k_heads > 1:
+            query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
+            key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
+
+        if not use_precomputed_states:
+            cu_seqlens = get_cu_seqlens(position_ids=position_ids)
+            core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule(
+                query,
+                key,
+                value,
+                g=g,
+                beta=beta,
+                initial_state=None,
+                output_final_state=cache_params is not None,
+                use_qk_l2norm_in_kernel=True,
+                cu_seqlens=cu_seqlens,
+            )
+
+        else:
+            core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule(
+                query,
+                key,
+                value,
+                g=g,
+                beta=beta,
+                initial_state=recurrent_state,
+                output_final_state=cache_params is not None,
+                use_qk_l2norm_in_kernel=True,
+            )
+
+        # Update cache
+        if cache_params is not None:
+            cache_params.recurrent_states[self.layer_idx] = last_recurrent_state
+
+        z_shape_og = z.shape
+        # reshape input data into 2D tensor
+        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
+        z = z.reshape(-1, z.shape[-1])
+        core_attn_out = self.norm(core_attn_out, z)
+        core_attn_out = core_attn_out.reshape(z_shape_og)
+        core_attn_out = core_attn_out.reshape(
+            core_attn_out.shape[0], core_attn_out.shape[1], -1
+        )
+
+        output = self.out_proj(core_attn_out)
+        return output
+
+    # Apply the patches
+    Qwen3NextGatedDeltaNet.forward = patched_gated_delta_net_forward
+
+    def unpatch():
+        """Restore the original forward method"""
+        Qwen3NextGatedDeltaNet.forward = original_gated_delta_net_forward
+
+    return unpatch
+
+
+def patch_qwen3_next_imports():
+    """Patch Qwen3Next imports to use try/except instead of is_flash_linear_attention_available."""
+    try:
+        import transformers.models.qwen3_next.modeling_qwen3_next as qwen3_modeling
+    except ImportError:
+        LOG.warning("Qwen3Next model not found, skipping import patch")
+        return
+
+    # Save original values for unpatch
+    original_FusedRMSNormGated = getattr(qwen3_modeling, "FusedRMSNormGated", None)
+    original_chunk_gated_delta_rule = getattr(
+        qwen3_modeling, "chunk_gated_delta_rule", None
+    )
+    original_fused_recurrent_gated_delta_rule = getattr(
+        qwen3_modeling, "fused_recurrent_gated_delta_rule", None
+    )
+    original_is_fast_path_available = getattr(
+        qwen3_modeling, "is_fast_path_available", False
+    )
+
+    try:
+        from fla.modules import FusedRMSNormGated
+        from fla.ops.gated_delta_rule import (
+            chunk_gated_delta_rule,
+            fused_recurrent_gated_delta_rule,
+        )
+
+        qwen3_modeling.FusedRMSNormGated = FusedRMSNormGated
+        qwen3_modeling.chunk_gated_delta_rule = chunk_gated_delta_rule
+        qwen3_modeling.fused_recurrent_gated_delta_rule = (
+            fused_recurrent_gated_delta_rule
+        )
+
+        # Force is_fast_path_available to be True
+        # fla has triton kernels for causal_conv1d
+        qwen3_modeling.is_fast_path_available = True
+    except ImportError:
+        qwen3_modeling.chunk_gated_delta_rule = None
+        qwen3_modeling.fused_recurrent_gated_delta_rule = None
+        qwen3_modeling.FusedRMSNormGated = None
+
+    def unpatch():
+        """Restore the original import values"""
+        qwen3_modeling.FusedRMSNormGated = original_FusedRMSNormGated
+        qwen3_modeling.chunk_gated_delta_rule = original_chunk_gated_delta_rule
+        qwen3_modeling.fused_recurrent_gated_delta_rule = (
+            original_fused_recurrent_gated_delta_rule
+        )
+        qwen3_modeling.is_fast_path_available = original_is_fast_path_available
+
+    return unpatch
+
+
+def patch_qwen3_next_modeling_packing():
+    """Apply all Qwen3Next model patches."""
+    patch_qwen3_next_imports()
+    patch_qwen3_next_decoder_layer()
+    patch_qwen3_next_gateddelta_layer()
+
+    LOG.info("Applied Qwen3Next patch for packing")
diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
index 726e60111..4741245e1 100644
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -21,6 +21,7 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
     "qwen2_moe",
     "qwen3",
     "qwen3_moe",
+    "qwen3_next",
     "falcon",
     "phi",
     "phi3",
diff --git a/tests/monkeypatch/test_qwen3_next_modeling_patch.py b/tests/monkeypatch/test_qwen3_next_modeling_patch.py
new file mode 100644
index 000000000..91d9fc1cf
--- /dev/null
+++ b/tests/monkeypatch/test_qwen3_next_modeling_patch.py
@@ -0,0 +1,111 @@
+"""Integration tests for Qwen3 Next modeling patches."""
+
+import pytest
+import torch
+
+# Skip entire module if qwen3_next not available
+qwen3_next = pytest.importorskip("transformers.models.qwen3_next.modeling_qwen3_next")
+
+
+class TestQwen3NextModelingPatchIntegration:
+    """Test Qwen3 Next modeling patch integration."""
+
+    @pytest.mark.integration
+    def test_qwen3_next_decoder_layer_patch(self):
+        """Test that Qwen3Next decoder layer patch can be applied."""
+        from axolotl.monkeypatch.models.qwen3_next.modeling import (
+            patch_qwen3_next_decoder_layer,
+        )
+
+        # Store original method
+        original_forward = qwen3_next.Qwen3NextDecoderLayer.forward
+
+        # Apply patch and get unpatch function
+        unpatch_fn = patch_qwen3_next_decoder_layer()
+
+        # Verify patch was applied
+        assert qwen3_next.Qwen3NextDecoderLayer.forward != original_forward, (
+            "decoder layer forward method was not patched"
+        )
+
+        # Verify the method is still callable
+        assert callable(qwen3_next.Qwen3NextDecoderLayer.forward), (
+            "Patched method is not callable"
+        )
+
+        # Test unpatch function
+        if unpatch_fn:
+            unpatch_fn()
+            assert qwen3_next.Qwen3NextDecoderLayer.forward == original_forward, (
+                "unpatch function did not restore original method"
+            )
+
+    @pytest.mark.integration
+    def test_qwen3_next_gateddelta_layer_patch(self):
+        """Test that Qwen3Next GatedDeltaNet patch can be applied."""
+        from axolotl.monkeypatch.models.qwen3_next.modeling import (
+            patch_qwen3_next_gateddelta_layer,
+        )
+
+        # Store original method
+        original_forward = qwen3_next.Qwen3NextGatedDeltaNet.forward
+
+        # Apply patch and get unpatch function
+        unpatch_fn = patch_qwen3_next_gateddelta_layer()
+
+        # Verify patch was applied
+        assert qwen3_next.Qwen3NextGatedDeltaNet.forward != original_forward, (
+            "GatedDeltaNet forward method was not patched"
+        )
+
+        # Verify the method is still callable
+        assert callable(qwen3_next.Qwen3NextGatedDeltaNet.forward), (
+            "Patched method is not callable"
+        )
+
+        # Test unpatch function
+        if unpatch_fn:
+            unpatch_fn()
+            assert qwen3_next.Qwen3NextGatedDeltaNet.forward == original_forward, (
+                "unpatch function did not restore original method"
+            )
+
+    @pytest.mark.integration
+    def test_qwen3_next_imports_patch(self):
+        """Test that Qwen3Next imports patch can be applied without errors."""
+        from axolotl.monkeypatch.models.qwen3_next.modeling import (
+            patch_qwen3_next_imports,
+        )
+
+        # Apply patch - should not raise any exceptions even if modules unavailable
+        unpatch_fn = patch_qwen3_next_imports()
+
+        # Test that unpatch function is returned (or None if skipped)
+        assert unpatch_fn is None or callable(unpatch_fn), (
+            "patch_qwen3_next_imports should return None or callable unpatch function"
+        )
+
+    @pytest.mark.integration
+    def test_qwen3_next_modeling_packing_patch(self):
+        """Test that all Qwen3Next modeling patches can be applied together."""
+        from axolotl.monkeypatch.models.qwen3_next.modeling import (
+            patch_qwen3_next_modeling_packing,
+        )
+
+        # This should not raise any exceptions
+        patch_qwen3_next_modeling_packing()
+
+
+@pytest.mark.integration
+def test_get_cu_seqlens_utility():
+    """Test the get_cu_seqlens utility function."""
+    from axolotl.monkeypatch.models.qwen3_next.modeling import get_cu_seqlens
+
+    # Test with simple position_ids
+    position_ids = torch.tensor([[0, 1, 2, 0, 1]])
+    cu_seqlens = get_cu_seqlens(position_ids)
+    assert cu_seqlens.dtype == torch.int32, "Should be int32 dtype"
+
+    # Should return tensor with start positions and total length
+    expected = torch.tensor([0, 3, 5], dtype=torch.int32)
+    assert torch.equal(cu_seqlens, expected), f"Expected {expected}, got {cu_seqlens}"

From 55d1be2ae6f037081d7bcca55d93753fc2e10702 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Tue, 23 Sep 2025 21:22:15 +0700
Subject: [PATCH 059/115] fix: unify default for conversations_field [skip-e2e]
 (#3070)

* fix: unify default for conversations_field

* fix: suggestion to remove defaults
---
 docs/multimodal.qmd                                    |  1 -
 .../archived/deepcoder/deepcoder-14B-preview-lora.yml  |  4 ----
 .../deepcogito/cogito-v1-preview-llama-3B-lora.yml     |  4 ----
 .../deepcogito/cogito-v1-preview-qwen-14B-lora.yml     |  4 ----
 examples/gemma3/gemma-3-4b-vision-qlora.yml            |  2 +-
 examples/llama-3/instruct-lora-8b.yml                  |  9 ---------
 .../llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml |  1 -
 examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml    |  1 -
 examples/phi/lora-3.5.yaml                             |  9 ---------
 examples/qwen2-vl/lora-7b.yaml                         |  2 +-
 examples/qwen2_5-vl/lora-7b.yaml                       |  2 +-
 src/axolotl/core/datasets/transforms/chat_builder.py   | 10 +++++-----
 12 files changed, 8 insertions(+), 41 deletions(-)

diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd
index 413404195..3a28b579a 100644
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -42,7 +42,6 @@ datasets:
   - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
     split: train[:1%]
-    field_messages: messages
 
 # (optional) if doing lora, only finetune the Language model,
 # leave the vision model and vision tower frozen
diff --git a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
index 2202091d5..3223ec19a 100644
--- a/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
+++ b/examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
@@ -9,10 +9,6 @@ strict: false
 datasets:
   - path: fozziethebeat/alpaca_messages_2k_test
     type: chat_template
-    field_messages: messages
-    message_property_mappings:
-      role: role
-      content: content
 
 dataset_prepared_path:
 val_set_size: 0.05
diff --git a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
index fc9a75e3f..97d1bb6b3 100644
--- a/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
@@ -9,10 +9,6 @@ strict: false
 datasets:
   - path: fozziethebeat/alpaca_messages_2k_test
     type: chat_template
-    field_messages: messages
-    message_property_mappings:
-      role: role
-      content: content
 
 dataset_prepared_path:
 val_set_size: 0.05
diff --git a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
index b527edc6f..b80cc5bc0 100644
--- a/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
+++ b/examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
@@ -9,10 +9,6 @@ strict: false
 datasets:
   - path: fozziethebeat/alpaca_messages_2k_test
     type: chat_template
-    field_messages: messages
-    message_property_mappings:
-      role: role
-      content: content
 
 dataset_prepared_path:
 val_set_size: 0.05
diff --git a/examples/gemma3/gemma-3-4b-vision-qlora.yml b/examples/gemma3/gemma-3-4b-vision-qlora.yml
index e9e606b69..b42b6b492 100644
--- a/examples/gemma3/gemma-3-4b-vision-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-vision-qlora.yml
@@ -18,7 +18,7 @@ datasets:
   - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
     split: train[:1%]
-    field_messages: messages
+
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.01
 output_dir: ./outputs/out
diff --git a/examples/llama-3/instruct-lora-8b.yml b/examples/llama-3/instruct-lora-8b.yml
index 69e17b9cf..401df1d72 100644
--- a/examples/llama-3/instruct-lora-8b.yml
+++ b/examples/llama-3/instruct-lora-8b.yml
@@ -12,15 +12,6 @@ chat_template: llama3
 datasets:
   - path: fozziethebeat/alpaca_messages_2k_test
     type: chat_template
-    field_messages: messages
-    message_property_mappings:
-      role: role
-      content: content
-    roles:
-      user:
-        - user
-      assistant:
-        - assistant
 
 dataset_prepared_path:
 val_set_size: 0.05
diff --git a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
index 9975949bb..4136dc14a 100644
--- a/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
+++ b/examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
@@ -46,7 +46,6 @@ datasets:
   - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
     split: train[:1%]
-    field_messages: messages
 
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
diff --git a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
index ac7e05659..5972c2ae3 100644
--- a/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
+++ b/examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
@@ -45,7 +45,6 @@ datasets:
   - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
     split: train[:1%]
-    field_messages: messages
 
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
diff --git a/examples/phi/lora-3.5.yaml b/examples/phi/lora-3.5.yaml
index a6fa15d98..c10014dab 100644
--- a/examples/phi/lora-3.5.yaml
+++ b/examples/phi/lora-3.5.yaml
@@ -12,15 +12,6 @@ chat_template: phi_3
 datasets:
   - path: fozziethebeat/alpaca_messages_2k_test
     type: chat_template
-    field_messages: messages
-    message_property_mappings:
-      role: role
-      content: content
-    roles:
-      user:
-        - user
-      assistant:
-        - assistant
 
 dataset_prepared_path:
 val_set_size: 0.05
diff --git a/examples/qwen2-vl/lora-7b.yaml b/examples/qwen2-vl/lora-7b.yaml
index 8ea608199..285a35cbb 100644
--- a/examples/qwen2-vl/lora-7b.yaml
+++ b/examples/qwen2-vl/lora-7b.yaml
@@ -11,7 +11,7 @@ datasets:
   - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
     split: train[:1%]
-    field_messages: messages
+
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
diff --git a/examples/qwen2_5-vl/lora-7b.yaml b/examples/qwen2_5-vl/lora-7b.yaml
index 13a97dec3..7d499d841 100644
--- a/examples/qwen2_5-vl/lora-7b.yaml
+++ b/examples/qwen2_5-vl/lora-7b.yaml
@@ -11,7 +11,7 @@ datasets:
   - path: HuggingFaceH4/llava-instruct-mix-vsft
     type: chat_template
     split: train[:1%]
-    field_messages: messages
+
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
diff --git a/src/axolotl/core/datasets/transforms/chat_builder.py b/src/axolotl/core/datasets/transforms/chat_builder.py
index 8f2013027..0de0ecb40 100644
--- a/src/axolotl/core/datasets/transforms/chat_builder.py
+++ b/src/axolotl/core/datasets/transforms/chat_builder.py
@@ -8,7 +8,7 @@ from typing import Any, Mapping
 
 def chat_message_transform_builder(
     train_on_inputs=False,
-    conversations_field: str = "conversations",
+    conversations_field: str = "messages",
     message_field_role: str | list[str] | None = None,  # commonly "role"
     message_field_content: str | list[str] | None = None,  # commonly "content"
     message_field_training: str | list[str] | None = None,  # commonly "weight"
@@ -20,13 +20,13 @@ def chat_message_transform_builder(
             If True, the transform will train on the inputs. If False, the transform will train on the targets.
             Defaults to False.
         conversations_field (str, optional):
-            The field name of the conversations. Defaults to "conversations".
+            The field name of the conversations. Defaults to "messages".
         message_field_role (str | list[str], optional):
-            The field name of the role. Defaults to "role".
+            The field name of the role.
         message_field_content (str | list[str], optional):
-            The field name of the message content. Defaults to "content".
+            The field name of the message content.
         message_field_training (str | list[str], optional):
-            The field name of the train/weight. Defaults to "weight".
+            The field name of the train/weight.
 
     Returns:
         Callable:

From b3b92687c4ba8792d343b6b1a616f541840db8b3 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Wed, 24 Sep 2025 13:48:38 +0700
Subject: [PATCH 060/115] chore: rename gemma3 270m config (#3174)

---
 examples/gemma3/{270m-qlora.yml => gemma-3-270m-qlora.yml} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/gemma3/{270m-qlora.yml => gemma-3-270m-qlora.yml} (100%)

diff --git a/examples/gemma3/270m-qlora.yml b/examples/gemma3/gemma-3-270m-qlora.yml
similarity index 100%
rename from examples/gemma3/270m-qlora.yml
rename to examples/gemma3/gemma-3-270m-qlora.yml

From 6bc959342b3ff687cbaca700a310594190f3ab57 Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Wed, 24 Sep 2025 13:18:44 -0400
Subject: [PATCH 061/115] remove unused dep (#3180)

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3a44f0ae9..3e642b57f 100644
--- a/setup.py
+++ b/setup.py
@@ -124,7 +124,6 @@ extras_require = {
     "ring-flash-attn": [
         "flash-attn==2.8.3",
         "ring-flash-attn>=0.1.7",
-        "yunchang==0.6.0",
     ],
     "deepspeed": [
         "deepspeed==0.17.5",

From 856ff12171e262dc56f6b6b890a5e3e8e31b7dee Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Thu, 25 Sep 2025 03:13:49 +0700
Subject: [PATCH 062/115] feat(doc): add optimizations table of content to our
 improvements (#3175) [skip ci]

* chore: format

* feat: add usage for alst

* chore: wording

* feat: add optimizations doc

* Apply suggestion from @SalmanMohammadi

Co-authored-by: salman <salman.mohammadi@outlook.com>

* Update docs/dataset-formats/index.qmd

Co-authored-by: salman <salman.mohammadi@outlook.com>

* feat: add alst, act offloading, nd parallelism, use relative links, and fix format

* chore: comments

---------

Co-authored-by: salman <salman.mohammadi@outlook.com>
---
 _quarto.yml                    |   1 +
 docs/dataset-formats/index.qmd |   2 +-
 docs/optimizations.qmd         | 133 +++++++++++++++++++++++++++++++++
 docs/qat.qmd                   |   1 +
 examples/alst/README.md        |  21 ++++++
 5 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 docs/optimizations.qmd

diff --git a/_quarto.yml b/_quarto.yml
index 3ffb0e627..fad3f6786 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -267,6 +267,7 @@ website:
             - docs/dataset_loading.qmd
             - docs/qat.qmd
             - docs/quantize.qmd
+            - docs/optimizations.qmd
 
         - section: "Core Concepts"
           contents:
diff --git a/docs/dataset-formats/index.qmd b/docs/dataset-formats/index.qmd
index a0113db07..715e3ef20 100644
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -61,7 +61,7 @@ While we recommend `.jsonl`, you can also use the other formats (`csv`, `parquet
 
 ### Pre-training without streaming
 
-On the rare case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the `completion` format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.
+In the case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the `completion` format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.
 
 One benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.
 
diff --git a/docs/optimizations.qmd b/docs/optimizations.qmd
new file mode 100644
index 000000000..967ec2d34
--- /dev/null
+++ b/docs/optimizations.qmd
@@ -0,0 +1,133 @@
+---
+title: Optimizations Guide
+description: A guide to the performance and memory optimizations available in Axolotl.
+---
+
+Axolotl includes numerous optimizations to speed up training, reduce memory usage, and handle large models.
+
+This guide provides a high-level overview and directs you to the detailed documentation for each feature.
+
+## Speed Optimizations
+
+These optimizations focus on increasing training throughput and reducing total training time.
+
+### Sample Packing
+
+Improves GPU utilization by combining multiple short sequences into a single packed sequence for training. This requires enabling one of the [attention](#attention-implementations) implementations below.
+
+- **Config:** `sample_packing: true`
+- **Learn more:** [Sample Packing](multipack.qmd)
+
+### Attention Implementations
+
+Using an optimized attention implementation is critical for training speed.
+
+- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `flash_attention: true`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
+- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `flex_attention: true`.
+- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `sdp_attention: true`. PyTorch's native implementation.
+- **[Xformers](https://github.com/facebookresearch/xformers)**: `xformers_attention: true`. Works with FP16.
+
+*Note: You should only enable one attention backend.*
+
+### LoRA Optimizations
+
+Leverages optimized kernels to accelerate LoRA training and reduce memory usage.
+
+- **Learn more:** [LoRA Optimizations Documentation](lora_optims.qmd)
+
+## Memory Optimizations
+
+These techniques help you fit larger models or use bigger batch sizes on your existing hardware.
+
+### Parameter Efficient Finetuning (LoRA & QLoRA)
+
+Drastically reduces memory by training a small set of "adapter" parameters instead of the full model. This is the most common and effective memory-saving technique.
+
+- Examples: Find configs with `lora` or `qlora` in the [examples directory](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-3).
+- Config Reference: See `adapter`, `load_in_4bit`, and `load_in_8bit` in the [Configuration Reference](config-reference.qmd).
+
+### Gradient Checkpointing & Activation Offloading
+
+These techniques save VRAM by changing how activations are handled.
+
+- Gradient Checkpointing: re-computes activations during the backward pass, trading compute time for VRAM.
+- Activation Offloading: moves activations to CPU RAM or disk, trading I/O overhead for VRAM.
+- Learn more: [Gradient Checkpointing and Offloading Docs](gradient_checkpointing.qmd)
+
+### Cut Cross Entropy (CCE)
+
+Reduces VRAM usage by using an optimized cross-entropy loss calculation.
+
+- **Learn more:** [Custom Integrations - CCE](custom_integrations.qmd#cut-cross-entropy)
+
+### Liger Kernels
+
+Provides efficient Triton kernels to improve training speed and reduce memory usage.
+
+- **Learn more:** [Custom Integrations - Liger Kernels](custom_integrations.qmd#liger-kernels)
+
+## Long Context Models
+
+Techniques to train models on sequences longer than their original context window.
+
+### RoPE Scaling
+
+Extends a model's context window by interpolating its Rotary Position Embeddings.
+
+- **Config:** Pass the `rope_scaling` config under the `overrides_of_model_config: `. To learn how to set RoPE, check the respective model config.
+
+### Sequence Parallelism
+
+Splits long sequences across multiple GPUs, enabling training with sequence lengths that would not fit on a single device.
+
+- **Learn more:** [Sequence Parallelism Documentation](sequence_parallelism.qmd)
+
+### Artic Long Sequence Training (ALST)
+
+ALST is a recipe that combines several techniques to train long-context models efficiently. It typically involves:
+
+- TiledMLP to reduce memory usage in MLP layers.
+- Tiled Loss functions (like [CCE](#cut-cross-entropy-(cce) or [Liger](#liger-kernels)).
+- Activation Offloading to CPU.
+
+- Example: [ALST Example Configuration](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst)
+
+## Large Models (Distributed Training)
+
+To train models that don't fit on a single GPU, you'll need to use a distributed training strategy like FSDP or DeepSpeed. These frameworks shard the model weights, gradients, and optimizer states across multiple GPUs and nodes.
+
+- **Learn more:** [Multi-GPU Guide](multi-gpu.qmd)
+- **Learn more:** [Multi-Node Guide](multi-node.qmd)
+
+### N-D Parallelism (Beta)
+
+For advanced scaling, Axolotl allows you to compose different parallelism techniques (e.g., Data, Tensor, Sequence Parallelism). This is a powerful approach to train an extremely large model by overcoming multiple bottlenecks at once.
+
+- **Learn more:** [N-D Parallelism Guide](nd_parallelism.qmd)
+
+
+## Quantization
+
+Techniques to reduce the precision of model weights for memory savings.
+
+### 4-bit Training (QLoRA)
+
+The recommended approach for quantization-based training. It loads the base model in 4-bit using `bitsandbytes` and then trains QLoRA adapters. See [Adapter Finetuning](#adapter-finetuning-lora-qlora) for details.
+
+### FP8 Training
+
+Enables training with 8-bit floating point precision on supported hardware (e.g., NVIDIA Hopper series GPUs) for significant speed and memory gains.
+
+- **Example:** [Llama 3 FP8 FSDP Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-3/3b-fp8-fsdp2.yaml)
+
+### Quantization Aware Training (QAT)
+
+Simulates quantization effects during training, helping the model adapt and potentially improving the final accuracy of the quantized model.
+
+- **Learn more:** [QAT Documentation](qat.qmd)
+
+### GPTQ
+
+Allows you to finetune LoRA adapters on top of a model that has already been quantized using the GPTQ method.
+
+- **Example:** [GPTQ LoRA Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-2/gptq-lora.yml)
diff --git a/docs/qat.qmd b/docs/qat.qmd
index ad9779066..91fe5180c 100644
--- a/docs/qat.qmd
+++ b/docs/qat.qmd
@@ -30,6 +30,7 @@ qat:
 ```
 
 We support the following quantization schemas:
+
 - `Int4WeightOnly` (requires the `fbgemm-gpu` extra when installing Axolotl)
 - `Int8DynamicActivationInt4Weight`
 - `Float8DynamicActivationFloat8Weight`
diff --git a/examples/alst/README.md b/examples/alst/README.md
index 7f194d299..6d201f826 100644
--- a/examples/alst/README.md
+++ b/examples/alst/README.md
@@ -7,3 +7,24 @@ techniques. It is a combination of:
 - Activation Offloading: Offload activations to CPU RAM to reduce memory usage
 
 For more information, you can check out the ALST paper [here](https://www.arxiv.org/abs/2506.13996).
+
+## Usage
+
+```yaml
+tiled_mlp: true
+
+# See Sequence Parallelism docs
+# https://docs.axolotl.ai/docs/sequence_parallelism.html
+context_parallel_size: int
+
+plugins:
+# See Cut Cross Entropy docs
+# https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+# or Liger Kernel docs
+# https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels
+  - axolotl.integrations.liger.LigerPlugin
+# ...
+
+```

From e8b962d47f89041fd6ca6e84c7ece38b8baa34a9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=99=88=E5=8D=8E=E6=9D=B0?= <cmathking@gmail.com>
Date: Thu, 25 Sep 2025 13:06:21 +0800
Subject: [PATCH 063/115] feat: support training with JSON string tool
 arguments (#3136)

* feat: support training with JSON string tool arguments; fix PyArrow data type inconsistent error

* feat: raise error for tool call arguments decode

* Add test_chat_templates_tool_call_string_arguments.py

Add test for string arguments

* fix: change to correct qwen3 tokenizer

* fix: update docs to clarify arguments json

* chore: lint

* fix: duplicate

* chore: revert

* feat: add error to faq

* fix: remove duplicate fixture

---------

Co-authored-by: caoqinping <caoqinping@lixiang.com>
Co-authored-by: gamersover-blog <1611885128@qq.com>
Co-authored-by: NanoCode012 <nano@axolotl.ai>
---
 docs/dataset-formats/conversation.qmd         |   8 +
 docs/faq.qmd                                  |   4 +
 .../prompt_strategies/chat_template.py        |  17 ++
 tests/prompt_strategies/conftest.py           |   9 +
 ...est_chat_template_ds_schema_unification.py |  10 -
 .../test_chat_templates_thinking.py           |  10 -
 ...at_templates_tool_call_string_arguments.py | 214 ++++++++++++++++++
 7 files changed, 252 insertions(+), 20 deletions(-)
 create mode 100644 tests/prompt_strategies/test_chat_templates_tool_call_string_arguments.py

diff --git a/docs/dataset-formats/conversation.qmd b/docs/dataset-formats/conversation.qmd
index d53c68598..870a2b67d 100644
--- a/docs/dataset-formats/conversation.qmd
+++ b/docs/dataset-formats/conversation.qmd
@@ -212,6 +212,14 @@ Instead of passing `tools` via the system prompt, an alternative method would be
 Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
 :::
 
+::: {.callout-warning}
+If you have tool arguments with same name but different dtypes (like `"time": string` and `"time": number`), please save `arguments: ` as JSON string to prevent `datasets` from having casting issues.
+
+```
+"arguments": "{\"...\": \"...\"}"
+```
+:::
+
 Example config for Llama4:
 ```yaml
 chat_template: llama4
diff --git a/docs/faq.qmd b/docs/faq.qmd
index 08d439af7..ffc29d35d 100644
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -140,3 +140,7 @@ description: Frequently asked questions
 **Q: `ValueError("Backward pass should have cleared tracker of all tensors")`
 
 > A: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with `offload_activations: legacy` in your YAML.
+
+**Q: `Error parsing tool_calls arguments as JSON.`
+
+> A: There is an error parsing string arguments to a dict. Please check your dataset and the error message for more details.
diff --git a/src/axolotl/prompt_strategies/chat_template.py b/src/axolotl/prompt_strategies/chat_template.py
index cb3e3dfb1..f4dcbd7cd 100644
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -2,6 +2,7 @@
 HF Chat Templates prompt strategy
 """
 
+import json
 from collections import defaultdict
 from typing import TYPE_CHECKING, Any, Dict, List, Set, Union
 
@@ -794,6 +795,22 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                 if val is not None:
                     transformed_message[key] = val
 
+        if "tool_calls" in transformed_message and transformed_message["tool_calls"]:
+            for tool_call in transformed_message["tool_calls"]:
+                if "function" in tool_call and "arguments" in tool_call["function"]:
+                    args = tool_call["function"]["arguments"]
+                    if isinstance(args, str):
+                        try:
+                            tool_call["function"]["arguments"] = json.loads(args)
+                        except json.JSONDecodeError as e:
+                            LOG.error(
+                                f"Error parsing tool_calls arguments as JSON. "
+                                f"Function: {tool_call.get('function', {}).get('name', 'unknown')}, "
+                                f"Arguments string: {args!r}, "
+                                f"Error: {e}"
+                            )
+                            raise
+
         return transformed_message
 
     def _get_images(self, prompt):
diff --git a/tests/prompt_strategies/conftest.py b/tests/prompt_strategies/conftest.py
index 12c4bcd93..0af7b3e93 100644
--- a/tests/prompt_strategies/conftest.py
+++ b/tests/prompt_strategies/conftest.py
@@ -177,6 +177,15 @@ def fixture_devstral_1_1_tokenizer():
     return tokenizer
 
 
+@pytest.fixture(name="qwen3_tokenizer")
+def qwen3_tokenizer_fixture(
+    download_qwen3_half_billion_model,
+):  # pylint: disable=unused-argument,redefined-outer-name
+    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
+
+    return tokenizer
+
+
 @pytest.fixture(name="mistralv03_tokenizer_chat_template_jinja")
 def fixture_mistralv03_chat_template_jinja_w_system() -> str:
     return '{%- if messages[0]["role"] == "system" %}\n    {%- set system_message = messages[0]["content"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == "tool" or message.role == "tool_results" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message["role"] == "user") != (ns.index % 2 == 0) %}\n            {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message["role"] == "user" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- "[AVAILABLE_TOOLS] [" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- \'{"type": "function", "function": {\' }}\n                {%- for key, val in tool.items() if key != "return" %}\n                    {%- if val is string %}\n                        {{- \'"\' + key + \'": "\' + val + \'"\' }}\n                    {%- else %}\n                        {{- \'"\' + key + \'": \' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- ", " }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- "}}" }}\n                {%- if not loop.last %}\n                    {{- ", " }}\n                {%- else %}\n                    {{- "]" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- "[/AVAILABLE_TOOLS]" }}\n            {%- endif %}\n        {%- if loop.first and system_message is defined %}\n            {{- "[INST] " + system_message + "\\n\\n" + message["content"] + "[/INST]" }}\n        {%- else %}\n            {{- "[INST] " + message["content"] + "[/INST]" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- "[TOOL_CALLS] [" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}\n            {%- endif %}\n            {{- \', "id": "\' + tool_call.id + \'"}\' }}\n            {%- if not loop.last %}\n                {{- ", " }}\n            {%- else %}\n                {{- "]" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message["role"] == "assistant" %}\n        {{- " " + message["content"]|trim + eos_token}}\n    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- \'[TOOL_RESULTS] {"content": \' + content|string + ", " }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}\n        {%- endif %}\n        {{- \'"call_id": "\' + message.tool_call_id + \'"}[/TOOL_RESULTS]\' }}\n    {%- else %}\n        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}\n    {%- endif %}\n{%- endfor %}\n'
diff --git a/tests/prompt_strategies/test_chat_template_ds_schema_unification.py b/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
index e8d35e974..4f4e32208 100644
--- a/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
+++ b/tests/prompt_strategies/test_chat_template_ds_schema_unification.py
@@ -6,7 +6,6 @@ import json
 
 import pytest
 from datasets import Dataset
-from transformers import AutoTokenizer
 
 from axolotl.prompt_strategies.chat_template import StrategyLoader
 from axolotl.utils.dict import DictDefault
@@ -23,15 +22,6 @@ def fixture_messages_w_tools():
     return Dataset.from_list(rows)
 
 
-@pytest.fixture(name="qwen3_tokenizer")
-def qwen3_tokenizer_fixture(
-    download_qwen3_half_billion_model,
-):
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
-
-    return tokenizer
-
-
 @pytest.fixture(name="qwen3_prompt_strategy")
 def qwen3_chat_template_strategy(qwen3_tokenizer):
     cfg = DictDefault(
diff --git a/tests/prompt_strategies/test_chat_templates_thinking.py b/tests/prompt_strategies/test_chat_templates_thinking.py
index 5475666a5..054012e00 100644
--- a/tests/prompt_strategies/test_chat_templates_thinking.py
+++ b/tests/prompt_strategies/test_chat_templates_thinking.py
@@ -4,7 +4,6 @@ Tests for splitting reasoning/thinking from content into separate field
 
 import pytest
 from datasets import Dataset
-from transformers import AutoTokenizer
 
 from axolotl.prompt_strategies.chat_template import (
     load,
@@ -56,15 +55,6 @@ def messages_w_reasoning_fixture():
     )
 
 
-@pytest.fixture(name="qwen3_tokenizer")
-def qwen3_tokenizer_fixture(
-    download_qwen3_half_billion_model,
-):
-    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")
-
-    return tokenizer
-
-
 class TestSplitThinking:
     """
     test class to make sure datasets with reasoning content conforms to the chat_template strategy
diff --git a/tests/prompt_strategies/test_chat_templates_tool_call_string_arguments.py b/tests/prompt_strategies/test_chat_templates_tool_call_string_arguments.py
new file mode 100644
index 000000000..7de21b940
--- /dev/null
+++ b/tests/prompt_strategies/test_chat_templates_tool_call_string_arguments.py
@@ -0,0 +1,214 @@
+"""
+Tests for handling json tool content
+"""
+
+import json
+
+import pytest
+from datasets import Dataset
+
+from axolotl.prompt_strategies.chat_template import (
+    load,
+)
+from axolotl.utils.dict import DictDefault
+
+
+@pytest.fixture(name="qwen3_instruct_prompt_strategy")
+def qwen3_instruct_chat_template_strategy(qwen3_tokenizer):
+    strategy = load(
+        qwen3_tokenizer,
+        DictDefault(
+            {
+                "train_on_inputs": False,
+                "sequence_len": 512,
+            }
+        ),
+        DictDefault(
+            {
+                "chat_template": "qwen3",
+                "message_field_role": "role",
+                "message_field_content": "content",
+                "message_property_mappings": {
+                    "role": "role",
+                    "content": "content",
+                },
+                "roles": {
+                    "user": ["user"],
+                    "assistant": ["assistant"],
+                    "system": ["system"],
+                },
+                "field_messages": "messages",
+            }
+        ),
+    )
+    return strategy
+
+
+class TestQwen3IdenticalConversationArgs:
+    """
+    Test Qwen3 tools is identical between JSON and dict
+    """
+
+    @pytest.fixture(name="conversation_dict_args_dataset")
+    def fixture_conversation_dict_args_dataset(self):
+        """
+        Provides a dataset with conversation where arguments is a dict.
+        """
+        user_content = "What is the weather in Boston?"
+        function_name = "get_current_weather"
+        arguments_dict = {"location": "Boston, MA", "unit": "celsius"}
+
+        data = [
+            {
+                "messages": [
+                    {"role": "user", "content": user_content},
+                    {
+                        "role": "assistant",
+                        "content": "",
+                        "tool_calls": [
+                            {
+                                "function": {
+                                    "name": function_name,
+                                    "arguments": arguments_dict,  # dict格式
+                                }
+                            }
+                        ],
+                    },
+                ],
+            }
+        ]
+        return Dataset.from_list(data)
+
+    @pytest.fixture(name="conversation_str_args_dataset")
+    def fixture_conversation_str_args_dataset(self):
+        """
+        Provides a dataset with conversation where arguments is a JSON string.
+        """
+        user_content = "What is the weather in Boston?"
+        function_name = "get_current_weather"
+        arguments_dict = {"location": "Boston, MA", "unit": "celsius"}
+        arguments_str = json.dumps(arguments_dict)
+
+        data = [
+            {
+                "messages": [
+                    {"role": "user", "content": user_content},
+                    {
+                        "role": "assistant",
+                        "content": "",
+                        "tool_calls": [
+                            {
+                                "function": {
+                                    "name": function_name,
+                                    "arguments": arguments_str,  # str格式
+                                }
+                            }
+                        ],
+                    },
+                ],
+            }
+        ]
+        return Dataset.from_list(data)
+
+    @pytest.fixture(name="conversation_mixed_time_types_dataset")
+    def fixture_conversation_mixed_time_types_dataset(self):
+        """
+        Provides a dataset where 'time' field has different types in different tool calls.
+        """
+        data = [
+            {
+                "messages": [
+                    {
+                        "role": "user",
+                        "content": "Get weather information at different times",
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "",
+                        "tool_calls": [
+                            {
+                                "function": {
+                                    "name": "func1",
+                                    "arguments": json.dumps(
+                                        {"time": "2025-08-01"}
+                                    ),  # string type
+                                }
+                            },
+                            {
+                                "function": {
+                                    "name": "func2",
+                                    "arguments": json.dumps(
+                                        {"time": 1690876800}
+                                    ),  # number type
+                                }
+                            },
+                        ],
+                    },
+                ],
+            }
+        ]
+        return Dataset.from_list(data)
+
+    def test_dict_and_str_args_produce_identical_output(
+        self,
+        conversation_dict_args_dataset,
+        conversation_str_args_dataset,
+        qwen3_instruct_prompt_strategy,
+        qwen3_tokenizer,
+    ):
+        """
+        Tests that after tokenization and decoding, the outputs for both
+        dict and string `arguments` are exactly the same.
+        """
+        processed_dict_args = conversation_dict_args_dataset.map(
+            qwen3_instruct_prompt_strategy.tokenize_prompt,
+            batched=True,
+            remove_columns=["messages"],
+        )
+
+        processed_str_args = conversation_str_args_dataset.map(
+            qwen3_instruct_prompt_strategy.tokenize_prompt,
+            batched=True,
+            remove_columns=["messages"],
+        )
+
+        decoded_prompt_from_dict = qwen3_tokenizer.decode(
+            processed_dict_args[0]["input_ids"]
+        )
+
+        decoded_prompt_from_str = qwen3_tokenizer.decode(
+            processed_str_args[0]["input_ids"]
+        )
+
+        assert decoded_prompt_from_dict == decoded_prompt_from_str, (
+            f"Dict format output:\n{decoded_prompt_from_dict}\n"
+            f"String format output:\n{decoded_prompt_from_str}"
+        )
+
+        assert (
+            processed_dict_args[0]["input_ids"] == processed_str_args[0]["input_ids"]
+        ), "The tokenized input_ids should be identical for dict and str arguments"
+
+    def test_str_args_with_mixed_time_types_no_error(
+        self,
+        conversation_mixed_time_types_dataset,
+        qwen3_instruct_prompt_strategy,
+        qwen3_tokenizer,
+    ):
+        """
+        Tests that when 'time' field has different types (string vs number)
+        in different tool calls, str format arguments don't cause errors.
+        """
+        processed = conversation_mixed_time_types_dataset.map(
+            qwen3_instruct_prompt_strategy.tokenize_prompt,
+            batched=True,
+            remove_columns=["messages"],
+        )
+
+        assert len(processed) == 1
+        assert "input_ids" in processed[0]
+        assert len(processed[0]["input_ids"]) > 0
+
+        decoded = qwen3_tokenizer.decode(processed[0]["input_ids"])
+        assert "2025-08-01" in decoded, "String time value should be present"
+        assert "1690876800" in decoded, "Number time value should be present"

From 33975ce4bcec0f34221e1d54fe985d2c743b4033 Mon Sep 17 00:00:00 2001
From: miketung <mike@cs.stanford.edu>
Date: Thu, 25 Sep 2025 19:06:16 +0900
Subject: [PATCH 064/115] feat(qwen3-next): Adds targeting of shared expert and
 attention modules (#3183)

* Adds targetting of shared expert and attention modules in each layer

* Update VRAM usage

---------

Co-authored-by: Mike Tung <mike@diffbot.com>
---
 examples/qwen3-next/README.md                     | 2 +-
 examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml | 8 ++++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/examples/qwen3-next/README.md b/examples/qwen3-next/README.md
index eb0d5fd28..678175fd4 100644
--- a/examples/qwen3-next/README.md
+++ b/examples/qwen3-next/README.md
@@ -38,7 +38,7 @@ pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.3.2
 axolotl train examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
 ```
 
-This config uses about 41.7 GiB VRAM.
+This config uses about 45.62 GiB VRAM.
 
 Let us know how it goes. Happy finetuning! 🚀
 
diff --git a/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml b/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
index 11481dcd3..db841beab 100644
--- a/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
+++ b/examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
@@ -27,6 +27,14 @@ lora_r: 16
 lora_alpha: 8
 lora_dropout: 0.05
 lora_target_modules:
+  - linear_attn.in_proj_ba
+  - linear_attn.in_proj_qkvz
+  - linear_attn.out_proj
+  - shared_expert.up_proj
+  - shared_expert.down_proj
+  - shared_expert.gate_proj
+  - shared_expert_gate
+  - mlp.gate
   - q_proj
   - v_proj
   - k_proj

From f9748c4dc54430d7a12defa4ea76748db0a07f4f Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Thu, 25 Sep 2025 12:03:50 -0400
Subject: [PATCH 065/115] Cp fix (#3182)

* patch transformers to allow CP + FA2

* nits

* only patch in CP > 1 case
---
 src/axolotl/loaders/patch_manager.py          |  7 ++
 .../transformers/trainer_context_parallel.py  | 68 +++++++++++++++++++
 .../test_trainer_context_parallel_patch.py    | 66 ++++++++++++++++++
 3 files changed, 141 insertions(+)
 create mode 100644 src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
 create mode 100644 tests/monkeypatch/test_trainer_context_parallel_patch.py

diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index 3d4b7b96b..1e46f5c34 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -84,6 +84,13 @@ class PatchManager:
         patch_evaluation_loop()
         patch_maybe_log_save_evaluate()
 
+        if self.cfg.context_parallel_size > 1:
+            from axolotl.monkeypatch.transformers.trainer_context_parallel import (
+                patch_prepare_context_parallel_inputs,
+            )
+
+            patch_prepare_context_parallel_inputs()
+
     def apply_post_model_load_patches(self, model: PreTrainedModel):
         """Apply patches that require the model instance."""
         self._apply_llama_flash_attn_patches(model)
diff --git a/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
new file mode 100644
index 000000000..74a35e83f
--- /dev/null
+++ b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
@@ -0,0 +1,68 @@
+"""Monkey patch to allow context parallelism with FlashAttention in HF Trainer."""
+
+from __future__ import annotations
+
+import importlib
+import inspect
+
+from transformers import Trainer
+
+from axolotl.monkeypatch.utils import detab_code
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+GUARD_PATTERN = 'if model.config._attn_implementation != "sdpa":'
+PATCHED_GUARD = (
+    'if model.config._attn_implementation not in ("sdpa", "flash_attention_2"):'
+)
+
+
+def patch_prepare_context_parallel_inputs() -> None:
+    """Relax the SDPA-only guard when running context parallelism with FlashAttention."""
+    if getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False):
+        LOG.debug("Trainer._prepare_context_parallel_inputs already patched")
+        return
+
+    try:
+        original_source = inspect.getsource(Trainer._prepare_context_parallel_inputs)
+    except OSError as exc:  # pragma: no cover - occurs when source is unavailable
+        LOG.warning("Unable to patch Trainer._prepare_context_parallel_inputs: %s", exc)
+        return
+
+    if GUARD_PATTERN not in original_source:
+        LOG.warning(
+            "Expected guard not found in Trainer._prepare_context_parallel_inputs; \n"
+            "skipping FlashAttention context parallelism patch"
+        )
+        return
+
+    patched_source = original_source.replace(GUARD_PATTERN, PATCHED_GUARD)
+    patched_source, _ = detab_code(patched_source)
+    patched_source = patched_source.replace(
+        "def _prepare_context_parallel_inputs(",
+        "def axolotl_prepare_context_parallel_inputs(",
+        1,
+    )
+
+    module_name = Trainer.__module__
+    module = importlib.import_module(module_name)
+
+    # import symbols referenced in the method so exec can succeed
+    items_to_import = []
+    for item in dir(module):
+        if item in patched_source:
+            items_to_import.append(item)
+
+    exec(f"from {module_name} import ({', '.join(items_to_import)})", globals())
+    exec(patched_source, globals())
+
+    Trainer._original_prepare_context_parallel_inputs = (
+        Trainer._prepare_context_parallel_inputs
+    )
+    Trainer._prepare_context_parallel_inputs = axolotl_prepare_context_parallel_inputs
+    Trainer._axolotl_prepare_context_parallel_inputs_source = patched_source
+    Trainer._axolotl_prepare_context_parallel_inputs_patched = True
+    LOG.debug(
+        "Patched Trainer._prepare_context_parallel_inputs for FlashAttention + CP"
+    )
diff --git a/tests/monkeypatch/test_trainer_context_parallel_patch.py b/tests/monkeypatch/test_trainer_context_parallel_patch.py
new file mode 100644
index 000000000..84c883e91
--- /dev/null
+++ b/tests/monkeypatch/test_trainer_context_parallel_patch.py
@@ -0,0 +1,66 @@
+"""Tests for the HF Trainer context parallel patch."""
+
+import pytest
+from transformers import Trainer
+
+from axolotl.monkeypatch.transformers.trainer_context_parallel import (
+    GUARD_PATTERN,
+    PATCHED_GUARD,
+    patch_prepare_context_parallel_inputs,
+)
+
+
+@pytest.fixture
+def restore_trainer_prepare_method():
+    """Ensure Trainer._prepare_context_parallel_inputs is restored after a test."""
+    original_method = getattr(
+        Trainer,
+        "_original_prepare_context_parallel_inputs",
+        Trainer._prepare_context_parallel_inputs,
+    )
+    patched_attr_present = hasattr(
+        Trainer, "_axolotl_prepare_context_parallel_inputs_patched"
+    )
+
+    yield
+
+    Trainer._prepare_context_parallel_inputs = original_method
+    if patched_attr_present:
+        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched")
+    if hasattr(Trainer, "_original_prepare_context_parallel_inputs"):
+        delattr(Trainer, "_original_prepare_context_parallel_inputs")
+    if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source"):
+        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source")
+
+
+def test_patch_attention_guard(restore_trainer_prepare_method):
+    """Patch should swap the guard to allow sdpa or flash attention."""
+    # Ensure we start from the unpatched method
+    if hasattr(Trainer, "_original_prepare_context_parallel_inputs"):
+        Trainer._prepare_context_parallel_inputs = (
+            Trainer._original_prepare_context_parallel_inputs
+        )
+        delattr(Trainer, "_original_prepare_context_parallel_inputs")
+    if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched"):
+        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched")
+
+    patch_prepare_context_parallel_inputs()
+
+    patched_method = Trainer._prepare_context_parallel_inputs
+    assert patched_method is not None
+    assert getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False)
+
+    source = Trainer._axolotl_prepare_context_parallel_inputs_source
+    assert GUARD_PATTERN not in source
+    assert PATCHED_GUARD in source
+
+
+def test_patch_is_idempotent(restore_trainer_prepare_method):
+    """Calling the patch twice should leave the same patched function in place."""
+    patch_prepare_context_parallel_inputs()
+    first_patched = Trainer._prepare_context_parallel_inputs
+
+    patch_prepare_context_parallel_inputs()
+    second_patched = Trainer._prepare_context_parallel_inputs
+
+    assert first_patched is second_patched

From 7fa8ac40cd344e3578dcdddb8e426b5445487997 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Fri, 26 Sep 2025 12:11:29 +0700
Subject: [PATCH 066/115] Feat(cce): add qwen3_vl, qwen3_vl_moe,
 granitemoeshared, granitemoehybrid, and upgraded all cce patches (#3178)

* feat: upgrade cce with patches for transformers 4.56

* feat: add missing models to cce readme
---
 examples/colab-notebooks/colab-axolotl-example.ipynb   | 2 +-
 scripts/cutcrossentropy_install.py                     | 2 +-
 src/axolotl/integrations/cut_cross_entropy/README.md   | 9 ++++++++-
 src/axolotl/integrations/cut_cross_entropy/__init__.py | 2 +-
 4 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index b48331063..9e18757f6 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
     "%%capture\n",
     "# This step can take ~5-10 minutes to install dependencies\n",
     "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28\""
    ]
   },
   {
diff --git a/scripts/cutcrossentropy_install.py b/scripts/cutcrossentropy_install.py
index dc117604a..32f585858 100644
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
 
 print(
     UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"'
 )
diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md
index cc73eebb7..c33d45f00 100644
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
 
 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"
 ```
 
 ## Usage
@@ -31,6 +31,7 @@ plugins:
 
 ## Supported Models
 
+- apertus
 - arcee
 - cohere
 - cohere2
@@ -44,9 +45,13 @@ plugins:
 - glm
 - glm4
 - glm4_moe
+- glm4v
+- glm4v_moe
 - gpt_oss
 - granite
 - granitemoe
+- granitemoeshared
+- granitemoehybrid
 - hunyuan_v1_dense
 - hunyuan_v1_moe
 - llama
@@ -65,6 +70,8 @@ plugins:
 - qwen2_5_vl
 - qwen3
 - qwen3_moe
+- qwen3_vl
+- qwen3_vl_moe
 - qwen3_next
 - smollm3
 - seed_oss
diff --git a/src/axolotl/integrations/cut_cross_entropy/__init__.py b/src/axolotl/integrations/cut_cross_entropy/__init__.py
index 812baf33f..e8c6c23a3 100644
--- a/src/axolotl/integrations/cut_cross_entropy/__init__.py
+++ b/src/axolotl/integrations/cut_cross_entropy/__init__.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)
 
 _CCE_INSTALL_MESSAGE = (
     "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"`'
 )
 
 

From 850c1a5f8db4abd4cce2aab5d041cdf0dc90e03a Mon Sep 17 00:00:00 2001
From: "Grant Holmes (Ren)" <60802511+gholmes829@users.noreply.github.com>
Date: Fri, 26 Sep 2025 04:23:59 -0500
Subject: [PATCH 067/115] Add FSDP v2 swap memory support + QLoRA compatibility
 fixes (#3167)

Co-authored-by: salman <salman.mohammadi@outlook.com>
---
 docs/fsdp_qlora.qmd                         |  8 ++++-
 examples/llama-2/qlora-fsdp.yml             |  1 +
 src/axolotl/monkeypatch/accelerate/fsdp2.py |  7 +++-
 src/axolotl/utils/schemas/validation.py     | 27 +++++++-------
 src/axolotl/utils/trainer.py                |  4 +++
 tests/utils/schemas/validation/test_fsdp.py | 40 ++++++++++++++++++++-
 6 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/docs/fsdp_qlora.qmd b/docs/fsdp_qlora.qmd
index 2f1b0358f..01f57e627 100644
--- a/docs/fsdp_qlora.qmd
+++ b/docs/fsdp_qlora.qmd
@@ -1,5 +1,5 @@
 ---
-title: "FDSP + QLoRA"
+title: "FSDP + QLoRA"
 description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs.
 format:
   html:
@@ -23,6 +23,12 @@ To enable `QLoRA` with `FSDP`, you need to perform the following steps:
 2. Enable FSDP in your axolotl config, as [described here](multi-gpu.qmd#sec-fsdp).
 3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.
 
+## Enabling Swap for FSDP2
+
+If available memory is insufficient even after FSDP's CPU offloading, you can enable swap memory usage by setting `cpu_offload_pin_memory: false` alongside `offload_params: true` in FSDP config.
+
+This disables memory pinning, allowing FSDP to use disk swap space as fallback. Disabling memory pinning itself incurs performance overhead, and actually having to use swap adds more, but it may enable training larger models that would otherwise cause OOM errors on resource constrained systems.
+
 ## Example Config
 
 [examples/llama-2/qlora-fsdp.yml](../examples/llama-2/qlora-fsdp.yml) contains an example of how to enable QLoRA + FSDP in axolotl.
diff --git a/examples/llama-2/qlora-fsdp.yml b/examples/llama-2/qlora-fsdp.yml
index 54f4b86b4..1e7064de8 100644
--- a/examples/llama-2/qlora-fsdp.yml
+++ b/examples/llama-2/qlora-fsdp.yml
@@ -66,6 +66,7 @@ fsdp_config:
   fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
   fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
   fsdp_state_dict_type: FULL_STATE_DICT
+  # fsdp_cpu_offload_pin_memory: false  # uncomment to enable swap memory usage when RAM is insufficient
 special_tokens:
 
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/src/axolotl/monkeypatch/accelerate/fsdp2.py b/src/axolotl/monkeypatch/accelerate/fsdp2.py
index d8ba02cb2..af6f24a63 100644
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -4,6 +4,7 @@ monkeypatch for accelerate fsdp2 fix when modifying ordereddict during interatio
 
 import copy
 import functools
+import os
 import sys
 
 import torch
@@ -277,6 +278,11 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
 
     mesh = getattr(accelerator.state, "device_mesh", None)
 
+    # Disable memory pinning if requested
+    offload_to_cpu = isinstance(fsdp2_plugin.cpu_offload, CPUOffloadPolicy)
+    if offload_to_cpu and os.environ.get("FSDP_CPU_OFFLOAD_PIN_MEMORY", "") == "false":
+        fsdp2_plugin.cpu_offload.pin_memory = False
+
     fsdp2_kwargs = {
         "reshard_after_forward": fsdp2_plugin.reshard_after_forward,
         "offload_policy": fsdp2_plugin.cpu_offload,
@@ -341,7 +347,6 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
         )
 
     if fsdp2_plugin.cpu_ram_efficient_loading:
-        offload_to_cpu = isinstance(fsdp2_plugin.cpu_offload, CPUOffloadPolicy)
         fsdp2_load_full_state_dict(
             accelerator, model, original_sd, offload_to_cpu=offload_to_cpu
         )
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 9671b10ae..0ec3e854f 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -816,21 +816,22 @@ class OptimizationValidationMixin:
             )
         return data
 
-    @model_validator(mode="after")
-    def check_fsdp2_base_model_quant_ram_efficient_loading(self):
-        fsdp_config = self.fsdp_config if hasattr(self, "fsdp_config") else None
-        fsdp_version = self.fsdp_version if hasattr(self, "fsdp_version") else None
-        load_in_8bit = self.load_in_8bit if hasattr(self, "load_in_8bit") else None
-        load_in_4bit = self.load_in_4bit if hasattr(self, "load_in_4bit") else None
-        if fsdp_config and fsdp_version == 2:
-            if fsdp_config.get("cpu_ram_efficient_loading") and (
-                load_in_8bit or load_in_4bit
-            ):
+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp2_cpu_offload_pin_memory(cls, data):
+        if not (fsdp_config := data.get("fsdp_config")):
+            return data
+
+        if fsdp_config.get("cpu_offload_pin_memory") is False:
+            if str(data.get("fsdp_version")) != "2":
                 raise ValueError(
-                    "FSDP2 does not support load_in_8bit or load_in_4bit with cpu_ram_efficient_loading. Please do one of the following: use DeepSpeed, "
-                    "set fsdp_version to 1, or disable cpu_ram_efficient_loading."
+                    "FSDP1 does not support disabling cpu_offload_pin_memory, please set `fsdp_version` to 2"
                 )
-        return self
+            if not fsdp_config.get("offload_params"):
+                raise ValueError(
+                    "disabling cpu_offload_pin_memory requires enabling offload_params"
+                )
+        return data
 
     @model_validator(mode="before")
     @classmethod
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 662a54655..56fbe34c0 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -595,6 +595,10 @@ def setup_fsdp_envs(cfg):
         os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
     if cfg.fsdp_config.state_dict_type:
         os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.state_dict_type
+    if cfg.fsdp_config.cpu_offload_pin_memory is not None:
+        os.environ["FSDP_CPU_OFFLOAD_PIN_MEMORY"] = str(
+            cfg.fsdp_config.cpu_offload_pin_memory
+        ).lower()
     if cfg.fsdp_config.auto_wrap_policy:
         os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.auto_wrap_policy
     if cfg.fsdp_config.transformer_layer_cls_to_wrap:
diff --git a/tests/utils/schemas/validation/test_fsdp.py b/tests/utils/schemas/validation/test_fsdp.py
index 08fc50c61..65f9c66a3 100644
--- a/tests/utils/schemas/validation/test_fsdp.py
+++ b/tests/utils/schemas/validation/test_fsdp.py
@@ -61,12 +61,50 @@ class TestFSDPValidation:
             },
             fsdp_version=2,
         )
+        validated_cfg = validate_config(cfg)
+        assert validated_cfg.fsdp_version == 2
+        assert validated_cfg.fsdp_config.cpu_ram_efficient_loading is True
+
+    def test_fsdp2_cpu_offload_pin_memory_requires_offload_params(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fsdp_config={
+                "cpu_offload_pin_memory": False,
+                "offload_params": False,
+            },
+            fsdp_version=2,
+        )
         with pytest.raises(
             ValueError,
-            match="FSDP2 does not support load_in_8bit or load_in_4bit with cpu_ram_efficient_loading.",
+            match="disabling cpu_offload_pin_memory requires enabling offload_params",
         ):
             validate_config(cfg)
 
+    def test_fsdp1_cpu_offload_pin_memory_not_supported(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fsdp_config={
+                "cpu_offload_pin_memory": False,
+                "offload_params": True,
+            },
+            fsdp_version=1,
+        )
+        with pytest.raises(
+            ValueError,
+            match="FSDP1 does not support disabling cpu_offload_pin_memory, please set `fsdp_version` to 2",
+        ):
+            validate_config(cfg)
+
+    def test_fsdp2_cpu_offload_pin_memory_w_offload_params(self, min_base_cfg):
+        cfg = min_base_cfg | DictDefault(
+            fsdp_config={
+                "cpu_offload_pin_memory": False,
+                "offload_params": True,
+            },
+            fsdp_version=2,
+        )
+        validated_cfg = validate_config(cfg)
+        assert validated_cfg.fsdp_config.cpu_offload_pin_memory is False
+        assert validated_cfg.fsdp_config.offload_params is True
+
     def test_fsdp_prefixes_removed(self, min_base_cfg):
         cfg = min_base_cfg | DictDefault(
             fsdp_config={

From 740d5a1d31e100833974f8ad2a7891b6c4dc1f9c Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Fri, 26 Sep 2025 09:55:15 -0400
Subject: [PATCH 068/115] doc fix (#3187)

---
 docs/lora_optims.qmd | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/lora_optims.qmd b/docs/lora_optims.qmd
index 7cdf53975..40893387b 100644
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -5,10 +5,11 @@ description: "Custom autograd functions and Triton kernels in Axolotl for optimi
 
 Inspired by [Unsloth](https://github.com/unslothai/unsloth), we've implemented two
 optimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU
-(in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function
-Triton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was
-to leverage operator fusion and tensor re-use in order to improve speed and reduce
-memory usage during the forward and backward passes of these calculations.
+(including the DDP, DeepSpeed, and FSDP2 settings) training. These include (1) SwiGLU
+and GEGLU activation function Triton kernels, and (2) LoRA MLP and attention custom
+autograd functions. Our goal was to leverage operator fusion and tensor re-use in order
+to improve speed and reduce memory usage during the forward and backward passes of
+these calculations.
 
 We currently support several common model architectures, including (but not limited to):
 
@@ -131,6 +132,5 @@ computation path.
 ## Future Work
 
 - Support for additional model architectures
-- Support for the FSDP setting
 - Support for dropout and bias
 - Additional operator fusions

From f4376748f38abaf1eed8f6d592453dd94ab9c0d7 Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Fri, 26 Sep 2025 15:07:39 -0400
Subject: [PATCH 069/115] debug log: multiprocess race condition fix (#3188)

---
 src/axolotl/utils/tee.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/axolotl/utils/tee.py b/src/axolotl/utils/tee.py
index 1209ad1dd..7bc8efab0 100644
--- a/src/axolotl/utils/tee.py
+++ b/src/axolotl/utils/tee.py
@@ -109,8 +109,8 @@ def prepare_debug_log(cfg, filename: str = "debug.log") -> str:
             cfg.get("resume_from_checkpoint") or cfg.get("auto_resume_from_checkpoints")
         )
 
-        if not append and log_path.exists():
-            log_path.unlink()
+        if not append:
+            log_path.unlink(missing_ok=True)
 
         fh = open(log_path, "a", encoding="utf-8")
         fh.flush()

From a6bfbe34009686c59bbf5a198f8ad047019a78d6 Mon Sep 17 00:00:00 2001
From: VED <146507396+ved1beta@users.noreply.github.com>
Date: Wed, 1 Oct 2025 13:32:51 +0530
Subject: [PATCH 070/115] torch_dtype -> dtype (#3177)

* torch_dtype -> dtype

* torch_dtype -> dtype
---
 src/axolotl/cli/delinearize_llama4.py                       | 4 +---
 src/axolotl/cli/quantize.py                                 | 2 +-
 src/axolotl/utils/model_shard_quant.py                      | 4 ++--
 tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py | 2 +-
 tests/e2e/test_quantization.py                              | 2 +-
 5 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/src/axolotl/cli/delinearize_llama4.py b/src/axolotl/cli/delinearize_llama4.py
index 90227fccd..4f5448a14 100644
--- a/src/axolotl/cli/delinearize_llama4.py
+++ b/src/axolotl/cli/delinearize_llama4.py
@@ -85,9 +85,7 @@ def do_cli(model: Union[Path, str], output: Union[Path, str]) -> None:
     unpatch_llama4 = patch_llama4_linearized_modeling()
     from transformers import Llama4ForConditionalGeneration
 
-    model_ = Llama4ForConditionalGeneration.from_pretrained(
-        model, torch_dtype=torch.bfloat16
-    )
+    model_ = Llama4ForConditionalGeneration.from_pretrained(model, dtype=torch.bfloat16)
     processor = AutoProcessor.from_pretrained(model)
     processor.save_pretrained(output)
 
diff --git a/src/axolotl/cli/quantize.py b/src/axolotl/cli/quantize.py
index 6838f47d8..c11bcc6d9 100644
--- a/src/axolotl/cli/quantize.py
+++ b/src/axolotl/cli/quantize.py
@@ -69,7 +69,7 @@ def do_quantize(
     config = AutoConfig.from_pretrained(model_path)
     torch_dtype = config.torch_dtype if hasattr(config, "torch_dtype") else None
     model = AutoModelForCausalLM.from_pretrained(
-        model_path, device_map="auto", torch_dtype=torch_dtype
+        model_path, device_map="auto", dtype=torch_dtype
     )
 
     LOG.info(
diff --git a/src/axolotl/utils/model_shard_quant.py b/src/axolotl/utils/model_shard_quant.py
index f20a9625e..ca152113a 100644
--- a/src/axolotl/utils/model_shard_quant.py
+++ b/src/axolotl/utils/model_shard_quant.py
@@ -148,7 +148,7 @@ def load_sharded_model(
         model = AutoModelForCausalLM.from_pretrained(
             model_name,
             use_cache=False,
-            torch_dtype=torch.float32,
+            dtype=torch.float32,
             _attn_implementation=model_config._attn_implementation,
             trust_remote_code=cfg.trust_remote_code,
         )
@@ -158,7 +158,7 @@ def load_sharded_model(
         with init_empty_weights():
             model = AutoModelForCausalLM.from_config(
                 model_config,
-                torch_dtype=torch_dtype,
+                dtype=torch_dtype,
                 trust_remote_code=cfg.trust_remote_code,
             )
     return model
diff --git a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
index 2180eb99d..73f883858 100644
--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -160,7 +160,7 @@ def test_geglu_model_integration():
     """Test GeGLU activation with Gemma model."""
     model = AutoModelForCausalLM.from_pretrained(
         "trl-internal-testing/tiny-Gemma2ForCausalLM",
-        torch_dtype=torch.float16,
+        dtype=torch.float16,
         device_map="cuda:0",
     )
     peft_config = get_peft_config(
diff --git a/tests/e2e/test_quantization.py b/tests/e2e/test_quantization.py
index b64aef51a..706279c6c 100644
--- a/tests/e2e/test_quantization.py
+++ b/tests/e2e/test_quantization.py
@@ -39,7 +39,7 @@ def model():
     dummy_model = AutoModelForCausalLM.from_pretrained(
         "Qwen/Qwen2-0.5B",
         device_map="auto",
-        torch_dtype=torch.bfloat16,
+        dtype=torch.bfloat16,
     )
     with torch.device(dummy_model.device):
         dummy_model.model.embed_tokens = torch.nn.Embedding(

From ce74c20109d60df4cb023254f3a58b80b6a4cfc8 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 1 Oct 2025 11:11:39 -0400
Subject: [PATCH 071/115] don't cache pip install (#3194)

* don't cache pip install

* no cache dir for disk space for sdist too
---
 .github/workflows/tests.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index cfd2c715d..5d5bdb5ac 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -81,12 +81,12 @@ jobs:
 
       - name: Install PyTorch
         run: |
-          pip3 install torch==${{ matrix.pytorch_version }} torchvision
+          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
 
       - name: Install dependencies
         run: |
           pip3 show torch
-          pip3 install --no-build-isolation -U -e .
+          pip3 install --no-cache-dir --no-build-isolation -U -e .
           python scripts/unsloth_install.py | sh
           python scripts/cutcrossentropy_install.py | sh
           pip3 install -r requirements-dev.txt -r requirements-tests.txt
@@ -156,13 +156,13 @@ jobs:
 
       - name: Install PyTorch
         run: |
-          pip3 install torch==${{ matrix.pytorch_version }} torchvision
+          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision
 
       - name: Install dependencies
         run: |
           pip3 show torch
           python -m build --no-isolation --sdist
-          pip3 install --no-build-isolation dist/axolotl*.tar.gz
+          pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
           python scripts/unsloth_install.py | sh
           python scripts/cutcrossentropy_install.py | sh
           pip3 install -r requirements-dev.txt -r requirements-tests.txt

From 409cfb8a87287bb2314e5ce9d9ba2585bff2da9f Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 7 Oct 2025 11:23:41 -0400
Subject: [PATCH 072/115] deprecate torch 2.6.0 support (#3197) [skip ci]

---
 .github/workflows/base.yml          | 21 ---------------------
 .github/workflows/main.yml          | 15 ---------------
 .github/workflows/multi-gpu-e2e.yml |  7 -------
 .github/workflows/nightlies.yml     | 20 ++++++++++----------
 .github/workflows/tests-nightly.yml | 10 +++++-----
 .github/workflows/tests.yml         | 10 ++--------
 README.md                           |  2 +-
 7 files changed, 18 insertions(+), 67 deletions(-)

diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index 160ed7df9..7af6059c8 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -25,20 +25,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: "124"
-            cuda_version: 12.4.1
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.6.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
-          - cuda: "126"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.6.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-base"
           - cuda: "126"
             cuda_version: 12.6.3
             cudnn_version: ""
@@ -122,13 +108,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: "126"
-            cuda_version: 12.6.3
-            cudnn_version: ""
-            python_version: "3.11"
-            pytorch: 2.6.0
-            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
-            dockerfile: "Dockerfile-uv-base"
           - cuda: "126"
             cuda_version: 12.6.3
             cudnn_version: ""
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 3f98dd2b4..4040ccdc9 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,11 +15,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
@@ -88,11 +83,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
@@ -162,11 +152,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index 05f9e0761..6a92de352 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -26,13 +26,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
-            num_gpus: 2
-            nightly_build: "true"
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
diff --git a/.github/workflows/nightlies.yml b/.github/workflows/nightlies.yml
index 49bce470b..18b036a0d 100644
--- a/.github/workflows/nightlies.yml
+++ b/.github/workflows/nightlies.yml
@@ -12,16 +12,16 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.7.1
             axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.8.0
+            axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout
@@ -65,16 +65,16 @@ jobs:
     strategy:
       matrix:
         include:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.6.0
-            axolotl_extras:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.7.1
             axolotl_extras:
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.8.0
+            axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
       - name: Checkout
diff --git a/.github/workflows/tests-nightly.yml b/.github/workflows/tests-nightly.yml
index fc6c2b396..35cb707eb 100644
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -26,7 +26,7 @@ jobs:
       max-parallel: 2
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.6.0", "2.7.0"]
+        pytorch_version: ["2.7.1", "2.8.0"]
     timeout-minutes: 20
 
     steps:
@@ -102,14 +102,14 @@ jobs:
           - cuda: 126
             cuda_version: 12.6.3
             python_version: "3.11"
-            pytorch: 2.6.0
+            pytorch: 2.7.1
             num_gpus: 1
             axolotl_extras:
             nightly_build: "true"
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 128
+            cuda_version: 12.8.1
             python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.8.0
             num_gpus: 1
             axolotl_extras:
             nightly_build: "true"
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 5d5bdb5ac..8f368b517 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -55,7 +55,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
+        pytorch_version: ["2.7.1", "2.8.0"]
     timeout-minutes: 20
 
     steps:
@@ -130,7 +130,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.6.0", "2.7.1", "2.8.0"]
+        pytorch_version: ["2.7.1", "2.8.0"]
     timeout-minutes: 20
 
     steps:
@@ -286,12 +286,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras:
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
diff --git a/README.md b/README.md
index 1a033acd9..6313a73ca 100644
--- a/README.md
+++ b/README.md
@@ -73,7 +73,7 @@ Features:
 
 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
-- PyTorch ≥2.6.0
+- PyTorch ≥2.7.1
 
 ### Google Colab
 

From 377c510e955c4db01b9e26858ed0945d92dc6e8d Mon Sep 17 00:00:00 2001
From: VED <146507396+ved1beta@users.noreply.github.com>
Date: Wed, 8 Oct 2025 17:09:21 +0530
Subject: [PATCH 073/115] sleep model support (#3135)

Co-authored-by: salman <salman.mohammadi@outlook.com>
---
 src/axolotl/core/trainers/grpo/__init__.py | 1 +
 src/axolotl/utils/schemas/trl.py           | 6 ++++++
 2 files changed, 7 insertions(+)

diff --git a/src/axolotl/core/trainers/grpo/__init__.py b/src/axolotl/core/trainers/grpo/__init__.py
index 7eda7a0ba..d1a6b7fd9 100644
--- a/src/axolotl/core/trainers/grpo/__init__.py
+++ b/src/axolotl/core/trainers/grpo/__init__.py
@@ -52,6 +52,7 @@ class GRPOStrategy:
             if trl.vllm_mode:
                 grpo_args_kwargs["vllm_mode"] = trl.vllm_mode
             if trl.vllm_mode == "colocate":
+                grpo_args_kwargs["enable_sleep_mode"] = trl.vllm_enable_sleep_mode  # type: ignore[attr-defined]
                 grpo_args_kwargs["vllm_gpu_memory_utilization"] = (
                     vllm_cfg.gpu_memory_utilization
                 )
diff --git a/src/axolotl/utils/schemas/trl.py b/src/axolotl/utils/schemas/trl.py
index 980474e87..624f7663e 100644
--- a/src/axolotl/utils/schemas/trl.py
+++ b/src/axolotl/utils/schemas/trl.py
@@ -167,3 +167,9 @@ class TRLConfig(BaseModel):
             "description": "Whether to exclude truncated completions from loss calculation."
         },
     )
+    vllm_enable_sleep_mode: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Enable sleep mode for vLLM to offload VRAM when idle"
+        },
+    )

From 130637a3fa4cb476b2452aefd1710be7501ad221 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 8 Oct 2025 08:43:46 -0400
Subject: [PATCH 074/115] upgrade transformers to 4.57.0 (#3201)

* upgrade transformers to 4.57.0

* remove deprecated autoawq and use latest peft

* remove autoawq from setuptools script

* fix imports

* make sure torchvision is installed

* remove support for BetterTransformer

* skip fsdp_qlora_prequant test

* more robust error reporting
---
 cicd/Dockerfile-uv.jinja                      |  1 +
 cicd/single_gpu.py                            |  9 ++++-
 docker/Dockerfile-uv-base                     |  2 +-
 requirements.txt                              |  5 +--
 setup.py                                      |  3 --
 src/axolotl/core/builders/causal.py           |  7 ----
 src/axolotl/processing_strategies.py          |  4 +-
 src/axolotl/train.py                          | 10 -----
 src/axolotl/utils/callbacks/__init__.py       | 37 -------------------
 ...setuptools_axolotl_dynamic_dependencies.py |  2 -
 tests/e2e/multigpu/test_llama.py              |  1 +
 11 files changed, 15 insertions(+), 66 deletions(-)

diff --git a/cicd/Dockerfile-uv.jinja b/cicd/Dockerfile-uv.jinja
index 860386187..6a4d8a7d3 100644
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -32,6 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
     fi
 
 RUN uv pip install packaging==23.2 setuptools==75.8.0
+RUN uv pip install torchvision
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
         uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
     else \
diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py
index 5a06a34f0..3bca5806f 100644
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -68,5 +68,10 @@ def run_cmd(cmd: str, run_folder: str):
     sp_env["AXOLOTL_DATASET_PROCESSES"] = "8"
 
     # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env):  # nosec
-        exit(exit_code)
+    try:
+        exit_code = subprocess.call(cmd.split(), cwd=run_folder, env=sp_env)  # nosec
+        if exit_code:
+            print(f"Command '{cmd}' failed with exit code {exit_code}")
+            return exit_code
+    except Exception as e:  # pylint: disable=broad-except
+        print(f"Command '{cmd}' failed with exception {e}")
diff --git a/docker/Dockerfile-uv-base b/docker/Dockerfile-uv-base
index 4b08e55f8..eaa49b9e9 100644
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -30,7 +30,7 @@ RUN uv venv --no-project --relocatable axolotl-venv
 ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
 
 RUN uv pip install packaging setuptools wheel psutil \
-    && uv pip install torch==${PYTORCH_VERSION} \
+    && uv pip install torch==${PYTORCH_VERSION} torchvision \
     && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
     && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
     && uv pip install awscli pydantic
diff --git a/requirements.txt b/requirements.txt
index 86013374f..9c56638a3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,16 +5,15 @@ bitsandbytes==0.47.0
 triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
-autoawq==0.2.7.post3
 liger-kernel==0.6.1
 # END section
 
 packaging==23.2
 
 huggingface_hub>=0.33.0
-peft>=0.17.0
-transformers==4.56.1
+peft>=0.17.1
 tokenizers>=0.21.1
+transformers==4.57.0
 accelerate==1.10.1
 datasets==4.0.0
 deepspeed>=0.17.0
diff --git a/setup.py b/setup.py
index 3e642b57f..b2eeb92d6 100644
--- a/setup.py
+++ b/setup.py
@@ -26,7 +26,6 @@ def parse_requirements(extras_require_map):
                 _install_requires.append(line)
     try:
         xformers_version = [req for req in _install_requires if "xformers" in req][0]
-        autoawq_version = [req for req in _install_requires if "autoawq" in req][0]
         if "Darwin" in platform.system():
             # skip packages not compatible with OSX
             skip_packages = [
@@ -34,7 +33,6 @@ def parse_requirements(extras_require_map):
                 "triton",
                 "mamba-ssm",
                 "xformers",
-                "autoawq",
                 "liger-kernel",
             ]
             _install_requires = [
@@ -87,7 +85,6 @@ def parse_requirements(extras_require_map):
                     _install_requires.append("xformers==0.0.28.post2")
                 else:
                     _install_requires.append("xformers>=0.0.28.post3")
-                _install_requires.pop(_install_requires.index(autoawq_version))
                 extras_require_map.pop("vllm")
             elif (major, minor) >= (2, 4):
                 extras_require_map.pop("vllm")
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
index f7f350e1a..820304230 100644
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -28,7 +28,6 @@ from axolotl.processing_strategies import get_processing_strategy
 from axolotl.utils import is_comet_available, is_mlflow_available
 from axolotl.utils.callbacks import (
     LossWatchDogCallback,
-    SaveBetterTransformerModelCallback,
     bench_eval_callback_factory,
     causal_lm_bench_eval_callback_factory,
     colab_inference_post_train_callback,
@@ -63,12 +62,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
         if self.cfg.relora:
             callbacks.append(ReLoRACallback(self.cfg))
 
-        if (
-            hasattr(self.model, "use_bettertransformer")
-            and self.model.use_bettertransformer is True
-        ):
-            callbacks.append(SaveBetterTransformerModelCallback())
-
         # TODO: check if can move to base class
         if self.cfg.loss_watchdog_threshold is not None:
             callbacks.append(LossWatchDogCallback(self.cfg))
diff --git a/src/axolotl/processing_strategies.py b/src/axolotl/processing_strategies.py
index 5e7c1456a..07b114163 100644
--- a/src/axolotl/processing_strategies.py
+++ b/src/axolotl/processing_strategies.py
@@ -6,8 +6,10 @@ from typing import Optional
 from PIL import Image, ImageOps
 from PIL.Image import Resampling
 from torch import Tensor, zeros_like
-from transformers import ProcessorMixin, SmolVLMProcessor, VoxtralProcessor
+from transformers import ProcessorMixin
 from transformers.image_utils import load_image
+from transformers.models.smolvlm import SmolVLMProcessor
+from transformers.models.voxtral import VoxtralProcessor
 
 from axolotl.utils.dict import remove_none_values
 from axolotl.utils.logging import get_logger
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index 2a70d9712..da7b63121 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -40,11 +40,6 @@ from axolotl.utils.schemas.enums import RLType
 from axolotl.utils.train import determine_last_checkpoint
 from axolotl.utils.trainer import setup_trainer
 
-try:
-    from optimum.bettertransformer import BetterTransformer
-except ImportError:
-    BetterTransformer = None
-
 if typing.TYPE_CHECKING:
     from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
 
@@ -141,8 +136,6 @@ def setup_signal_handler(
         def terminate_handler(_, __, model_weakref):
             if model_weakref() is not None:
                 _model = model_weakref()
-                if cfg.flash_optimum and BetterTransformer:
-                    _model = BetterTransformer.reverse(_model)
                 _model.save_pretrained(
                     cfg.output_dir, safe_serialization=safe_serialization
                 )
@@ -321,9 +314,6 @@ def save_trained_model(
             except FileNotFoundError:
                 pass
     elif cfg.local_rank == 0:
-        if cfg.flash_optimum and BetterTransformer:
-            model = BetterTransformer.reverse(model)
-
         if cfg.rl and cfg.adapter and not cfg.rl_adapter_ref_model:
             trainer.model.save_pretrained(
                 cfg.output_dir, safe_serialization=safe_serialization
diff --git a/src/axolotl/utils/callbacks/__init__.py b/src/axolotl/utils/callbacks/__init__.py
index 6c5512223..b54cf10c9 100644
--- a/src/axolotl/utils/callbacks/__init__.py
+++ b/src/axolotl/utils/callbacks/__init__.py
@@ -17,7 +17,6 @@ import torch
 import torch.distributed as dist
 import wandb
 from datasets import load_dataset
-from optimum.bettertransformer import BetterTransformer
 from tqdm import tqdm
 from transformers import (
     GenerationConfig,
@@ -28,8 +27,6 @@ from transformers import (
     TrainingArguments,
 )
 from transformers.trainer_utils import (
-    PREFIX_CHECKPOINT_DIR,
-    IntervalStrategy,
     SaveStrategy,
 )
 from trl.models import unwrap_model_for_generation
@@ -56,40 +53,6 @@ IGNORE_INDEX = -100
 LOG = get_logger(__name__)
 
 
-class SaveBetterTransformerModelCallback(TrainerCallback):
-    """Callback to save the BetterTransformer wrapped model"""
-
-    def on_step_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ) -> TrainerControl:
-        # Save
-        if (
-            args.save_strategy == IntervalStrategy.STEPS
-            and args.save_steps > 0
-            and state.global_step % args.save_steps == 0
-        ):
-            control.should_save = True
-
-        if control.should_save:
-            checkpoint_folder = os.path.join(
-                args.output_dir,
-                f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
-            )
-
-            model = BetterTransformer.reverse(kwargs["model"])
-            model.save_pretrained(checkpoint_folder)
-            # FIXME - need to cleanup old checkpoints
-
-            # since we're saving here, we don't need the trainer loop to attempt to save too b/c
-            # the trainer will raise an exception since it can't save a BetterTransformer wrapped model
-            control.should_save = False
-        return control
-
-
 class LossWatchDogCallback(TrainerCallback):
     """Callback to track loss and stop training if loss is too high"""
 
diff --git a/src/setuptools_axolotl_dynamic_dependencies.py b/src/setuptools_axolotl_dynamic_dependencies.py
index ccd7c72d7..3bb54cda8 100644
--- a/src/setuptools_axolotl_dynamic_dependencies.py
+++ b/src/setuptools_axolotl_dynamic_dependencies.py
@@ -33,7 +33,6 @@ def parse_requirements():
     try:
         xformers_version = [req for req in _install_requires if "xformers" in req][0]
         torchao_version = [req for req in _install_requires if "torchao" in req][0]
-        autoawq_version = [req for req in _install_requires if "autoawq" in req][0]
 
         if "Darwin" in platform.system():
             # don't install xformers on MacOS
@@ -63,7 +62,6 @@ def parse_requirements():
                     _install_requires.append("xformers==0.0.28.post2")
                 else:
                     _install_requires.append("xformers==0.0.28.post3")
-                _install_requires.pop(_install_requires.index(autoawq_version))
             elif (major, minor) >= (2, 4):
                 if patch == 0:
                     _install_requires.pop(_install_requires.index(xformers_version))
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index c16ef0c60..b836291e5 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -548,6 +548,7 @@ class TestMultiGPULlama:
             temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
         )
 
+    @pytest.mark.skip("regression failure from v4.57.0")
     def test_fsdp_qlora_prequant_packed(self, temp_dir):
         cfg = DictDefault(
             {

From 4c3488cc9f5377d66ed96fc1e12e350c6a8cb21b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 8 Oct 2025 08:58:02 -0400
Subject: [PATCH 075/115] chore: update pre-commit hooks (#3160) [skip ci]

Co-authored-by: djsaunde <1245942+djsaunde@users.noreply.github.com>
---
 .pre-commit-config.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 92ddc7f41..e853243cd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,13 +11,13 @@ repos:
     -   id: no-commit-to-branch
         args: ['--branch', 'main']
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.12.12
+    rev: v0.13.3
     hooks:
     -   id: ruff
         args: [--fix]
     -   id: ruff-format
 -   repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.17.1
+    rev: v1.18.2
     hooks:
     - id: mypy
       additional_dependencies:

From d0e9c3c1c58a093f7ee8e18ddfbe0702f3cf7333 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 8 Oct 2025 10:43:41 -0400
Subject: [PATCH 076/115] When using Ray use prepare for dataloader fixes
 (#3198)

* make sure to use ray prepare for dataloader fixes

* ray tests use 2.7.0+

* don't call init_distributed w ray and deepspeed

* handle dict deepspeed config

* better handling of dict deepspeed config

* use json.dumps

* guard to_dict

* wrap import for optional ray
---
 src/axolotl/cli/train.py       |  2 +-
 src/axolotl/train.py           | 11 +++++++++++
 src/axolotl/utils/trainer.py   | 16 +++++++++++++++-
 tests/e2e/multigpu/test_ray.py |  5 ++---
 4 files changed, 29 insertions(+), 5 deletions(-)

diff --git a/src/axolotl/cli/train.py b/src/axolotl/cli/train.py
index 2332717e7..6b3bfbd57 100644
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -99,7 +99,7 @@ def ray_train_func(kwargs: dict):
     resolve_dtype(cfg)
 
     # ray serializing objects gets rid of frozen attribute - HF expects dict not DefaultDict
-    if cfg.deepspeed:
+    if cfg.deepspeed and hasattr(cfg.deepspeed, "to_dict"):
         cfg.deepspeed = cfg.deepspeed.to_dict()
 
     # initialize accelerator before model instantiation
diff --git a/src/axolotl/train.py b/src/axolotl/train.py
index da7b63121..441c50871 100644
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -525,6 +525,17 @@ def setup_model_and_trainer(
     plugin_manager = PluginManager.get_instance()
     plugin_manager.post_trainer_create(cfg, trainer)
 
+    if cfg.use_ray:
+        try:
+            import ray.train.huggingface.transformers
+
+            trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)
+        except ImportError:
+            LOG.warning(
+                "The Ray integration with Hugging Face Transformers is not available. "
+                "To use Ray, install the 'ray[train]' package."
+            )
+
     return (
         trainer,
         model,
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index 56fbe34c0..c7fa0a647 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -6,6 +6,7 @@ import os
 import random
 from contextlib import contextmanager
 from functools import partial
+from tempfile import NamedTemporaryFile
 from typing import List, Optional
 
 import numpy as np
@@ -15,6 +16,7 @@ from datasets import IterableDataset, disable_caching, enable_caching
 from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 from transformers.utils import is_torch_bf16_gpu_available
 
+from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import init_distributed_state, reduce_and_broadcast
 from axolotl.utils.environment import check_cuda_p2p_ib_support
 from axolotl.utils.logging import get_logger
@@ -540,6 +542,13 @@ def setup_deepspeed_env(cfg, stage=None):
         )
 
     os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
+    if isinstance(cfg.deepspeed, DictDefault):
+        with NamedTemporaryFile(
+            mode="w", delete=False, suffix=".json", prefix="deepspeed_config_"
+        ) as temp_file:
+            temp_file.write(json.dumps(cfg.deepspeed.to_dict(), indent=4))
+            temp_file.close()
+            cfg.deepspeed = str(temp_file.name)
     os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
     os.environ["ACCELERATE_GRADIENT_ACCUMULATION_STEPS"] = str(
         cfg.gradient_accumulation_steps
@@ -562,6 +571,7 @@ def setup_deepspeed_env(cfg, stage=None):
     if (
         int(os.environ.get("WORLD_SIZE", "1")) == 1
         and os.environ.get("AXOLOTL_IS_PREPROCESS", "0") != "1"
+        and cfg.use_ray is not True
     ):
         os.environ["WORLD_SIZE"] = "1"  # force it in case not set
         os.environ["LOCAL_RANK"] = "0"  # force it in case not set
@@ -638,11 +648,15 @@ def prepare_optim_env(cfg):
         setup_fsdp_envs(cfg)
     elif cfg.deepspeed:
         stage = None
+        deepspeed_config = None
         # check if the cfg.deepspeed is a file
-        if os.path.isfile(cfg.deepspeed):
+        if isinstance(cfg.deepspeed, DictDefault):
+            deepspeed_config = cfg.deepspeed
+        elif os.path.isfile(cfg.deepspeed):
             # parse with json
             with open(cfg.deepspeed, "r", encoding="utf-8") as fin:
                 deepspeed_config = json.load(fin)
+        if deepspeed_config:
             stage = deepspeed_config.get("zero_optimization", {}).get("stage", None)
         setup_deepspeed_env(cfg, stage=stage)
 
diff --git a/tests/e2e/multigpu/test_ray.py b/tests/e2e/multigpu/test_ray.py
index 7c6ea8a1f..df41b1444 100644
--- a/tests/e2e/multigpu/test_ray.py
+++ b/tests/e2e/multigpu/test_ray.py
@@ -13,7 +13,6 @@ from axolotl.utils.dict import DictDefault
 from tests.e2e.utils import (
     check_tensorboard,
     require_torch_2_7_0,
-    require_torch_lt_2_6_0,
 )
 
 AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent
@@ -24,7 +23,7 @@ class TestMultiGPURay:
     Test cases for AnyScale Ray post training
     """
 
-    @require_torch_lt_2_6_0
+    @require_torch_2_7_0
     def test_lora_ddp(self, temp_dir):
         cfg = DictDefault(
             {
@@ -83,7 +82,7 @@ class TestMultiGPURay:
             temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
         )
 
-    @require_torch_lt_2_6_0
+    @require_torch_2_7_0
     @pytest.mark.parametrize(
         "gradient_accumulation_steps",
         [1, 2],

From 6f8ce024d1ede51d03ece47c40f0e7b73e04488b Mon Sep 17 00:00:00 2001
From: Manh Nguyen <144032789+nguyen599@users.noreply.github.com>
Date: Wed, 8 Oct 2025 22:27:01 +0700
Subject: [PATCH 077/115] Remove check_torch_compile_deepspeed (#3195) [skip
 ci]

Signed-off-by: nguyen599 <pnvmanh2123@gmail.com>
---
 src/axolotl/utils/schemas/validation.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 0ec3e854f..4abe45e64 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -783,15 +783,6 @@ class OptimizationValidationMixin:
 
         return data
 
-    @model_validator(mode="before")
-    @classmethod
-    def check_torch_compile_deepspeed(cls, data):
-        if data.get("deepspeed") and data.get("torch_compile"):
-            raise ValueError(
-                "torch_compile should be set within your deepspeed config file"
-            )
-        return data
-
     @model_validator(mode="before")
     @classmethod
     def check_xentropy_patch_conflicts(cls, data):

From ab63b92c384d07d83260e0ed3109afccefaf0d84 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Thu, 9 Oct 2025 21:47:41 +0700
Subject: [PATCH 078/115] feat: add lfm2 family and latest moe model (#3208)

* feat: add lfm2 family and latest moe model

* fix: use ml-cross-entropy for lfm2 examples
---
 examples/LiquidAI/README.md                   | 15 ++++-
 examples/LiquidAI/lfm2-350m-fft.yaml          |  3 +-
 examples/LiquidAI/lfm2-8b-a1b-lora.yaml       | 59 +++++++++++++++++++
 examples/LiquidAI/lfm2-vl-lora.yaml           |  3 +
 .../colab-axolotl-example.ipynb               |  2 +-
 scripts/cutcrossentropy_install.py            |  2 +-
 src/axolotl/common/architectures.py           |  1 +
 .../integrations/cut_cross_entropy/README.md  |  6 +-
 .../cut_cross_entropy/__init__.py             |  2 +-
 src/axolotl/monkeypatch/multipack.py          |  2 +
 10 files changed, 87 insertions(+), 8 deletions(-)
 create mode 100644 examples/LiquidAI/lfm2-8b-a1b-lora.yaml

diff --git a/examples/LiquidAI/README.md b/examples/LiquidAI/README.md
index 96fc74a92..8a18d9eb1 100644
--- a/examples/LiquidAI/README.md
+++ b/examples/LiquidAI/README.md
@@ -6,6 +6,8 @@ LFM2 features a new hybrid Liquid architecture with multiplicative gates, short-
 
 This guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.
 
+Thanks to the team at LiquidAI for giving us early access to prepare for these releases.
+
 ## Getting Started
 
 1.  Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
@@ -31,6 +33,14 @@ This guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.
     axolotl train examples/LiquidAI/lfm2-vl-lora.yaml
     ```
 
+    **LFM2-MoE**
+    ```bash
+    pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6
+
+    # LoRA SFT (1x48GB @ 16.2GiB)
+    axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
+    ```
+
 ### TIPS
 
 - **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
@@ -45,14 +55,13 @@ This guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.
 
 ## Optimization Guides
 
-- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
-- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
-- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
+- [Optimizations Guide](https://docs.axolotl.ai/docs/optimizations.html)
 
 ## Related Resources
 
 - [LFM2 Blog](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models)
 - [LFM2-VL Blog](https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models)
+- [LFM2-MoE Blog](https://www.liquid.ai/blog/lfm2-8b-a1b-an-efficient-on-device-mixture-of-experts)
 - [Axolotl Docs](https://docs.axolotl.ai)
 - [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
 - [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
diff --git a/examples/LiquidAI/lfm2-350m-fft.yaml b/examples/LiquidAI/lfm2-350m-fft.yaml
index d19815008..145b56dd1 100644
--- a/examples/LiquidAI/lfm2-350m-fft.yaml
+++ b/examples/LiquidAI/lfm2-350m-fft.yaml
@@ -1,6 +1,7 @@
 base_model: LiquidAI/LFM2-350M
 
-chunked_cross_entropy: true
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
 
 eot_tokens:
   - "<|im_end|>"
diff --git a/examples/LiquidAI/lfm2-8b-a1b-lora.yaml b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
new file mode 100644
index 000000000..73cbfcce7
--- /dev/null
+++ b/examples/LiquidAI/lfm2-8b-a1b-lora.yaml
@@ -0,0 +1,59 @@
+base_model: LiquidAI/LFM2-8B-A1B
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+load_in_8bit: true
+
+eot_tokens:
+  - "<|im_end|>"
+datasets:
+  - path: mlabonne/FineTome-100k
+    type: chat_template
+    split: train[:20%]
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+dataset_prepared_path: last_run_prepared
+val_set_size: 0.05
+output_dir: ./outputs/out
+
+sequence_len: 4096
+sample_packing: true
+
+adapter: lora
+lora_model_dir:
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 2
+micro_batch_size: 4
+num_epochs: 1
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 5e-5
+
+bf16: true
+tf32: true
+
+gradient_checkpointing: true
+resume_from_checkpoint:
+logging_steps: 1
+flash_attention: true
+
+warmup_ratio: 0.1
+evals_per_epoch: 2
+saves_per_epoch: 1
+
+weight_decay: 0.0
+
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
diff --git a/examples/LiquidAI/lfm2-vl-lora.yaml b/examples/LiquidAI/lfm2-vl-lora.yaml
index 7fee17f92..313da8274 100644
--- a/examples/LiquidAI/lfm2-vl-lora.yaml
+++ b/examples/LiquidAI/lfm2-vl-lora.yaml
@@ -3,6 +3,9 @@ trust_remote_code: true
 model_type: AutoModelForImageTextToText
 processor_type: AutoProcessor
 
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index 9e18757f6..ee99c283f 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
     "%%capture\n",
     "# This step can take ~5-10 minutes to install dependencies\n",
     "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@49f3308\""
    ]
   },
   {
diff --git a/scripts/cutcrossentropy_install.py b/scripts/cutcrossentropy_install.py
index 32f585858..cf8bd57e7 100644
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
 
 print(
     UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@49f3308"'
 )
diff --git a/src/axolotl/common/architectures.py b/src/axolotl/common/architectures.py
index ce945e670..b754e56ba 100644
--- a/src/axolotl/common/architectures.py
+++ b/src/axolotl/common/architectures.py
@@ -14,4 +14,5 @@ MOE_ARCH_BLOCK = {
     "qwen3_moe": "Qwen3MoeSparseMoeBlock",
     "deepseek_v2": "DeepseekV2MoE",
     "gpt_oss": "GptOssDecoderLayer",
+    "lfm2_moe": "Lfm2MoeSparseMoeBlock",
 }
diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md
index c33d45f00..08cd41200 100644
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
 
 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@49f3308"
 ```
 
 ## Usage
@@ -54,9 +54,13 @@ plugins:
 - granitemoehybrid
 - hunyuan_v1_dense
 - hunyuan_v1_moe
+- lfm2
+- lfm2_moe
+- lfm2_vl
 - llama
 - llama4
 - llama4_text
+- llava
 - mistral
 - mistral3
 - mixtral
diff --git a/src/axolotl/integrations/cut_cross_entropy/__init__.py b/src/axolotl/integrations/cut_cross_entropy/__init__.py
index e8c6c23a3..ed6ebe62a 100644
--- a/src/axolotl/integrations/cut_cross_entropy/__init__.py
+++ b/src/axolotl/integrations/cut_cross_entropy/__init__.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)
 
 _CCE_INSTALL_MESSAGE = (
     "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@49f3308"`'
 )
 
 
diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
index 4741245e1..48b4ea10e 100644
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -45,6 +45,8 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
     "gpt_oss",
     "arcee",
     "seed_oss",
+    "lfm2",
+    "lfm2_moe",
 ]
 
 

From 37f78c8592afee7699cdda7ac8d59f685949060c Mon Sep 17 00:00:00 2001
From: VED <146507396+ved1beta@users.noreply.github.com>
Date: Thu, 9 Oct 2025 21:35:54 +0530
Subject: [PATCH 079/115] add chat_template_jinja to wandb (#3192) [skip ci]

* add chat_template_jinja to wandb

* temp_ct_file.flush()

* Update src/axolotl/utils/callbacks/__init__.py

Co-authored-by: Wing Lian <wing.lian@gmail.com>

* Update src/axolotl/utils/callbacks/__init__.py

Co-authored-by: Wing Lian <wing.lian@gmail.com>

* Apply suggestion from @winglian

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
---
 src/axolotl/utils/callbacks/__init__.py | 32 +++++++++++++++++++++++++
 1 file changed, 32 insertions(+)

diff --git a/src/axolotl/utils/callbacks/__init__.py b/src/axolotl/utils/callbacks/__init__.py
index b54cf10c9..36370ef13 100644
--- a/src/axolotl/utils/callbacks/__init__.py
+++ b/src/axolotl/utils/callbacks/__init__.py
@@ -16,6 +16,7 @@ import pandas as pd
 import torch
 import torch.distributed as dist
 import wandb
+import yaml
 from datasets import load_dataset
 from tqdm import tqdm
 from transformers import (
@@ -759,6 +760,37 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
             except (FileNotFoundError, ConnectionError) as err:
                 LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
 
+            try:
+                with open(self.axolotl_config_path, "r", encoding="utf-8") as f:
+                    cfg = yaml.safe_load(f) or {}
+
+                chat_tpl = cfg.get("chat_template_jinja")
+                if chat_tpl:
+                    with NamedTemporaryFile(
+                        mode="w", delete=True, suffix=".jinja", prefix="chat_template_"
+                    ) as temp_ct_file:
+                        if (
+                            isinstance(chat_tpl, str)
+                            and os.path.exists(chat_tpl)
+                            and os.path.isfile(chat_tpl)
+                        ):
+                            copyfile(chat_tpl, temp_ct_file.name)
+                        else:
+                            temp_ct_file.write(str(chat_tpl))
+                            temp_ct_file.flush()
+
+                        artifact = wandb.Artifact(
+                            f"chat-template-{wandb.run.id}", type="jinja-template"
+                        )
+                        artifact.add_file(temp_ct_file.name)
+                        wandb.log_artifact(artifact)
+                        wandb.save(temp_ct_file.name)
+                        LOG.info(
+                            "The chat_template_jinja has been saved to the WandB run under files."
+                        )
+            except (FileNotFoundError, ConnectionError, yaml.YAMLError) as err:
+                LOG.warning(f"Error while saving chat_template_jinja to WandB: {err}")
+
             if args.deepspeed:
                 try:
                     # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later.

From 3a5c97e6e5899cbeb7ea284c658712217bf9721c Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Thu, 9 Oct 2025 14:17:31 -0400
Subject: [PATCH 080/115] use can_device_access_peer for P2P checks (#3209)
 [skip ci]

* use can_device_access_peer for P2P checks

* also log warn when automatically setting NCCL_P2P_DISABLE=1
---
 src/axolotl/utils/environment.py | 60 +++++++++++---------------------
 src/axolotl/utils/trainer.py     |  1 +
 2 files changed, 21 insertions(+), 40 deletions(-)

diff --git a/src/axolotl/utils/environment.py b/src/axolotl/utils/environment.py
index 7b2348413..d5f2d9f78 100644
--- a/src/axolotl/utils/environment.py
+++ b/src/axolotl/utils/environment.py
@@ -3,66 +3,46 @@ utils to get GPU info for the current environment
 """
 
 import os
-import subprocess  # nosec B404
 from importlib.metadata import version
 
+import torch
 from accelerate.utils.environment import (
     check_cuda_p2p_ib_support as accelerate_check_cuda_p2p_ib_support,
-    get_gpu_info,
 )
 from packaging.version import Version, parse
 
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
 
 def check_cuda_p2p_ib_support():
     if not accelerate_check_cuda_p2p_ib_support():
         return False
-    if not check_runpod_p2p_support():
+    if not check_cuda_p2p_support():
         return False
-    unsupported_devices = {"RTX 6000 Ada", "L40S"}
-    try:
-        device_names, device_count = get_gpu_info()
-        if 1 < device_count < 8:
-            if any(
-                unsupported_device in device_name
-                for device_name in device_names
-                for unsupported_device in unsupported_devices
-            ):
-                return False
-    except Exception:  # nosec B110
-        pass
     return True
 
 
-def check_runpod_p2p_support() -> bool:
-    if "RUNPOD_GPU_COUNT" not in os.environ:
-        return True
+def check_cuda_p2p_support() -> bool:
     try:
-        gpu_count = int(os.environ.get("RUNPOD_GPU_COUNT", "1"))
+        world_size = int(os.environ.get("WORLD_SIZE", "1"))
+        local_rank = int(os.environ.get("LOCAL_RANK", "0"))
     except ValueError:
         return True
-    if gpu_count >= 2:
-        # run `nvidia-smi topo -p2p n` and inspect the GPU0 row
+
+    if world_size > 1:
+        node_world_size = int(os.environ.get("NODE_WORLD_SIZE", "8"))
+        local_other_rank = (local_rank // node_world_size) * node_world_size
+        local_other_rank += 1 if (local_rank % node_world_size) == 0 else 0
         try:
-            result = subprocess.run(  # nosec B603 B607
-                ["nvidia-smi", "topo", "-p2p", "n"],
-                check=True,
-                capture_output=True,
-                text=True,
-                timeout=5,
-            )
-        except (
-            subprocess.CalledProcessError,
-            FileNotFoundError,
-            subprocess.TimeoutExpired,
-        ):
-            return True  # fail-open if detection fails
-        output_lines = result.stdout.strip().split("\n")
-        # filter rows that start with "GPU0" (avoid header row)
-        gpu0_rows = [line for line in output_lines if line.lstrip().startswith("GPU0")]
-        if not gpu0_rows:
+            can_p2p = torch.cuda.can_device_access_peer(local_rank, local_other_rank)
+        except AssertionError as exc:
+            # some sort of logic error in indexing processes, assume p2p is fine for now
+            LOG.warning(exc)
             return True
-        # consider P2P supported if any OK is present in the GPU0 row
-        return "OK" in gpu0_rows[-1]
+        return can_p2p
+
     return True
 
 
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index c7fa0a647..f2f8279f3 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -641,6 +641,7 @@ def setup_parallelism_envs(cfg):
 def prepare_optim_env(cfg):
     if not check_cuda_p2p_ib_support():
         if os.getenv("NCCL_P2P_DISABLE") is None:
+            LOG.warning("P2P support not detected, setting `NCCL_P2P_DISABLE=1`")
             os.environ["NCCL_P2P_DISABLE"] = "1"
     # TODO @SalmanMohammadi remove the cfg.fsdp check in 0.12
     if cfg.fsdp or cfg.fsdp_config:

From 08b8fa62cc27d0c8bd7b8cb9bba91d6fcf9067ac Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Thu, 9 Oct 2025 14:18:46 -0400
Subject: [PATCH 081/115] only calculate packed ds length once if using a large
 world size (#3210)

---
 src/axolotl/utils/samplers/multipack.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/utils/samplers/multipack.py b/src/axolotl/utils/samplers/multipack.py
index d07988613..662c63caa 100644
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -5,6 +5,7 @@ into fixed-capacity batches to optimize memory usage and training throughput.
 
 import gc
 import math
+import os
 import time
 from concurrent.futures import ProcessPoolExecutor
 from multiprocessing import cpu_count, get_context
@@ -291,7 +292,10 @@ class MultipackBatchSampler(BatchSampler):
         self.total_token_slots = 0
 
         # The number of times to calculate batches to determine minimum packed dataset length
-        self.num_count_samples = num_count_samples
+        world_size = int(os.environ.get("WORLD_SIZE", "1"))
+        self.num_count_samples = (
+            1 if world_size >= num_count_samples else num_count_samples
+        )
 
         if self.sequential and not isinstance(sampler, SequentialSampler):
             LOG.warning(

From 153edcfe7903170ceb9f71e36f2638add455e5c8 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Fri, 10 Oct 2025 10:57:50 +0700
Subject: [PATCH 082/115] fix(doc): add act checkpointing migration to fsdp2
 docs (#3193) [skip ci]

---
 docs/multi-gpu.qmd | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/multi-gpu.qmd b/docs/multi-gpu.qmd
index fb91f81e5..57a941b04 100644
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -88,6 +88,7 @@ fsdp_sync_module_states | **REMOVED**
 fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
 fsdp_state_dict_type | state_dict_type
 fsdp_use_orig_params | **REMOVED**
+fsdp_activation_checkpointing | activation_checkpointing
 
 For more details, please see the migration guide in the [torchtitan repo](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md). In Axolotl,
 if you were using the following FSDP1 config:

From bc2ffb8204466d4df0235c5df37a7bbeb2c384d4 Mon Sep 17 00:00:00 2001
From: Hitesh Sagtani <sagtanih@gmail.com>
Date: Fri, 10 Oct 2025 18:27:00 +0530
Subject: [PATCH 083/115] fix: Enable KD plugin support for PEFT/LoRA adapters
 (#3207)

- Fix _loss_function attribute not found on base model with PEFT
- Fix mismatched attribute name (loss_function vs _loss_function)
- Set _loss_function on unwrapped base model for PEFT
- Enable previously skipped test_llama_lora_kd test
- Add test config fixes for LoRA kernel compatibility

Fixes https://github.com/axolotl-ai-cloud/axolotl/issues/3206
---
 src/axolotl/integrations/kd/kernels/models.py |  4 ++--
 src/axolotl/integrations/kd/trainer.py        | 11 ++++++++++-
 tests/e2e/integrations/test_kd.py             |  5 ++++-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/src/axolotl/integrations/kd/kernels/models.py b/src/axolotl/integrations/kd/kernels/models.py
index f7b468669..badb3460d 100644
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -72,9 +72,9 @@ def kldiv_forward_llama_like(
 
     # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
     # TODO, we can optimize this further by filtering hidden_states on sequence dimension using labels != -100
-    # self.loss_function should be LigerFusedLinearKLTopKLogprobLoss
+    # self._loss_function should be LigerFusedLinearKLTopKLogprobLoss
 
-    loss = self.loss_function(
+    loss = self._loss_function(
         self.lm_head.weight,
         hidden_states,
         target_token_ids,
diff --git a/src/axolotl/integrations/kd/trainer.py b/src/axolotl/integrations/kd/trainer.py
index 7ec43333a..0e98497a7 100644
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -29,7 +29,8 @@ class AxolotlKDTrainer(AxolotlTrainer):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.model_accepts_loss_kwargs = True
-        self.model._loss_function = LigerFusedLinearKLTopKLogprobLoss(
+
+        loss_fn = LigerFusedLinearKLTopKLogprobLoss(
             self.args.kd_ce_alpha,  # hard label loss
             self.args.kd_alpha,  # kd loss
             self.args.kd_temperature,
@@ -37,6 +38,14 @@ class AxolotlKDTrainer(AxolotlTrainer):
             compute_ce_loss=bool(self.args.kd_ce_alpha),
             normalize_topk=self.args.kd_normalize_topk,
         )
+        target = self.model
+
+        # Unwrap PEFT wrapper
+        if hasattr(target, "get_base_model"):
+            target = target.get_base_model()
+
+        # Set on the actual model instance
+        target._loss_function = loss_fn
 
     def _set_signature_columns_if_needed(self):
         super()._set_signature_columns_if_needed()
diff --git a/tests/e2e/integrations/test_kd.py b/tests/e2e/integrations/test_kd.py
index ff47b9427..d89044247 100644
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -104,7 +104,6 @@ class TestKnowledgeDistillation:
             temp_dir + "/runs", "train/loss", 1.4, "Train Loss (%s) is too high"
         )
 
-    @pytest.mark.skip(reason="Chunked KD loss doesn't support PEFT/LoRA")
     @pytest.mark.parametrize(
         "load_in_8bit",
         [True, False],
@@ -120,6 +119,10 @@ class TestKnowledgeDistillation:
                 "lora_r": 16,
                 "lora_alpha": 32,
                 "lora_dropout": 0.0,
+                "lora_modules_to_save": ["embed_tokens", "lm_head"],
+                "lora_mlp_kernel": False,
+                "lora_qkv_kernel": False,
+                "lora_o_kernel": False,
             }
             | kd_min_cfg
         )

From 143dea4753fe4a9ff5d9ef0f303e41a32091e355 Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Fri, 10 Oct 2025 14:44:25 +0100
Subject: [PATCH 084/115] `FSDPConfig` (#3170)

---
 examples/llama-3/3b-fp8-fsdp2.yaml      |  2 +-
 src/axolotl/core/trainers/base.py       |  7 ---
 src/axolotl/utils/schemas/config.py     |  4 +-
 src/axolotl/utils/schemas/fsdp.py       | 71 +++++++++++++++++++++++++
 src/axolotl/utils/schemas/validation.py |  2 +-
 tests/e2e/multigpu/test_llama.py        |  3 --
 tests/test_normalize_config.py          |  4 --
 7 files changed, 75 insertions(+), 18 deletions(-)
 create mode 100644 src/axolotl/utils/schemas/fsdp.py

diff --git a/examples/llama-3/3b-fp8-fsdp2.yaml b/examples/llama-3/3b-fp8-fsdp2.yaml
index bea698c0e..b7de7ca52 100644
--- a/examples/llama-3/3b-fp8-fsdp2.yaml
+++ b/examples/llama-3/3b-fp8-fsdp2.yaml
@@ -29,7 +29,7 @@ flex_attention: true
 flex_attn_compile_kwargs:
   dynamic: false
   mode: max-autotune-no-cudagraphs
-
+save_strategy: no
 torch_compile: true
 
 wandb_project:
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 627f8e3f8..11dfecb98 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -560,13 +560,6 @@ class AxolotlTrainer(
 
         super().create_accelerator_and_postprocess()
 
-        if self.is_fsdp_enabled:
-            if (
-                "limit_all_gathers" in self.args.fsdp_config
-                and self.args.fsdp_config["limit_all_gathers"]
-            ):
-                self.accelerator.state.fsdp_plugin.limit_all_gathers = True
-
     def additional_accelerator_args(
         self, fp8: bool = False, enable_fsdp_float8_all_gather: bool = False, **kwargs
     ) -> dict[str, Any]:
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 0177b19f6..7cf8c3b4a 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -24,6 +24,7 @@ from axolotl.utils.schemas.datasets import (
 )
 from axolotl.utils.schemas.deprecated import DeprecatedParameters, RemappedParameters
 from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType
+from axolotl.utils.schemas.fsdp import FSDPConfig
 from axolotl.utils.schemas.integrations import (
     CometConfig,
     GradioConfig,
@@ -667,8 +668,7 @@ class AxolotlInputConfig(
         json_schema_extra={"description": "FSDP configuration"},
         deprecated="Configuring FSDP using `fsdp` is deprecated. Please use `fsdp_config` instead. ",
     )
-    # TODO @SalmanMohammadi strongly type this as its own schema
-    fsdp_config: dict[str, Any] | None = Field(
+    fsdp_config: FSDPConfig | None = Field(
         default=None, json_schema_extra={"description": "FSDP configuration options"}
     )
     fsdp_version: int | None = Field(
diff --git a/src/axolotl/utils/schemas/fsdp.py b/src/axolotl/utils/schemas/fsdp.py
new file mode 100644
index 000000000..f34f40e8e
--- /dev/null
+++ b/src/axolotl/utils/schemas/fsdp.py
@@ -0,0 +1,71 @@
+"""
+FSDP Configuration Schema
+"""
+
+from typing import Literal
+
+from pydantic import BaseModel, Field
+
+
+class FSDPConfig(BaseModel):
+    """
+    FSDP Configuration Schema
+    """
+
+    activation_checkpointing: bool | None = Field(
+        default=None,
+        description="Enable activation checkpointing to reduce memory usage during forward passes",
+    )
+    offload_params: bool | None = Field(
+        default=None,
+        description="Offload parameters to CPU to reduce GPU memory usage",
+    )
+    sync_module_states: bool | None = Field(
+        default=None,
+        description="Synchronize module states across all processes",
+    )
+    cpu_ram_efficient_loading: bool | None = Field(
+        default=None,
+        description="Enable CPU RAM efficient loading to reduce memory usage during model loading",
+    )
+    cpu_offload_pin_memory: bool | None = Field(
+        default=None,
+        description="Disabling this enables swap memory usage for resource-constrained setups when offload_params is enabled.",
+    )
+    use_orig_params: bool | None = Field(
+        default=None,
+        description="Use original parameters instead of flattened parameters",
+    )
+
+    state_dict_type: (
+        Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None
+    ) = Field(
+        default=None,
+        description="Type of state dict to use for saving/loading checkpoints",
+    )
+    final_state_dict_type: (
+        Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None
+    ) = Field(
+        default=None,
+        description="Final state dict type to use after training completion",
+    )
+
+    auto_wrap_policy: Literal["TRANSFORMER_BASED_WRAP", "SIZE_BASED_WRAP"] | None = (
+        Field(
+            default=None,
+            description="Policy for automatically wrapping modules with FSDP",
+        )
+    )
+    transformer_layer_cls_to_wrap: str | None = Field(
+        default=None,
+        description="Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')",
+    )
+
+    reshard_after_forward: bool | None = Field(
+        default=None,
+        description="Reshard parameters after forward pass to save memory",
+    )
+    mixed_precision_policy: str | None = Field(
+        default=None,
+        description="Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')",
+    )
diff --git a/src/axolotl/utils/schemas/validation.py b/src/axolotl/utils/schemas/validation.py
index 4abe45e64..368976831 100644
--- a/src/axolotl/utils/schemas/validation.py
+++ b/src/axolotl/utils/schemas/validation.py
@@ -881,7 +881,7 @@ class OptimizationValidationMixin:
             and self.fsdp_config
             and self.optimizer
             and "8bit" in self.optimizer.value
-            and self.fsdp_config["offload_params"]
+            and self.fsdp_config.offload_params
             and str(self.fsdp_version) != "2"
         ):
             raise ValueError(
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index b836291e5..ffdbad942 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -353,7 +353,6 @@ class TestMultiGPULlama:
                     "auto_wrap",
                 ],
                 "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
                     "fsdp_offload_params": False,
                     "fsdp_sync_module_states": True,
                     "fsdp_use_orig_params": False,
@@ -431,7 +430,6 @@ class TestMultiGPULlama:
                     "auto_wrap",
                 ],
                 "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
                     "fsdp_offload_params": False,
                     "fsdp_sync_module_states": True,
                     "fsdp_use_orig_params": False,
@@ -595,7 +593,6 @@ class TestMultiGPULlama:
                     "auto_wrap",
                 ],
                 "fsdp_config": {
-                    "fsdp_limit_all_gathers": True,
                     "fsdp_offload_params": False,
                     "fsdp_sync_module_states": True,
                     "fsdp_use_orig_params": False,
diff --git a/tests/test_normalize_config.py b/tests/test_normalize_config.py
index 658e06fcb..f0d3a2d72 100644
--- a/tests/test_normalize_config.py
+++ b/tests/test_normalize_config.py
@@ -111,7 +111,6 @@ class NormalizeConfigTestCase(unittest.TestCase):
                     "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                     "fsdp_offload_params": False,
                     "fsdp_cpu_ram_efficient_loading": True,
-                    "regular_param": "value",
                 }
             }
         )
@@ -124,7 +123,6 @@ class NormalizeConfigTestCase(unittest.TestCase):
         )
         self.assertEqual(cfg_with_version.fsdp_config.offload_params, False)
         self.assertEqual(cfg_with_version.fsdp_config.cpu_ram_efficient_loading, True)
-        self.assertEqual(cfg_with_version.fsdp_config.regular_param, "value")
 
         self.assertNotIn("fsdp_auto_wrap_policy", cfg_with_version.fsdp_config)
         self.assertNotIn("fsdp_offload_params", cfg_with_version.fsdp_config)
@@ -137,7 +135,6 @@ class NormalizeConfigTestCase(unittest.TestCase):
                 "fsdp_config": {
                     "fsdp_auto_wrap_policy": "SIZE_BASED_WRAP",
                     "fsdp_offload_params": True,
-                    "regular_param": "value",
                 }
             }
         )
@@ -149,7 +146,6 @@ class NormalizeConfigTestCase(unittest.TestCase):
             cfg_without_version.fsdp_config.auto_wrap_policy, "SIZE_BASED_WRAP"
         )
         self.assertEqual(cfg_without_version.fsdp_config.offload_params, True)
-        self.assertEqual(cfg_without_version.fsdp_config.regular_param, "value")
 
         self.assertNotIn("fsdp_auto_wrap_policy", cfg_without_version.fsdp_config)
         self.assertNotIn("fsdp_offload_params", cfg_without_version.fsdp_config)

From cd856b45b168b41f2286f59efe13facc55a36eb5 Mon Sep 17 00:00:00 2001
From: VED <146507396+ved1beta@users.noreply.github.com>
Date: Mon, 13 Oct 2025 15:48:12 +0530
Subject: [PATCH 085/115] feat:add support dataset_num_processes (#3129) [skip
 ci]

* feat:add support dataset_num_processes

* chore

* required changes

* requested chnages

* required chnages

* required changes

* required changes

* elif get_default_process_count()

* add:del data

* Update cicd/Dockerfile.jinja

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>

* Update cicd/single_gpu.py

Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>

---------

Co-authored-by: salman <salman.mohammadi@outlook.com>
Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>
---
 cicd/Dockerfile.jinja                         |  2 +-
 cicd/single_gpu.py                            |  2 +-
 devtools/dev_chat_template.yml                |  2 +-
 docs/debugging.qmd                            |  4 +--
 src/axolotl/core/builders/base.py             |  4 +--
 src/axolotl/utils/data/rl.py                  |  4 +--
 src/axolotl/utils/data/shared.py              |  2 +-
 src/axolotl/utils/data/utils.py               |  2 +-
 src/axolotl/utils/data/wrappers.py            |  2 +-
 src/axolotl/utils/datasets.py                 |  2 ++
 src/axolotl/utils/schemas/config.py           | 31 ++++++++++++++++---
 src/axolotl/utils/trainer.py                  | 10 +++---
 tests/core/test_builders.py                   |  2 +-
 .../patched/test_activation_checkpointing.py  |  2 +-
 tests/e2e/test_llama_pretrain.py              |  2 +-
 tests/test_datasets.py                        | 14 ++++-----
 tests/test_exact_deduplication.py             |  2 +-
 tests/test_packed_dataset.py                  |  2 +-
 18 files changed, 57 insertions(+), 34 deletions(-)

diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja
index 94c9a67e3..6a1ddb66d 100644
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -9,7 +9,7 @@ ENV GITHUB_REF="{{ GITHUB_REF }}"
 ENV GITHUB_SHA="{{ GITHUB_SHA }}"
 ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
 ENV HF_HOME="{{ HF_HOME }}"
-ENV AXOLOTL_DATASET_PROCESSES="8"
+ENV AXOLOTL_DATASET_NUM_PROC="8"
 
 RUN apt-get update && \
     apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm
diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py
index 3bca5806f..cd73f60b8 100644
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -65,7 +65,7 @@ def run_cmd(cmd: str, run_folder: str):
     import subprocess  # nosec
 
     sp_env = os.environ.copy()
-    sp_env["AXOLOTL_DATASET_PROCESSES"] = "8"
+    sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8"
 
     # Propagate errors from subprocess.
     try:
diff --git a/devtools/dev_chat_template.yml b/devtools/dev_chat_template.yml
index 27dc9be1a..32d5e56a0 100644
--- a/devtools/dev_chat_template.yml
+++ b/devtools/dev_chat_template.yml
@@ -13,7 +13,7 @@ datasets:
 val_set_size: 0
 output_dir: temp_debug/axolotl_outputs/model
 dataset_prepared_path: temp_debug/axolotl_outputs/data
-dataset_processes: 1
+dataset_num_proc: 1
 
 sequence_len: 4096
 sample_packing: false
diff --git a/docs/debugging.qmd b/docs/debugging.qmd
index bf3c6fe7e..04b4faa64 100644
--- a/docs/debugging.qmd
+++ b/docs/debugging.qmd
@@ -29,7 +29,7 @@ While debugging it's helpful to simplify your test scenario as much as possible.
 1. **Make sure you are using the latest version of axolotl**:  This project changes often and bugs get fixed fast.  Check your git branch and make sure you have pulled the latest changes from `main`.
 1. **Eliminate concurrency**: Restrict the number of processes to 1 for both training and data preprocessing:
     - Set `CUDA_VISIBLE_DEVICES` to a single GPU, ex: `export CUDA_VISIBLE_DEVICES=0`.
-    - Set `dataset_processes: 1` in your axolotl config or run the training command with `--dataset_processes=1`.
+    - Set `dataset_num_proc: 1` in your axolotl config or run the training command with `--dataset_num_proc=1`.
 2. **Use a small dataset**: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure `sample_packing: False` and `eval_sample_packing: False` to avoid errors.  If you are in a pinch and don't have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training.  For example, to shard the dataset into 20 pieces, add the following to your axolotl config):
 
     ```yaml
@@ -101,7 +101,7 @@ For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 acceler
                 "-m", "axolotl.cli.train", "dev_chat_template.yml",
                 // The flags below simplify debugging by overriding the axolotl config
                 // with the debugging tips above.  Modify as needed.
-                "--dataset_processes=1",      // limits data preprocessing to one process
+                "--dataset_num_proc=1",      // limits data preprocessing to one process
                 "--max_steps=1",              // limits training to just one step
                 "--batch_size=1",             // minimizes batch size
                 "--micro_batch_size=1",       // minimizes batch size
diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
index 3ad8012f9..8c86e335e 100644
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -491,6 +491,7 @@ class TrainerBuilderBase(abc.ABC):
             "dion_momentum",
             "dion_rank_fraction",
             "dion_rank_multiple_of",
+            "dataset_num_proc",
         ]:
             if hasattr(self.cfg, arg) and getattr(self.cfg, arg) is not None:
                 training_args_kwargs[arg] = getattr(self.cfg, arg)
@@ -514,9 +515,6 @@ class TrainerBuilderBase(abc.ABC):
         training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
         training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs
 
-        if self.cfg.dataset_processes:
-            training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
-
         # max_length is not used in CausalTrainer
         if self.cfg.reward_model or self.cfg.rl:
             training_args_kwargs["max_length"] = self.cfg.sequence_len
diff --git a/src/axolotl/utils/data/rl.py b/src/axolotl/utils/data/rl.py
index d371c9acb..f7a5ec04c 100644
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -113,7 +113,7 @@ def _map_dataset(
 
     dataset = dataset.map(
         ds_transform_fn,
-        num_proc=cfg.dataset_processes,
+        num_proc=cfg.dataset_num_proc,
         load_from_cache_file=not cfg.is_preprocess,
         desc="Mapping RL Dataset",
         **map_kwargs,
@@ -234,7 +234,7 @@ def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset:
             prior_len = len(split_datasets[i])
             split_datasets[i] = split_datasets[i].filter(
                 drop_long,
-                num_proc=cfg.dataset_processes,
+                num_proc=cfg.dataset_num_proc,
                 load_from_cache_file=not cfg.is_preprocess,
                 desc="Dropping Long Sequences",
             )
diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py
index 6b6e0e281..c9a91b829 100644
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -409,7 +409,7 @@ def save_preprocessed_dataset(
 ) -> None:
     """Save preprocessed dataset to disk and optionally push to the HF Hub."""
     prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)
-    num_workers = cfg.dataset_processes or get_default_process_count()
+    num_workers = cfg.dataset_num_proc or get_default_process_count()
     if isinstance(dataset, IterableDataset):
         ds_from_iter = Dataset.from_generator(
             functools.partial(_generate_from_iterable_dataset, dataset),
diff --git a/src/axolotl/utils/data/utils.py b/src/axolotl/utils/data/utils.py
index 445a65d6c..2d0ca9d0e 100644
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -223,7 +223,7 @@ def handle_long_seq_in_dataset(
 
     filter_map_kwargs = {}
     if not isinstance(dataset, IterableDataset):
-        filter_map_kwargs["num_proc"] = cfg.dataset_processes
+        filter_map_kwargs["num_proc"] = cfg.dataset_num_proc
         filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
 
     drop_long_kwargs = {}
diff --git a/src/axolotl/utils/data/wrappers.py b/src/axolotl/utils/data/wrappers.py
index cb9e2c6b4..3a10bde00 100644
--- a/src/axolotl/utils/data/wrappers.py
+++ b/src/axolotl/utils/data/wrappers.py
@@ -80,7 +80,7 @@ def get_dataset_wrapper(
     """
     # Common parameters for dataset wrapping
     dataset_kwargs: dict[str, Any] = {
-        "process_count": cfg.dataset_processes,
+        "process_count": cfg.dataset_num_proc,
         "keep_in_memory": cfg.dataset_keep_in_memory is True,
     }
 
diff --git a/src/axolotl/utils/datasets.py b/src/axolotl/utils/datasets.py
index 93e1a2416..9b8a8e25a 100644
--- a/src/axolotl/utils/datasets.py
+++ b/src/axolotl/utils/datasets.py
@@ -4,6 +4,8 @@ import os
 
 
 def get_default_process_count():
+    if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"):
+        return int(axolotl_dataset_num_proc)
     if axolotl_dataset_processes := os.environ.get("AXOLOTL_DATASET_PROCESSES"):
         return int(axolotl_dataset_processes)
     if runpod_cpu_count := os.environ.get("RUNPOD_CPU_COUNT"):
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 7cf8c3b4a..4d1d0aab2 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -234,6 +234,7 @@ class AxolotlInputConfig(
     )
     dataset_processes: int | None = Field(
         default=None,
+        deprecated="Use `dataset_num_proc` instead. This parameter will be removed in a future version.",
         json_schema_extra={
             "description": (
                 "The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` if not set.\n"
@@ -241,6 +242,16 @@ class AxolotlInputConfig(
             )
         },
     )
+    dataset_num_proc: int | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": (
+                "The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` if not set.\n"
+                "For Runpod VMs, it will default to number of vCPUs via RUNPOD_CPU_COUNT."
+            )
+        },
+    )
+
     dataset_exact_deduplication: bool | None = Field(
         default=None,
         json_schema_extra={
@@ -1314,10 +1325,22 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
 
     @model_validator(mode="before")
     @classmethod
-    def default_dataset_processes(cls, data):
-        if data.get("dataset_processes") is None:
-            data["dataset_processes"] = get_default_process_count()
-
+    def default_dataset_num_proc(cls, data):
+        if data.get("dataset_processes") is not None:
+            if data.get("dataset_num_proc") is None:
+                data["dataset_num_proc"] = data["dataset_processes"]
+                LOG.warning(
+                    "dataset_processes is deprecated and will be removed in a future version. "
+                    "Please use dataset_num_proc instead."
+                )
+            else:
+                LOG.warning(
+                    "Both dataset_processes and dataset_num_proc are set. "
+                    "Using dataset_num_proc and ignoring dataset_processes."
+                )
+            del data["dataset_processes"]
+        elif data.get("dataset_num_proc") is None:
+            data["dataset_num_proc"] = get_default_process_count()
         return data
 
     @model_validator(mode="before")
diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py
index f2f8279f3..d97577d86 100644
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -278,7 +278,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
         prior_len = None
     filter_map_kwargs = {}
     if not isinstance(train_dataset, IterableDataset):
-        filter_map_kwargs["num_proc"] = cfg.dataset_processes
+        filter_map_kwargs["num_proc"] = cfg.dataset_num_proc
         filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess
 
     drop_long_kwargs = {}
@@ -318,7 +318,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
     if cfg.group_by_length:
         train_dataset = train_dataset.map(
             add_length,
-            num_proc=cfg.dataset_processes,
+            num_proc=cfg.dataset_num_proc,
             load_from_cache_file=not cfg.is_preprocess,
             desc="Group By Length",
         )
@@ -335,7 +335,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
         )
         train_dataset = train_dataset.map(
             pose_fn,
-            num_proc=cfg.dataset_processes,
+            num_proc=cfg.dataset_num_proc,
             load_from_cache_file=not cfg.is_preprocess,
             desc="Add position_id column (PoSE)",
         )
@@ -344,7 +344,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
             if eval_dataset:
                 eval_dataset = eval_dataset.map(
                     pose_fn,
-                    num_proc=cfg.dataset_processes,
+                    num_proc=cfg.dataset_num_proc,
                     load_from_cache_file=not cfg.is_preprocess,
                     desc="Add position_id column (PoSE)",
                 )
@@ -469,7 +469,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
                 bin_size=cfg.sample_packing_bin_size,
                 sequential=cfg.sample_packing_sequentially,
                 drop_last=True,
-                num_processes=cfg.dataset_processes,
+                num_processes=cfg.dataset_prcoesses,
                 mp_start_method=cfg.sample_packing_mp_start_method or "fork",
             )
 
diff --git a/tests/core/test_builders.py b/tests/core/test_builders.py
index 6428aa977..67481b2ad 100644
--- a/tests/core/test_builders.py
+++ b/tests/core/test_builders.py
@@ -440,7 +440,7 @@ def rand_reward_func(prompts, completions) -> list[float]:
             ]
         else:
             raise ValueError(f"Unhandled cfg_string: {cfg_string}")
-        cfg["dataset_processes"] = 4
+        cfg["dataset_num_proc"] = 4
 
         if cfg_string == "grpo_cfg":
             rewards_dir = tmp_path / "rewards_test"
diff --git a/tests/e2e/patched/test_activation_checkpointing.py b/tests/e2e/patched/test_activation_checkpointing.py
index ddace8ef1..e8006c162 100644
--- a/tests/e2e/patched/test_activation_checkpointing.py
+++ b/tests/e2e/patched/test_activation_checkpointing.py
@@ -69,7 +69,7 @@ class TestActivationCheckpointing:
                 "save_safetensors": True,
                 "gradient_checkpointing": gradient_checkpointing,
                 "save_first_step": False,
-                "dataset_processes": 4,
+                "dataset_num_proc": 4,
             }
         )
 
diff --git a/tests/e2e/test_llama_pretrain.py b/tests/e2e/test_llama_pretrain.py
index a041244e7..f0daa9dd6 100644
--- a/tests/e2e/test_llama_pretrain.py
+++ b/tests/e2e/test_llama_pretrain.py
@@ -29,7 +29,7 @@ class TestPretrainLlama:
                 "sequence_len": 1024,
                 "sample_packing": sample_packing,
                 "pretrain_multipack_attn": pretrain_multipack_attn,
-                "dataset_processes": 1,
+                "dataset_num_proc": 1,
                 "special_tokens": {
                     "pad_token": "<|endoftext|>",
                 },
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index ea5ee368d..bd1c8f2c2 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -141,7 +141,7 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_processes": 4,
+                    "dataset_num_proc": 4,
                 }
             )
 
@@ -180,7 +180,7 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_processes": 4,
+                    "dataset_num_proc": 4,
                 }
             )
 
@@ -219,7 +219,7 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_processes": 4,
+                    "dataset_num_proc": 4,
                 }
             )
 
@@ -252,7 +252,7 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_processes": 4,
+                    "dataset_num_proc": 4,
                 }
             )
 
@@ -285,7 +285,7 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_processes": 4,
+                    "dataset_num_proc": 4,
                 }
             )
 
@@ -370,7 +370,7 @@ class TestDatasetPreparation:
                 "rl": "dpo",
                 "chat_template": "llama3",
                 "datasets": [ALPACA_MESSAGES_CONFIG_REVISION],
-                "dataset_processes": 4,
+                "dataset_num_proc": 4,
             }
         )
 
@@ -471,7 +471,7 @@ class TestDatasetPreparation:
                             "type": "alpaca",
                         },
                     ],
-                    "dataset_processes": 4,
+                    "dataset_num_proc": 4,
                 }
             )
 
diff --git a/tests/test_exact_deduplication.py b/tests/test_exact_deduplication.py
index 65deb5209..a519db525 100644
--- a/tests/test_exact_deduplication.py
+++ b/tests/test_exact_deduplication.py
@@ -210,7 +210,7 @@ class TestDeduplicateRLDataset:
                     ALPACA_MESSAGES_CONFIG_REVISION,
                     ALPACA_MESSAGES_CONFIG_REVISION,
                 ],
-                "dataset_processes": 4,
+                "dataset_num_proc": 4,
             }
         )
         yield fixture
diff --git a/tests/test_packed_dataset.py b/tests/test_packed_dataset.py
index 64f314e2e..953d523af 100644
--- a/tests/test_packed_dataset.py
+++ b/tests/test_packed_dataset.py
@@ -55,7 +55,7 @@ class TestPacking(unittest.TestCase):
                         "type": "alpaca",
                     },
                 ],
-                "dataset_processes": 4,
+                "dataset_num_proc": 4,
                 "num_epochs": 1,
                 "max_steps": 20,
                 "save_steps": 10,

From 8c7f63cf971fbaf5627f770881a38c99816da631 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Mon, 13 Oct 2025 17:19:15 +0700
Subject: [PATCH 086/115] fix: unpack cce imported incorrectly (#3212) [skip
 ci]

---
 examples/colab-notebooks/colab-axolotl-example.ipynb   | 2 +-
 scripts/cutcrossentropy_install.py                     | 2 +-
 src/axolotl/integrations/cut_cross_entropy/README.md   | 2 +-
 src/axolotl/integrations/cut_cross_entropy/__init__.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/examples/colab-notebooks/colab-axolotl-example.ipynb b/examples/colab-notebooks/colab-axolotl-example.ipynb
index ee99c283f..cea1aeda0 100644
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
     "%%capture\n",
     "# This step can take ~5-10 minutes to install dependencies\n",
     "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@49f3308\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec\""
    ]
   },
   {
diff --git a/scripts/cutcrossentropy_install.py b/scripts/cutcrossentropy_install.py
index cf8bd57e7..cb498c002 100644
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
 
 print(
     UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@49f3308"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"'
 )
diff --git a/src/axolotl/integrations/cut_cross_entropy/README.md b/src/axolotl/integrations/cut_cross_entropy/README.md
index 08cd41200..5c7c5166b 100644
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
 
 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@49f3308"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"
 ```
 
 ## Usage
diff --git a/src/axolotl/integrations/cut_cross_entropy/__init__.py b/src/axolotl/integrations/cut_cross_entropy/__init__.py
index ed6ebe62a..bd0124b93 100644
--- a/src/axolotl/integrations/cut_cross_entropy/__init__.py
+++ b/src/axolotl/integrations/cut_cross_entropy/__init__.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)
 
 _CCE_INSTALL_MESSAGE = (
     "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@49f3308"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@8a1a0ec"`'
 )
 
 

From 6e2f5ccf9f03040e5de3252999aa0733fc88261b Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 14 Oct 2025 10:21:49 -0400
Subject: [PATCH 087/115] chore: update pre-commit hooks (#3211) [skip ci]

Co-authored-by: djsaunde <1245942+djsaunde@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e853243cd..0e455f52c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
     -   id: no-commit-to-branch
         args: ['--branch', 'main']
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.13.3
+    rev: v0.14.0
     hooks:
     -   id: ruff
         args: [--fix]

From 4cdfdfebb51d6a53d4468c6512b75c500ab85293 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Tue, 14 Oct 2025 15:54:05 -0400
Subject: [PATCH 088/115] upgrade transformers==4.57.1 and peft==0.23.1 (#3214)

---
 requirements.txt                 | 4 ++--
 tests/e2e/multigpu/test_llama.py | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9c56638a3..e1f1b10a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,11 +13,11 @@ packaging==23.2
 huggingface_hub>=0.33.0
 peft>=0.17.1
 tokenizers>=0.21.1
-transformers==4.57.0
+transformers==4.57.1
 accelerate==1.10.1
 datasets==4.0.0
 deepspeed>=0.17.0
-trl==0.23.0
+trl==0.23.1
 hf_xet==1.1.5
 kernels==0.9.0
 trackio
diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py
index ffdbad942..3383e71d1 100644
--- a/tests/e2e/multigpu/test_llama.py
+++ b/tests/e2e/multigpu/test_llama.py
@@ -546,7 +546,6 @@ class TestMultiGPULlama:
             temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
         )
 
-    @pytest.mark.skip("regression failure from v4.57.0")
     def test_fsdp_qlora_prequant_packed(self, temp_dir):
         cfg = DictDefault(
             {

From aa1240acd8d7e9640a01a78e9da8a0725b158041 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Thu, 16 Oct 2025 16:07:27 +0700
Subject: [PATCH 089/115] fix: transformers deprecate load_in_Xbit in
 model_kwargs (#3205)

* fix: transformers deprecate load_in_Xbit in model_kwargs

* fix: test to read from quantization_config kwarg

* fix: test

* fix: access

* fix: test weirdly entering incorrect config
---
 src/axolotl/loaders/model.py | 16 ++--------------
 tests/test_loaders.py        | 28 +++++++++++++++++++---------
 2 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/src/axolotl/loaders/model.py b/src/axolotl/loaders/model.py
index f438d6b61..aeec46584 100644
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -515,9 +515,6 @@ class ModelLoader:
             if self.cfg.model_quantization_config_kwargs:
                 mxfp4_kwargs = self.cfg.model_quantization_config_kwargs
             self.model_kwargs["quantization_config"] = Mxfp4Config(**mxfp4_kwargs)
-        else:
-            self.model_kwargs["load_in_8bit"] = self.cfg.load_in_8bit
-            self.model_kwargs["load_in_4bit"] = self.cfg.load_in_4bit
 
         if self.cfg.gptq:
             if not hasattr(self.model_config, "quantization_config"):
@@ -552,9 +549,7 @@ class ModelLoader:
                 self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                     **self.model_config.quantization_config
                 )
-        elif self.cfg.adapter == "qlora" and self.model_kwargs.get(
-            "load_in_4bit", False
-        ):
+        elif self.cfg.adapter == "qlora" and self.cfg.load_in_4bit:
             bnb_config = {
                 "load_in_4bit": True,
                 "llm_int8_threshold": 6.0,
@@ -580,9 +575,7 @@ class ModelLoader:
             self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                 **bnb_config,
             )
-        elif self.cfg.adapter == "lora" and self.model_kwargs.get(
-            "load_in_8bit", False
-        ):
+        elif self.cfg.adapter == "lora" and self.cfg.load_in_8bit:
             bnb_config = {
                 "load_in_8bit": True,
             }
@@ -596,11 +589,6 @@ class ModelLoader:
                 **bnb_config,
             )
 
-        # no longer needed per https://github.com/huggingface/transformers/pull/26610
-        if "quantization_config" in self.model_kwargs or self.cfg.gptq:
-            self.model_kwargs.pop("load_in_8bit", None)
-            self.model_kwargs.pop("load_in_4bit", None)
-
     def _set_attention_config(self):
         """Sample packing uses custom FA2 patch"""
         if self.cfg.attn_implementation:
diff --git a/tests/test_loaders.py b/tests/test_loaders.py
index f516d0ca4..913090566 100644
--- a/tests/test_loaders.py
+++ b/tests/test_loaders.py
@@ -80,16 +80,26 @@ class TestModelsUtils:
                 hasattr(self.model_loader.model_kwargs, "load_in_8bit")
                 and hasattr(self.model_loader.model_kwargs, "load_in_4bit")
             )
-        elif load_in_8bit and self.cfg.adapter is not None:
-            assert self.model_loader.model_kwargs["load_in_8bit"]
-        elif load_in_4bit and self.cfg.adapter is not None:
-            assert self.model_loader.model_kwargs["load_in_4bit"]
 
-        if (self.cfg.adapter == "qlora" and load_in_4bit) or (
-            self.cfg.adapter == "lora" and load_in_8bit
-        ):
-            assert self.model_loader.model_kwargs.get(
-                "quantization_config", BitsAndBytesConfig
+        if self.cfg.adapter == "qlora" and load_in_4bit:
+            assert isinstance(
+                self.model_loader.model_kwargs.get("quantization_config"),
+                BitsAndBytesConfig,
+            )
+
+            assert (
+                self.model_loader.model_kwargs["quantization_config"]._load_in_4bit
+                is True
+            )
+        if self.cfg.adapter == "lora" and load_in_8bit:
+            assert isinstance(
+                self.model_loader.model_kwargs.get("quantization_config"),
+                BitsAndBytesConfig,
+            )
+
+            assert (
+                self.model_loader.model_kwargs["quantization_config"]._load_in_8bit
+                is True
             )
 
     def test_message_property_mapping(self):

From 93ba57396f103778dc4e02cb954bdb46ef155fe2 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Fri, 17 Oct 2025 10:35:03 +0700
Subject: [PATCH 090/115] fix: qwen3_vl attention config (#3216)

---
 src/axolotl/monkeypatch/lora_kernels.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/axolotl/monkeypatch/lora_kernels.py b/src/axolotl/monkeypatch/lora_kernels.py
index e845dc6ce..8e335fe4c 100644
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -134,6 +134,11 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
 
         return Qwen2Attention
 
+    if model_type == "qwen3_vl":
+        from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextAttention
+
+        return Qwen3VLTextAttention
+
     if model_type == "mllama":
         from transformers.models.mllama.modeling_mllama import MllamaTextSelfAttention
 

From 87565ecc05f1b8fd1f8b907dd750d3a5d09adf9a Mon Sep 17 00:00:00 2001
From: Leonard <lhl@randomfoo.net>
Date: Fri, 17 Oct 2025 19:00:26 +0900
Subject: [PATCH 091/115] Add chat_template.argilla_chat support for DPO
 datasets (#3202)

* Add chat_template.argilla_chat support for DPO datasets

  Creates a new chat_template.argilla_chat prompt strategy for handling
  DPO datasets where chosen/rejected fields contain full conversations
  (messages + final response), following the pattern of chatml.argilla_chat
  and llama3.argilla_chat.

  - Add argilla_chat() function to chat_template.py
  - Add chat_template.argilla_chat to RLHF documentation
  - Add test coverage for argilla_chat with multiple tokenizers

  Dataset format:
  {
    "chosen": [
      {"role": "user", "content": "..."},
      {"role": "assistant", "content": "..."}
    ],
    "rejected": [
      {"role": "user", "content": "..."},
      {"role": "assistant", "content": "..."}
    ]
  }

* Fix chat_template.argilla_chat return value contract and add docstring

- Return (transform_fn, dataset_kwargs) tuple instead of bare transform_fn
- Add remove_columns specification for field_chosen and field_rejected
- Add comprehensive docstring with Args/Returns sections
- Update tests to unpack tuple return value

Addresses PR feedback to maintain consistency with chat_template.default()
and properly specify columns to remove after dataset transformation.

* Update tests/prompt_strategies/test_dpo_chat_templates.py

Co-authored-by: Wing Lian <wing.lian@gmail.com>

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
---
 docs/rlhf.qmd                                 |  15 +++
 .../prompt_strategies/dpo/chat_template.py    | 120 ++++++++++++++++++
 .../test_dpo_chat_templates.py                |  78 +++++++++++-
 3 files changed, 212 insertions(+), 1 deletion(-)

diff --git a/docs/rlhf.qmd b/docs/rlhf.qmd
index 4a67b7559..594ebc743 100644
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -219,6 +219,21 @@ DPO supports the following types with the following dataset format:
 }
 ```
 
+#### chat_template.argilla_chat
+
+```json
+{
+    "chosen": [
+        {"role": "user", "content": "..."},
+        {"role": "assistant", "content": "..."}
+    ],
+    "rejected": [
+        {"role": "user", "content": "..."},
+        {"role": "assistant", "content": "..."}
+    ]
+}
+```
+
 #### chat_template.default
 
 ```yaml
diff --git a/src/axolotl/prompt_strategies/dpo/chat_template.py b/src/axolotl/prompt_strategies/dpo/chat_template.py
index 85c4d2182..58b4d75bd 100644
--- a/src/axolotl/prompt_strategies/dpo/chat_template.py
+++ b/src/axolotl/prompt_strategies/dpo/chat_template.py
@@ -120,3 +120,123 @@ def default(cfg, dataset_idx=0, **kwargs):
         return result
 
     return transform_fn, {"remove_columns": [field_messages]}
+
+
+def argilla_chat(cfg, dataset_idx=0, **kwargs):
+    """
+    DPO chat template strategy for argilla-style datasets.
+
+    For argilla-style datasets where chosen/rejected contain full conversations
+    instead of single response messages. Extracts the conversation history from
+    the chosen field and formats both chosen/rejected responses using the
+    configured chat template.
+
+    Args:
+        cfg: Configuration object containing chat_template and dataset settings
+        dataset_idx: Index of the dataset in the config (default: 0)
+        **kwargs: Additional keyword arguments (unused)
+
+    Returns:
+        tuple: (transform_fn, dataset_kwargs) where:
+            - transform_fn: Function to transform dataset samples
+            - dataset_kwargs: Dict with 'remove_columns' specifying columns to drop
+
+    Dataset format:
+        {
+            "chosen": [
+                {"role": "user", "content": "..."},
+                {"role": "assistant", "content": "..."}
+            ],
+            "rejected": [
+                {"role": "user", "content": "..."},
+                {"role": "assistant", "content": "..."}
+            ]
+        }
+    """
+    ds_cfg = cfg["datasets"][dataset_idx]
+    ds_cfg = handle_legacy_message_fields_logic(ds_cfg)
+
+    chat_template_choice, chat_template_jinja = extract_chat_template_args(
+        cfg=cfg, ds_cfg=ds_cfg
+    )
+    field_chosen = ds_cfg.get("field_chosen", "chosen")
+    field_rejected = ds_cfg.get("field_rejected", "rejected")
+    message_property_mappings = ds_cfg.get(
+        "message_property_mappings",
+        {
+            "role": "role",
+            "content": "content",
+        },
+    )
+    role_map_inv = ds_cfg.get(
+        "roles",
+        {
+            "user": ["user"],
+            "assistant": ["assistant"],
+            "system": ["system"],
+        },
+    )
+    role_map = {}
+    for target, sources in role_map_inv.items():
+        for source in sources:
+            role_map[source] = target
+
+    def transform_fn(sample, tokenizer=None):
+        chat_template_string = get_chat_template(
+            user_choice=chat_template_choice,
+            jinja_template=chat_template_jinja,
+            tokenizer=tokenizer,
+        )
+
+        chosen_raw = sample[field_chosen]
+        rejected_raw = sample[field_rejected]
+
+        # Extract messages (all but last) and responses (last message)
+        chosen_messages = [
+            {
+                "role": role_map[m[message_property_mappings["role"]]],
+                "content": m[message_property_mappings["content"]],
+            }
+            for m in chosen_raw[:-1]
+        ]
+        chosen_response = {
+            "role": role_map[chosen_raw[-1][message_property_mappings["role"]]],
+            "content": chosen_raw[-1][message_property_mappings["content"]],
+        }
+
+        rejected_response = {
+            "role": role_map[rejected_raw[-1][message_property_mappings["role"]]],
+            "content": rejected_raw[-1][message_property_mappings["content"]],
+        }
+
+        dummy_user_message = {"role": "user", "content": "[[dummy_message]]"}
+
+        result = {}
+        result["prompt"] = tokenizer.apply_chat_template(
+            chosen_messages,
+            add_generation_prompt=True,
+            chat_template=chat_template_string,
+            tokenize=False,
+        )
+
+        result["chosen"] = tokenizer.apply_chat_template(
+            [dummy_user_message, chosen_response],
+            add_generation_prompt=False,
+            chat_template=chat_template_string,
+            tokenize=False,
+        )
+        chosen_strip_index = result["chosen"].find(chosen_response["content"])
+        result["chosen"] = result["chosen"][chosen_strip_index:].rstrip()
+
+        result["rejected"] = tokenizer.apply_chat_template(
+            [dummy_user_message, rejected_response],
+            add_generation_prompt=False,
+            chat_template=chat_template_string,
+            tokenize=False,
+        )
+        rejected_strip_index = result["rejected"].find(rejected_response["content"])
+        result["rejected"] = result["rejected"][rejected_strip_index:].rstrip()
+
+        return result
+
+    return transform_fn, {"remove_columns": [field_chosen, field_rejected]}
diff --git a/tests/prompt_strategies/test_dpo_chat_templates.py b/tests/prompt_strategies/test_dpo_chat_templates.py
index e570cfc9d..b5c121726 100644
--- a/tests/prompt_strategies/test_dpo_chat_templates.py
+++ b/tests/prompt_strategies/test_dpo_chat_templates.py
@@ -8,7 +8,7 @@ import pytest
 from datasets import Dataset
 from transformers import AutoTokenizer
 
-from axolotl.prompt_strategies.dpo.chat_template import default
+from axolotl.prompt_strategies.dpo.chat_template import argilla_chat, default
 from axolotl.utils.dict import DictDefault
 
 from tests.hf_offline_utils import enable_hf_offline
@@ -78,6 +78,36 @@ def fixture_custom_assistant_dataset():
     )
 
 
+@pytest.fixture(name="argilla_chat_dataset")
+def fixture_argilla_chat_dataset():
+    return Dataset.from_list(
+        [
+            {
+                "chosen": [
+                    {
+                        "role": "user",
+                        "content": "hello",
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "goodbye",
+                    },
+                ],
+                "rejected": [
+                    {
+                        "role": "user",
+                        "content": "hello",
+                    },
+                    {
+                        "role": "assistant",
+                        "content": "party on",
+                    },
+                ],
+            }
+        ]
+    )
+
+
 @pytest.fixture(name="phi3_tokenizer")
 @enable_hf_offline
 def fixture_phi3_tokenizer():
@@ -216,5 +246,51 @@ class TestAssistantDPOChatTemplateGemma:
         assert result["rejected"] == "party on<end_of_turn>"
 
 
+class TestArgillaChatDPOChatTemplate:
+    """
+    Test class for argilla_chat style datasets (chosen/rejected contain full conversations).
+    """
+
+    def test_llama3_argilla_chat(self, llama3_tokenizer, argilla_chat_dataset):
+        transform_fn, _ = argilla_chat(
+            DictDefault(
+                {
+                    "chat_template": "llama3",
+                    "datasets": [
+                        {
+                            "type": "chat_template.argilla_chat",
+                        }
+                    ],
+                }
+            )
+        )
+        result = transform_fn(argilla_chat_dataset[0], tokenizer=llama3_tokenizer)
+        assert result["prompt"] == (
+            "<|begin_of_text|>"
+            + "<|start_header_id|>user<|end_header_id|>\n\nhello<|eot_id|>"
+            + "<|start_header_id|>assistant<|end_header_id|>\n\n"
+        )
+        assert result["chosen"] == "goodbye<|eot_id|>"
+        assert result["rejected"] == "party on<|eot_id|>"
+
+    def test_phi3_argilla_chat(self, phi3_tokenizer, argilla_chat_dataset):
+        transform_fn, _ = argilla_chat(
+            DictDefault(
+                {
+                    "chat_template": "tokenizer_default",
+                    "datasets": [
+                        {
+                            "type": "chat_template.argilla_chat",
+                        }
+                    ],
+                }
+            )
+        )
+        result = transform_fn(argilla_chat_dataset[0], tokenizer=phi3_tokenizer)
+        assert result["prompt"] == "<|user|>\nhello<|end|>\n" + "<|assistant|>\n"
+        assert result["chosen"] == "goodbye<|end|>"
+        assert result["rejected"] == "party on<|end|>"
+
+
 if __name__ == "__main__":
     unittest.main()

From 8bb871b5cf0810fd4034069821250d718db366ca Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Mon, 20 Oct 2025 14:06:58 +0700
Subject: [PATCH 092/115] fix: deepspeed with context parallel (#3220)

---
 .../monkeypatch/transformers/trainer_context_parallel.py      | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
index 74a35e83f..ba8b16dda 100644
--- a/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
+++ b/src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
@@ -13,9 +13,7 @@ from axolotl.utils.logging import get_logger
 LOG = get_logger(__name__)
 
 GUARD_PATTERN = 'if model.config._attn_implementation != "sdpa":'
-PATCHED_GUARD = (
-    'if model.config._attn_implementation not in ("sdpa", "flash_attention_2"):'
-)
+PATCHED_GUARD = 'if (attn_impl := (getattr(model.config, "_attn_implementation", None) or getattr(model.model.config, "_attn_implementation", None))) and attn_impl not in ("sdpa", "flash_attention_2"):'
 
 
 def patch_prepare_context_parallel_inputs() -> None:

From 383f220cfd658804f4c508a0686c988861ecffbe Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Mon, 20 Oct 2025 08:53:49 -0400
Subject: [PATCH 093/115] build torch 2.9.0 base images (#3221)

---
 .github/workflows/base.yml | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index 7af6059c8..b2681bb5d 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -53,6 +53,13 @@ jobs:
             pytorch: 2.8.0
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
             dockerfile: "Dockerfile-base"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.9.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-base"
 #          - cuda: "128"
 #            cuda_version: 12.8.1
 #            cudnn_version: ""
@@ -129,6 +136,13 @@ jobs:
             pytorch: 2.8.0
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
             dockerfile: "Dockerfile-uv-base"
+          - cuda: "128"
+            cuda_version: 12.8.1
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.9.0
+            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
     steps:
       - name: Checkout
         uses: actions/checkout@v4

From 613bcf90e58f3ab81d3827e7fc572319908db9fb Mon Sep 17 00:00:00 2001
From: Matthew Hambrecht <14303543+matthambrecht@users.noreply.github.com>
Date: Wed, 22 Oct 2025 09:55:26 -0400
Subject: [PATCH 094/115] fix: enable_sleep_mode -> vllm_enable_sleep_mode
 (#3225)

Co-authored-by: Matthew Hambrecht <matthew.hambrecht@patapsco.ai>
---
 src/axolotl/core/trainers/grpo/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/axolotl/core/trainers/grpo/__init__.py b/src/axolotl/core/trainers/grpo/__init__.py
index d1a6b7fd9..bd77489eb 100644
--- a/src/axolotl/core/trainers/grpo/__init__.py
+++ b/src/axolotl/core/trainers/grpo/__init__.py
@@ -52,7 +52,7 @@ class GRPOStrategy:
             if trl.vllm_mode:
                 grpo_args_kwargs["vllm_mode"] = trl.vllm_mode
             if trl.vllm_mode == "colocate":
-                grpo_args_kwargs["enable_sleep_mode"] = trl.vllm_enable_sleep_mode  # type: ignore[attr-defined]
+                grpo_args_kwargs["vllm_enable_sleep_mode"] = trl.vllm_enable_sleep_mode  # type: ignore[attr-defined]
                 grpo_args_kwargs["vllm_gpu_memory_utilization"] = (
                     vllm_cfg.gpu_memory_utilization
                 )

From 3750fdcf79313f5c626d9508c72ea167f7da2985 Mon Sep 17 00:00:00 2001
From: Qingyang Wu <qingyang@together.ai>
Date: Wed, 22 Oct 2025 07:22:14 -0700
Subject: [PATCH 095/115] Fix trainer dataloader slow loading issue (#3219)

* Fix trainer dataloader handling in src/axolotl/core/trainers/base.py

* update comment to reflect torch version

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
---
 setup.py                          |  2 +-
 src/axolotl/core/trainers/base.py | 23 ++++++++++++-----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/setup.py b/setup.py
index b2eeb92d6..a93d8d49e 100644
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,7 @@ def parse_requirements(extras_require_map):
             try:
                 torch_version = version("torch")
             except PackageNotFoundError:
-                torch_version = "2.6.0"  # default to torch 2.6
+                torch_version = "2.8.0"  # default to torch 2.8.0
             _install_requires.append(f"torch=={torch_version}")
 
             version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 11dfecb98..7d7420fb8 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -225,17 +225,6 @@ class AxolotlTrainer(
 
         data_collator = self.data_collator if is_training else self.eval_data_collator
 
-        if dataset.column_names and "length" in dataset.column_names:
-            dataset = dataset.remove_columns(["length"])
-        if (
-            dataset.column_names
-            and "position_ids" in dataset.column_names
-            and "attention_mask" in dataset.column_names
-            and self.args.sample_packing
-            and self.args.sample_packing_drop_attention_mask
-        ):
-            dataset = dataset.remove_columns(["attention_mask"])
-
         if isinstance(dataset, datasets.Dataset):
             if is_training:
                 if not self.args.sample_packing or self.args.pretraining:
@@ -294,6 +283,18 @@ class AxolotlTrainer(
         ):
             self.accelerator.even_batches = False
 
+        if dataset.column_names and "length" in dataset.column_names:
+            dataset = dataset.remove_columns(["length"])
+
+        if (
+            dataset.column_names
+            and "position_ids" in dataset.column_names
+            and "attention_mask" in dataset.column_names
+            and self.args.sample_packing
+            and self.args.sample_packing_drop_attention_mask
+        ):
+            dataset = dataset.remove_columns(["attention_mask"])
+
         dataloader = DataLoader(dataset, **dataloader_params)
 
         # Accelerator.free_memory() will destroy the references, so

From 243620394a2576db507b1f6ab033c4183a18233e Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Thu, 23 Oct 2025 05:23:20 +0700
Subject: [PATCH 096/115] fix: force train split for json,csv,txt for
 test_datasets and misc doc changes (#3226)

* fix: force train split for json,csv,txt for test_datasets

* feat(doc): add info on mixing datasets for VLM

* feat(doc): max memory

* fix(doc): clarify lr groups

* fix: add info on vision not being dropped

* feat: add qwen3-vl to multimodal docs

* fix: add moe blocks to arch list

* feat(doc): improve mistral docs

* chore: add helpful link [skip-e2e]

* fix: add vram usage for mistral small

* Update link in docs/faq.qmd

Co-authored-by: salman <salman.mohammadi@outlook.com>

---------

Co-authored-by: Wing Lian <wing@axolotl.ai>
Co-authored-by: salman <salman.mohammadi@outlook.com>
---
 docs/faq.qmd                                  |  8 +++
 docs/lr_groups.qmd                            |  6 +++
 docs/multimodal.qmd                           | 14 ++++-
 examples/magistral/think/README.md            |  2 +-
 examples/magistral/vision/README.md           |  2 +-
 examples/mistral/mistral-small/README.md      | 51 +++++++++++++++++++
 .../mistral-small-3.1-24B-lora.yml            |  2 +-
 src/axolotl/common/architectures.py           |  2 +
 src/axolotl/utils/data/shared.py              |  5 ++
 9 files changed, 88 insertions(+), 4 deletions(-)
 create mode 100644 examples/mistral/mistral-small/README.md

diff --git a/docs/faq.qmd b/docs/faq.qmd
index ffc29d35d..92b432f2d 100644
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -63,6 +63,14 @@ description: Frequently asked questions
 
 > A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.
 
+**Q: Can we mix text and text+image datasets for VLM training?**
+
+> A: Yes, you can for newer VLM arch. The ones that would not work are LLaVA / Pixtral arch. If you notice one not working, please let us know!
+
+**Q: Why is `memory/max_*` different from `nvidia-smi`?**
+
+> A: We use `torch` APIs to retrieve this information. You can see https://docs.pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management for more information.
+
 ### Chat templates
 
 **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**
diff --git a/docs/lr_groups.qmd b/docs/lr_groups.qmd
index 52059016c..ce5350722 100644
--- a/docs/lr_groups.qmd
+++ b/docs/lr_groups.qmd
@@ -27,3 +27,9 @@ learning_rate: 2e-5
 In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate
 of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's
 self attention `q_proj` module.
+
+::: {.callout-note}
+
+We currently only support varying `lr` for now. If you're interested in adding support for others (`weight_decay`), we welcome PRs. See https://github.com/axolotl-ai-cloud/axolotl/blob/613bcf90e58f3ab81d3827e7fc572319908db9fb/src/axolotl/core/trainers/mixins/optimizer.py#L17
+
+:::
diff --git a/docs/multimodal.qmd b/docs/multimodal.qmd
index 3a28b579a..1c4e28ea7 100644
--- a/docs/multimodal.qmd
+++ b/docs/multimodal.qmd
@@ -56,10 +56,14 @@ image_resize_algorithm: bilinear
 
 Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs.
 
-::: {.callout-warning}
+::: {.callout-tip}
 Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
 :::
 
+::: {.callout-note}
+As of now, we do not truncate nor drop samples based on `sequence_len` as each arch has different ways to process non-text tokens. We are looking for help on this.
+:::
+
 ### Mllama {#sec-mllama}
 
 ```yaml
@@ -168,6 +172,14 @@ base_model: Qwen/Qwen2.5-VL-7B-Instruct
 chat_template: qwen2_vl  # same as qwen2-vl
 ```
 
+### Qwen3-VL {#sec-qwen3-vl}
+
+```yaml
+base_model: Qwen/Qwen3-VL-4B-Instruct
+
+chat_template: qwen2_vl  # same as qwen2-vl
+```
+
 ### SmolVLM2 {#sec-smolvlm2}
 
 ::: {.callout-tip}
diff --git a/examples/magistral/think/README.md b/examples/magistral/think/README.md
index 29950f59e..a87579775 100644
--- a/examples/magistral/think/README.md
+++ b/examples/magistral/think/README.md
@@ -12,7 +12,7 @@ Before starting, ensure you have:
 Run the thinking model fine-tuning:
 
 ```bash
-axolotl train magistral-small-think-qlora.yaml
+axolotl train examples/magistral/think/magistral-small-think-qlora.yaml
 ```
 
 This config uses about 19.1 GiB VRAM.
diff --git a/examples/magistral/vision/README.md b/examples/magistral/vision/README.md
index 932a3631e..fc614c850 100644
--- a/examples/magistral/vision/README.md
+++ b/examples/magistral/vision/README.md
@@ -21,7 +21,7 @@ Before starting, ensure you have:
 
 3. Run the fine-tuning:
    ```bash
-   axolotl train magistral-small-vision-24B-qlora.yml
+   axolotl train examples/magistral/vision/magistral-small-vision-24B-qlora.yml
    ```
 
 This config uses about 17GiB VRAM.
diff --git a/examples/mistral/mistral-small/README.md b/examples/mistral/mistral-small/README.md
new file mode 100644
index 000000000..3c606a897
--- /dev/null
+++ b/examples/mistral/mistral-small/README.md
@@ -0,0 +1,51 @@
+# Mistral Small 3.1/3.2 Fine-tuning
+
+This guide covers fine-tuning [Mistral Small 3.1](mistralai/Mistral-Small-3.1-24B-Instruct-2503) and [Mistral Small 3.2](mistralai/Mistral-Small-3.2-24B-Instruct-2506) with vision capabilities using Axolotl.
+
+## Prerequisites
+
+Before starting, ensure you have:
+- Installed Axolotl (see [Installation docs](https://docs.axolotl.ai/docs/installation.html))
+
+## Getting Started
+
+1. Install the required vision lib:
+    ```bash
+    pip install 'mistral-common[opencv]==1.8.5'
+    ```
+
+2. Download the example dataset image:
+   ```bash
+   wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
+   ```
+
+3. Run the fine-tuning:
+   ```bash
+   axolotl train examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
+   ```
+
+This config uses about 29.4 GiB VRAM.
+
+## Dataset Format
+
+The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
+
+One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now.
+
+Example:
+```json
+{
+    "messages": [
+        {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
+        {"role": "user", "content": [
+            { "type": "text", "text": "What's in this image?"},
+            {"type": "image", "path": "path/to/image.jpg" }
+        ]},
+        {"role": "assistant", "content": [{ "type": "text", "text": "..." }]},
+    ],
+}
+```
+
+## Limitations
+
+- Sample Packing is not supported for multi-modality training currently.
diff --git a/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml b/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
index ec197f333..d45d13ac6 100644
--- a/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
+++ b/examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
@@ -39,7 +39,7 @@ wandb_name:
 wandb_log_model:
 
 gradient_accumulation_steps: 1
-micro_batch_size: 1
+micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
diff --git a/src/axolotl/common/architectures.py b/src/axolotl/common/architectures.py
index b754e56ba..c8a2f0836 100644
--- a/src/axolotl/common/architectures.py
+++ b/src/axolotl/common/architectures.py
@@ -12,7 +12,9 @@ MOE_ARCH_BLOCK = {
     "mixtral": "MixtralSparseMoeBlock",
     "qwen2_moe": "Qwen2MoeSparseMoeBlock",
     "qwen3_moe": "Qwen3MoeSparseMoeBlock",
+    "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock",
     "deepseek_v2": "DeepseekV2MoE",
+    "deepseek_v3": "DeepseekV3MoE",
     "gpt_oss": "GptOssDecoderLayer",
     "lfm2_moe": "Lfm2MoeSparseMoeBlock",
 }
diff --git a/src/axolotl/utils/data/shared.py b/src/axolotl/utils/data/shared.py
index c9a91b829..a8ed55ae2 100644
--- a/src/axolotl/utils/data/shared.py
+++ b/src/axolotl/utils/data/shared.py
@@ -239,6 +239,11 @@ def _load_from_local_path(
             return load_dataset(dataset_config.path, **load_dataset_kwargs)
     elif local_path.is_file():
         dataset_type = get_dataset_type(dataset_config)
+
+        # For single file datasets, HF always creates only a "train" split
+        if dataset_type in ("json", "csv", "text"):
+            load_dataset_kwargs["split"] = "train"
+
         return load_dataset(
             dataset_type,
             data_files=dataset_config.path,

From 4dc018992dccba6fa5e239d0453cbbd565e47e96 Mon Sep 17 00:00:00 2001
From: VED <146507396+ved1beta@users.noreply.github.com>
Date: Thu, 23 Oct 2025 07:46:55 +0530
Subject: [PATCH 097/115] Feat/opentelemetry (#3215)

---
 examples/llama-3/opentelemetry-qlora.yml     |  50 +++
 setup.py                                     |   6 +
 src/axolotl/core/builders/base.py            |  12 +-
 src/axolotl/utils/__init__.py                |   7 +
 src/axolotl/utils/callbacks/opentelemetry.py | 238 +++++++++++++
 src/axolotl/utils/schemas/config.py          |   2 +
 src/axolotl/utils/schemas/integrations.py    |  24 ++
 tests/test_opentelemetry_callback.py         | 349 +++++++++++++++++++
 8 files changed, 687 insertions(+), 1 deletion(-)
 create mode 100644 examples/llama-3/opentelemetry-qlora.yml
 create mode 100644 src/axolotl/utils/callbacks/opentelemetry.py
 create mode 100644 tests/test_opentelemetry_callback.py

diff --git a/examples/llama-3/opentelemetry-qlora.yml b/examples/llama-3/opentelemetry-qlora.yml
new file mode 100644
index 000000000..d8ce7b1ec
--- /dev/null
+++ b/examples/llama-3/opentelemetry-qlora.yml
@@ -0,0 +1,50 @@
+base_model: NousResearch/Llama-3.2-1B
+model_type: AutoModelForCausalLM
+tokenizer_type: AutoTokenizer
+
+load_in_4bit: true
+
+datasets:
+  - path: mhenrichsen/alpaca_2k_test
+    type: alpaca
+
+output_dir: ./outputs/opentelemetry-example
+
+adapter: qlora
+sequence_len: 512
+sample_packing: false
+
+lora_r: 32
+lora_alpha: 16
+lora_dropout: 0.05
+lora_target_linear: true
+
+# OpenTelemetry Configuration
+use_otel_metrics: true
+otel_metrics_host: "localhost"
+otel_metrics_port: 8000
+
+# Disable WandB
+use_wandb: false
+
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 1
+optimizer: paged_adamw_32bit
+lr_scheduler: cosine
+learning_rate: 0.0002
+
+bf16: auto
+tf32: false
+
+gradient_checkpointing: true
+logging_steps: 1
+flash_attention: false
+
+warmup_ratio: 0.1
+evals_per_epoch: 2
+saves_per_epoch: 1
+weight_decay: 0.0
+
+special_tokens:
+  pad_token: "<|end_of_text|>"
diff --git a/setup.py b/setup.py
index a93d8d49e..9e3de48b5 100644
--- a/setup.py
+++ b/setup.py
@@ -159,6 +159,12 @@ extras_require = {
         "llmcompressor==0.5.1",
     ],
     "fbgemm-gpu": ["fbgemm-gpu-genai>=1.2.0"],
+    "opentelemetry": [
+        "opentelemetry-api",
+        "opentelemetry-sdk",
+        "opentelemetry-exporter-prometheus",
+        "prometheus-client",
+    ],
 }
 install_requires, dependency_links, extras_require_build = parse_requirements(
     extras_require
diff --git a/src/axolotl/core/builders/base.py b/src/axolotl/core/builders/base.py
index 8c86e335e..2c949f8e7 100644
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -29,7 +29,11 @@ from transformers.trainer_pt_utils import AcceleratorConfig
 
 from axolotl.integrations.base import PluginManager
 from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
-from axolotl.utils import is_comet_available, is_mlflow_available
+from axolotl.utils import (
+    is_comet_available,
+    is_mlflow_available,
+    is_opentelemetry_available,
+)
 from axolotl.utils.callbacks import (
     GCCallback,
     SaveAxolotlConfigtoWandBCallback,
@@ -134,6 +138,12 @@ class TrainerBuilderBase(abc.ABC):
             callbacks.append(
                 SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
             )
+        if self.cfg.use_otel_metrics and is_opentelemetry_available():
+            from axolotl.utils.callbacks.opentelemetry import (
+                OpenTelemetryMetricsCallback,
+            )
+
+            callbacks.append(OpenTelemetryMetricsCallback(self.cfg))
         if self.cfg.save_first_step:
             callbacks.append(SaveModelOnFirstStepCallback())
 
diff --git a/src/axolotl/utils/__init__.py b/src/axolotl/utils/__init__.py
index 7256a5700..72f8173f3 100644
--- a/src/axolotl/utils/__init__.py
+++ b/src/axolotl/utils/__init__.py
@@ -17,6 +17,13 @@ def is_comet_available():
     return importlib.util.find_spec("comet_ml") is not None
 
 
+def is_opentelemetry_available():
+    return (
+        importlib.util.find_spec("opentelemetry") is not None
+        and importlib.util.find_spec("prometheus_client") is not None
+    )
+
+
 def get_pytorch_version() -> tuple[int, int, int]:
     """
     Get Pytorch version as a tuple of (major, minor, patch).
diff --git a/src/axolotl/utils/callbacks/opentelemetry.py b/src/axolotl/utils/callbacks/opentelemetry.py
new file mode 100644
index 000000000..3f7e56b78
--- /dev/null
+++ b/src/axolotl/utils/callbacks/opentelemetry.py
@@ -0,0 +1,238 @@
+"""OpenTelemetry metrics callback for Axolotl training"""
+
+import threading
+from typing import Dict, Optional
+
+from transformers import (
+    TrainerCallback,
+    TrainerControl,
+    TrainerState,
+    TrainingArguments,
+)
+
+from axolotl.utils.logging import get_logger
+
+LOG = get_logger(__name__)
+
+try:
+    from opentelemetry import metrics
+    from opentelemetry.exporter.prometheus import PrometheusMetricReader
+    from opentelemetry.metrics import set_meter_provider
+    from opentelemetry.sdk.metrics import MeterProvider as SDKMeterProvider
+    from prometheus_client import start_http_server
+
+    OPENTELEMETRY_AVAILABLE = True
+except ImportError:
+    LOG.warning("OpenTelemetry not available. pip install [opentelemetry]")
+    OPENTELEMETRY_AVAILABLE = False
+
+
+class OpenTelemetryMetricsCallback(TrainerCallback):
+    """
+    TrainerCallback that exports training metrics to OpenTelemetry/Prometheus.
+
+    This callback automatically tracks key training metrics including:
+    - Training loss
+    - Evaluation loss
+    - Learning rate
+    - Epoch progress
+    - Global step count
+    - Gradient norm
+
+    Metrics are exposed via HTTP endpoint for Prometheus scraping.
+    """
+
+    def __init__(self, cfg):
+        if not OPENTELEMETRY_AVAILABLE:
+            LOG.warning("OpenTelemetry not available, metrics will not be collected")
+            self.metrics_enabled = False
+            return
+
+        self.cfg = cfg
+        self.metrics_host = getattr(cfg, "otel_metrics_host", "localhost")
+        self.metrics_port = getattr(cfg, "otel_metrics_port", 8000)
+        self.metrics_enabled = True
+        self.server_started = False
+        self.metrics_lock = threading.Lock()
+
+        try:
+            # Create Prometheus metrics reader
+            prometheus_reader = PrometheusMetricReader()
+
+            # Create meter provider with Prometheus exporter
+            provider = SDKMeterProvider(metric_readers=[prometheus_reader])
+            set_meter_provider(provider)
+
+            # Get meter for creating metrics
+            self.meter = metrics.get_meter("axolotl.training")
+
+            # Create metrics
+            self._create_metrics()
+
+        except Exception as e:
+            LOG.warning(f"Failed to initialize OpenTelemetry metrics: {e}")
+            self.metrics_enabled = False
+
+    def _create_metrics(self):
+        """Create all metrics that will be tracked"""
+        self.train_loss_gauge = self.meter.create_gauge(
+            name="axolotl_train_loss",
+            description="Current training loss",
+            unit="1",
+        )
+
+        self.eval_loss_gauge = self.meter.create_gauge(
+            name="axolotl_eval_loss",
+            description="Current evaluation loss",
+            unit="1",
+        )
+
+        self.learning_rate_gauge = self.meter.create_gauge(
+            name="axolotl_learning_rate",
+            description="Current learning rate",
+            unit="1",
+        )
+
+        self.epoch_gauge = self.meter.create_gauge(
+            name="axolotl_epoch",
+            description="Current training epoch",
+            unit="1",
+        )
+
+        self.global_step_counter = self.meter.create_counter(
+            name="axolotl_global_steps",
+            description="Total training steps completed",
+            unit="1",
+        )
+
+        self.grad_norm_gauge = self.meter.create_gauge(
+            name="axolotl_gradient_norm",
+            description="Gradient norm",
+            unit="1",
+        )
+
+        self.memory_usage_gauge = self.meter.create_gauge(
+            name="axolotl_memory_usage",
+            description="Current memory usage in MB",
+            unit="MB",
+        )
+
+    def _start_metrics_server(self):
+        """Start the HTTP server for metrics exposure"""
+        if self.server_started:
+            return
+
+        try:
+            start_http_server(self.metrics_port, addr=self.metrics_host)
+            self.server_started = True
+            LOG.info(
+                f"OpenTelemetry metrics server started on http://{self.metrics_host}:{self.metrics_port}/metrics"
+            )
+
+        except Exception as e:
+            LOG.error(f"Failed to start OpenTelemetry metrics server: {e}")
+
+    def on_train_begin(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        """Called at the beginning of training"""
+        if not self.metrics_enabled:
+            return
+
+        self._start_metrics_server()
+        LOG.info("OpenTelemetry metrics collection started")
+
+    def on_log(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        logs: Optional[Dict[str, float]] = None,
+        **kwargs,
+    ):
+        """Called when logging occurs"""
+        if not self.metrics_enabled or not logs:
+            return
+
+        if "loss" in logs:
+            self.train_loss_gauge.set(logs["loss"])
+
+        if "eval_loss" in logs:
+            self.eval_loss_gauge.set(logs["eval_loss"])
+
+        if "learning_rate" in logs:
+            self.learning_rate_gauge.set(logs["learning_rate"])
+
+        if "epoch" in logs:
+            self.epoch_gauge.set(logs["epoch"])
+
+        if "grad_norm" in logs:
+            self.grad_norm_gauge.set(logs["grad_norm"])
+        if "memory_usage" in logs:
+            self.memory_usage_gauge.set(logs["memory_usage"])
+
+    def on_step_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        """Called at the end of each training step"""
+        if not self.metrics_enabled:
+            return
+
+        # Update step counter and epoch
+        self.global_step_counter.add(1)
+        if state.epoch is not None:
+            self.epoch_gauge.set(state.epoch)
+
+    def on_evaluate(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        metrics: Optional[Dict[str, float]] = None,
+        **kwargs,
+    ):
+        """Called after evaluation"""
+        if not self.metrics_enabled or not metrics:
+            return
+
+        if "eval_loss" in metrics:
+            self.eval_loss_gauge.set(metrics["eval_loss"])
+
+        # Record any other eval metrics as gauges
+        for key, value in metrics.items():
+            if key.startswith("eval_") and isinstance(value, (int, float)):
+                # Create gauge for this metric if it doesn't exist
+                gauge_name = f"axolotl_{key}"
+                try:
+                    gauge = self.meter.create_gauge(
+                        name=gauge_name,
+                        description=f"Evaluation metric: {key}",
+                        unit="1",
+                    )
+                    gauge.set(value)
+                except Exception as e:
+                    LOG.warning(f"Failed to create/update metric {gauge_name}: {e}")
+
+    def on_train_end(
+        self,
+        args: TrainingArguments,
+        state: TrainerState,
+        control: TrainerControl,
+        **kwargs,
+    ):
+        """Called at the end of training"""
+        if not self.metrics_enabled:
+            return
+
+        LOG.info("Training completed. OpenTelemetry metrics collection finished.")
+        LOG.info(
+            f"Metrics are still available at http://{self.metrics_host}:{self.metrics_port}/metrics"
+        )
diff --git a/src/axolotl/utils/schemas/config.py b/src/axolotl/utils/schemas/config.py
index 4d1d0aab2..86b3aa17b 100644
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -30,6 +30,7 @@ from axolotl.utils.schemas.integrations import (
     GradioConfig,
     LISAConfig,
     MLFlowConfig,
+    OpenTelemetryConfig,
     RayConfig,
     WandbConfig,
 )
@@ -60,6 +61,7 @@ class AxolotlInputConfig(
     WandbConfig,
     MLFlowConfig,
     CometConfig,
+    OpenTelemetryConfig,
     LISAConfig,
     GradioConfig,
     RayConfig,
diff --git a/src/axolotl/utils/schemas/integrations.py b/src/axolotl/utils/schemas/integrations.py
index 7332c7d39..97d675569 100644
--- a/src/axolotl/utils/schemas/integrations.py
+++ b/src/axolotl/utils/schemas/integrations.py
@@ -176,3 +176,27 @@ class RayConfig(BaseModel):
             "help": "The resources per worker for Ray training. Default is to use 1 GPU per worker."
         },
     )
+
+
+class OpenTelemetryConfig(BaseModel):
+    """OpenTelemetry configuration subset"""
+
+    use_otel_metrics: bool | None = Field(
+        default=False,
+        json_schema_extra={
+            "description": "Enable OpenTelemetry metrics collection and Prometheus export"
+        },
+    )
+    otel_metrics_host: str | None = Field(
+        default="localhost",
+        json_schema_extra={
+            "title": "OpenTelemetry Metrics Host",
+            "description": "Host to bind the OpenTelemetry metrics server to",
+        },
+    )
+    otel_metrics_port: int | None = Field(
+        default=8000,
+        json_schema_extra={
+            "description": "Port for the Prometheus metrics HTTP server"
+        },
+    )
diff --git a/tests/test_opentelemetry_callback.py b/tests/test_opentelemetry_callback.py
new file mode 100644
index 000000000..294ff6585
--- /dev/null
+++ b/tests/test_opentelemetry_callback.py
@@ -0,0 +1,349 @@
+"""Tests for OpenTelemetry metrics callback functionality."""
+
+import time
+
+import pytest
+
+from axolotl.utils.dict import DictDefault
+
+
+@pytest.fixture
+def mock_otel_config():
+    """Mock configuration for OpenTelemetry callback."""
+    return DictDefault(
+        {
+            "use_otel_metrics": True,
+            "otel_metrics_host": "localhost",
+            "otel_metrics_port": 8003,  # Use unique port for tests
+        }
+    )
+
+
+@pytest.fixture
+def mock_trainer_state():
+    """Mock trainer state for callback testing."""
+    from transformers import TrainerState
+
+    state = TrainerState()
+    state.epoch = 1.0
+    state.global_step = 100
+    return state
+
+
+@pytest.fixture
+def mock_training_args():
+    """Mock training arguments for callback testing."""
+    from transformers import TrainingArguments
+
+    return TrainingArguments(output_dir="/tmp/test")
+
+
+@pytest.fixture
+def mock_trainer_control():
+    """Mock trainer control for callback testing."""
+    from transformers.trainer_callback import TrainerControl
+
+    return TrainerControl()
+
+
+class TestOpenTelemetryConfig:
+    """Test OpenTelemetry configuration schema."""
+
+    def test_config_schema_valid(self):
+        """Test OpenTelemetry configuration schema validation."""
+        from axolotl.utils.schemas.integrations import OpenTelemetryConfig
+
+        # Test valid config
+        valid_config = {
+            "use_otel_metrics": True,
+            "otel_metrics_host": "localhost",
+            "otel_metrics_port": 8000,
+        }
+
+        otel_config = OpenTelemetryConfig(**valid_config)
+        assert otel_config.use_otel_metrics is True
+        assert otel_config.otel_metrics_host == "localhost"
+        assert otel_config.otel_metrics_port == 8000
+
+    def test_config_defaults(self):
+        """Test OpenTelemetry configuration default values."""
+        from axolotl.utils.schemas.integrations import OpenTelemetryConfig
+
+        # Test minimal config with defaults
+        minimal_config = {"use_otel_metrics": True}
+
+        otel_config = OpenTelemetryConfig(**minimal_config)
+        assert otel_config.use_otel_metrics is True
+        assert otel_config.otel_metrics_host == "localhost"  # default
+        assert otel_config.otel_metrics_port == 8000  # default
+
+    def test_config_disabled_by_default(self):
+        """Test that OpenTelemetry is disabled by default."""
+        from axolotl.utils.schemas.integrations import OpenTelemetryConfig
+
+        # Test default config
+        default_config = OpenTelemetryConfig()
+        assert default_config.use_otel_metrics is False
+
+
+class TestOpenTelemetryCallback:
+    """Test OpenTelemetry callback functionality."""
+
+    def test_callback_import(self):
+        """Test that OpenTelemetry callback can be imported."""
+        from axolotl.utils.callbacks.opentelemetry import OpenTelemetryMetricsCallback
+
+        assert OpenTelemetryMetricsCallback is not None
+
+    def test_callback_graceful_fallback(self, mock_otel_config):
+        """Test callback gracefully handles missing dependencies."""
+        from axolotl.utils.callbacks.opentelemetry import OpenTelemetryMetricsCallback
+
+        # This should not raise an exception even if dependencies are missing
+        callback = OpenTelemetryMetricsCallback(mock_otel_config)
+
+        # Callback should exist but may have metrics disabled
+        assert callback is not None
+        assert hasattr(callback, "metrics_enabled")
+
+    def test_callback_initialization_enabled(self, mock_otel_config):
+        """Test callback initialization when OpenTelemetry is available."""
+        from axolotl.utils.callbacks.opentelemetry import (
+            OPENTELEMETRY_AVAILABLE,
+            OpenTelemetryMetricsCallback,
+        )
+
+        callback = OpenTelemetryMetricsCallback(mock_otel_config)
+
+        if OPENTELEMETRY_AVAILABLE:
+            assert callback.metrics_enabled is True
+            assert callback.cfg == mock_otel_config
+            assert callback.metrics_host == "localhost"
+            assert callback.metrics_port == 8003
+        else:
+            assert callback.metrics_enabled is False
+
+    def test_metrics_server_lifecycle(
+        self,
+        mock_otel_config,
+        mock_trainer_state,
+        mock_training_args,
+        mock_trainer_control,
+    ):
+        """Test metrics server starts and stops correctly."""
+        from axolotl.utils.callbacks.opentelemetry import (
+            OPENTELEMETRY_AVAILABLE,
+            OpenTelemetryMetricsCallback,
+        )
+
+        if not OPENTELEMETRY_AVAILABLE:
+            pytest.skip("OpenTelemetry dependencies not available")
+
+        callback = OpenTelemetryMetricsCallback(mock_otel_config)
+
+        # Start server
+        callback.on_train_begin(
+            mock_training_args, mock_trainer_state, mock_trainer_control
+        )
+        assert callback.server_started is True
+
+        # End training
+        callback.on_train_end(
+            mock_training_args, mock_trainer_state, mock_trainer_control
+        )
+
+    def test_metrics_recording(
+        self,
+        mock_otel_config,
+        mock_trainer_state,
+        mock_training_args,
+        mock_trainer_control,
+    ):
+        """Test that metrics are recorded during training."""
+        from axolotl.utils.callbacks.opentelemetry import (
+            OPENTELEMETRY_AVAILABLE,
+            OpenTelemetryMetricsCallback,
+        )
+
+        if not OPENTELEMETRY_AVAILABLE:
+            pytest.skip("OpenTelemetry dependencies not available")
+
+        callback = OpenTelemetryMetricsCallback(mock_otel_config)
+        callback.on_train_begin(
+            mock_training_args, mock_trainer_state, mock_trainer_control
+        )
+
+        # Test logging metrics
+        test_logs = {
+            "loss": 0.5,
+            "learning_rate": 1e-4,
+            "grad_norm": 0.8,
+        }
+
+        # This should not raise an exception
+        callback.on_log(
+            mock_training_args, mock_trainer_state, mock_trainer_control, logs=test_logs
+        )
+        assert callback.metrics_enabled is True
+
+    def test_evaluation_metrics(
+        self,
+        mock_otel_config,
+        mock_trainer_state,
+        mock_training_args,
+        mock_trainer_control,
+    ):
+        """Test evaluation metrics recording."""
+        from axolotl.utils.callbacks.opentelemetry import (
+            OPENTELEMETRY_AVAILABLE,
+            OpenTelemetryMetricsCallback,
+        )
+
+        if not OPENTELEMETRY_AVAILABLE:
+            pytest.skip("OpenTelemetry dependencies not available")
+
+        callback = OpenTelemetryMetricsCallback(mock_otel_config)
+        callback.on_train_begin(
+            mock_training_args, mock_trainer_state, mock_trainer_control
+        )
+
+        # Test evaluation metrics
+        eval_logs = {
+            "eval_loss": 0.3,
+            "eval_accuracy": 0.95,
+        }
+
+        # This should not raise an exception
+        callback.on_evaluate(
+            mock_training_args, mock_trainer_state, mock_trainer_control, eval_logs
+        )
+        assert callback.metrics_enabled is True
+
+    def test_thread_safety(self, mock_otel_config):
+        """Test that callback has thread safety mechanisms."""
+        from axolotl.utils.callbacks.opentelemetry import (
+            OPENTELEMETRY_AVAILABLE,
+            OpenTelemetryMetricsCallback,
+        )
+
+        if not OPENTELEMETRY_AVAILABLE:
+            pytest.skip("OpenTelemetry dependencies not available")
+
+        callback = OpenTelemetryMetricsCallback(mock_otel_config)
+        assert hasattr(callback, "metrics_lock")
+        # Check it's a lock-like object
+        assert hasattr(callback.metrics_lock, "__enter__")
+        assert hasattr(callback.metrics_lock, "__exit__")
+
+
+class TestOpenTelemetryIntegration:
+    """Integration tests for OpenTelemetry."""
+
+    def test_availability_check(self):
+        """Test availability check function."""
+        from axolotl.utils import is_opentelemetry_available
+
+        result = is_opentelemetry_available()
+        assert isinstance(result, bool)
+
+    def test_prometheus_endpoint_basic(
+        self,
+        mock_otel_config,
+        mock_trainer_state,
+        mock_training_args,
+        mock_trainer_control,
+    ):
+        """Test basic Prometheus endpoint functionality."""
+        from axolotl.utils.callbacks.opentelemetry import (
+            OPENTELEMETRY_AVAILABLE,
+            OpenTelemetryMetricsCallback,
+        )
+
+        if not OPENTELEMETRY_AVAILABLE:
+            pytest.skip("OpenTelemetry dependencies not available")
+
+        try:
+            import requests
+        except ImportError:
+            pytest.skip("requests library not available")
+
+        callback = OpenTelemetryMetricsCallback(mock_otel_config)
+        callback.on_train_begin(
+            mock_training_args, mock_trainer_state, mock_trainer_control
+        )
+
+        if not callback.server_started:
+            pytest.skip("Metrics server failed to start")
+
+        # Give server time to start
+        time.sleep(1)
+
+        # Try to access metrics endpoint
+        try:
+            response = requests.get(
+                f"http://{callback.metrics_host}:{callback.metrics_port}/metrics",
+                timeout=2,
+            )
+            assert response.status_code == 200
+            # Check for Prometheus format
+            assert "# TYPE" in response.text or "# HELP" in response.text
+        except requests.exceptions.RequestException:
+            pytest.skip(
+                "Could not connect to metrics endpoint - this is expected in some environments"
+            )
+
+
+class TestOpenTelemetryCallbackMethods:
+    """Test specific callback methods."""
+
+    def test_step_end_callback(
+        self,
+        mock_otel_config,
+        mock_trainer_state,
+        mock_training_args,
+        mock_trainer_control,
+    ):
+        """Test step end callback method."""
+        from axolotl.utils.callbacks.opentelemetry import (
+            OPENTELEMETRY_AVAILABLE,
+            OpenTelemetryMetricsCallback,
+        )
+
+        if not OPENTELEMETRY_AVAILABLE:
+            pytest.skip("OpenTelemetry dependencies not available")
+
+        callback = OpenTelemetryMetricsCallback(mock_otel_config)
+        callback.on_train_begin(
+            mock_training_args, mock_trainer_state, mock_trainer_control
+        )
+
+        # Should not raise an exception
+        callback.on_step_end(
+            mock_training_args, mock_trainer_state, mock_trainer_control
+        )
+
+    def test_epoch_end_callback(
+        self,
+        mock_otel_config,
+        mock_trainer_state,
+        mock_training_args,
+        mock_trainer_control,
+    ):
+        """Test epoch end callback method."""
+        from axolotl.utils.callbacks.opentelemetry import (
+            OPENTELEMETRY_AVAILABLE,
+            OpenTelemetryMetricsCallback,
+        )
+
+        if not OPENTELEMETRY_AVAILABLE:
+            pytest.skip("OpenTelemetry dependencies not available")
+
+        callback = OpenTelemetryMetricsCallback(mock_otel_config)
+        callback.on_train_begin(
+            mock_training_args, mock_trainer_state, mock_trainer_control
+        )
+
+        # Should not raise an exception
+        callback.on_epoch_end(
+            mock_training_args, mock_trainer_state, mock_trainer_control
+        )

From bb33fda44d8cc889230698539b8df5a7ba114b67 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 22 Oct 2025 21:24:52 -0700
Subject: [PATCH 098/115] install flash attention in 2.9.0 base images (#3224)

---
 docker/Dockerfile-base    | 6 ++++--
 docker/Dockerfile-uv-base | 6 ++++++
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index 87918cc41..cc209f304 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -47,6 +47,8 @@ RUN git lfs install --skip-repo && \
     pip3 install -U --no-cache-dir pydantic==1.10.10 && \
     pip3 cache purge
 
-RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
-        FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
+RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
+        wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+        pip3 install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+        rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
     fi
diff --git a/docker/Dockerfile-uv-base b/docker/Dockerfile-uv-base
index eaa49b9e9..2ca272c6e 100644
--- a/docker/Dockerfile-uv-base
+++ b/docker/Dockerfile-uv-base
@@ -34,3 +34,9 @@ RUN uv pip install packaging setuptools wheel psutil \
     && uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
     && uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
     && uv pip install awscli pydantic
+
+RUN if [ "$PYTORCH_VERSION" = "2.9.0" ] && [ "$CUDA" = "128" ] ; then \
+        wget https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/v0.4.17/flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+        uv pip install --no-cache-dir flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+        rm flash_attn-2.8.3+cu128torch2.9-cp311-cp311-linux_x86_64.whl; \
+    fi

From 9d4d39e939b3e44298f0c5e1f1b05c7b515fc7a6 Mon Sep 17 00:00:00 2001
From: Dan Saunders <danjsaund@gmail.com>
Date: Mon, 27 Oct 2025 03:42:01 -0400
Subject: [PATCH 099/115] Diffusion trainer fix: shift logits to align with
 input tokens (#3191)

* shift logits for diffusion generate

* delete unused

* diffusion trainer: token shift
---
 src/axolotl/integrations/diffusion/generation.py | 4 ++--
 src/axolotl/integrations/diffusion/trainer.py    | 4 ++--
 src/axolotl/integrations/diffusion/utils.py      | 7 +++++++
 3 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/src/axolotl/integrations/diffusion/generation.py b/src/axolotl/integrations/diffusion/generation.py
index 49e3cdfae..ec517fd23 100644
--- a/src/axolotl/integrations/diffusion/generation.py
+++ b/src/axolotl/integrations/diffusion/generation.py
@@ -7,7 +7,7 @@ import torch
 
 from axolotl.utils.logging import get_logger
 
-from .utils import create_bidirectional_attention_mask
+from .utils import create_bidirectional_attention_mask, shift_logits_to_input_positions
 
 LOG = get_logger(__name__)
 
@@ -360,7 +360,7 @@ def _diffusion_step(
 
     # Forward pass
     outputs = model(input_ids=sequence, attention_mask=attention_mask)
-    logits = outputs.logits
+    logits = shift_logits_to_input_positions(outputs.logits)
 
     # Only sample at currently masked positions
     if current_mask.any():
diff --git a/src/axolotl/integrations/diffusion/trainer.py b/src/axolotl/integrations/diffusion/trainer.py
index 42b2468f4..dfaef2a48 100644
--- a/src/axolotl/integrations/diffusion/trainer.py
+++ b/src/axolotl/integrations/diffusion/trainer.py
@@ -11,7 +11,7 @@ from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 
 from .callbacks import DiffusionGenerationCallback
-from .utils import create_bidirectional_attention_mask
+from .utils import create_bidirectional_attention_mask, shift_logits_to_input_positions
 
 LOG = get_logger(__name__)
 
@@ -207,7 +207,7 @@ class DiffusionTrainer(AxolotlTrainer):
             input_ids=noisy_batch.long(),
             attention_mask=bidirectional_mask,
         )
-        logits = outputs.logits
+        logits = shift_logits_to_input_positions(outputs.logits)
 
         if masked_indices.sum() > 0:
             valid_indices = torch.where(masked_indices)
diff --git a/src/axolotl/integrations/diffusion/utils.py b/src/axolotl/integrations/diffusion/utils.py
index 47abf6fec..b6f71c07b 100644
--- a/src/axolotl/integrations/diffusion/utils.py
+++ b/src/axolotl/integrations/diffusion/utils.py
@@ -157,3 +157,10 @@ def create_bidirectional_attention_mask(
 
     # Add head dimension: [batch_size, 1, seq_len, seq_len]
     return bidirectional_mask.unsqueeze(1)
+
+
+def shift_logits_to_input_positions(logits: torch.Tensor) -> torch.Tensor:
+    """Align next-token logits with their input token positions for diffusion."""
+    if logits.size(1) <= 1:
+        return logits
+    return torch.cat([logits[:, :1], logits[:, :-1]], dim=1)

From 98333e639a35bd36a108786a6daaa42f03488aca Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 29 Oct 2025 18:02:16 -0400
Subject: [PATCH 100/115] upgrade trl to 0.24.0 and liger to 0.6.3 (#3230)

* upgrade trl to 0.24.0

* fix reward collator init

* use newer DataCollatorForPreference instead

* DataCollatorForPreference doesn't use padding kwarg

* fix input id labels

* fix fbgemm-gpu version for pytorch versions

* tweak pinned deps

* transformers doesn't support hub 1.0 yet

* upgrade liger dep to 0.6.3

* set TORCH_CUDA_ARCH_LIST correctly
---
 cicd/Dockerfile.jinja                                |  2 +-
 requirements.txt                                     | 12 ++++++------
 setup.py                                             |  8 ++++++--
 src/axolotl/core/builders/causal.py                  |  9 ++++++---
 .../prompt_strategies/bradley_terry/chat_template.py |  4 ++--
 5 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja
index 6a1ddb66d..c3a613ecc 100644
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -1,6 +1,6 @@
 FROM axolotlai/axolotl-base:{{ BASE_TAG }}
 
-ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
+ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
 ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
 ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
 ENV CUDA="{{ CUDA }}"
diff --git a/requirements.txt b/requirements.txt
index e1f1b10a5..5621d94b1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,27 +5,27 @@ bitsandbytes==0.47.0
 triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
-liger-kernel==0.6.1
+liger-kernel==0.6.3
 # END section
 
 packaging==23.2
 
-huggingface_hub>=0.33.0
+huggingface_hub>=0.36.0
 peft>=0.17.1
 tokenizers>=0.21.1
 transformers==4.57.1
 accelerate==1.10.1
 datasets==4.0.0
 deepspeed>=0.17.0
-trl==0.23.1
-hf_xet==1.1.5
-kernels==0.9.0
+trl==0.24.0
+hf_xet==1.2.0
+kernels>=0.9.0
 trackio
 
 optimum==1.16.2
 hf_transfer
 sentencepiece
-gradio==5.41.1
+gradio==5.49.1
 
 modal==1.0.2
 pydantic==2.10.6
diff --git a/setup.py b/setup.py
index 9e3de48b5..2845bb151 100644
--- a/setup.py
+++ b/setup.py
@@ -62,8 +62,12 @@ def parse_requirements(extras_require_map):
             else:
                 raise ValueError("Invalid version format")
 
-            if (major, minor) >= (2, 8):
-                pass
+            if (major, minor) >= (2, 9):
+                extras_require_map.pop("fbgemm-gpu")
+                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.4.1"]
+            elif (major, minor) >= (2, 8):
+                extras_require_map.pop("fbgemm-gpu")
+                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
             elif (major, minor) >= (2, 7):
                 _install_requires.pop(_install_requires.index(xformers_version))
                 if patch == 0:
diff --git a/src/axolotl/core/builders/causal.py b/src/axolotl/core/builders/causal.py
index 820304230..7a06431dc 100644
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -12,7 +12,7 @@ from transformers import (
     EarlyStoppingCallback,
     Trainer,
 )
-from trl.trainer.utils import RewardDataCollatorWithPadding
+from trl.trainer.reward_trainer import DataCollatorForPreference
 
 from axolotl.core.builders.base import TrainerBuilderBase
 from axolotl.core.trainers import (
@@ -453,7 +453,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                 BatchSamplerDataCollatorForSeq2Seq,
                 DataCollatorForSeq2Seq,
                 DataCollatorWithFlattening,
-                RewardDataCollatorWithPadding,
+                DataCollatorForPreference,
             ]
         ]
         collator_args = [self.tokenizer]
@@ -470,7 +470,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
             if kwargs and isinstance(kwargs, dict):
                 kwargs.update(collator_cls_and_kwargs[1])
         elif self.cfg.reward_model:
-            collator = RewardDataCollatorWithPadding
+            collator = DataCollatorForPreference
+            tokenizer = collator_args.pop(0)
+            kwargs["pad_token_id"] = tokenizer.pad_token_id
+            kwargs.pop("padding")
         elif use_batch_sampler_collator:
             # Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
             # supported multipack models, or non-flash-attention llama
diff --git a/src/axolotl/prompt_strategies/bradley_terry/chat_template.py b/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
index fd0d76f51..03336b3ef 100644
--- a/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
+++ b/src/axolotl/prompt_strategies/bradley_terry/chat_template.py
@@ -71,10 +71,10 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
             ]
 
         return {
-            "input_ids_chosen": chosen_tokenized["input_ids"],
+            "chosen_input_ids": chosen_tokenized["input_ids"],
             "attention_mask_chosen": chosen_tokenized["attention_mask"],
             "labels_chosen": 1.0,
-            "input_ids_rejected": rejected_tokenized["input_ids"],
+            "rejected_input_ids": rejected_tokenized["input_ids"],
             "attention_mask_rejected": rejected_tokenized["attention_mask"],
             "labels_rejected": 0.0,
         }

From a4b921135b56abad32f962009686b52089b273c9 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Wed, 29 Oct 2025 18:07:29 -0400
Subject: [PATCH 101/115] build cuda 13.0.0 base image with 2.9.0 (#3229)

* build cuda 13.0.0 base image with 2.9.0

* upgrade causal-conv1d

* 1.5.4 not in pypi yet

* pin to 1.3.0

* use github release instead of pypi

* split the logic for incompatible packages

* fix bash in dockerfile
---
 .github/workflows/base.yml | 14 ++++++++++++++
 docker/Dockerfile-base     |  8 ++++++--
 setup.py                   |  2 +-
 3 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/base.yml b/.github/workflows/base.yml
index b2681bb5d..87d6772dd 100644
--- a/.github/workflows/base.yml
+++ b/.github/workflows/base.yml
@@ -60,6 +60,13 @@ jobs:
             pytorch: 2.9.0
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
             dockerfile: "Dockerfile-base"
+          - cuda: "130"
+            cuda_version: 13.0.0
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.9.0
+            torch_cuda_arch_list: "9.0+PTX"
+            dockerfile: "Dockerfile-base"
 #          - cuda: "128"
 #            cuda_version: 12.8.1
 #            cudnn_version: ""
@@ -143,6 +150,13 @@ jobs:
             pytorch: 2.9.0
             torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
             dockerfile: "Dockerfile-uv-base"
+          - cuda: "130"
+            cuda_version: 13.0.0
+            cudnn_version: ""
+            python_version: "3.11"
+            pytorch: 2.9.0
+            torch_cuda_arch_list: "9.0+PTX"
+            dockerfile: "Dockerfile-uv-base"
     steps:
       - name: Checkout
         uses: actions/checkout@v4
diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index cc209f304..a08b5cd4f 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -37,10 +37,14 @@ WORKDIR /workspace
 
 RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
     python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
-    CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir causal_conv1d==1.5.2 && \
-    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
     python3 -m pip cache purge
 
+RUN if [ "$CUDA" != "130" ] ; then \
+        CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.4"; \
+        python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
+        python3 -m pip cache purge; \
+    fi
+
 RUN git lfs install --skip-repo && \
     pip3 install awscli && \
     # The base image ships with `pydantic==1.8.2` which is not working
diff --git a/setup.py b/setup.py
index 2845bb151..b16377e92 100644
--- a/setup.py
+++ b/setup.py
@@ -162,7 +162,7 @@ extras_require = {
     "llmcompressor": [
         "llmcompressor==0.5.1",
     ],
-    "fbgemm-gpu": ["fbgemm-gpu-genai>=1.2.0"],
+    "fbgemm-gpu": ["fbgemm-gpu-genai==1.3.0"],
     "opentelemetry": [
         "opentelemetry-api",
         "opentelemetry-sdk",

From 0f7c886b7b28a0a90a8510c58f160f6ee70e9851 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 29 Oct 2025 18:09:46 -0400
Subject: [PATCH 102/115] chore: update pre-commit hooks (#3222) [skip ci]

Co-authored-by: djsaunde <1245942+djsaunde@users.noreply.github.com>
---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0e455f52c..015fb5e6e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
     -   id: no-commit-to-branch
         args: ['--branch', 'main']
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.0
+    rev: v0.14.2
     hooks:
     -   id: ruff
         args: [--fix]

From 4b1b4fa6d86246cfef1e8b693c011bab2d7db7dd Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Thu, 30 Oct 2025 10:03:24 -0400
Subject: [PATCH 103/115] upgrade numpy (#3236)

* upgrade numpy to 2.3.4

* bump contribs for numpy

* fix vllm versions

* bump numba

* make sure psutil is installed

* add psutil to cicd dockerfile jinja

* lower dep versions of numba + numpy for vllm

* bump datasets version

* resolve pydantic conflict too
---
 .github/workflows/tests.yml |  2 +-
 cicd/Dockerfile.jinja       |  2 +-
 docker/Dockerfile-base      |  2 +-
 requirements.txt            | 12 ++++++------
 setup.py                    |  4 +++-
 5 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 8f368b517..90bf3234a 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -152,7 +152,7 @@ jobs:
       - name: upgrade pip
         run: |
           pip3 install --upgrade pip
-          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel
+          pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel psutil
 
       - name: Install PyTorch
         run: |
diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja
index c3a613ecc..81ed5453e 100644
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -32,7 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
         sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
     fi
 
-RUN pip install packaging==23.2 setuptools==75.8.0
+RUN pip install packaging==23.2 setuptools==75.8.0 psutil
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
         pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
     else \
diff --git a/docker/Dockerfile-base b/docker/Dockerfile-base
index a08b5cd4f..25eae4fde 100644
--- a/docker/Dockerfile-base
+++ b/docker/Dockerfile-base
@@ -35,7 +35,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
 
 WORKDIR /workspace
 
-RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
+RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel psutil && \
     python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
     python3 -m pip cache purge
 
diff --git a/requirements.txt b/requirements.txt
index 5621d94b1..4d27ee148 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ peft>=0.17.1
 tokenizers>=0.21.1
 transformers==4.57.1
 accelerate==1.10.1
-datasets==4.0.0
+datasets==4.3.0
 deepspeed>=0.17.0
 trl==0.24.0
 hf_xet==1.2.0
@@ -28,7 +28,7 @@ sentencepiece
 gradio==5.49.1
 
 modal==1.0.2
-pydantic==2.10.6
+pydantic>=2.10.6
 addict
 fire
 PyYAML>=6.0
@@ -36,8 +36,8 @@ requests
 wandb
 einops
 colorama
-numba
-numpy>=1.24.4,<=2.0.1
+numba>=0.61.2
+numpy>=2.2.6
 
 # qlora things
 evaluate==0.4.1
@@ -50,7 +50,7 @@ python-dotenv==1.0.1
 
 # remote filesystems
 s3fs>=2024.5.0
-gcsfs>=2024.5.0
+gcsfs>=2025.3.0
 adlfs>=2024.5.0
 ocifs==1.3.2
 
@@ -66,7 +66,7 @@ antlr4-python3-runtime==4.13.2
 torchao==0.13.0
 schedulefree==1.4.1
 
-axolotl-contribs-lgpl==0.0.6
+axolotl-contribs-lgpl==0.0.7
 axolotl-contribs-mit==0.0.5
 
 mistral-common==1.8.5
diff --git a/setup.py b/setup.py
index b16377e92..b046a2fdc 100644
--- a/setup.py
+++ b/setup.py
@@ -65,9 +65,11 @@ def parse_requirements(extras_require_map):
             if (major, minor) >= (2, 9):
                 extras_require_map.pop("fbgemm-gpu")
                 extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.4.1"]
+                extras_require_map["vllm"] = ["vllm==0.11.1"]
             elif (major, minor) >= (2, 8):
                 extras_require_map.pop("fbgemm-gpu")
                 extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
+                extras_require_map["vllm"] = ["vllm==0.11.0"]
             elif (major, minor) >= (2, 7):
                 _install_requires.pop(_install_requires.index(xformers_version))
                 if patch == 0:
@@ -76,7 +78,7 @@ def parse_requirements(extras_require_map):
                     extras_require_map.pop("vllm")
                 else:
                     _install_requires.append("xformers==0.0.31")
-                    extras_require_map["vllm"] = ["vllm>=0.10.0"]
+                    extras_require_map["vllm"] = ["vllm==0.10.1"]
             elif (major, minor) >= (2, 6):
                 _install_requires.pop(_install_requires.index(xformers_version))
                 _install_requires.append("xformers==0.0.29.post3")

From 633afffacb21a9d34d0e2d5af1ed12e1802611ca Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Thu, 30 Oct 2025 18:50:26 -0400
Subject: [PATCH 104/115] add torch 2.9.0 to ci (#3223)

---
 .github/workflows/multi-gpu-e2e.yml |  7 +++++++
 .github/workflows/tests.yml         | 32 +++++++++++++++++------------
 setup.py                            |  1 +
 3 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/multi-gpu-e2e.yml b/.github/workflows/multi-gpu-e2e.yml
index 6a92de352..1682beb31 100644
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -40,6 +40,13 @@ jobs:
             axolotl_extras: fbgemm-gpu
             num_gpus: 2
             nightly_build: "true"
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.9.0
+            axolotl_extras: fbgemm-gpu
+            num_gpus: 2
+            nightly_build: "true"
     runs-on: [self-hosted, modal]
     timeout-minutes: 120
     steps:
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 90bf3234a..7ad9d1ab4 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -55,7 +55,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.7.1", "2.8.0"]
+        pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
     timeout-minutes: 20
 
     steps:
@@ -130,7 +130,7 @@ jobs:
       fail-fast: false
       matrix:
         python_version: ["3.11"]
-        pytorch_version: ["2.7.1", "2.8.0"]
+        pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
     timeout-minutes: 20
 
     steps:
@@ -231,16 +231,10 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 126
-            cuda_version: 12.6.3
+          - cuda: 128
+            cuda_version: 12.8.1
             python_version: "3.11"
-            pytorch: 2.7.1
-            num_gpus: 1
-            axolotl_extras:
-          - cuda: 126
-            cuda_version: 12.6.3
-            python_version: "3.11"
-            pytorch: 2.7.1
+            pytorch: 2.8.0
             num_gpus: 1
             axolotl_extras:
             dockerfile: "Dockerfile-uv.jinja"
@@ -286,12 +280,18 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 128
-            cuda_version: 12.8.1
+          - cuda: 126
+            cuda_version: 12.6.3
             python_version: "3.11"
             pytorch: 2.7.1
             num_gpus: 1
             axolotl_extras:
+#          - cuda: 128
+#            cuda_version: 12.8.1
+#            python_version: "3.11"
+#            pytorch: 2.7.1
+#            num_gpus: 1
+#            axolotl_extras:
           - cuda: 128
             cuda_version: 12.8.1
             python_version: "3.11"
@@ -299,6 +299,12 @@ jobs:
             num_gpus: 1
             gpu_type: "B200"
             axolotl_extras: fbgemm-gpu
+          - cuda: 128
+            cuda_version: 12.8.1
+            python_version: "3.11"
+            pytorch: 2.9.0
+            num_gpus: 1
+            axolotl_extras:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
diff --git a/setup.py b/setup.py
index b046a2fdc..9c1161642 100644
--- a/setup.py
+++ b/setup.py
@@ -66,6 +66,7 @@ def parse_requirements(extras_require_map):
                 extras_require_map.pop("fbgemm-gpu")
                 extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.4.1"]
                 extras_require_map["vllm"] = ["vllm==0.11.1"]
+                _install_requires.pop(_install_requires.index(xformers_version))
             elif (major, minor) >= (2, 8):
                 extras_require_map.pop("fbgemm-gpu")
                 extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]

From ed58fa8a75e4eb8c5baf62e51e30fdc24c08a778 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 3 Nov 2025 15:55:40 +0000
Subject: [PATCH 105/115] chore: update pre-commit hooks (#3244)

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 015fb5e6e..86d8927d2 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,7 +11,7 @@ repos:
     -   id: no-commit-to-branch
         args: ['--branch', 'main']
 -   repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.14.2
+    rev: v0.14.3
     hooks:
     -   id: ruff
         args: [--fix]

From 26f05b6008195f29d859d8035bce9b0a7f7b2777 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Tue, 4 Nov 2025 07:35:07 +0700
Subject: [PATCH 106/115] fix(example): set model_type to load for gemma3 text
 (#3242)

* fix: set model_type to load for gemma3 text

* chore: simplify

* chore: unify
---
 examples/gemma3/gemma-3-1b-qlora.yml   | 6 +++---
 examples/gemma3/gemma-3-270m-qlora.yml | 6 +++---
 examples/gemma3/gemma-3-4b-qlora.yml   | 3 +++
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/examples/gemma3/gemma-3-1b-qlora.yml b/examples/gemma3/gemma-3-1b-qlora.yml
index 115717db7..2f998d144 100644
--- a/examples/gemma3/gemma-3-1b-qlora.yml
+++ b/examples/gemma3/gemma-3-1b-qlora.yml
@@ -1,7 +1,7 @@
 base_model: google/gemma-3-1b-it
-# optionally might have model_type or tokenizer_type
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
+
+model_type: Gemma3ForCausalLM
+
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 
diff --git a/examples/gemma3/gemma-3-270m-qlora.yml b/examples/gemma3/gemma-3-270m-qlora.yml
index 8744fad26..0c60c4a01 100644
--- a/examples/gemma3/gemma-3-270m-qlora.yml
+++ b/examples/gemma3/gemma-3-270m-qlora.yml
@@ -1,7 +1,7 @@
 base_model: google/gemma-3-270m-it
-# optionally might have model_type or tokenizer_type
-model_type: AutoModelForCausalLM
-tokenizer_type: AutoTokenizer
+
+model_type: Gemma3ForCausalLM
+
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 
diff --git a/examples/gemma3/gemma-3-4b-qlora.yml b/examples/gemma3/gemma-3-4b-qlora.yml
index 44ba9c879..959521149 100644
--- a/examples/gemma3/gemma-3-4b-qlora.yml
+++ b/examples/gemma3/gemma-3-4b-qlora.yml
@@ -1,5 +1,8 @@
 base_model: google/gemma-3-4b-it
 
+# Need to set else transformers tries to load vision too
+model_type: Gemma3ForCausalLM
+
 load_in_4bit: true
 
 # gemma3 doesn't seem to play nice with ddp

From 01a346d86ac54b4631c3e2dcb04a3855b3db757c Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Tue, 4 Nov 2025 07:39:21 +0700
Subject: [PATCH 107/115] feat(example): add gpt-oss-safeguard docs (#3243)

* feat(example): add gpt-oss-safeguard docs

* fix: add doc on reasoning_effort
---
 examples/gpt-oss/README.md                    | 12 ++++
 ...-oss-safeguard-20b-sft-lora-singlegpu.yaml | 67 +++++++++++++++++++
 2 files changed, 79 insertions(+)
 create mode 100644 examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml

diff --git a/examples/gpt-oss/README.md b/examples/gpt-oss/README.md
index fb6c67498..9ab02b122 100644
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -2,6 +2,8 @@
 
 [GPT-OSS](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B.
 
+In October 2025, OpenAI released safeguard models built upon GPT-OSS called [GPT-OSS-Safeguard](https://huggingface.co/collections/openai/gpt-oss-safeguard). They use the same architecture, so the same examples below can be re-used.
+
 This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
 
 ## Getting started
@@ -64,6 +66,16 @@ axolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offlo
 mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/
 ```
 
+### How to set reasoning_effort in template?
+
+The harmony template has a feature to set the `reasoning_effort` during prompt building. The default is `medium`. If you would like to adjust this, you can add the following to your config:
+
+```yaml
+chat_template_kwargs:
+  reasoning_effort: "high"  # low | medium | high
+```
+
+Currently, this applies globally. There is no method to apply per sample yet. If you are interested in adding this, please feel free to create an Issue to discuss.
 
 ### Inferencing your fine-tuned model
 
diff --git a/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml b/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml
new file mode 100644
index 000000000..ab026337d
--- /dev/null
+++ b/examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml
@@ -0,0 +1,67 @@
+base_model: openai/gpt-oss-safeguard-20b
+use_kernels: true
+model_quantization_config: Mxfp4Config
+model_quantization_config_kwargs:
+  dequantize: true
+
+plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+experimental_skip_move_to_device: true  # prevent OOM by not putting model to GPU before sharding
+
+datasets:
+  - path: HuggingFaceH4/Multilingual-Thinking
+    type: chat_template
+    field_thinking: thinking
+    template_thinking_key: thinking
+
+dataset_prepared_path: last_run_prepared
+val_set_size: 0
+output_dir: ./outputs/gpt-oss-safeguard-out/
+
+sequence_len: 4096
+sample_packing: true
+
+adapter: lora
+lora_r: 8
+lora_alpha: 16
+lora_dropout: 0.0  # dropout not supported when using LoRA over expert parameters
+lora_target_linear: true
+
+# TODO: not supported for now, see peft#2710
+#lora_target_parameters:  # target the experts in the last two layers
+#  - "22._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
+#  - "22._checkpoint_wrapped_module.mlp.experts.down_proj"
+#  - "23._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
+#  - "23._checkpoint_wrapped_module.mlp.experts.down_proj"
+
+wandb_project:
+wandb_entity:
+wandb_watch:
+wandb_name:
+wandb_log_model:
+
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 1
+
+optimizer: adamw_torch_8bit
+lr_scheduler: constant_with_warmup
+learning_rate: 2e-4
+
+bf16: true
+tf32: true
+
+flash_attention: true
+attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+
+gradient_checkpointing: true
+activation_offloading: true
+
+logging_steps: 1
+saves_per_epoch: 1
+warmup_ratio: 0.1
+
+special_tokens:
+eot_tokens:
+  - "<|end|>"

From c37decb073dcfa3538f96bf8a9f689ca5b76befd Mon Sep 17 00:00:00 2001
From: salman <salman.mohammadi@outlook.com>
Date: Tue, 4 Nov 2025 13:43:40 +0000
Subject: [PATCH 108/115] update pre-commit cadence  (#3245)

---
 .github/workflows/precommit-autoupdate.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/precommit-autoupdate.yml b/.github/workflows/precommit-autoupdate.yml
index 10330f955..4c2e59b6b 100644
--- a/.github/workflows/precommit-autoupdate.yml
+++ b/.github/workflows/precommit-autoupdate.yml
@@ -2,7 +2,7 @@ name: Pre-commit auto-update
 
 on:
   schedule:
-    - cron: '0 0 * * 0'  # Run weekly
+    - cron: '0 0 1 * *'  # Run monthly
   workflow_dispatch:  # Manual kickoff
 
 jobs:

From bfdc9a8249cea6c4b6605f42904c268dc7e330ef Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Thu, 6 Nov 2025 16:06:03 -0500
Subject: [PATCH 109/115] upgrade trl and other hf deps (#3249)

* upgrade trl and other hf deps

* skip simpo for now
---
 requirements.txt            | 8 ++++----
 tests/core/test_builders.py | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 4d27ee148..96b185197 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 
 # START section of dependencies that don't install on Darwin/MacOS
-bitsandbytes==0.47.0
+bitsandbytes==0.48.2
 triton>=3.0.0
 mamba-ssm==1.2.0.post1
 xformers>=0.0.23.post1
@@ -12,12 +12,12 @@ packaging==23.2
 
 huggingface_hub>=0.36.0
 peft>=0.17.1
-tokenizers>=0.21.1
+tokenizers>=0.22.1
 transformers==4.57.1
-accelerate==1.10.1
+accelerate==1.11.0
 datasets==4.3.0
 deepspeed>=0.17.0
-trl==0.24.0
+trl==0.25.0
 hf_xet==1.2.0
 kernels>=0.9.0
 trackio
diff --git a/tests/core/test_builders.py b/tests/core/test_builders.py
index 67481b2ad..199777896 100644
--- a/tests/core/test_builders.py
+++ b/tests/core/test_builders.py
@@ -396,10 +396,10 @@ def rand_reward_func(prompts, completions) -> list[float]:
             ),
             ("orpo_cfg", None),  # don't use fixture for orpo to use smaller split
             ("kto_cfg", None),  # no fixture for kto
-            (
-                "simpo_cfg",
-                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
-            ),
+            # (
+            #     "simpo_cfg",
+            #     "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
+            # ),
         ],
     )
     def test_custom_optimizer_cls_and_kwargs(

From 80270a92fa5eb9cd9063ffc22e8349d0bb056a41 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?L=C3=AA=20Nam=20Kh=C3=A1nh?=
 <55955273+khanhkhanhlele@users.noreply.github.com>
Date: Fri, 7 Nov 2025 20:21:20 +0700
Subject: [PATCH 110/115] Fix typos in some files (#3250) [skip ci]

---
 src/axolotl/loaders/patch_manager.py | 2 +-
 tests/e2e/multigpu/solo/test_grpo.py | 2 +-
 tests/e2e/test_preprocess.py         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py
index 1e46f5c34..81e4dd786 100644
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -457,7 +457,7 @@ class PatchManager:
             and self.cfg.flash_attention
             and not self.inference
         ):
-            # TODO(MengqingCao): split these patches seperately
+            # TODO(MengqingCao): split these patches separately
             from axolotl.monkeypatch.llama_attn_hijack_flash import (
                 is_xformers_swiglu_available,
                 replace_llama_mlp_with_swiglu,
diff --git a/tests/e2e/multigpu/solo/test_grpo.py b/tests/e2e/multigpu/solo/test_grpo.py
index b48eb30e1..257a388d0 100644
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -144,7 +144,7 @@ def recursive_kill(process: subprocess.Popen):
 @pytest.mark.skip(reason="flaky vllm tests in modal")
 class TestGRPO:
     """
-    Test case for GRPO training using multilpe GPUs
+    Test case for GRPO training using multiple GPUs
     """
 
     def _utils_write_yaml_and_rewards(self, cfg, temp_dir, suffix=""):
diff --git a/tests/e2e/test_preprocess.py b/tests/e2e/test_preprocess.py
index 4aa4cb6c2..8f15cbe55 100644
--- a/tests/e2e/test_preprocess.py
+++ b/tests/e2e/test_preprocess.py
@@ -14,7 +14,7 @@ class TestPreprocess:
     """test cases for preprocess"""
 
     def test_w_deepspeed(self, temp_dir):
-        """make sure preproces doesn't choke when using deepspeed in the config"""
+        """make sure preprocess doesn't choke when using deepspeed in the config"""
 
         cfg = DictDefault(
             {

From ed2e8cacd6cc7acba38582412311a2d6052bc1cf Mon Sep 17 00:00:00 2001
From: VED <146507396+ved1beta@users.noreply.github.com>
Date: Fri, 7 Nov 2025 19:21:40 +0530
Subject: [PATCH 111/115] feat:openenv rollout_func (#3239) [skip ci]

* feat:openenv rollout_func

* chore lint

* docs

* add:docs processing_class

* tests

* lint
---
 docs/rlhf.qmd                              | 112 +++++++++++++++++++++
 src/axolotl/core/trainers/grpo/__init__.py |  32 ++++++
 src/axolotl/utils/schemas/trl.py           |   6 ++
 tests/utils/test_grpo_rw_fnc.py            |  18 ++++
 4 files changed, 168 insertions(+)
 create mode 100644 tests/utils/test_grpo_rw_fnc.py

diff --git a/docs/rlhf.qmd b/docs/rlhf.qmd
index 594ebc743..2033649cc 100644
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -597,6 +597,118 @@ To see other examples of custom reward functions, please see [TRL GRPO Docs](htt
 
 To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
 
+#### OpenEnv Rollout Functions
+```bash
+pip insatll openenv-core```
+
+GRPO supports custom rollout functions for OpenEnv-style environments, enabling interactive tasks like web browsing, code execution, or tool use. This allows you to implement custom generation logic that interacts with external environments.
+
+For example, to implement a simple math-solving environment with step-by-step verification:
+
+```python
+# math_env.py
+import re
+
+def math_solver_rollout(model, processing_class, prompts, generation_config=None):
+    """
+    Custom rollout function that generates step-by-step math solutions.
+
+    Args:
+        model: The language model
+        processing_class: The tokenizer/processing_class
+        prompts: List of prompt dicts (with 'messages' key for chat format)
+        generation_config: Optional generation configuration
+
+    Returns:
+        List of completion strings
+    """
+    completions = []
+
+    for prompt in prompts:
+        # Apply chat template to prompt
+        messages = prompt.get("messages", [])
+        formatted_prompt = processing_class.apply_chat_template(
+            messages, processing_class=False, add_generation_prompt=True
+        )
+
+        # Generate step-by-step solution
+        full_response = ""
+        for step in range(5):  # Max 5 reasoning steps
+            current_input = formatted_prompt + full_response + "\nNext step:"
+            inputs = processing_class(current_input, return_tensors="pt").to(model.device)
+
+            outputs = model.generate(
+                **inputs,
+                max_new_tokens=100,
+                generation_config=generation_config,
+            )
+            step_text = processing_class.decode(
+                outputs[0][inputs.input_ids.shape[1]:],
+                skip_special_tokens=True
+            )
+
+            # Check if solution is complete
+            if "FINAL ANSWER:" in step_text:
+                full_response += step_text
+                break
+            full_response += step_text + "\n"
+
+        completions.append(full_response)
+
+    return completions
+
+def math_reward(prompts, completions, answers, **kwargs):
+    """Reward function that checks mathematical correctness"""
+    rewards = []
+    for completion, correct_answer in zip(completions, answers):
+        # Extract predicted answer
+        match = re.search(r"FINAL ANSWER:\s*(.+)", completion)
+        predicted = match.group(1).strip() if match else ""
+
+        # Compare with correct answer
+        reward = 1.0 if predicted == str(correct_answer) else 0.0
+        rewards.append(reward)
+
+    return rewards
+
+def math_transform(cfg, *args, **kwargs):
+    """Transform dataset to GRPO format with answer field"""
+    def transform_fn(example, processing_class=None):
+        return {
+            "prompt": [{"role": "user", "content": example["question"]}],
+            "answer": str(example["answer"]),
+        }
+    return transform_fn, {"remove_columns": ["question"]}
+```
+
+```yaml
+rl: grpo
+
+trl:
+  beta: 0.001
+  max_completion_length: 512
+  num_generations: 4
+  rollout_func: "math_env.math_solver_rollout"  # Custom rollout function
+  reward_funcs: ["math_env.math_reward"]
+  reward_weights: [1.0]
+
+datasets:
+  - path: openai/gsm8k
+    name: main
+    type: math_env.math_transform
+```
+
+The `rollout_func` parameter accepts a fully qualified name (e.g., `module_name.function_name`) that points to a callable function in your local directory. The function receives:
+
+- `model`: The language model
+- `processing_class`: The tokenizer/processing class
+- `prompts`: List of prompt dictionaries
+- `generation_config` (optional): Generation configuration
+
+And should return a list of completion strings.
+
+For more OpenEnv examples, see [TRL OpenEnv Documentation](https://huggingface.co/docs/trl/main/en/openenv).
+
 #### GRPO with DAPO/Dr. GRPO loss
 
 The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.
diff --git a/src/axolotl/core/trainers/grpo/__init__.py b/src/axolotl/core/trainers/grpo/__init__.py
index bd77489eb..7f28cb8d4 100644
--- a/src/axolotl/core/trainers/grpo/__init__.py
+++ b/src/axolotl/core/trainers/grpo/__init__.py
@@ -126,6 +126,9 @@ class GRPOStrategy:
         if trl.use_liger_loss is not None:
             grpo_args_kwargs["use_liger_loss"] = trl.use_liger_loss
 
+        if trl.rollout_func:
+            grpo_args_kwargs["rollout_func"] = cls.get_rollout_func(trl.rollout_func)
+
         return grpo_args_kwargs
 
     @classmethod
@@ -201,3 +204,32 @@ class GRPOStrategy:
                 raise ValueError(
                     f"Reward function {reward_func_fqn} not found."
                 ) from exc
+
+    @classmethod
+    def get_rollout_func(cls, rollout_func_fqn: str):
+        """
+        Returns the rollout function from the given fully qualified name.
+
+        Args:
+            rollout_func_fqn (str): Fully qualified name of the rollout function
+                                    (e.g. my_module.my_rollout_func)
+
+        Returns:
+            Callable rollout function
+        """
+        try:
+            rollout_func_module_name = rollout_func_fqn.split(".")[-1]
+            rollout_func_module = importlib.import_module(
+                ".".join(rollout_func_fqn.split(".")[:-1])
+            )
+            rollout_func = getattr(rollout_func_module, rollout_func_module_name)
+
+            if not callable(rollout_func):
+                raise ValueError(
+                    f"Rollout function {rollout_func_fqn} must be callable"
+                )
+
+            return rollout_func
+
+        except ModuleNotFoundError as exc:
+            raise ValueError(f"Rollout function {rollout_func_fqn} not found.") from exc
diff --git a/src/axolotl/utils/schemas/trl.py b/src/axolotl/utils/schemas/trl.py
index 624f7663e..d24d6f477 100644
--- a/src/axolotl/utils/schemas/trl.py
+++ b/src/axolotl/utils/schemas/trl.py
@@ -173,3 +173,9 @@ class TRLConfig(BaseModel):
             "description": "Enable sleep mode for vLLM to offload VRAM when idle"
         },
     )
+    rollout_func: str | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": "Path to custom rollout function. Must be importable from current dir."
+        },
+    )
diff --git a/tests/utils/test_grpo_rw_fnc.py b/tests/utils/test_grpo_rw_fnc.py
new file mode 100644
index 000000000..507de277b
--- /dev/null
+++ b/tests/utils/test_grpo_rw_fnc.py
@@ -0,0 +1,18 @@
+import os
+
+import pytest
+
+from axolotl.core.trainers.grpo import GRPOStrategy
+
+
+def test_get_rollout_func_loads_successfully():
+    """Test that a valid rollout function can be loaded"""
+    rollout_func = GRPOStrategy.get_rollout_func("os.path.join")
+    assert callable(rollout_func)
+    assert rollout_func == os.path.join
+
+
+def test_get_rollout_func_invalid_module_raises_error():
+    """Test that invalid module path raises clear ValueError"""
+    with pytest.raises(ValueError, match="Rollout function .* not found"):
+        GRPOStrategy.get_rollout_func("nonexistent_module.my_func")

From b62eed88095462160da956f21ff33efb6585eb7b Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Fri, 7 Nov 2025 12:17:27 -0500
Subject: [PATCH 112/115] add openenv-core to requirements (#3251)

---
 docs/rlhf.qmd    | 2 --
 requirements.txt | 1 +
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/rlhf.qmd b/docs/rlhf.qmd
index 2033649cc..1eea42036 100644
--- a/docs/rlhf.qmd
+++ b/docs/rlhf.qmd
@@ -598,8 +598,6 @@ To see other examples of custom reward functions, please see [TRL GRPO Docs](htt
 To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).
 
 #### OpenEnv Rollout Functions
-```bash
-pip insatll openenv-core```
 
 GRPO supports custom rollout functions for OpenEnv-style environments, enabling interactive tasks like web browsing, code execution, or tool use. This allows you to implement custom generation logic that interacts with external environments.
 
diff --git a/requirements.txt b/requirements.txt
index 96b185197..a12a3941b 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -64,6 +64,7 @@ immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2
 
 torchao==0.13.0
+openenv-core==0.1.0
 schedulefree==1.4.1
 
 axolotl-contribs-lgpl==0.0.7

From b5fcc2f14be77f1cddc7a9b439e17ebb7c761e7a Mon Sep 17 00:00:00 2001
From: Wing Lian <wing@axolotl.ai>
Date: Fri, 7 Nov 2025 16:04:00 -0500
Subject: [PATCH 113/115] log cumulative total trained tokens (#3252)

* log cumulative total trained tokens

* use is_distributed helper
---
 src/axolotl/core/trainers/base.py | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/axolotl/core/trainers/base.py b/src/axolotl/core/trainers/base.py
index 7d7420fb8..7896c6088 100644
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -43,7 +43,7 @@ from axolotl.core.trainers.utils import (
 from axolotl.utils import get_not_null
 from axolotl.utils.bench import get_gpu_memory_usage
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.distributed import is_main_process
+from axolotl.utils.distributed import is_distributed, is_main_process
 from axolotl.utils.logging import get_logger
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 
@@ -350,6 +350,11 @@ class AxolotlTrainer(
         # track number of tokens for tokens per second calculation
         if self.args.include_tkps:
             inputs_key = "labels" if "labels" in inputs else "input_ids"
+            num_tokens = (inputs[inputs_key] != -100).sum()
+            if is_distributed():
+                torch.distributed.all_reduce(
+                    num_tokens, op=torch.distributed.ReduceOp.SUM
+                )
             if hasattr(self.state, "num_tokens"):
                 self.state.num_tokens = (
                     self.state.num_tokens + (inputs[inputs_key] != -100).sum().cpu()
@@ -357,6 +362,11 @@ class AxolotlTrainer(
             else:
                 self.state.num_tokens = (inputs[inputs_key] != -100).sum().cpu()
 
+            if hasattr(self.state, "total_tokens"):
+                self.state.total_tokens += num_tokens
+            else:
+                self.state.total_tokens = num_tokens
+
         if self.args.orpo_alpha:
             return self.orpo_compute_loss(
                 model,
@@ -621,6 +631,7 @@ class AxolotlTrainer(
             logs["tokens_per_second_per_gpu"] = round(
                 self.state.last_tokens_per_second.item() / self.args.logging_steps, 2
             )
+            logs["total_tokens"] = int(self.state.total_tokens.item())
 
         del self._stored_metrics[train_eval]
 

From d0c846fc5e589aeb7150e4946a5110033ebaf92a Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Mon, 10 Nov 2025 21:35:45 +0700
Subject: [PATCH 114/115] feat: add granitemoeshared and granitemoehybrid
 (#3158)

---
 src/axolotl/monkeypatch/multipack.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/axolotl/monkeypatch/multipack.py b/src/axolotl/monkeypatch/multipack.py
index 48b4ea10e..5d34f1935 100644
--- a/src/axolotl/monkeypatch/multipack.py
+++ b/src/axolotl/monkeypatch/multipack.py
@@ -40,6 +40,8 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
     "smollm3",
     "granite",
     "granitemoe",
+    "granitemoeshared",
+    "granitemoehybrid",
     "hunyuan_v1_dense",
     "hunyuan_v1_moe",
     "gpt_oss",

From 11eb36585a210786ece0f94315662525b9f375f5 Mon Sep 17 00:00:00 2001
From: NanoCode012 <nano@axolotl.ai>
Date: Mon, 10 Nov 2025 21:37:47 +0700
Subject: [PATCH 115/115] feat: add arg to enable dft in liger (#3125)

* feat: add arg to enable dft in liger

* feat: add tests use_token_scaling

* fix: test

* fix: move check to args
---
 src/axolotl/integrations/liger/README.md |  3 +++
 src/axolotl/integrations/liger/args.py   | 23 +++++++++++++++++++-
 src/axolotl/integrations/liger/plugin.py | 27 ++++++++++++++++++++++++
 tests/e2e/integrations/test_liger.py     |  8 ++++++-
 tests/integrations/test_liger.py         | 16 ++++++++++++++
 5 files changed, 75 insertions(+), 2 deletions(-)

diff --git a/src/axolotl/integrations/liger/README.md b/src/axolotl/integrations/liger/README.md
index c5cce8282..3a2d4bd04 100644
--- a/src/axolotl/integrations/liger/README.md
+++ b/src/axolotl/integrations/liger/README.md
@@ -18,6 +18,9 @@ liger_rms_norm: true
 liger_glu_activation: true
 liger_layer_norm: true
 liger_fused_linear_cross_entropy: true
+
+# FLCE-specific
+liger_use_token_scaling: true
 ```
 
 ## Supported Models
diff --git a/src/axolotl/integrations/liger/args.py b/src/axolotl/integrations/liger/args.py
index d5bb10cfd..eb7a6c59b 100644
--- a/src/axolotl/integrations/liger/args.py
+++ b/src/axolotl/integrations/liger/args.py
@@ -16,7 +16,7 @@
 Module for handling LIGER input arguments.
 """
 
-from pydantic import BaseModel, model_validator
+from pydantic import BaseModel, Field, model_validator
 
 from axolotl.utils.logging import get_logger
 
@@ -35,6 +35,15 @@ class LigerArgs(BaseModel):
     liger_glu_activation: bool | None = None
     liger_cross_entropy: bool | None = None
     liger_fused_linear_cross_entropy: bool | None = None
+    liger_use_token_scaling: bool | None = Field(
+        default=None,
+        json_schema_extra={
+            "description": (
+                "Enables use_token_scaling in fused_linear_cross_entropy. "
+                "When True, each token's loss is multiplied by its predicted probability (detached from gradients)."
+            )
+        },
+    )
 
     @model_validator(mode="before")
     @classmethod
@@ -75,6 +84,18 @@ class LigerArgs(BaseModel):
             )
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def check_liger_use_token_scaling_flce(cls, data):
+        if data.get("liger_use_token_scaling") and not data.get(
+            "liger_fused_linear_cross_entropy"
+        ):
+            raise ValueError(
+                "`liger_use_token_scaling: true` requires `liger_fused_linear_cross_entropy` enabled."
+            )
+
+        return data
+
     @model_validator(mode="after")
     def check_tensor_parallel_size_liger_fused_linear_cross_entropy(self):
         # TODO @SalmanMohammadi this is a larger fix - investigate
diff --git a/src/axolotl/integrations/liger/plugin.py b/src/axolotl/integrations/liger/plugin.py
index 89f7c37b7..ac796c2c9 100644
--- a/src/axolotl/integrations/liger/plugin.py
+++ b/src/axolotl/integrations/liger/plugin.py
@@ -48,6 +48,33 @@ class LigerPlugin(BasePlugin):
                 "Cannot have both `liger_cross_entropy` and `liger_fused_linear_cross_entropy` set."
             )
 
+        if cfg.liger_use_token_scaling:
+            # Patch FLCE to set token_scaling=True for function and class API
+            from liger_kernel.transformers import functional
+            from liger_kernel.transformers.fused_linear_cross_entropy import (
+                LigerFusedLinearCrossEntropyLoss,
+            )
+
+            old_liger_fused_linear_cross_entropy = (
+                functional.liger_fused_linear_cross_entropy
+            )
+
+            def patched_liger_fused_linear_cross_entropy(*args, **kwargs):
+                kwargs["use_token_scaling"] = True
+                return old_liger_fused_linear_cross_entropy(*args, **kwargs)
+
+            functional.liger_fused_linear_cross_entropy = (
+                patched_liger_fused_linear_cross_entropy
+            )
+
+            old_init = LigerFusedLinearCrossEntropyLoss.__init__
+
+            def patched_init(self, *args, **kwargs):
+                kwargs["use_token_scaling"] = True
+                return old_init(self, *args, **kwargs)
+
+            LigerFusedLinearCrossEntropyLoss.__init__ = patched_init
+
         if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
             apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
             liger_fn_sig = inspect.signature(apply_liger_fn)
diff --git a/tests/e2e/integrations/test_liger.py b/tests/e2e/integrations/test_liger.py
index 285969963..55317151e 100644
--- a/tests/e2e/integrations/test_liger.py
+++ b/tests/e2e/integrations/test_liger.py
@@ -2,6 +2,7 @@
 Simple end-to-end test for Liger integration
 """
 
+import pytest
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
@@ -62,7 +63,11 @@ class LigerIntegrationTestCase:
         check_model_output_exists(temp_dir, cfg)
 
     @require_torch_2_4_1
-    def test_llama_w_flce(self, temp_dir):
+    @pytest.mark.parametrize(
+        "liger_use_token_scaling",
+        [True, False],
+    )
+    def test_llama_w_flce(self, temp_dir, liger_use_token_scaling):
         cfg = DictDefault(
             {
                 "base_model": "HuggingFaceTB/SmolLM2-135M",
@@ -74,6 +79,7 @@ class LigerIntegrationTestCase:
                 "liger_glu_activation": True,
                 "liger_cross_entropy": False,
                 "liger_fused_linear_cross_entropy": True,
+                "liger_use_token_scaling": liger_use_token_scaling,
                 "sequence_len": 1024,
                 "val_set_size": 0.05,
                 "special_tokens": {
diff --git a/tests/integrations/test_liger.py b/tests/integrations/test_liger.py
index d7b171ec2..6865306c9 100644
--- a/tests/integrations/test_liger.py
+++ b/tests/integrations/test_liger.py
@@ -75,3 +75,19 @@ class TestValidation:
         ):
             prepare_plugins(test_cfg)
             validate_config(test_cfg)
+
+    def test_use_token_scaling_require_flce(self, minimal_liger_cfg):
+        test_cfg = DictDefault(
+            {
+                "liger_fused_linear_cross_entropy": False,
+                "liger_use_token_scaling": True,
+            }
+            | minimal_liger_cfg
+        )
+
+        with pytest.raises(
+            ValueError,
+            match=r"`liger_use_token_scaling: true` requires `liger_fused_linear_cross_entropy` enabled.",
+        ):
+            prepare_plugins(test_cfg)
+            validate_config(test_cfg)