diffusion alt: custom loss impl

nits
2025-08-18 20:50:34 +00:00 · 2025-08-18 20:50:20 +00:00 · 2025-08-18 19:17:24 +00:00 · 2025-08-18 19:09:09 +00:00 · 2025-08-18 18:25:04 +00:00 · 2025-08-18 10:07:55 -04:00
310 changed files with 13092 additions and 11472 deletions
--- a/.bandit
+++ b/.bandit
@@ -1,3 +1,3 @@
 [bandit]
 exclude = tests
-skips = B101,B615,B102,B110
+skips = B101,B615
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -12,6 +12,5 @@ reviews:
  auto_review:
    enabled: true
    drafts: false
    auto_incremental_review: true
 chat:
  auto_reply: true
--- a/.flake8
+++ b/.flake8
@@ -0,0 +1,5 @@
 [flake8]
 max-line-length = 88
 select = C,E,F,W,B,B950
 extend-ignore = E203, E501, W503
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -0,0 +1,4 @@
 [settings]
 profile=black
 known_third_party=wandb,comet_ml
 known_local_folder=src,tests
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,12 +10,22 @@ repos:
    -   id: trailing-whitespace
    -   id: no-commit-to-branch
        args: ['--branch', 'main']
-   repo: https://github.com/astral-sh/ruff-pre-commit
+-   repo: https://github.com/psf/black
-    rev: v0.12.9
+    rev: 25.1.0
    hooks:
-    -   id: ruff
+    -   id: black
-        args: [--fix]
+-   repo: https://github.com/pycqa/isort
-    -   id: ruff-format
+    rev: 6.0.1
    hooks:
      - id: isort
 -   repo: https://github.com/PyCQA/flake8
    rev: 7.3.0
    hooks:
    - id: flake8
 -   repo: https://github.com/pylint-dev/pylint
    rev: v3.3.8
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.17.1
    hooks:
--- a/.pylintrc
+++ b/.pylintrc
@@ -0,0 +1,15 @@
 [MASTER]
 init-hook="from pylint.config import find_default_config_files; import sys; sys.path.append(next(find_default_config_files()).parent.as_posix())"
 [TYPECHECK]
 # List of members which are set dynamically and missed by Pylint inference
 # system, and so shouldn't trigger E1101 when accessed.
 generated-members=numpy.*, torch.*
 [pylint.messages_control]
 disable=missing-function-docstring, line-too-long, import-error,
    too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
    too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
    too-many-positional-arguments, possibly-used-before-assignment
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -2,6 +2,8 @@
 modal application to run axolotl gpu tests in Modal
 """
 # pylint: disable=duplicate-code
 import os
 import pathlib
 import tempfile
@@ -61,7 +63,7 @@ def run_cmd(cmd: str, run_folder: str):
    # Propagate errors from subprocess.
    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
@app.function(
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -1,5 +1,7 @@
 """Modal app to run axolotl GPU tests"""
 # pylint: disable=duplicate-code
 import os
 import pathlib
 import tempfile
@@ -68,4 +70,4 @@ def run_cmd(cmd: str, run_folder: str):
    # Propagate errors from subprocess.
    if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env):  # nosec
-        exit(exit_code)
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
--- a/docs/scripts/generate_config_docs.py
+++ b/docs/scripts/generate_config_docs.py
@@ -47,6 +47,7 @@ class QuartoGenerator:
        """Check if a type is a Pydantic BaseModel."""
        return inspect.isclass(type_obj) and issubclass(type_obj, BaseModel)
    # pylint: disable=too-many-return-statements
    def _extract_nested_type(self, field_type) -> Any:
        """Extract the actual type from complex type annotations."""
        # Handle Annotated types (Python 3.9+)
@@ -123,6 +124,7 @@ class QuartoGenerator:
        return field_type
    # pylint: disable=too-many-return-statements
    def _extract_all_pydantic_models_from_type(
        self, field_type
    ) -> list[type[BaseModel]]:
@@ -316,6 +318,7 @@ class QuartoGenerator:
        return all_groups
    # pylint: disable=too-many-return-statements
    def _extract_field_groups_from_source(
        self, model_class: type[BaseModel]
    ) -> list[dict]:
@@ -500,7 +503,7 @@ class QuartoGenerator:
                    nested_schema = nested_model.model_json_schema()
                    nested_properties = nested_schema.get("properties", {})
                    nested_required = nested_schema.get("required", [])
-                except Exception:
+                except Exception:  # pylint: disable=broad-exception-caught
                    # Fallback: use model fields directly
                    nested_properties = {}
                    nested_required = []
@@ -604,7 +607,7 @@ class QuartoGenerator:
            schema = model_class.model_json_schema()
            properties = schema.get("properties", {})
            required = schema.get("required", [])
-        except Exception as e:
+        except Exception as e:  # pylint: disable=broad-exception-caught
            print(
                f"Warning: Could not generate JSON schema ({e}). Using model fields instead."
            )
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -89,7 +89,6 @@
      "outputs": [],
      "source": [
        "import os\n",
    "\n",
        "# Optionally, upload your own JSONL to your Google Drive\n",
        "GOOGLE_DRIVE_PATH = \"\"  # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n",
        "\n",
@@ -97,7 +96,6 @@
        "# \"MessageError: Error: credential propagation was unsuccessful\"\n",
        "if GOOGLE_DRIVE_PATH:\n",
        "    from google.colab import drive\n",
    "\n",
        "    # Mount your Google Drive\n",
        "    GOOGLE_DRIVE_MNT = \"/content/drive/\"\n",
        "    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n",
@@ -105,7 +103,7 @@
        "    # make sure file exists\n",
        "    if not os.path.isfile(tmp_path):\n",
        "        raise ValueError(f\"File {tmp_path} does not exist\")\n",
-    "    dataset_id = tmp_path"
+        "    dataset_id = tmp_path\n"
      ]
    },
    {
@@ -187,13 +185,8 @@
        "    lora_r = 32,\n",
        "    lora_alpha = 64,\n",
        "    lora_target_modules = [\n",
-    "        \"q_proj\",\n",
+        "        \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",  # train self_attn linear modules\n",
-    "        \"k_proj\",\n",
+        "        \"gate_proj\", \"down_proj\", \"up_proj\",  # train MLP linear modules\n",
    "        \"v_proj\",\n",
    "        \"o_proj\",  # train self_attn linear modules\n",
    "        \"gate_proj\",\n",
    "        \"down_proj\",\n",
    "        \"up_proj\",  # train MLP linear modules\n",
        "    ],\n",
        "    lora_qkv_kernel = True,  # optimized triton kernels for LoRA\n",
        "    lora_o_kernel = True,\n",
@@ -252,7 +245,6 @@
      "outputs": [],
      "source": [
        "from axolotl.utils import patch_optimized_env\n",
    "\n",
        "# speedup downloads from HF 🤗 and set \"PYTORCH_CUDA_ALLOC_CONF\" env to save memory\n",
        "patch_optimized_env()"
      ]
@@ -1244,6 +1236,7 @@
        }
      ],
      "source": [
        "import torch\n",
        "from transformers import TextStreamer\n",
        "\n",
        "messages = [\n",
@@ -1263,11 +1256,9 @@
        "outputs = model.generate(\n",
        "    **tokenizer(prompt, return_tensors = \"pt\").to(\"cuda\"),\n",
        "    max_new_tokens = 192,\n",
-    "    temperature=1.0,\n",
+        "    temperature = 1.0, top_p = 0.8, top_k = 32,\n",
    "    top_p=0.8,\n",
    "    top_k=32,\n",
        "    streamer = TextStreamer(tokenizer, skip_prompt = True),\n",
-    ")"
+        ")\n"
      ]
    },
    {
@@ -1440,7 +1431,6 @@
      ],
      "source": [
        "from huggingface_hub import notebook_login\n",
    "\n",
        "# remove the partial epoch checkpoints\n",
        "!rm -rf \"./outputs/qwen-sft-pirate-rrr/checkpoint-*\"\n",
        "\n",
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -41,12 +41,6 @@ model, and final model output, you may need at least 3TB of free disk space to k
 axolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
 ```
 To simplify fine-tuning across 2 nodes × 8x H100 (80GB) GPUs, we've partnered with [Baseten](https://baseten.co) to showcase multi-node
 training of the 120B model using Baseten Truss. You can read more about this recipe on
 [Baseten's blog](https://www.baseten.co/blog/how-to-fine-tune-gpt-oss-120b-with-baseten-and-axolotl/). The recipe can
 be found on their
 [GitHub](https://github.com/basetenlabs/ml-cookbook/tree/main/examples/oss-gpt-120b-axolotl/training).
 ERRATA: Transformers saves the model Architecture prefixed with `FSDP` which needs to be manually renamed in `config.json`.
 See https://github.com/huggingface/transformers/pull/40207 for the status of this issue.
@@ -67,23 +61,9 @@ mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/
 ### Inferencing your fine-tuned model
 #### vLLM
 GPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425
 for more information about using a special vllm-openai docker image for inferencing with vLLM.
 Optionally, vLLM can be installed from nightly:
 ```bash
 pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment):
 ```bash
 vllm serve ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-20b --host 0.0.0.0 --port 8888  --tensor-parallel-size 8
 ```
 #### SGLang
 SGLang has 0-day support in main, see https://github.com/sgl-project/sglang/issues/8833 for infomation on installing
 SGLang from source. Once you've installed SGLang, run the following command to launch a SGLang server:
--- a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
@@ -44,7 +44,7 @@ bf16: true
 tf32: true
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+attn_implementation: kernels-community/vllm-flash-attn3
 gradient_checkpointing: true
 activation_offloading: true
--- a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
@@ -40,7 +40,7 @@ bf16: true
 tf32: true
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+attn_implementation: kernels-community/vllm-flash-attn3
 gradient_checkpointing: true
 activation_offloading: true
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
@@ -15,7 +15,7 @@ datasets:
    field_thinking: thinking
    template_thinking_key: thinking
-dataset_prepared_path: ./outputs/last_run_prepared
+dataset_prepared_path: last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/gpt-oss-out/
@@ -41,7 +41,7 @@ bf16: true
 tf32: true
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+attn_implementation: kernels-community/vllm-flash-attn3
 gradient_checkpointing: true
 activation_offloading: true
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
@@ -15,7 +15,7 @@ datasets:
    field_thinking: thinking
    template_thinking_key: thinking
-dataset_prepared_path: ./outputs/last_run_prepared
+dataset_prepared_path: last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/gpt-oss-out/
@@ -40,7 +40,7 @@ bf16: true
 tf32: true
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+attn_implementation: kernels-community/vllm-flash-attn3
 gradient_checkpointing: true
 activation_offloading: true
--- a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
@@ -53,7 +53,7 @@ bf16: true
 tf32: true
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+attn_implementation: kernels-community/vllm-flash-attn3
 gradient_checkpointing: true
 activation_offloading: true
--- a/examples/llama-3/diffusion-3.2-1b-pretrain.yaml
+++ b/examples/llama-3/diffusion-3.2-1b-pretrain.yaml
@@ -0,0 +1,57 @@
 base_model: meta-llama/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 pretraining_dataset:
  - path: wikitext
    name: wikitext-103-raw-v1
    type: completion
    field: text
 plugins:
  - diffusion.DiffusionPlugin
 noise_schedule: cosine
 min_mask_ratio: 0.15
 max_mask_ratio: 0.85
 eps: 5e-4
 importance_weighting: true
 mask_token_id: 128002
 generate_samples: true
 generation_interval: 10
 output_dir: ./outputs/model-out
 sequence_len: 512
 sample_packing: true
 gradient_accumulation_steps: 8
 micro_batch_size: 4
 max_steps: 10000
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 3e-4
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 sdp_attention: true
 warmup_steps: 1000
 save_strategy: steps
 save_steps: 1000
 special_tokens:
  pad_token: "<|end_of_text|>"
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/examples/llama-3/diffusion-3.2-1b-sft.yaml
+++ b/examples/llama-3/diffusion-3.2-1b-sft.yaml
@@ -0,0 +1,58 @@
 base_model: meta-llama/Llama-3.2-1B
 # Automatically upload checkpoint and final model to HF
 # hub_model_id: username/custom_model_name
 datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
 val_set_size: 0.05
 plugins:
  - diffusion.DiffusionPlugin
 noise_schedule: cosine
 min_mask_ratio: 0.1
 max_mask_ratio: 0.9
 num_diffusion_steps: 128
 eps: 1e-3
 importance_weighting: true
 mask_token_id: 128002
 output_dir: ./outputs/model-out
 sequence_len: 512
 sample_packing: true
 eval_sample_packing: true
 gradient_accumulation_steps: 4
 micro_batch_size: 4
 num_epochs: 1
 optimizer: adamw_8bit
 lr_scheduler: cosine
 learning_rate: 1e-5
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 resume_from_checkpoint:
 logging_steps: 1
 sdp_attention: true
 warmup_steps: 1000
 save_strategy: steps
 eval_strategy: steps
 save_steps: 500
 eval_steps: 500
 special_tokens:
  pad_token: "<|end_of_text|>"
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 # save_first_step: true  # uncomment this to validate checkpoint saving works with your config
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,34 +26,3 @@ include-package-data = true
 [tool.setuptools.cmdclass]
 build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"
 [tool.ruff]
 line-length = 88
 target-version = "py310"
 [tool.ruff.lint]
 select = ["E", "F", "W", "C90", "B"]
 ignore = [
    "E203",  # Whitespace before ':'
    "E501",  # Line too long
    "C901",  # Too complex
    "B019",  # Use of functools.cache on methods
    "E722",  # Bare except
    "F821",  # Undefined name (for dynamic exec)
 ]
 [tool.ruff.lint.isort]
 known-third-party = ["wandb", "comet_ml"]
 known-local-folder = ["src", "tests"]
 # Black-compatible isort settings
 force-single-line = false
 combine-as-imports = true
 split-on-trailing-comma = true
 [tool.ruff.format]
 # Use black's formatting style exactly
 quote-style = "double"
 indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"
 docstring-code-format = false
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,8 +13,8 @@ liger-kernel==0.6.1
 packaging==23.2
 huggingface_hub>=0.33.0
-peft>=0.17.0
+peft==0.17.0
-transformers==4.55.3
+transformers==4.55.2
 tokenizers>=0.21.1
 accelerate==1.10.0
 datasets==4.0.0
--- a/scripts/chat_datasets.py
+++ b/scripts/chat_datasets.py
@@ -27,7 +27,7 @@ def parse_dataset(dataset=None, split="train"):
            break
    if not field_messages:
        raise ValueError(
-            f"No conversation field found in dataset: {', '.join(feature_keys)}"
+            f'No conversation field found in dataset: {", ".join(feature_keys)}'
        )
    ds_cfg["field_messages"] = field_messages
@@ -40,7 +40,7 @@ def parse_dataset(dataset=None, split="train"):
            break
    if not message_property_mappings["role"]:
        raise ValueError(
-            f"No role field found in messages: {', '.join(message_fields)}"
+            f'No role field found in messages: {", ".join(message_fields)}'
        )
    for key in ["content", "text", "value"]:
@@ -49,7 +49,7 @@ def parse_dataset(dataset=None, split="train"):
            break
    if not message_property_mappings["content"]:
        raise ValueError(
-            f"No content field found in messages: {', '.join(message_fields)}"
+            f'No content field found in messages: {", ".join(message_fields)}'
        )
    ds_cfg["message_property_mappings"] = message_property_mappings
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -1,10 +1,11 @@
 # noqa
 # pylint: skip-file
 import sys
 try:
    import torch
-except ImportError as error:
+except ImportError:
-    raise ImportError("Install torch via `pip install torch`") from error
+    raise ImportError("Install torch via `pip install torch`")
 from packaging.version import Version as V
 use_uv = "--uv" in sys.argv[1:]
--- a/setup.py
+++ b/setup.py
@@ -118,9 +118,9 @@ def get_package_version():
 extras_require = {
-    "flash-attn": ["flash-attn==2.8.3"],
+    "flash-attn": ["flash-attn==2.8.2"],
    "ring-flash-attn": [
-        "flash-attn==2.8.3",
+        "flash-attn==2.8.2",
        "ring-flash-attn>=0.1.7",
        "yunchang==0.6.0",
    ],
--- a/src/axolotl/cli/art.py
+++ b/src/axolotl/cli/art.py
@@ -22,7 +22,7 @@ HAS_PRINTED_LOGO = False
 def print_axolotl_text_art():
    """Prints axolotl ASCII art."""
-    global HAS_PRINTED_LOGO
+    global HAS_PRINTED_LOGO  # pylint: disable=global-statement
    if HAS_PRINTED_LOGO:
        return
    if is_main_process():
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -41,7 +41,7 @@ def run_cmd(cmd: str, run_folder: str, volumes=None):
    if exit_code := subprocess.call(  # nosec B603
        cmd.split(), cwd=run_folder, env=new_env
    ):
-        exit(exit_code)
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
    # Commit writes to volume.
    if volumes:
@@ -82,7 +82,7 @@ class ModalCloud(Cloud):
        return res
    def get_image(self):
-        docker_tag = "main-py3.11-cu126-2.7.1"
+        docker_tag = "main-py3.11-cu124-2.6.0"
        if self.config.docker_tag:
            docker_tag = self.config.docker_tag
        docker_image = f"axolotlai/axolotl:{docker_tag}"
@@ -130,6 +130,7 @@ class ModalCloud(Cloud):
        res = []
        if self.config.secrets:
            for key in self.config.get("secrets", []):
                # pylint: disable=duplicate-code
                if isinstance(key, str):
                    if val := os.environ.get(key, ""):
                        res.append(modal.Secret.from_dict({key: val}))
@@ -176,8 +177,8 @@ class ModalCloud(Cloud):
            with self.app.run(detach=True):
                modal_fn.remote(
                    config_yaml,
                    *args,
                    volumes={k: v[0] for k, v in self.volumes.items()},
                    *args,
                    **kwargs,
                )
@@ -186,7 +187,7 @@ class ModalCloud(Cloud):
            return int(self.config.timeout)
        return 60 * 60 * 24  # 24 hours
-    def get_train_gpu(self):
+    def get_train_gpu(self):  # pylint: disable=too-many-return-statements
        count = self.config.gpu_count or 1
        family = self.config.gpu.lower() or "l40s"
@@ -199,7 +200,7 @@ class ModalCloud(Cloud):
        if family in ["a10", "a10g"]:
            return modal.gpu.A10G(count=count)
        if family == "h100":
-            return f"H100:{count}"
+            return modal.gpu.H100(count=count)
        if family == "t4":
            return modal.gpu.T4(count=count)
        if family == "l4":
@@ -276,7 +277,7 @@ def _train(
    launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
    launcher_args: list[str] | None = None,
    volumes=None,
-    **kwargs,
+    **kwargs,  # pylint: disable=unused-argument
 ):
    Path("/workspace/mounts").mkdir(parents=True, exist_ok=True)
    with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out:
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -210,7 +210,7 @@ def load_cfg(
    try:
        device_props = torch.cuda.get_device_properties("cuda")
        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
-    except:
+    except:  # pylint: disable=bare-except # noqa: E722
        gpu_version = None
    prepare_plugins(cfg)
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -28,7 +28,7 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: CLI arguments.
    """
-
+    # pylint: disable=duplicate-code
    check_accelerate_default_config()
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        check_user_token()
@@ -49,7 +49,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
-
+    # pylint: disable=duplicate-code
    parsed_cfg = load_cfg(config, **kwargs)
    parser = HfArgumentParser(TrainerCliArgs)
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -35,7 +35,7 @@ def get_multi_line_input() -> str:
    instruction = ""
    for line in sys.stdin:
-        instruction += line
+        instruction += line  # pylint: disable=consider-using-join
    return instruction
@@ -64,7 +64,7 @@ def do_inference(
            importlib.import_module("axolotl.prompters"), prompter
        )
    elif cfg.chat_template:
-        chat_template_str = get_chat_template(cfg.chat_template, tokenizer=tokenizer)
+        chat_template_str = get_chat_template(cfg.chat_template)
    elif cfg.datasets[0].type == "chat_template":
        chat_template_str = get_chat_template_from_config(
            cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
@@ -167,6 +167,7 @@ def do_inference_gradio(
        if not instruction:
            return
        if prompter_module:
            # pylint: disable=stop-iteration-return
            prompt: str = next(
                prompter_module().build_prompt(instruction=instruction.strip("\n"))
            )
@@ -251,7 +252,7 @@ def do_cli(
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
-
+    # pylint: disable=duplicate-code
    parsed_cfg = load_cfg(config, inference=True, rl=None, **kwargs)
    parsed_cfg.sample_packing = False
    parser = transformers.HfArgumentParser(InferenceCliArgs)
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -1,5 +1,7 @@
 """Click CLI definitions for various axolotl commands."""
 # pylint: disable=redefined-outer-name
 import os
 import subprocess  # nosec B404
 from typing import Literal, Optional
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -32,7 +32,7 @@ LOG = get_logger(__name__)
 class BFloat16CastPlanner(_EmptyStateDictLoadPlanner):
    """A custom planner to cast tensors to bfloat16 on the fly during loading."""
-    def commit_tensor(self, read_item, tensor):
+    def commit_tensor(self, read_item, tensor):  # pylint: disable=unused-argument
        tensor.copy_(tensor.to(torch.bfloat16))
@@ -59,10 +59,10 @@ def _distributed_checkpoint_to_merged_weights(
    state_dict: Dict = {}
    save_path_ = Path(save_path)
    save_path_.mkdir(exist_ok=True)
-    dist_cp_format_utils._load_state_dict(
+    dist_cp_format_utils._load_state_dict(  # pylint: disable=protected-access
        state_dict,
        storage_reader=dist_cp.FileSystemReader(checkpoint_dir),
-        planner=BFloat16CastPlanner(),
+        planner=BFloat16CastPlanner(),  # pylint: disable=protected-access
        no_dist=True,
    )
@@ -191,7 +191,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
-
+    # pylint: disable=duplicate-code
    parsed_cfg = load_cfg(config, **kwargs)
    fsdp_dir = Path(parsed_cfg.output_dir) / "pytorch_model_fsdp_0"
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -73,7 +73,7 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
                    AutoModelForCausalLM.from_pretrained(
                        model_name, trust_remote_code=True
                    )
-                except Exception:  # nosec B110
+                except Exception as exc:  # pylint: disable=broad-exception-caught,unused-variable  # nosec B110  # noqa F841
                    pass
                # fmt: on
@@ -95,10 +95,9 @@ def do_cli(
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
-
+    # pylint: disable=duplicate-code
    os.environ["AXOLOTL_IS_PREPROCESS"] = "1"
-    is_preprocess = kwargs.pop("is_preprocess", True)
+    parsed_cfg = load_cfg(config, **kwargs)
    parsed_cfg = load_cfg(config, is_preprocess=is_preprocess, **kwargs)
    parsed_cfg.is_preprocess = True
    parser = transformers.HfArgumentParser(PreprocessCliArgs)
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -59,7 +59,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
-
+    # pylint: disable=duplicate-code
    parsed_cfg = load_cfg(config, **kwargs)
    parser = HfArgumentParser(TrainerCliArgs)
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
--- a/src/axolotl/cli/utils/args.py
+++ b/src/axolotl/cli/utils/args.py
@@ -65,7 +65,7 @@ def add_options_from_dataclass(config_class: Type[Any]) -> Callable:
        for field in reversed(dataclasses.fields(config_class)):
            field_type = _strip_optional_type(field.type)
-            if field_type is bool:
+            if field_type == bool:
                field_name = field.name.replace("_", "-")
                option_name = f"--{field_name}/--no-{field_name}"
                function = click.option(
@@ -103,7 +103,7 @@ def add_options_from_config(config_class: Type[BaseModel]) -> Callable:
        for name, field in reversed(config_class.model_fields.items()):
            field_type = _strip_optional_type(field.annotation)
-            if field_type is bool:
+            if field_type == bool:
                field_name = name.replace("_", "-")
                option_name = f"--{field_name}/--no-{field_name}"
                function = click.option(
--- a/src/axolotl/cli/utils/sweeps.py
+++ b/src/axolotl/cli/utils/sweeps.py
@@ -3,12 +3,11 @@
 import random
 from copy import deepcopy
 from itertools import product
 from typing import Any
 def generate_sweep_configs(
    base_config: dict[str, list], sweeps_config: dict[str, list]
-) -> list[dict[str, Any]]:
+) -> list[dict[str, list]]:
    """
    Recursively generates all possible configurations by applying sweeps to the base config.
@@ -49,10 +48,7 @@ def generate_sweep_configs(
                new_config = {}
                # new_config = deepcopy(base_config)
                # Combine regular parameters with paired parameters
-                full_combo = {
+                full_combo = {**dict(zip(param_names, reg_combo)), **paired_set}
                    **dict(zip(param_names, reg_combo, strict=False)),
                    **paired_set,
                }
                for param_name, param_value in full_combo.items():
                    new_config[param_name] = param_value
                print(new_config)
@@ -61,7 +57,7 @@ def generate_sweep_configs(
            # If no paired values, just use regular combinations
            # new_config = deepcopy(base_config)
            new_config = {}
-            for param_name, param_value in zip(param_names, reg_combo, strict=False):
+            for param_name, param_value in zip(param_names, reg_combo):
                new_config[param_name] = param_value
            print(new_config)
            all_combinations.append(new_config)
--- a/src/axolotl/cli/utils/train.py
+++ b/src/axolotl/cli/utils/train.py
@@ -4,7 +4,6 @@ import os
 import subprocess  # nosec
 import sys
 import tempfile
 from pathlib import Path
 from typing import Any, Iterator, Literal
 import yaml
@@ -89,12 +88,8 @@ def generate_config_files(config: str, sweep: str | None) -> Iterator[tuple[str,
    # Generate all possible configurations
    permutations = generate_sweep_configs(base_config, sweep_config)
    is_group = len(permutations) > 1
-    base_output_dir = base_config.get("output_dir", "./model-out")
+    for permutation in permutations:
-    for idx, permutation in enumerate(permutations, start=1):
+        # pylint: disable=consider-using-with
        permutation_dir = Path(permutation.get("output_dir", base_output_dir))
        permutation_id = f"sweep{idx:04d}"
        permutation["output_dir"] = str(permutation_dir / permutation_id)
        temp_file = tempfile.NamedTemporaryFile(
            mode="w",
            suffix=".yaml",
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -39,7 +39,7 @@ def do_vllm_serve(
    model = cfg.base_model
    serve_module = cli_args.get("serve_module", "trl.scripts.vllm_serve")
-    vllm_serve_main = __import__(serve_module, fromlist=["main"]).main
+    vllm_serve_main = getattr(__import__(serve_module, fromlist=["main"]), "main")
    tensor_parallel_size = 1
    data_parallel_size = 1
@@ -68,6 +68,7 @@ def do_vllm_serve(
        cli_args.get("enable_reasoning") or cfg.vllm.enable_reasoning or False
    )
    # pylint: disable=unexpected-keyword-arg
    vllm_script_args = AxolotlScriptArguments(
        model=model,
        tensor_parallel_size=tensor_parallel_size,
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -6,7 +6,7 @@ from dataclasses import dataclass
 from datasets import Dataset
-import axolotl.monkeypatch.data.batch_dataset_fetcher  # noqa: F401
+import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.loaders import load_processor, load_tokenizer
 from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
--- a/src/axolotl/convert.py
+++ b/src/axolotl/convert.py
@@ -67,7 +67,9 @@ class JsonToJsonlConverter:
        self.json_parser = json_parser
        self.jsonl_serializer = jsonl_serializer
-    def convert(self, input_file_path, output_file_path):
+    def convert(
        self, input_file_path, output_file_path
    ):  # pylint: disable=unused-argument
        content = self.file_reader.read(input_file_path)
        data = self.json_parser.parse(content)
        # data = [r for r in data if r["conversations"]]  # vicuna cleaned has rows with empty conversations
--- a/src/axolotl/core/attention/flex_block_mask.py
+++ b/src/axolotl/core/attention/flex_block_mask.py
@@ -84,7 +84,9 @@ def create_causal_mask(
    batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
    if attention_mask is not None:
-        def causal_doc_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
+        def causal_doc_mask_mod(
            batch_idx, head_idx, q_idx, kv_idx
        ):  # pylint: disable=unused-argument
            """
            Defines the logic of a block causal mask by combining both a standard causal mask
            and a block diagonal document mask.
@@ -101,7 +103,9 @@ def create_causal_mask(
        mask_factory_function = causal_doc_mask_mod
    else:
        mask_factory_function = causal_mask_function
-    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[
        config._attn_implementation  # pylint: disable=protected-access
    ]
    # Do not allow skip if we are compiling (this is to match BC)
    allow_is_causal_skip = (
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -44,7 +44,7 @@ from axolotl.utils.schemas.enums import CustomSupportedOptimizers
 LOG = logging.getLogger(__name__)
 with suppress(ImportError):
-    import torch._dynamo
+    import torch._dynamo  # pylint: disable=ungrouped-imports
 class TrainerBuilderBase(abc.ABC):
@@ -260,14 +260,14 @@ class TrainerBuilderBase(abc.ABC):
                adam_kwargs["eps"] = training_args_kwargs.get("adam_epsilon")
            if self.cfg.optimizer == "muon":
-                from axolotl.contribs.mit.muon import (
+                from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
                    MuonOptimizerFactory,
                )
                optimizer_cls = MuonOptimizerFactory
                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "dion":
-                from axolotl.contribs.mit.dion import (
+                from axolotl.contribs.mit.dion import (  # pylint: disable=no-name-in-module
                    DionOptimizerFactory,
                )
@@ -414,8 +414,12 @@ class TrainerBuilderBase(abc.ABC):
    def _configure_torch_compile(self, training_args_kwargs: dict):
        if self.cfg.torch_compile and getattr(torch, "_dynamo", None):
-            torch._dynamo.config.suppress_errors = True
+            torch._dynamo.config.suppress_errors = (  # pylint: disable=protected-access
-            torch._dynamo.config.accumulated_cache_size_limit = 256
+                True
            )
            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
                256
            )
            training_args_kwargs["torch_compile"] = self.cfg.torch_compile
            if self.cfg.torch_compile_backend:
                training_args_kwargs["torch_compile_backend"] = (
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -10,6 +10,7 @@ import transformers
 from transformers import (
    DataCollatorWithFlattening,
    EarlyStoppingCallback,
    Trainer,
 )
 from trl.trainer.utils import RewardDataCollatorWithPadding
@@ -344,14 +345,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            training_args_cls = AxolotlPRMConfig
        else:
            training_args_cls = AxolotlTrainingArguments
-        training_args = training_args_cls(
+        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
            **training_arguments_kwargs,
        )
        training_args = self.hook_post_create_training_args(training_args)
        # unset run_name so wandb sets up experiment names
        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
-            training_args.run_name = None
+            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
                None
            )
        data_collator_kwargs = {
            "padding": True,  # True/"longest" is the default
@@ -383,10 +386,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                **data_collator_kwargs,
            )
        sig = inspect.signature(trainer_cls)
-        if "processing_class" in sig.parameters:
+        if "processing_class" in sig.parameters or issubclass(trainer_cls, Trainer):
            trainer_kwargs["processing_class"] = self.tokenizer
        elif "tokenizer" in sig.parameters:
            trainer_kwargs["tokenizer"] = self.tokenizer
        if (
            trainer_cls not in [AxolotlRewardTrainer, AxolotlPRMTrainer]
            and self.cfg.datasets is not None
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -168,14 +168,16 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if plugin_training_args:
                training_args_kwargs.update(plugin_training_args)
-        training_args = training_args_cls(
+        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
            logging_first_step=True,
            **training_args_kwargs,
        )
        # unset run_name so wandb sets up experiment names
        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
-            training_args.run_name = None
+            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
                None
            )
        return training_args, trainer_kwargs
--- a/src/axolotl/core/chat/format/chatml.py
+++ b/src/axolotl/core/chat/format/chatml.py
@@ -10,7 +10,7 @@ from .shared import wrap_tools
 def format_message(
    message: Messages,
-    message_index: Optional[int] = None,
+    message_index: Optional[int] = None,  # pylint: disable=unused-argument
 ) -> Messages:
    if message.is_chat_formatted:
        return message
--- a/src/axolotl/core/chat/messages.py
+++ b/src/axolotl/core/chat/messages.py
@@ -15,11 +15,11 @@ class MessageRoles(str, Enum):
    Message roles for the system, user, assistant, and tools
    """
-    system = "system"
+    system = "system"  # pylint: disable=invalid-name
-    user = "user"
+    user = "user"  # pylint: disable=invalid-name
-    assistant = "assistant"
+    assistant = "assistant"  # pylint: disable=invalid-name
-    tool = "tool"
+    tool = "tool"  # pylint: disable=invalid-name
-    ipython = (
+    ipython = (  # pylint: disable=invalid-name
        # for responses from builtin tools
        "ipython"
    )
@@ -30,12 +30,12 @@ class MessageContentTypes(str, Enum):
    Message content types for text, image, audio, tool calls, and tool responses
    """
-    special_token = "special_token"  # nosec B105
+    special_token = "special_token"  # pylint: disable=invalid-name  # nosec B105
-    text = "text"
+    text = "text"  # pylint: disable=invalid-name
-    image = "image"
+    image = "image"  # pylint: disable=invalid-name
-    audio = "audio"
+    audio = "audio"  # pylint: disable=invalid-name
-    tool_call = "tool_call"
+    tool_call = "tool_call"  # pylint: disable=invalid-name  # to differentiate regular responses from tool calls from the assistant
-    tool_response = "tool_response"
+    tool_response = "tool_response"  # pylint: disable=invalid-name
 class SpecialToken(str, Enum):
@@ -43,8 +43,8 @@ class SpecialToken(str, Enum):
    Special tokens for beginning of string and end of string
    """
-    bos_token = "bos_token"  # nosec B105
+    bos_token = "bos_token"  # pylint: disable=invalid-name  # nosec B105
-    eos_token = "eos_token"  # nosec B105
+    eos_token = "eos_token"  # pylint: disable=invalid-name  # nosec B105
 class ToolCallFunction(BaseModel):
@@ -73,7 +73,7 @@ class ToolCallContents(BaseModel):
    name: str
    arguments: dict[str, Union[str, int]]
-    id: Optional[str] = None
+    id: Optional[str] = None  # pylint: disable=invalid-name
    def __str__(self) -> str:
        data = {"name": self.name, "arguments": self.arguments}
@@ -89,7 +89,7 @@ class ToolResponseContents(BaseModel):
    name: str
    content: Union[str, dict[str, Union[str, int, float]]]
-    id: Optional[str] = None
+    id: Optional[str] = None  # pylint: disable=invalid-name
    def __str__(self) -> str:
        data = {"name": self.name, "content": self.content}
--- a/src/axolotl/core/datasets/transforms/chat_builder.py
+++ b/src/axolotl/core/datasets/transforms/chat_builder.py
@@ -1,17 +1,23 @@
 """
-This module contains a function that builds a transform that takes a row from the
+This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.
 dataset and converts it to a Chat.
 """
-from typing import Any, Mapping
+from typing import Any, Mapping, Union
-def chat_message_transform_builder(
+def chat_message_transform_builder(  # pylint: disable=dangerous-default-value
    train_on_inputs=False,
    conversations_field: str = "conversations",
-    message_field_role: str | list[str] | None = None,  # commonly "role"
+    message_field_role: Union[str, list[str]] = ["role", "from"],  # commonly "role"
-    message_field_content: str | list[str] | None = None,  # commonly "content"
+    message_field_content: Union[str, list[str]] = [
-    message_field_training: str | list[str] | None = None,  # commonly "weight"
+        "value",
        "text",
        "content",
    ],  # commonly "content"
    message_field_training: Union[str, list[str]] = [
        "train",
        "weight",
    ],  # commonly "weight"
 ):
    """Builds a transform that takes a row from the dataset and converts it to a Chat
@@ -33,12 +39,6 @@ def chat_message_transform_builder(
            A function that takes a list of conversations and returns a list of messages.
    """
    if message_field_training is None:
        message_field_training = ["train", "weight"]
    if message_field_content is None:
        message_field_content = ["value", "text", "content"]
    if message_field_role is None:
        message_field_role = ["role", "from"]
    message_field_role = (
        [message_field_role]
        if isinstance(message_field_role, str)
--- a/src/axolotl/core/trainers/init.py
+++ b/src/axolotl/core/trainers/init.py
@@ -1,5 +1,6 @@
 """Init for axolotl.core.trainers"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .base import AxolotlTrainer
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -1,5 +1,7 @@
 """Module for customized trainers"""
 # pylint: disable=too-many-lines
 from __future__ import annotations
 import os
@@ -80,7 +82,9 @@ class AxolotlTrainer(
        super().__init__(*_args, **kwargs)
        self.train_data_collator = self.data_collator
-        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+        self._stored_metrics = defaultdict(
            lambda: defaultdict(lambda: {"values": [], "reduction": "mean"})
        )
        if self.args.orpo_alpha:
            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
@@ -283,9 +287,9 @@ class AxolotlTrainer(
        # fmt: off
        if dataloader_key is not None and self.args.dataloader_persistent_workers:
            if hasattr(self, "_eval_dataloaders"):
-                self._eval_dataloaders[dataloader_key] = dataloader  # type: ignore
+                self._eval_dataloaders[dataloader_key] = dataloader  # type: ignore  # pylint: disable=access-member-before-definition
            else:
-                self._eval_dataloaders = {dataloader_key: dataloader}
+                self._eval_dataloaders = {dataloader_key: dataloader}  # pylint: disable=attribute-defined-outside-init
        # fmt: on
        return self.accelerator.prepare(dataloader)
@@ -441,7 +445,7 @@ class AxolotlTrainer(
        model,
        inputs,
        return_outputs=False,
-        num_items_in_batch=None,
+        num_items_in_batch=None,  # pylint: disable=unused-argument
    ):
        concat_inputs = AxolotlTrainer.orpo_concatenate_inputs(
            inputs,
@@ -522,7 +526,9 @@ class AxolotlTrainer(
        accelerator_config = self.args.accelerator_config.to_dict()
        use_configured_state = accelerator_config.get("use_configured_state", False)
        if not use_configured_state:
-            AcceleratorState._reset_state(reset_partial_state=True)
+            AcceleratorState._reset_state(  # pylint: disable=protected-access
                reset_partial_state=True
            )
        super().create_accelerator_and_postprocess()
@@ -536,6 +542,7 @@ class AxolotlTrainer(
            ):
                self.accelerator.state.fsdp_plugin.limit_all_gathers = True
    # pylint: disable=unused-argument
    def additional_accelerator_args(
        self, fp8: bool = False, enable_fsdp_float8_all_gather: bool = False, **kwargs
    ) -> dict[str, Any]:
@@ -568,9 +575,26 @@ class AxolotlTrainer(
        """
        # logs either has 'loss' or 'eval_loss'
        train_eval = "train" if "loss" in logs else "eval"
-        # Add averaged stored metrics to logs
+
-        for key, metrics in self._stored_metrics[train_eval].items():
+        # Add reduced stored metrics to logs
-            logs[key] = torch.tensor(metrics).mean().item()
+        for key, metric_data in self._stored_metrics[train_eval].items():
            values = torch.tensor(metric_data["values"])
            reduction_type = metric_data["reduction"]
            if reduction_type == "mean":
                logs[key] = values.mean().item()
            elif reduction_type == "min":
                logs[key] = values.min().item()
            elif reduction_type == "max":
                logs[key] = values.max().item()
            elif reduction_type == "sum":
                logs[key] = values.sum().item()
            else:
                raise NotImplementedError(
                    "Metric reduction must be one of [mean, min, max, sum]"
                )
            logs[key] = round(logs[key], 4)
        if is_main_process():
            # Add memory usage
@@ -587,10 +611,27 @@ class AxolotlTrainer(
        return super().log(logs, start_time)
    def store_metrics(
-        self, metrics: dict[str, float], train_eval: Literal["train", "eval"] = "train"
+        self,
        metrics: dict[str, float] | dict[str, tuple[int | float, str]],
        train_eval: Literal["train", "eval"] = "train",
        reduction: Literal["mean", "min", "max", "sum"] = "mean",
    ) -> None:
        """
        Store metrics with specified reduction type.
        Args:
            metrics: Dictionary of metric names to values, or metric names to (value,
                reduction_type) tuples.
            train_eval: Whether this is for training or evaluation.
        """
        for key, value in metrics.items():
-            self._stored_metrics[train_eval][key].append(value)
+            if isinstance(value, tuple):
                metric_value, metric_reduction = value
            else:
                metric_value, metric_reduction = value, reduction
            self._stored_metrics[train_eval][key]["values"].append(metric_value)
            self._stored_metrics[train_eval][key]["reduction"] = metric_reduction
    def _save_checkpoint(self, model, trial, **kwargs):
        # make sure the checkpoint dir exists, since trainer is flakey
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -101,11 +101,11 @@ class AxolotlDPOTrainer(
    ) -> dict[str, torch.Tensor]:
        if self.args.dpo_norm_loss:
            # fmt: off
-            loss_type: str = self.loss_type  # type: ignore[has-type]
+            loss_type: str = self.loss_type  # type: ignore[has-type]  # pylint: disable=access-member-before-definition
            # fmt: on
            # concatenated_forward handles avg token logprob for ipo case already
-            self.loss_type = "ipo"
+            self.loss_type = "ipo"  # pylint: disable=attribute-defined-outside-init
            res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
-            self.loss_type = loss_type
+            self.loss_type = loss_type  # pylint: disable=attribute-defined-outside-init
            return res
        return super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -128,7 +128,9 @@ class GRPOStrategy:
        return grpo_args_kwargs
    @classmethod
-    def set_trainer_args(cls, cfg: DictDefault) -> list[Any]:
+    def set_trainer_args(
        cls, cfg: DictDefault
    ) -> list[Any]:  # pylint: disable=unused-argument
        trainer_args = []
        if cfg.trl and cfg.trl.reward_funcs:
            reward_funcs = []
@@ -149,7 +151,7 @@ class GRPOStrategy:
        return trainer_kwargs
    @classmethod
-    def get_collator(cls, *args, **kwargs):
+    def get_collator(cls, *args, **kwargs):  # pylint: disable=unused-argument
        # No data collation is needed in GRPO, handled by trl's trainer __init__
        return None
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -1,5 +1,7 @@
 """Axolotl GRPO trainers (with and without sequence parallelism handling)"""
 # pylint: disable=too-many-lines,duplicate-code,protected-access,no-member
 import warnings
 from functools import partial
 from typing import Any
@@ -50,6 +52,7 @@ from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, Optimizer
 from axolotl.monkeypatch.ring_attn import get_ring_attn_group
 if is_peft_available():
    # pylint: disable=unused-import
    from peft import PeftConfig
@@ -250,7 +253,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
    def get_train_dataloader(self) -> DataLoader:
        """Get dataloader for training"""
        train_dataset = self.train_dataset
-
+        # pylint: disable=access-member-before-definition
        data_collator = self.data_collator  # type: ignore
        # Handle dataset preprocessing
@@ -263,7 +266,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                    train_dataset, description="training"
                )
        else:
-            self.data_collator = self._get_collator_with_removed_columns(
+            self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
                data_collator,
                description="training",
            )
@@ -305,10 +308,10 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
        # Generate completions using either vLLM or regular generation
        if self.args.use_vllm:
            # First, have main process load weights if needed
-
+            # pylint: disable=access-member-before-definition
            if self.state.global_step != self._last_loaded_step:  # type: ignore[has-type]
                self._move_model_to_vllm()
-
+                # pylint: disable=attribute-defined-outside-init
                self._last_loaded_step = self.state.global_step
            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
@@ -330,9 +333,8 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                        # Extract prompts from this SP group, accounting for num_generations duplicates
                        # We only need prompts from one rank in each SP group
                        group_prompts = all_prompts_text[
-                            group_leader_rank * len(prompts_text) : (
+                            group_leader_rank
-                                group_leader_rank + 1
+                            * len(prompts_text) : (group_leader_rank + 1)
                            )
                            * len(prompts_text) : self.num_generations
                        ]
@@ -483,7 +485,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
        )
        if is_conversational(inputs[0]):
            completions = []
-            for prompt, completion in zip(prompts, completions_text, strict=False):
+            for prompt, completion in zip(prompts, completions_text):
                bootstrap = (
                    prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
                )
@@ -501,7 +503,6 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                self.reward_funcs,
                self.reward_processing_classes,
                self.reward_func_names,
                strict=False,
            )
        ):
            with profiling_context(self, reward_func_name):
@@ -510,17 +511,14 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                ):  # Module instead of PretrainedModel for compat with compiled models
                    if is_conversational(inputs[0]):
                        messages = [
-                            {"messages": p + c}
+                            {"messages": p + c} for p, c in zip(prompts, completions)
                            for p, c in zip(prompts, completions, strict=False)
                        ]
                        texts = [
                            apply_chat_template(x, reward_processing_class)["text"]
                            for x in messages
                        ]
                    else:
-                        texts = [
+                        texts = [p + c for p, c in zip(prompts, completions)]
                            p + c for p, c in zip(prompts, completions, strict=False)
                        ]
                    reward_inputs = reward_processing_class(
                        text=texts,
                        return_tensors="pt",
@@ -566,8 +564,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
            row_reward_kwargs["completion"] = completions[nan_row_idx]
            warnings.warn(
                f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. "
-                "Please ensure that at least one reward function returns a valid reward.",
+                "Please ensure that at least one reward function returns a valid reward."
                stacklevel=2,
            )
        # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
--- a/src/axolotl/core/trainers/mamba.py
+++ b/src/axolotl/core/trainers/mamba.py
@@ -5,6 +5,7 @@ import torch
 from axolotl.core.trainers.base import AxolotlTrainer
 # pylint: disable=too-many-ancestors
 class AxolotlMambaTrainer(AxolotlTrainer):
    """Mamba specific trainer to handle loss calculation"""
@@ -14,8 +15,8 @@ class AxolotlMambaTrainer(AxolotlTrainer):
        self,
        model,
        inputs,
-        return_outputs=False,
+        return_outputs=False,  # pylint: disable=unused-argument
-        num_items_in_batch=None,
+        num_items_in_batch=None,  # pylint: disable=unused-argument
    ):
        input_ids = inputs.pop("input_ids")
        lm_logits = model(input_ids).logits
--- a/src/axolotl/core/trainers/mixins/init.py
+++ b/src/axolotl/core/trainers/mixins/init.py
@@ -1,5 +1,6 @@
 """Init for axolotl.core.trainers.mixins"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .activation_checkpointing import ActivationOffloadingMixin
--- a/src/axolotl/core/trainers/mixins/activation_checkpointing.py
+++ b/src/axolotl/core/trainers/mixins/activation_checkpointing.py
@@ -92,7 +92,7 @@ def get_lora_act_offloading_ctx_manager(
        `contextlib.ContextDecorator`:
            Activation offloading context manager for the model.
    """
-
+    # pylint: disable=unnecessary-dunder-call
    activations_handling_ctx = OffloadActivations(
        use_pin_memory=use_pin_memory,
        use_streams=use_streams,
--- a/src/axolotl/core/trainers/mixins/distributed_parallel.py
+++ b/src/axolotl/core/trainers/mixins/distributed_parallel.py
@@ -26,6 +26,7 @@ class DistributedParallelMixin(Trainer):
            self.accelerator.distributed_type == "FSDP"
            and self.accelerator.state.fsdp_plugin is None
        ):
            # pylint: disable=protected-access
            # handle Context Parallelism without FSDP
            self.accelerator.state.distributed_type = "MULTI_GPU"
            self.accelerator.state._shared_state["distributed_type"] = "MULTI_GPU"
--- a/src/axolotl/core/trainers/mixins/optimizer.py
+++ b/src/axolotl/core/trainers/mixins/optimizer.py
@@ -70,11 +70,11 @@ class OptimizerMixin(Trainer):
                }
            )
        if params["embeddings"]:
-            lr = optimizer_kwargs["lr"]
+            lr = optimizer_kwargs["lr"]  # pylint: disable=invalid-name
            if self.args.embedding_lr_scale:
-                lr *= self.args.embedding_lr_scale
+                lr *= self.args.embedding_lr_scale  # pylint: disable=invalid-name
            elif self.args.embedding_lr:
-                lr = self.args.embedding_lr
+                lr = self.args.embedding_lr  # pylint: disable=invalid-name
            optimizer_grouped_parameters.append(
                {
                    "params": list(params["embeddings"].values()),
@@ -143,7 +143,7 @@ class OptimizerMixin(Trainer):
                loraplus_lr_embedding = getattr(
                    self.args, "loraplus_lr_embedding", 1e-6
                )
-                self.optimizer = create_loraplus_optimizer(
+                self.optimizer = create_loraplus_optimizer(  # pylint: disable=attribute-defined-outside-init
                    opt_model,
                    optimizer_cls,
                    loraplus_lr_ratio=loraplus_lr_ratio,
@@ -193,7 +193,9 @@ class OptimizerMixin(Trainer):
                LOG.info(f"skipped: {skipped/2**20}M params")
        if is_sagemaker_mp_enabled():
-            self.optimizer = smp.DistributedOptimizer(self.optimizer)
+            self.optimizer = smp.DistributedOptimizer(  # pylint: disable=attribute-defined-outside-init
                self.optimizer
            )
        return self.optimizer
--- a/src/axolotl/core/trainers/mixins/scheduler.py
+++ b/src/axolotl/core/trainers/mixins/scheduler.py
@@ -46,7 +46,7 @@ class SchedulerMixin(Trainer):
        )
        # fmt: off
-        if self.lr_scheduler is None:  # type: ignore
+        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
            # fmt: on
            plugin_manager = PluginManager.get_instance()
            lr_scheduler: LRScheduler | None = plugin_manager.create_lr_scheduler(
@@ -90,7 +90,7 @@ class SchedulerMixin(Trainer):
                    LOG.warning(
                        "Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
-                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(
+                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
@@ -98,7 +98,7 @@ class SchedulerMixin(Trainer):
            elif self.args.cosine_min_lr_ratio and self.args.cosine_constant_lr_ratio and use_cosine_min_lr:
                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
                assert 0 <= self.args.cosine_constant_lr_ratio <= 1.0, "cosine_constant_lr_ratio must be between 0.0 and 1.0"
-                self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(
+                self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(  # pylint: disable=attribute-defined-outside-init
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
@@ -107,7 +107,7 @@ class SchedulerMixin(Trainer):
                )
            elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
-                self.lr_scheduler = get_cosine_schedule_with_min_lr(
+                self.lr_scheduler = get_cosine_schedule_with_min_lr(  # pylint: disable=attribute-defined-outside-init
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
@@ -133,7 +133,7 @@ class SchedulerMixin(Trainer):
            )
            if not self.lr_scheduler:
                super().create_scheduler(num_training_steps, optimizer)
-            self.lr_scheduler = JaggedLRRestartScheduler(
+            self.lr_scheduler = JaggedLRRestartScheduler(  # pylint: disable=attribute-defined-outside-init
                optimizer,
                self.lr_scheduler,
                self.args.jagged_restart_steps,
--- a/src/axolotl/core/training_args_base.py
+++ b/src/axolotl/core/training_args_base.py
@@ -14,6 +14,7 @@ class AxolotlTrainingMixins:
    Mixin class for the Axolotl training args.
    """
    # pylint: disable=duplicate-code
    model_type: Optional[str] = field(
        default=None, metadata={"help": "HF model configuration model_type."}
    )
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -26,7 +26,7 @@ class TokenizedPromptDataset(Dataset):
        keep_in_memory: Whether to keep the tokenized dataset in memory.
    """
-    def __init__(
+    def __init__(  # pylint: disable=super-init-not-called
        self,
        prompt_tokenizer: PromptTokenizingStrategy,
        dataset: Dataset,
@@ -99,7 +99,7 @@ class ConstantLengthDataset(IterableDataset):
        seq_length: Length of token sequences to return.
    """
-    def __init__(
+    def __init__(  # pylint: disable=super-init-not-called
        self,
        tokenizer,
        datasets,
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -79,7 +79,7 @@ def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, f
    model, tokenizer, _, processor = setup_model_and_tokenizer(cfg)
    # Get datasets
-
+    # pylint: disable=duplicate-code
    train_dataset = dataset_meta.train_dataset
    eval_dataset = dataset_meta.eval_dataset
    total_num_steps = dataset_meta.total_num_steps
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -76,7 +76,7 @@ class BasePlugin:
    def __init__(self):
        """Initializes the BasePlugin."""
-    def register(self, cfg: dict):
+    def register(self, cfg: dict):  # pylint: disable=unused-argument
        """Registers the plugin with the given configuration as an unparsed dict.
        Args:
@@ -104,13 +104,14 @@ class BasePlugin:
            dataset_meta: The metadata for the training dataset.
        """
-    def pre_model_load(self, cfg: DictDefault):
+    def pre_model_load(self, cfg: DictDefault):  # pylint: disable=unused-argument
        """Performs actions before the model is loaded.
        Args:
            cfg: The configuration for the plugin.
        """
    # pylint: disable=unused-argument
    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
        """Performs actions after the model is built/loaded, but before any adapters are applied.
@@ -118,6 +119,7 @@ class BasePlugin:
            cfg: The configuration for the plugin.
        """
    # pylint: disable=unused-argument
    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
        """Performs actions before LoRA weights are loaded.
@@ -126,6 +128,7 @@ class BasePlugin:
            model: The loaded model.
        """
    # pylint: disable=unused-argument
    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after LoRA weights are loaded.
@@ -134,6 +137,7 @@ class BasePlugin:
            model: The loaded model.
        """
    # pylint: disable=unused-argument
    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after the model is loaded.
@@ -142,7 +146,8 @@ class BasePlugin:
            model: The loaded model.
        """
-    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
+    # pylint: disable=unused-argument
    def get_trainer_cls(self, cfg: DictDefault) -> type[Trainer] | None:
        """Returns a custom class for the trainer.
        Args:
@@ -152,6 +157,7 @@ class BasePlugin:
            The first non-`None` trainer class returned by a plugin.
        """
    # pylint: disable=unused-argument
    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
        """Performs actions after the trainer is created.
@@ -160,7 +166,7 @@ class BasePlugin:
            trainer: The trainer object for training.
        """
-    def get_training_args(self, cfg: DictDefault):
+    def get_training_args(self, cfg: DictDefault):  # pylint: disable=unused-argument):
        """
        Returns custom training arguments to set on TrainingArgs.
@@ -171,7 +177,9 @@ class BasePlugin:
            object: dict containing the training arguments.
        """
-    def get_collator_cls_and_kwargs(self, cfg: DictDefault, is_eval: bool = False):
+    def get_collator_cls_and_kwargs(
        self, cfg: DictDefault, is_eval: bool = False
    ):  # pylint: disable=unused-argument):
        """
        Returns a custom class for the collator.
@@ -183,6 +191,7 @@ class BasePlugin:
            class: The class for the collator.
        """
    # pylint: disable=unused-argument
    def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
        """Creates and returns an optimizer for training.
@@ -194,6 +203,7 @@ class BasePlugin:
            The created optimizer.
        """
    # pylint: disable=unused-argument
    def create_lr_scheduler(
        self,
        cfg: DictDefault,
@@ -213,6 +223,7 @@ class BasePlugin:
            The created learning rate scheduler.
        """
    # pylint: disable=unused-argument
    def add_callbacks_pre_trainer(
        self, cfg: DictDefault, model: PreTrainedModel
    ) -> list[Callable]:
@@ -227,6 +238,7 @@ class BasePlugin:
        """
        return []
    # pylint: disable=unused-argument
    def add_callbacks_post_trainer(
        self, cfg: DictDefault, trainer: Trainer
    ) -> list[Callable]:
@@ -242,6 +254,7 @@ class BasePlugin:
        """
        return []
    # pylint: disable=unused-argument
    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after training is complete.
@@ -250,7 +263,7 @@ class BasePlugin:
            model: The loaded model.
        """
-    def post_train_unload(self, cfg: DictDefault):
+    def post_train_unload(self, cfg: DictDefault):  # pylint: disable=unused-argument
        """Performs actions after training is complete and the model is unloaded.
        Args:
@@ -298,7 +311,7 @@ def load_plugin(plugin_name: str) -> BasePlugin:
    return plugin
-class PluginManager:
+class PluginManager:  # pylint: disable=too-many-public-methods
    """The `PluginManager` class is responsible for loading and managing plugins. It
    should be a singleton so it can be accessed from anywhere in the codebase.
--- a/src/axolotl/integrations/config.py
+++ b/src/axolotl/integrations/config.py
@@ -50,9 +50,15 @@ def merge_input_args():
        dynamic_input += f"class AxolotlInputConfig(AxolotlInputConfigBase, {', '.join(plugin_classes)}):\n    pass\n"
        namespace: Dict[Any, Any] = {}
-        exec(dynamic_input, globals(), namespace)  # nosec B102
+        exec(  # pylint: disable=exec-used  # nosec B102
-        AxolotlInputConfig = namespace["AxolotlInputConfig"]
+            dynamic_input, globals(), namespace
-        AxolotlConfigWCapabilities = namespace["AxolotlConfigWCapabilities"]
+        )
        AxolotlInputConfig = namespace[  # pylint: disable=invalid-name
            "AxolotlInputConfig"
        ]
        AxolotlConfigWCapabilities = namespace[  # pylint: disable=invalid-name
            "AxolotlConfigWCapabilities"
        ]
        return AxolotlConfigWCapabilities, AxolotlInputConfig
    return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase
@@ -68,7 +74,7 @@ def merge_training_args() -> Type:
    Returns:
    tuple: A tuple containing the newly created classes, AxolotlTrainingMixins.
    """
-
+    # pylint: disable=duplicate-code
    from axolotl.core.training_args_base import (
        AxolotlTrainingMixins as AxolotlTrainingMixinsBase,
    )
@@ -87,7 +93,11 @@ def merge_training_args() -> Type:
        namespace: Dict[Any, Any] = {}
        local_vars = {"AxolotlTrainingMixinsBase": AxolotlTrainingMixinsBase}
-        exec(dynamic_input, {**globals(), **local_vars}, namespace)  # nosec B102
+        exec(  # pylint: disable=exec-used  # nosec B102
-        AxolotlTrainingMixins = namespace["AxolotlTrainingMixins"]
+            dynamic_input, {**globals(), **local_vars}, namespace
        )
        AxolotlTrainingMixins = namespace[  # pylint: disable=invalid-name
            "AxolotlTrainingMixins"
        ]
        return AxolotlTrainingMixins
    return AxolotlTrainingMixinsBase
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -18,7 +18,6 @@ Module for the Plugin for Cut Cross Entropy integration with Axolotl.
 Cut Cross Entropy is an optimized implementation of cross entropy loss
 from Apple's ML team.
 """
 import importlib
 from functools import partial
@@ -29,7 +28,7 @@ from axolotl.utils import get_pytorch_version
 from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
 from axolotl.utils.logging import get_logger
-from .args import CutCrossEntropyArgs as CutCrossEntropyArgs
+from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401
 LOG = get_logger(__name__)
@@ -107,7 +106,9 @@ class CutCrossEntropyPlugin(BasePlugin):
        """
        from cut_cross_entropy.transformers.patch import PATCH_FNS
-        def patch_generic(maybe_model, patch_options, model_type: str):
+        def patch_generic(
            maybe_model, patch_options, model_type: str
        ):  # pylint: disable=unused-argument
            import cut_cross_entropy.transformers.llama
            from cut_cross_entropy.transformers.llama import cce_forward
@@ -120,10 +121,12 @@ class CutCrossEntropyPlugin(BasePlugin):
                )
                model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
-                cut_cross_entropy.transformers.llama._PATCH_OPTS = patch_options
+                cut_cross_entropy.transformers.llama._PATCH_OPTS = (  # pylint: disable=protected-access
                    patch_options
                )
                model_cls.forward = cce_forward
-
+            # pylint: disable=duplicate-code
            except (ImportError, AttributeError) as e:
                raise RuntimeError(
                    f"Could not import ForCausalLM class for model_type: {model_type}. "
--- a/src/axolotl/integrations/cut_cross_entropy/args.py
+++ b/src/axolotl/integrations/cut_cross_entropy/args.py
@@ -15,7 +15,6 @@
 """
 Module for handling Cut Cross Entropy input arguments.
 """
 from typing import Optional
 from pydantic import BaseModel, model_validator
--- a/src/axolotl/integrations/diffusion/README.md
+++ b/src/axolotl/integrations/diffusion/README.md
@@ -0,0 +1,125 @@
 # Diffusion LM Training Plugin for Axolotl
 This plugin enables diffusion language model training using the LLaDA (Large Language
 And Diffusion Assistant) approach within the Axolotl framework.
 ## Overview
 LLaDA is a diffusion-based approach to language model training that uses:
 - **Random token masking** during training instead of next-token prediction
 - **Bidirectional attention** to allow the model to see the full context
 - **Importance weighting** based on masking probabilities for stable training
 This approach can lead to more robust language models with better understanding of
 bidirectional context.
 ## Installation
 The plugin is included with Axolotl. To use it, simply add the plugin configuration to
 your training config.
 ## Quickstart
 ### Basic Configuration
 Add the following to your Axolotl configuration YAML:
 ```yaml
 # Enable diffusion LM training plugin
 plugins:
  - axolotl.integrations.diffusion.DiffusionPlugin
 # Diffusion-specific configuration
 noise_schedule: linear  # or "cosine"
 min_mask_ratio: 0.1
 max_mask_ratio: 0.9
 num_diffusion_steps: 128
 eps: 1e-3
 importance_weighting: true
 mask_token_id: 128002
 # Sample generation (optional)
 generate_samples: true
 generation_interval: 100
 num_generation_samples: 3
 generation_steps: 128
 generation_temperature: 0.0
 generation_max_length: 100
 # Model configuration
 base_model: meta-llama/Llama-3.2-1B
 model_type: llama
 # Standard Axolotl configuration
 datasets:
  - path: your_dataset
    ...
 # Other config
 sequence_len: 1024
 micro_batch_size: 8
 gradient_accumulation_steps: 4
 learning_rate: 3e-4
 ```
 ## Supported Models
 Any models that support 4D attention masks should work out of the box. If not, please
 create an [issue](https://github.com/axolotl-ai-cloud/axolotl/issues)!
 ## How It Works
 ### Random Masking
 During training, tokens are randomly masked based on a sampled timestep:
 - Sample timestep `t` uniformly from [0, 1]
 - Calculate masking probability: `p = (1 - eps) * t + eps`
 - Randomly mask tokens with probability `p`
 ### Bidirectional Attention
 The plugin uses native 4D attention masks to:
 - Enable bidirectional attention without patches
 - Allow all tokens to attend to all other tokens
 - Maintain proper padding masks
 - Work with modern `transformers` models out of the box
 ### Diffusion Loss
 Loss is computed only on masked tokens with (optional) importance weighting:
 ```python
 loss = sum(cross_entropy(pred, target) / p_mask) / total_tokens
 ```
 ## Sample Generation
 When `generate_samples: true`, the plugin generates samples during training:
 ```
 Sample 1:
   Original (45 tokens): The quick brown fox jumps over the lazy dog...
   Masked (18/45 tokens, 40.0%): The [MASK] [MASK] fox [MASK] over [MASK] lazy [MASK]...
   Generated: The quick brown fox jumps over the lazy dog...
 ```
 Samples are logged to console and wandb (if enabled).
 ## Metrics and Monitoring
 The plugin adds several metrics to track diffusion training:
 - `train/loss`: Weighted diffusion loss
 - `train/accuracy`: Accuracy on masked tokens
 - `train/mask_ratio`: Average fraction of tokens masked
 - `train/num_masked_tokens`: Number of tokens masked
 - `train/avg_p_mask`: Average masking probability
 - `train/ce_loss`: Unweighted cross-entropy loss
 - `train/importance_weight_avg`: Average importance weight
 ## Limitations
 - No flash attention support
 ## References
 - [LLaDA Paper](https://arxiv.org/abs/2404.10406)
 - [Axolotl Documentation](https://docs.axolotl.ai/)
--- a/src/axolotl/integrations/diffusion/init.py
+++ b/src/axolotl/integrations/diffusion/init.py
@@ -0,0 +1,6 @@
 """Diffusion LM training plugin init."""
 from .args import DiffusionArgs
 from .plugin import DiffusionPlugin
 __all__ = ["DiffusionArgs", "DiffusionPlugin"]
--- a/src/axolotl/integrations/diffusion/args.py
+++ b/src/axolotl/integrations/diffusion/args.py
@@ -0,0 +1,70 @@
 """Config args for diffusion LM training."""
 from typing import Literal
 from pydantic import BaseModel, Field
 class DiffusionArgs(BaseModel):
    """Arguments for diffusion LM training plugin."""
    # Noise schedule config
    noise_schedule: Literal["linear", "cosine"] = Field(
        default="linear", description="Type of noise schedule for diffusion training"
    )
    min_mask_ratio: float = Field(
        default=0.1,
        ge=0.0,
        le=1.0,
        description="Minimum masking ratio for diffusion noise schedule",
    )
    max_mask_ratio: float = Field(
        default=0.9,
        ge=0.0,
        le=1.0,
        description="Maximum masking ratio for diffusion noise schedule",
    )
    num_diffusion_steps: int = Field(
        default=128, ge=1, description="Number of diffusion timesteps"
    )
    eps: float = Field(
        default=1e-3,
        ge=0.0,
        le=1.0,
        description="Epsilon value for minimum masking probability in forward process",
    )
    # Training config
    importance_weighting: bool = Field(
        default=True,
        description="Apply importance weighting to loss based on masking probability",
    )
    mask_token_id: int = Field(
        default=128002,
        description=(
            "Token ID to use for masking. Default is 128002 "
            "(<|reserved_special_token_0|> for Llama 3.2)"
        ),
    )
    # Sample generation config
    generate_samples: bool = Field(
        default=True, description="Enable sample generation during training"
    )
    generation_interval: int = Field(
        default=100, ge=1, description="Generate samples every N steps"
    )
    num_generation_samples: int = Field(
        default=3, ge=1, description="Number of samples to generate each time"
    )
    generation_steps: int = Field(
        default=128, ge=1, description="Number of diffusion steps for generation"
    )
    generation_temperature: float = Field(
        default=0.0,
        ge=0.0,
        description="Temperature for generation sampling (0.0 = deterministic)",
    )
    generation_max_length: int = Field(
        default=100, ge=1, description="Maximum sequence length for generation"
    )
--- a/src/axolotl/integrations/diffusion/callbacks.py
+++ b/src/axolotl/integrations/diffusion/callbacks.py
@@ -0,0 +1,113 @@
 """Callbacks for diffusion training."""
 import wandb
 from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
 from transformers.training_args import TrainingArguments
 from axolotl.utils.logging import get_logger
 from .generation import generate_samples
 LOG = get_logger(__name__)
 class DiffusionGenerationCallback(TrainerCallback):
    """Callback for generating samples during diffusion training."""
    def __init__(self, trainer):
        self.trainer = trainer
    # pylint: disable=unused-argument
    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Generate samples at specified intervals."""
        if (
            state.global_step > 0
            and state.global_step % self.trainer.config.generation_interval == 0
        ):
            # Use eval dataloader if available, otherwise use train dataloader
            if (
                hasattr(self.trainer, "eval_dataset")
                and self.trainer.eval_dataset is not None
            ):
                dataloader = self.trainer.callback_handler.eval_dataloader
            else:
                dataloader = self.trainer.callback_handler.train_dataloader
            # Generate samples
            samples = generate_samples(
                model=self.trainer.model,
                tokenizer=self.trainer.tokenizer,
                dataloader=dataloader,
                num_generation_samples=self.trainer.config.num_generation_samples,
                max_length=self.trainer.config.generation_max_length,
                num_diffusion_steps=self.trainer.config.generation_steps,
                temperature=self.trainer.config.generation_temperature,
                mask_token_id=self.trainer.config.mask_token_id,
            )
            # Log samples
            self._log_samples(samples, state.global_step)
    def _log_samples(self, samples: list, step: int):
        """Log generated samples."""
        if not samples:
            return
        LOG.info("=" * 60)
        LOG.info("GENERATED SAMPLES")
        LOG.info("=" * 60)
        for i, sample_data in enumerate(samples, 1):
            original = sample_data["original"]
            masked = sample_data["masked"]
            generated = sample_data["generated"]
            mask_ratio = sample_data["mask_ratio"]
            masked_tokens = sample_data["masked_tokens"]
            total_tokens = sample_data["total_tokens"]
            LOG.info(f"\nSample {i}:")
            LOG.info(f"\tOriginal ({total_tokens} tokens): {original}")
            LOG.info(
                f"\tMasked ({masked_tokens}/{total_tokens} tokens, "
                f"{mask_ratio:.1%}): {masked}"
            )
            LOG.info(f"\tGenerated: {generated}")
        LOG.info("=" * 60)
        if self.trainer.config.use_wandb and self.trainer.state.is_world_process_zero:
            if wandb.run is not None:
                wandb.log(
                    {
                        "generated_samples": wandb.Table(
                            columns=[
                                "step",
                                "original",
                                "masked",
                                "generated",
                                "mask_ratio",
                                "masked_tokens",
                                "total_tokens",
                            ],
                            data=[
                                [
                                    step,
                                    sample["original"],
                                    sample["masked"],
                                    sample["generated"],
                                    f"{sample['mask_ratio']:.1%}",
                                    sample["masked_tokens"],
                                    sample["total_tokens"],
                                ]
                                for sample in samples
                            ],
                        )
                    },
                    step=step,
                )
--- a/src/axolotl/integrations/diffusion/generation.py
+++ b/src/axolotl/integrations/diffusion/generation.py
@@ -0,0 +1,269 @@
 """Sample generation utilities for diffusion training."""
 import logging
 from typing import Any, List, Optional
 import torch
 logger = logging.getLogger(__name__)
 def generate_samples(
    model: torch.nn.Module,
    tokenizer: Any,
    dataloader: Optional[Any] = None,
    num_generation_samples: int = 3,
    max_length: int = 100,
    num_diffusion_steps: int = 128,
    temperature: float = 0.0,
    mask_token_id: int = 32000,
 ) -> List[dict]:
    """
    Generate text samples using the diffusion model by randomly masking sequences from
    the given dataset and running the reverse diffusion process.
    Args:
        model: The wrapped or unwrapped model
        tokenizer: Tokenizer for encoding/decoding
        dataloader: Validation dataloader (for sampling sequences)
        num_generation_samples: Number of samples to generate
        max_length: Maximum length of sequences to use
        num_diffusion_steps: Number of diffusion steps for generation
        temperature: Temperature for sampling (0.0 = deterministic)
        mask_token_id: Token ID used for masking
    Returns:
        List of dictionaries with original text, masked text, and generated text
    """
    if dataloader is None:
        logger.warning("No validation dataloader provided, cannot generate samples")
        return []
    # Get the actual model (unwrap if needed)
    unwrapped_model = model.module if hasattr(model, "module") else model
    unwrapped_model.eval()
    generations = []
    # Sample sequences from validation dataset
    sampled_sequences = _sample_sequences_from_dataloader(
        dataloader, num_generation_samples, max_length, unwrapped_model.device
    )
    logger.info(f"Sampled {len(sampled_sequences)} sequences from validation dataset")
    # Generate samples using reverse diffusion process
    with torch.no_grad():
        for original_sequence in sampled_sequences:
            generation_result = _generate(
                unwrapped_model,
                tokenizer,
                original_sequence,
                num_diffusion_steps,
                temperature,
                mask_token_id,
            )
            generations.append(generation_result)
    unwrapped_model.train()
    return generations
 def _sample_sequences_from_dataloader(
    dataloader: Any, num_samples: int, max_length: int, device: torch.device
 ) -> List[torch.Tensor]:
    """Sample sequences from validation dataloader."""
    sampled_sequences = []
    sample_count = 0
    # Add randomness by skipping a random number of batches
    skip_batches = torch.randint(0, 6, (1,)).item()
    batch_count = 0
    for batch in dataloader:
        # Skip some batches for variety
        if batch_count < skip_batches:
            batch_count += 1
            continue
        if sample_count >= num_samples:
            break
        batch_count += 1
        input_ids = batch["input_ids"]
        attention_mask = batch.get("attention_mask")
        # Randomly sample from sequences in this batch
        batch_indices = torch.randperm(input_ids.size(0)).tolist()
        for i in batch_indices:
            if sample_count >= num_samples:
                break
            # Get actual sequence length (non-padded)
            if attention_mask is not None:
                seq_len = attention_mask[i].sum().item()
            else:
                seq_len = input_ids.size(1)
            # Limit sequence length to max_length
            actual_length = min(seq_len, max_length)
            if actual_length < 10:  # Skip very short sequences
                continue
            # Extract the sequence
            sequence = input_ids[i][:actual_length].unsqueeze(0).to(device)
            sampled_sequences.append(sequence)
            sample_count += 1
    return sampled_sequences
 def _generate(
    model: torch.nn.Module,
    tokenizer: Any,
    original_sequence: torch.Tensor,
    num_diffusion_steps: int,
    temperature: float,
    mask_token_id: int,
 ) -> dict:
    """Generate a single sample using reverse diffusion."""
    # Get original text for comparison
    original_text = tokenizer.decode(
        original_sequence[0].cpu(), skip_special_tokens=True
    )
    # Apply custom masking with random ratio (10% to 70%)
    total_tokens = original_sequence.size(1)
    min_ratio, max_ratio = 0.1, 0.7
    target_mask_ratio = torch.rand(1).item() * (max_ratio - min_ratio) + min_ratio
    target_masked_tokens = int(total_tokens * target_mask_ratio)
    # Create random mask indices
    mask_positions = torch.randperm(total_tokens)[:target_masked_tokens]
    masked_indices = torch.zeros(
        1, total_tokens, dtype=torch.bool, device=original_sequence.device
    )
    masked_indices[0, mask_positions] = True
    # Create masked sequence
    masked_sequence = original_sequence.clone()
    masked_sequence[masked_indices] = mask_token_id
    # Calculate actual mask ratio
    masked_tokens = masked_indices.sum().item()
    mask_ratio = masked_tokens / total_tokens
    # Get masked text for comparison
    masked_text = tokenizer.decode(masked_sequence[0].cpu(), skip_special_tokens=False)
    # Clean up mask token representation
    masked_text = _clean_masked_text(masked_text, tokenizer, mask_token_id)
    # Run reverse diffusion process
    sequence = masked_sequence.clone()
    for step in range(num_diffusion_steps):
        sequence = _diffusion_step(
            model, sequence, step, num_diffusion_steps, temperature, mask_token_id
        )
    # Get final generated text
    generated_text = tokenizer.decode(sequence[0].cpu(), skip_special_tokens=True)
    return {
        "original": original_text,
        "masked": masked_text,
        "generated": generated_text,
        "mask_ratio": mask_ratio,
        "masked_tokens": masked_tokens,
        "total_tokens": total_tokens,
        "formatted": (
            f"Original: '{original_text}' → Masked: '{masked_text}' "
            f"({mask_ratio:.1%}) → Generated: '{generated_text}'"
        ),
    }
 def _clean_masked_text(masked_text: str, tokenizer: Any, mask_token_id: int) -> str:
    """Clean up masked text for display."""
    mask_token_repr = tokenizer.decode([mask_token_id], skip_special_tokens=False)
    cleaned = masked_text.replace(mask_token_repr, "[MASK]")
    if hasattr(tokenizer, "special_tokens_map"):
        for token_value in tokenizer.special_tokens_map.values():
            if token_value and isinstance(token_value, str):
                cleaned = cleaned.replace(token_value, "")
    cleaned = " ".join(cleaned.split()).strip()
    return cleaned
 def _diffusion_step(
    model: torch.nn.Module,
    sequence: torch.Tensor,
    step: int,
    num_diffusion_steps: int,
    temperature: float,
    mask_token_id: int,
 ) -> torch.Tensor:
    """Perform a single diffusion step with remasking."""
    # Only process if there are masked tokens remaining
    current_mask = sequence == mask_token_id
    if not current_mask.any():
        return sequence
    # Create bidirectional attention mask for diffusion
    batch_size, seq_len = sequence.shape
    attention_mask = torch.ones(
        batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=sequence.device
    )
    # Forward pass
    outputs = model(input_ids=sequence, attention_mask=attention_mask)
    logits = outputs.logits
    # Only sample at currently masked positions
    if current_mask.any():
        masked_logits = logits[current_mask]
        # Apply temperature scaling
        if temperature > 0:
            scaled_logits = masked_logits / temperature
        else:
            scaled_logits = masked_logits
        # Suppress mask token in outputs
        scaled_logits[:, mask_token_id] = -float("inf")
        # Sample predictions
        if temperature > 0:
            # Add Gumbel noise for sampling
            gumbel_noise = -torch.log(
                -torch.log(torch.rand_like(scaled_logits, dtype=torch.float32))
            )
            gumbel_logits = scaled_logits + gumbel_noise
            predicted_tokens = torch.argmax(gumbel_logits, dim=-1)
        else:
            # Deterministic sampling when temperature is 0
            predicted_tokens = torch.argmax(scaled_logits, dim=-1)
        # Calculate probabilities for confidence scoring
        probs = torch.softmax(scaled_logits, dim=-1)
        predicted_token_probs = probs[range(len(predicted_tokens)), predicted_tokens]
        # Determine how many tokens to unmask this step
        remaining_masked = current_mask.sum().item()
        if step == num_diffusion_steps - 1:
            num_to_unmask = remaining_masked
        else:
            unmask_ratio = 1.0 / (num_diffusion_steps - step)
            num_to_unmask = max(1, int(remaining_masked * unmask_ratio))
        # Select highest confidence predictions to unmask
        if num_to_unmask >= remaining_masked:
            sequence[current_mask] = predicted_tokens
        else:
            _, top_indices = predicted_token_probs.topk(num_to_unmask)
            mask_positions = torch.where(current_mask)[1]
            positions_to_unmask = mask_positions[top_indices]
            sequence[0, positions_to_unmask] = predicted_tokens[top_indices]
    return sequence
--- a/src/axolotl/integrations/diffusion/loss.py
+++ b/src/axolotl/integrations/diffusion/loss.py
@@ -0,0 +1,115 @@
 """Diffusion LM loss function for integration with transformers LOSS_MAPPING."""
 from typing import Optional
 import torch
 import torch.nn.functional as F
 def ForDiffusionLMLoss(
    logits: torch.Tensor,
    labels: torch.Tensor,
    vocab_size: int,
    config: Optional[dict] = None,
    inputs: Optional[dict] = None,
    model: Optional[torch.nn.Module] = None,
    **kwargs,
 ) -> torch.Tensor:
    """
    Diffusion Language Modeling loss function.
    This function computes cross-entropy loss only on masked tokens using
    diffusion info stored by the model patch during forward pass.
    Args:
        logits: Model predictions [batch_size, seq_len, vocab_size]
        labels: Ground truth tokens [batch_size, seq_len]
        vocab_size: Size of vocabulary
        config: Model configuration (contains diffusion parameters)
        inputs: Input batch dictionary (contains input_ids, attention_mask)
        model: The model instance (to access stored diffusion info)
        **kwargs: Additional arguments
    Returns:
        loss: Computed diffusion loss
    """
    # Get diffusion info stored by model patch
    if model is None or not hasattr(model, "_diffusion_info"):
        # Fallback to regular causal LM loss if no diffusion info
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()
        loss_fct = torch.nn.CrossEntropyLoss()
        return loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
        )
    diffusion_info = model._diffusion_info
    original_input_ids = diffusion_info["original_input_ids"]
    masked_indices = diffusion_info["masked_indices"]
    p_mask = diffusion_info["p_mask"]
    # Get diffusion config parameters
    diffusion_config = getattr(config, "diffusion_config", {})
    importance_weighting = diffusion_config.get("importance_weighting", True)
    # Check if we have any masked tokens
    if not masked_indices.any():
        return torch.tensor(0.0, device=logits.device, requires_grad=True)
    # Get predictions and targets for masked positions only
    masked_logits = logits[masked_indices]
    masked_targets = original_input_ids[masked_indices]  # Original unmasked tokens
    # Compute cross-entropy loss without reduction
    token_loss = F.cross_entropy(
        masked_logits.float(), masked_targets, reduction="none"
    )
    if importance_weighting:
        # Apply importance weighting: 1 / p_mask
        masked_p_mask = p_mask.expand_as(masked_indices)[masked_indices]
        weighted_loss = token_loss / masked_p_mask
        if labels is not None:
            # For SFT data: normalize by answer length per sample
            answer_mask = labels != -100
            answer_lengths = answer_mask.sum(dim=1).float()
            # Group losses by batch sample
            batch_indices = torch.arange(
                original_input_ids.shape[0], device=original_input_ids.device
            )
            batch_indices = batch_indices.unsqueeze(1).expand_as(masked_indices)
            masked_batch_indices = batch_indices[masked_indices]
            # Sum losses per sample and normalize by answer length
            loss_per_sample = torch.zeros(
                original_input_ids.shape[0], device=original_input_ids.device
            )
            for i in range(original_input_ids.shape[0]):
                sample_mask = masked_batch_indices == i
                if sample_mask.any():
                    sample_loss = weighted_loss[sample_mask].sum()
                    loss_per_sample[i] = sample_loss / max(answer_lengths[i], 1)
            loss = loss_per_sample.mean()
        else:
            # For completion data: simple average
            loss = weighted_loss.mean()
    else:
        # No importance weighting
        loss = token_loss.mean()
    return loss
 def register_diffusion_loss():
    """Register the diffusion loss function in transformers LOSS_MAPPING."""
    try:
        from transformers.loss.loss_utils import LOSS_MAPPING
        LOSS_MAPPING["ForDiffusionLM"] = ForDiffusionLMLoss
        return True
    except ImportError:
        # Fallback for older transformers versions
        return False
--- a/src/axolotl/integrations/diffusion/model_patch.py
+++ b/src/axolotl/integrations/diffusion/model_patch.py
@@ -0,0 +1,149 @@
 """Model patches for diffusion training."""
 import torch
 def patch_model_for_bidirectional_attention(model):
    """
    Patch model to handle diffusion training with forward process and bidirectional
    attention.
    This monkey-patches the model's forward method to:
    - Apply forward diffusion process (masking) during training
    - Use bidirectional attention masks
    - Store info for loss computation
    """
    original_forward = model.forward
    def diffusion_forward(
        self,
        input_ids: torch.Tensor | None = None,
        attention_mask: torch.Tensor | None = None,
        labels: torch.Tensor | None = None,
        **kwargs,
    ):
        # Check if this is diffusion training
        if (
            hasattr(self.config, "loss_type")
            and self.config.loss_type == "ForDiffusionLM"
            and self.training
        ):
            # Store original input_ids for loss computation
            original_input_ids = input_ids.clone()
            # Apply forward diffusion process (masking)
            diffusion_config = getattr(self.config, "diffusion_config", {})
            noisy_input_ids, masked_indices, p_mask = _forward_process(
                input_ids, attention_mask, labels, diffusion_config
            )
            # Use noisy input for model forward
            input_ids = noisy_input_ids
            # Convert attention mask to bidirectional
            if attention_mask is not None:
                attention_mask = _create_bidirectional_attention_mask(
                    input_ids, attention_mask
                )
            # Store diffusion info in the model for loss computation
            self._diffusion_info = {
                "original_input_ids": original_input_ids,
                "masked_indices": masked_indices,
                "p_mask": p_mask,
            }
        return original_forward(
            input_ids=input_ids, attention_mask=attention_mask, labels=labels, **kwargs
        )
    # Replace the forward method
    model.forward = diffusion_forward.__get__(model, model.__class__)
 def _create_bidirectional_attention_mask(
    input_ids: torch.Tensor, attention_mask: torch.Tensor
 ) -> torch.Tensor:
    """
    Create bidirectional attention mask from 2D attention mask.
    Args:
        input_ids: Input token IDs [batch_size, seq_len]
        attention_mask: 2D attention mask [batch_size, seq_len]
    Returns:
        bidirectional_mask: 4D attention mask [batch_size, 1, seq_len, seq_len]
    """
    batch_size, seq_len = input_ids.shape
    # Simple bidirectional mask - all tokens can attend to all valid tokens
    # Expand 2D mask to 4D: [batch_size, seq_len] -> [batch_size, 1, seq_len, seq_len]
    bidirectional_mask = attention_mask.unsqueeze(1).unsqueeze(2)  # [B, 1, 1, S]
    bidirectional_mask = bidirectional_mask.expand(batch_size, 1, seq_len, seq_len)
    # Apply row-wise masking (padded tokens can't attend to anything)
    row_mask = attention_mask.unsqueeze(1).unsqueeze(3)  # [B, 1, S, 1]
    bidirectional_mask = bidirectional_mask & row_mask
    return bidirectional_mask
 def _forward_process(
    input_ids: torch.Tensor,
    attention_mask: torch.Tensor | None = None,
    labels: torch.Tensor | None = None,
    diffusion_config: dict | None = None,
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Apply forward diffusion process (random masking).
    Args:
        input_ids: Input token IDs [batch_size, seq_len]
        attention_mask: Attention mask [batch_size, seq_len]
        labels: Labels for SFT training [batch_size, seq_len]
        diffusion_config: Diffusion configuration dict
    Returns:
        noisy_input_ids: Input with masked tokens
        masked_indices: Boolean mask of which tokens were masked
        p_mask: Masking probabilities used
    """
    if diffusion_config is None:
        diffusion_config = {}
    batch_size, seq_len = input_ids.shape
    device = input_ids.device
    eps = diffusion_config.get("eps", 1e-3)
    mask_token_id = diffusion_config.get("mask_token_id", 128002)
    # Sample random timesteps for each sample
    t = torch.rand(batch_size, device=device)
    # Calculate masking probability with epsilon
    p_mask = (1 - eps) * t + eps  # [batch_size]
    p_mask = p_mask.unsqueeze(1).expand(-1, seq_len)  # [batch_size, seq_len]
    # Don't mask padding tokens
    if attention_mask is not None:
        p_mask = p_mask * attention_mask.float()
    # Create random mask based on p_mask
    random_values = torch.rand_like(p_mask)
    masked_indices = random_values < p_mask
    # Apply attention mask constraints
    if attention_mask is not None:
        masked_indices = masked_indices & attention_mask.bool()
    # For SFT data, only mask answer tokens (where labels != -100)
    if labels is not None:
        answer_mask = labels != -100
        masked_indices = masked_indices & answer_mask
    # Create noisy input by replacing masked tokens
    noisy_input_ids = input_ids.clone()
    noisy_input_ids[masked_indices] = mask_token_id
    return noisy_input_ids, masked_indices, p_mask
--- a/src/axolotl/integrations/diffusion/plugin.py
+++ b/src/axolotl/integrations/diffusion/plugin.py
@@ -0,0 +1,96 @@
 """Diffusion LM training plugin for Axolotl."""
 from peft import PeftModel
 from transformers import PreTrainedModel
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.logging import get_logger
 from .args import DiffusionArgs
 from .callbacks import DiffusionGenerationCallback
 from .loss import register_diffusion_loss
 from .model_patch import patch_model_for_bidirectional_attention
 LOG = get_logger(__name__)
 class DiffusionPlugin(BasePlugin):
    """
    Plugin for diffusion language model training.
    This plugin enables diffusion-based training using the LLaDA approach, which uses
    random masking and bidirectional attention to train language models.
    """
    def __init__(self):
        super().__init__()
        self.cfg = None
        if register_diffusion_loss():
            LOG.info("Registered ForDiffusionLM loss function")
        else:
            LOG.warning(
                "Failed to register diffusion loss - older transformers version"
            )
    def get_input_args(self) -> str:
        """Returns the pydantic model for LLaDA plugin arguments."""
        return "axolotl.integrations.diffusion.DiffusionArgs"
    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Configure model for diffusion training after loading."""
        self.cfg = cfg
        # Set loss type for diffusion training
        if hasattr(model, "config"):
            model.config.loss_type = "ForDiffusionLM"
            # Store diffusion config in model config
            model.config.diffusion_config = {
                "eps": getattr(cfg, "eps", 1e-3),
                "importance_weighting": getattr(cfg, "importance_weighting", True),
                "mask_token_id": getattr(cfg, "mask_token_id", 128002),
            }
            LOG.info("Configured model for diffusion training with ForDiffusionLM loss")
        # Patch model for bidirectional attention during training
        patch_model_for_bidirectional_attention(model)
        LOG.info("Applied bidirectional attention patch to model")
        return model
    def post_trainer_create(self, cfg: DictDefault, trainer):
        """Configure trainer after creation."""
        # Create diffusion config from cfg
        diffusion_config = DiffusionArgs(
            noise_schedule=getattr(cfg, "noise_schedule", "linear"),
            min_mask_ratio=getattr(cfg, "min_mask_ratio", 0.1),
            max_mask_ratio=getattr(cfg, "max_mask_ratio", 0.9),
            num_diffusion_steps=getattr(cfg, "num_diffusion_steps", 128),
            eps=getattr(cfg, "eps", 1e-3),
            importance_weighting=getattr(cfg, "importance_weighting", True),
            mask_token_id=getattr(cfg, "mask_token_id", 128002),
            generate_samples=getattr(cfg, "generate_samples", True),
            generation_interval=getattr(cfg, "generation_interval", 100),
            num_generation_samples=getattr(cfg, "num_generation_samples", 3),
            generation_steps=getattr(cfg, "generation_steps", 128),
            generation_temperature=getattr(cfg, "generation_temperature", 0.0),
            generation_max_length=getattr(cfg, "generation_max_length", 100),
        )
        # Store diffusion config on trainer for callbacks to access
        trainer.diffusion_config = diffusion_config
        LOG.info("Stored diffusion config on trainer")
    def add_callbacks_post_trainer(self, cfg: DictDefault, trainer):
        """Add diffusion generation callback if enabled."""
        if (
            hasattr(trainer, "diffusion_config")
            and trainer.diffusion_config.generate_samples
        ):
            generation_callback = DiffusionGenerationCallback(trainer)
            LOG.info("Added diffusion generation callback")
            return [generation_callback]
        return []
--- a/src/axolotl/integrations/grokfast/init.py
+++ b/src/axolotl/integrations/grokfast/init.py
@@ -7,7 +7,7 @@ from transformers.trainer_callback import TrainerCallback
 from axolotl.utils.logging import get_logger
 from ..base import BasePlugin
-from .args import GrokfastArgs as GrokfastArgs
+from .args import GrokfastArgs  # pylint: disable=unused-import. # noqa: F401
 from .optimizer import gradfilter_ema
 LOG = get_logger(__name__)
@@ -24,10 +24,12 @@ class GrokfastCallbackHandler(TrainerCallback):
        self.alpha = alpha
        self.lamb = lamb
-    def on_train_begin(self, *args_, **kwargs):
+    def on_train_begin(self, *args_, **kwargs):  # pylint: disable=unused-argument
        self.grads = None
-    def on_pre_optimizer_step(self, args_, state, control, **kwargs):
+    def on_pre_optimizer_step(
        self, args_, state, control, **kwargs
    ):  # pylint: disable=unused-argument
        model = kwargs.pop("model")
        self.grads = gradfilter_ema(model, self.grads, alpha=self.alpha, lamb=self.lamb)
        return control
--- a/src/axolotl/integrations/grokfast/optimizer.py
+++ b/src/axolotl/integrations/grokfast/optimizer.py
@@ -1,6 +1,7 @@
 # Copyright: MIT License (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
 # Reference: https://github.com/ironjr/grokfast
 # pylint: skip-file
 from collections import deque
 from typing import Dict, Literal, Optional
--- a/src/axolotl/integrations/kd/init.py
+++ b/src/axolotl/integrations/kd/init.py
@@ -15,7 +15,6 @@
 """
 Plugin init to add KD support to Axolotl.
 """
 from typing import Any
 from transformers import Trainer
@@ -23,7 +22,7 @@ from transformers import Trainer
 from axolotl.integrations.base import BasePlugin
 from axolotl.integrations.kd.callbacks import KDTemperatureSchedulerCallback
-from .args import KDArgs as KDArgs
+from .args import KDArgs  # pylint: disable=unused-import. # noqa: F401
 class KDPlugin(BasePlugin):
--- a/src/axolotl/integrations/kd/args.py
+++ b/src/axolotl/integrations/kd/args.py
@@ -15,7 +15,6 @@
 """
 Plugin args for KD support.
 """
 from dataclasses import dataclass
 from enum import Enum
@@ -27,8 +26,8 @@ class InferenceServerType(str, Enum):
    Online inferences server types to handle different request args
    """
-    vllm = "vllm"
+    vllm = "vllm"  # pylint: disable=invalid-name
-    sglang = "sglang"
+    sglang = "sglang"  # pylint: disable=invalid-name
 class KDArgs(BaseModel):
--- a/src/axolotl/integrations/kd/callbacks.py
+++ b/src/axolotl/integrations/kd/callbacks.py
@@ -19,7 +19,9 @@ class KDTemperatureSchedulerCallback(TrainerCallback):
        self.trainer = trainer
-    def on_step_end(self, args, state, control, **kwargs):
+    def on_step_end(
        self, args, state, control, **kwargs
    ):  # pylint: disable=unused-argument
        # cosine decay temperature over the max steps
        progress = state.global_step / state.max_steps
--- a/src/axolotl/integrations/kd/chat_template.py
+++ b/src/axolotl/integrations/kd/chat_template.py
@@ -15,7 +15,6 @@
 """
 Chat template prompt strategy loader with KD support
 """
 import logging
 from typing import Any, Dict
@@ -193,6 +192,7 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
        """
        Transform logprobs to target format for KD training
        """
        # pylint: disable=duplicate-code
        logprobs = sample.pop(self.logprobs_field)
        target_seq_len = len(logprobs)
@@ -240,7 +240,7 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
                target_mask.append([1] * top_k)
        for token_pos_logprobs, pos_target_token_ids in zip(
-            logprobs, sample["target_token_ids"], strict=False
+            logprobs, sample["target_token_ids"]
        ):
            # Convert to a tensor for easier manipulation
            position_logprobs_tensor = torch.tensor(
@@ -299,7 +299,7 @@ class KDStrategyLoader(StrategyLoader):
    Load ChatTemplateStrategy with KD support using StrategyLoader.
    """
-    def _get_strategy_cls(self, cfg):
+    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
        return ChatTemplateStrategyWithKD
    def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
@@ -319,7 +319,7 @@ class KDStrategyLoaderV2(KDStrategyLoader):
    Load KD chat template datasets with pre-tokenized logprob data
    """
-    def _get_strategy_cls(self, cfg):
+    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
        return ChatTemplateStrategyWithKDv2
--- a/src/axolotl/integrations/kd/collator.py
+++ b/src/axolotl/integrations/kd/collator.py
@@ -37,6 +37,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
    target_logprobs. It also creates a teacher_mask to indicate which entries are valid.
    """
    # pylint: disable=duplicate-code
    tokenizer: PreTrainedTokenizerBase
    model: Optional[Any] = None
    padding: Union[bool, str, PaddingStrategy] = True
@@ -71,7 +72,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
                        // self.pad_to_multiple_of
                    ) * self.pad_to_multiple_of
-                for f in features:
+                for f in features:  # pylint: disable=invalid-name
                    remainder = [pad_token_id] * (max_len - len(f[feature_name]))
                    if isinstance(f[feature_name], list):
                        f[feature_name] = (
@@ -100,7 +101,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
        if has_teacher_data:
            # Extract and remove from features
-            for f in features:
+            for f in features:  # pylint: disable=invalid-name
                target_logprobs_list.append(f.pop("target_logprobs"))
                target_token_ids_list.append(f.pop("target_token_ids"))
                target_mask_list.append(f.pop("target_mask"))
@@ -116,25 +117,24 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
            padded_teacher_mask_list = []
            for t_logprobs, t_ids, t_mask in zip(
-                target_logprobs_list,
+                target_logprobs_list, target_token_ids_list, target_mask_list
                target_token_ids_list,
                target_mask_list,
                strict=False,
            ):
                t_logprobs_padded = []
                t_ids_padded = []
                t_mask_padded = []
-                for lp, ids, mask in zip(t_logprobs, t_ids, t_mask, strict=False):
+                for lp, ids, mask in zip(  # pylint: disable=invalid-name
                    t_logprobs, t_ids, t_mask
                ):
                    lp_len = len(lp)
                    if lp_len < max_k:
                        # Use -1e9 for padding logprobs and 0 for token_ids
                        pad_len = max_k - lp_len
-                        lp = lp + [-1e9] * pad_len
+                        lp = lp + [-1e9] * pad_len  # pylint: disable=invalid-name
                        ids = ids + [0] * pad_len
                        mask = mask + [0] * pad_len
                    else:
-                        lp = lp[:max_k]
+                        lp = lp[:max_k]  # pylint: disable=invalid-name
                        ids = ids[:max_k]
                        mask = mask[:max_k]
@@ -216,7 +216,9 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
        #    We want to produce a single "merged" feature dict for each sub-batch.
        out_features = [{} for _ in features]
-        for i, sub_features in enumerate(features):
+        for i, sub_features in enumerate(  # pylint: disable=too-many-nested-blocks
            features
        ):
            # sub_features is a list of dicts, each dict = one sequence’s features
            # We'll merge them into out_features[i].
            #
@@ -253,7 +255,9 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
                        if field_name in feat and isinstance(
                            feat[field_name], (list, torch.Tensor)
                        ):
-                            if isinstance(feat[field_name][0], (dict, str)):
+                            if isinstance(
                                feat[field_name][0], (dict, str)
                            ):  # pylint: disable=too-many-nested-blocks
                                continue
                            arr = np.array(feat[field_name])
                            arrays.append(arr)
--- a/src/axolotl/integrations/kd/collator_online_teacher.py
+++ b/src/axolotl/integrations/kd/collator_online_teacher.py
@@ -144,7 +144,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                }
            for sequence_data, seq_input_ids, seq_labels in zip(
-                api_data, batch_input_ids, labels, strict=False
+                api_data, batch_input_ids, labels
            ):
                current_target_logprobs = []
                current_target_token_ids = []
@@ -165,7 +165,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                assert len(seq_input_ids) == len(input_top_logprobs)
                for i, _, label in zip(
-                    range(len(seq_input_ids)), seq_input_ids, seq_labels, strict=False
+                    range(len(seq_input_ids)), seq_input_ids, seq_labels
                ):
                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
                        # this is always the case for the first token.
@@ -202,8 +202,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                        # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
                        pos_logprobs_raw, pos_token_ids, _ = [
-                            list(row)
+                            list(row) for row in zip(*pos_top_logprobs_data)
                            for row in zip(*pos_top_logprobs_data, strict=False)
                        ]
                        # Ensure correct length (top_k)
@@ -318,7 +317,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                }
            for sequence_data, seq_input_ids, seq_labels in zip(
-                choices, batch_input_ids, labels, strict=False
+                choices, batch_input_ids, labels
            ):
                # seq_input_ids: List[int]
                # seq_labels: List[int]
@@ -343,9 +342,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                seq_len = len(seq_input_ids)
-                for i, _, label in zip(
+                for i, _, label in zip(range(seq_len), seq_input_ids, seq_labels):
                    range(seq_len), seq_input_ids, seq_labels, strict=False
                ):
                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
                        # this is always the case for the first token.
                        # there is never logprob data for the first token since that's a true input
@@ -427,7 +424,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                            list(range(self.kd_online_topk))
                        )
                        current_target_mask.append([0] * self.kd_online_topk)
-                for _ in range(max(0, seq_len - len(current_target_logprobs))):
+                for i in range(max(0, seq_len - len(current_target_logprobs))):
                    current_target_logprobs.append(
                        [-float("inf")] * self.kd_online_topk
                    )
--- a/src/axolotl/integrations/kd/kernels/liger.py
+++ b/src/axolotl/integrations/kd/kernels/liger.py
@@ -197,7 +197,7 @@ class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
        compute_ce_loss: bool = True,
        normalize_topk: bool = True,
    ):
-        CHUNK_SIZE = chunk_size
+        CHUNK_SIZE = chunk_size  # pylint: disable=invalid-name
        grad_weight_acc = torch.zeros_like(student_lm_head_weight)
        grad_inputs_list = []
        grad_bias_acc = (
@@ -298,8 +298,8 @@ class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
            accumulate_chunk_grads_compiled = accumulate_chunk_grads
        # Use the same chunking logic as LigerFusedLinearDistillationBase.forward
-        B, N, D = student_input.shape
+        B, N, D = student_input.shape  # pylint: disable=invalid-name
-        K = target_token_ids.shape[-1]
+        K = target_token_ids.shape[-1]  # pylint: disable=invalid-name
        student_input_flat = student_input.reshape(-1, student_input.shape[-1])
        target_token_ids_flat = target_token_ids.reshape(-1, target_token_ids.shape[-1])
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -40,9 +40,10 @@ def kldiv_forward_llama_like(
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
+    logits_to_keep: Union[int, torch.Tensor] = 0,  # pylint: disable=unused-argument
    **kwargs: Unpack[TransformersKwargs],  # type: ignore[misc]
 ) -> CausalLMOutputWithPast:
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
--- a/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
+++ b/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
@@ -15,7 +15,6 @@
 """
 loss for top_k KL divergence
 """
 import torch
 from torch import nn
@@ -118,6 +117,7 @@ class ChunkedTopKKDLoss(nn.Module):
        target_mask: torch.Tensor,  # [B, seq_len, K]
        num_items_in_batch: int = -1,  # optional batch size for normalization
    ) -> torch.Tensor:
        # 1. Split along the "token" dimension (dim=1).
        student_logits_chunks = student_logits.chunk(self.num_output_chunks, dim=1)
        token_ids_chunks = target_token_ids.chunk(self.num_output_chunks, dim=1)
@@ -131,11 +131,7 @@ class ChunkedTopKKDLoss(nn.Module):
        # 2. Loop over each chunk and compute a chunk-specific loss.
        for st_chunk, tid_chunk, lp_chunk, msk_chunk in zip(
-            student_logits_chunks,
+            student_logits_chunks, token_ids_chunks, logprobs_chunks, mask_chunks
            token_ids_chunks,
            logprobs_chunks,
            mask_chunks,
            strict=False,
        ):
            # We pass num_items_in_batch=-1 so that the kd_loss
            # will average over *this chunk's* valid tokens only.
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -21,6 +21,7 @@ from axolotl.core.trainers.base import AxolotlTrainer
 from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss
 # pylint: disable=too-many-ancestors
 class AxolotlKDTrainer(AxolotlTrainer):
    """
    Custom trainer subclass for Knowledge Distillation (KD)
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -18,7 +18,6 @@ Module for the Plugin for LIGER integraton with Axolotl.
 Liger Kernel is the collection of Triton-native kernels for LLM Training.
 It is designed to be performant, correct, and light-weight.
 """
 from .args import LigerArgs
 from .plugin import LigerPlugin
--- a/src/axolotl/integrations/liger/models/base.py
+++ b/src/axolotl/integrations/liger/models/base.py
@@ -41,6 +41,7 @@ def lce_forward(
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
@@ -180,7 +181,7 @@ def patch_lce_forward(
        model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
        model_cls.forward = lce_forward
-
+    # pylint: disable=duplicate-code
    except (ImportError, AttributeError) as e:
        raise RuntimeError(
            f"Could not import ForCausalLM class for model_type: {model_type}. "
--- a/src/axolotl/integrations/liger/models/deepseekv2.py
+++ b/src/axolotl/integrations/liger/models/deepseekv2.py
@@ -2,6 +2,8 @@
 DeepseekV2 model with LigerFusedLinearCrossEntropyLoss
 """
 # pylint: disable=duplicate-code
 from typing import List, Optional, Tuple, Union
 import torch
--- a/src/axolotl/integrations/liger/models/jamba.py
+++ b/src/axolotl/integrations/liger/models/jamba.py
@@ -2,6 +2,8 @@
 Jamba model with LigerFusedLinearCrossEntropyLoss
 """
 # pylint: disable=duplicate-code
 from typing import Optional, Tuple, Union
 import torch
--- a/src/axolotl/integrations/liger/models/llama4.py
+++ b/src/axolotl/integrations/liger/models/llama4.py
@@ -46,6 +46,7 @@ def lce_forward(
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
@@ -77,7 +78,9 @@ def lce_forward(
    hidden_states = outputs[0]
    if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1:
-        raise Exception("Liger Kernel does not support pretraining_tp!!")
+        raise Exception(  # pylint: disable=broad-exception-raised
            "Liger Kernel does not support pretraining_tp!!"
        )
    logits = None
    loss = None
@@ -125,7 +128,7 @@ def apply_liger_kernel_to_llama4(
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
-    **kwargs,
+    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
@@ -141,15 +144,15 @@ def apply_liger_kernel_to_llama4(
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
-    import transformers.models.llama4.modeling_llama4  # noqa: F401
+    import transformers.models.llama4.modeling_llama4  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-    assert not (cross_entropy and fused_linear_cross_entropy), (
+    assert not (
-        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+        cross_entropy and fused_linear_cross_entropy
-    )
+    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_llama4 = sys.modules["transformers.models.llama4.modeling_llama4"]
@@ -162,7 +165,7 @@ def apply_liger_kernel_to_llama4(
            # clone config to avoid modifying the original
            config = deepcopy(config)
            if intermediate_size:
-                config.intermediate_size = intermediate_size
+                setattr(config, "intermediate_size", intermediate_size)
            return LigerSwiGLUMLP(config, **kwargs)
        modeling_llama4.Llama4TextMLP = _liger_swiglu_mlp_wrapper
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -43,6 +43,7 @@ def lce_forward(
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
@@ -112,8 +113,9 @@ def apply_liger_kernel_to_qwen3(
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
-    **kwargs,
+    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
@@ -128,15 +130,15 @@ def apply_liger_kernel_to_qwen3(
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
-    import transformers.models.qwen3.modeling_qwen3  # noqa: F401
+    import transformers.models.qwen3.modeling_qwen3  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-    assert not (cross_entropy and fused_linear_cross_entropy), (
+    assert not (
-        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+        cross_entropy and fused_linear_cross_entropy
-    )
+    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -45,6 +45,7 @@ def lce_forward(
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
@@ -134,8 +135,9 @@ def apply_liger_kernel_to_qwen3_moe(
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
-    **kwargs,
+    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
@@ -150,15 +152,15 @@ def apply_liger_kernel_to_qwen3_moe(
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
-    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401
+    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-    assert not (cross_entropy and fused_linear_cross_entropy), (
+    assert not (
-        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+        cross_entropy and fused_linear_cross_entropy
-    )
+    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
@@ -172,7 +174,7 @@ def apply_liger_kernel_to_qwen3_moe(
            # clone config to avoid modifying the original
            config = deepcopy(config)
            if intermediate_size:
-                config.intermediate_size = intermediate_size
+                setattr(config, "intermediate_size", intermediate_size)
            return LigerSwiGLUMLP(config, **kwargs)
        modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
--- a/src/axolotl/integrations/lm_eval/init.py
+++ b/src/axolotl/integrations/lm_eval/init.py
@@ -7,7 +7,7 @@ import subprocess  # nosec
 from axolotl.integrations.base import BasePlugin
 from axolotl.integrations.lm_eval.cli import build_lm_eval_command
-from .args import LMEvalArgs as LMEvalArgs
+from .args import LMEvalArgs  # pylint: disable=unused-import. # noqa: F401
 class LMEvalPlugin(BasePlugin):
@@ -20,6 +20,7 @@ class LMEvalPlugin(BasePlugin):
    def post_train_unload(self, cfg):
        if cfg.lm_eval_post_train:
            # pylint: disable=duplicate-code
            for lm_eval_args in build_lm_eval_command(
                cfg.lm_eval_tasks,
                bfloat16=cfg.bfloat16 or cfg.bf16,
--- a/src/axolotl/integrations/lm_eval/cli.py
+++ b/src/axolotl/integrations/lm_eval/cli.py
@@ -99,6 +99,7 @@ def lm_eval(config: str, cloud: Optional[str] = None):
        with open(config, encoding="utf-8") as file:
            cfg: DictDefault = DictDefault(yaml.safe_load(file))
        # pylint: disable=duplicate-code
        for lm_eval_args in build_lm_eval_command(
            cfg.lm_eval_tasks,
            bfloat16=cfg.bfloat16 or cfg.bf16,
--- a/src/axolotl/integrations/spectrum/init.py
+++ b/src/axolotl/integrations/spectrum/init.py
@@ -23,7 +23,7 @@ import requests
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils.logging import get_logger
-from .args import SpectrumArgs as SpectrumArgs
+from .args import SpectrumArgs  # pylint: disable=unused-import. # noqa: F401
 LOG = get_logger(__name__)
@@ -46,7 +46,7 @@ def _generate_unfrozen_params_yaml(snr_data, top_fraction=0.5):
        "^lm_head.weight$",
        "^model.embed_tokens.weight$",
    ]
-    for _, layer_names in top_layers_by_type.items():
+    for layer_type, layer_names in top_layers_by_type.items():
        for layer_name in layer_names:
            unfrozen_parameters.append(layer_name)
    return unfrozen_parameters
@@ -84,7 +84,7 @@ class SpectrumPlugin(BasePlugin):
                snr_data = json.load(fin)
        except FileNotFoundError:
            pass
-        except Exception as exc:
+        except Exception as exc:  # pylint: disable=broad-exception-caught
            LOG.warning(f"Failed to read SNR data from {snr_path}: {exc}")
        if not snr_data:
--- a/src/axolotl/integrations/spectrum/args.py
+++ b/src/axolotl/integrations/spectrum/args.py
@@ -15,7 +15,6 @@
 """
 Module for handling Spectrum input arguments.
 """
 from typing import Optional
 from pydantic import BaseModel, model_validator
--- a/src/axolotl/kernels/geglu.py
+++ b/src/axolotl/kernels/geglu.py
@@ -5,6 +5,8 @@ See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202).
 Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
 """
 # pylint: disable=invalid-name,unnecessary-lambda-assignment,duplicate-code
 import torch
 import triton
 import triton.language as tl
--- a/src/axolotl/kernels/lora.py
+++ b/src/axolotl/kernels/lora.py
@@ -7,6 +7,8 @@ See "LoRA: Low-Rank Adaptation of Large Language Models"
 Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
 """
 # pylint: disable=invalid-name
 from typing import Callable
 import torch
--- a/src/axolotl/kernels/quantize.py
+++ b/src/axolotl/kernels/quantize.py
@@ -1,5 +1,7 @@
 """Dequantization utilities for `bitsandbytes` integration."""
 # pylint: disable=invalid-name,global-statement
 import ctypes
 import bitsandbytes as bnb
--- a/src/axolotl/kernels/swiglu.py
+++ b/src/axolotl/kernels/swiglu.py
@@ -99,6 +99,7 @@ def _swiglu_bwd_kernel(
    tl.store(up_ptr + offsets, grad_up, mask=mask)  # grad wrt up
 # pylint: disable=unnecessary-lambda-assignment
 def swiglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
    """
    SwiGLU forward pass. Computes SwiGLU activation: `x * sigmoid(x) * up`, where
@@ -127,6 +128,7 @@ def swiglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
    return out
 # pylint: disable=unnecessary-lambda-assignment
 def swiglu_backward(
    grad_output: torch.Tensor, gate: torch.Tensor, up: torch.Tensor
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
--- a/src/axolotl/loaders/init.py
+++ b/src/axolotl/loaders/init.py
@@ -1,5 +1,6 @@
 """Init for axolotl.loaders module"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .adapter import load_adapter, load_lora
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -28,12 +28,14 @@ LOG = get_logger(__name__)
 def setup_quantized_meta_for_peft(model: torch.nn.Module):
    """Replaces `quant_state.to` with a dummy function to prevent PEFT from moving `quant_state` to meta device"""
-    def temp_to_method(self, *args, **kwargs):
+    def temp_to_method(self, *args, **kwargs):  # pylint: disable=unused-argument
        return self
    for param in model.parameters():
        if isinstance(param, Params4bit):
-            param.quant_state._orig_to = param.quant_state.to
+            param.quant_state._orig_to = (  # pylint: disable=protected-access
                param.quant_state.to
            )
            param.quant_state.to = types.MethodType(temp_to_method, param.quant_state)
@@ -41,8 +43,10 @@ def setup_quantized_peft_meta_for_training(model: torch.nn.Module):
    """Replaces dummy `quant_state.to` method with the original function to allow training to continue"""
    for param in model.parameters():
        if isinstance(param, Params4bit) and hasattr(param.quant_state, "_orig_to"):
-            param.quant_state.to = param.quant_state._orig_to
+            param.quant_state.to = (
-            param.quant_state._orig_to = None
+                param.quant_state._orig_to  # pylint: disable=protected-access
            )
            param.quant_state._orig_to = None  # pylint: disable=protected-access
 def find_all_linear_names(model):
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Dan Saunders	64f349b7bb	diffusion alt: custom loss impl	2025-08-18 20:50:34 +00:00
Dan Saunders	260ebe4c93	diffusion alt: custom loss impl	2025-08-18 20:50:20 +00:00
Dan Saunders	63d2280999	nits	2025-08-18 19:17:24 +00:00
Dan Saunders	b210db2d15	fixes	2025-08-18 19:09:09 +00:00
Dan Saunders	556a69118f	sample generation, tests fixes	2025-08-18 18:25:04 +00:00
Dan Saunders	8569675b26	Merge branch 'main' into diffusion	2025-08-18 10:07:55 -04:00
Dan Saunders	077b5a4358	cleanup; tests draft	2025-08-16 02:44:44 +00:00
Dan Saunders	234b7b3126	nits	2025-08-16 00:14:44 +00:00
Dan Saunders	e19be0c2d9	add back in reinit_weights (clobbered?); masking / pretrain fixes	2025-08-15 02:21:25 +00:00
Dan Saunders	479a454ae3	fixes + improvements	2025-08-14 16:11:37 -04:00
Dan Saunders	0a9341acde	nits	2025-08-14 01:53:24 -04:00
Dan Saunders	d8b63804bc	cleanup	2025-08-14 01:51:13 -04:00
Dan Saunders	3156c605d4	diffusion training plugin	2025-08-14 01:48:22 -04:00