fix conditional for None values

handle batch size correchtly when using split and dispatch batches
2025-08-17 12:49:48 -04:00 · 2025-08-16 22:05:31 -04:00
300 changed files with 11502 additions and 11471 deletions
--- a/.bandit
+++ b/.bandit
@@ -1,3 +1,3 @@
 [bandit]
 exclude = tests
-skips = B101,B615,B102,B110
+skips = B101,B615
--- a/.coderabbit.yaml
+++ b/.coderabbit.yaml
@@ -12,6 +12,5 @@ reviews:
  auto_review:
    enabled: true
    drafts: false
    auto_incremental_review: true
 chat:
  auto_reply: true
--- a/.flake8
+++ b/.flake8
@@ -0,0 +1,5 @@
 [flake8]
 max-line-length = 88
 select = C,E,F,W,B,B950
 extend-ignore = E203, E501, W503
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -0,0 +1,4 @@
 [settings]
 profile=black
 known_third_party=wandb,comet_ml
 known_local_folder=src,tests
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -10,12 +10,22 @@ repos:
    -   id: trailing-whitespace
    -   id: no-commit-to-branch
        args: ['--branch', 'main']
-   repo: https://github.com/astral-sh/ruff-pre-commit
+-   repo: https://github.com/psf/black
-    rev: v0.12.9
+    rev: 25.1.0
    hooks:
-    -   id: ruff
+    -   id: black
-        args: [--fix]
+-   repo: https://github.com/pycqa/isort
-    -   id: ruff-format
+    rev: 6.0.1
    hooks:
      - id: isort
 -   repo: https://github.com/PyCQA/flake8
    rev: 7.3.0
    hooks:
    - id: flake8
 -   repo: https://github.com/pylint-dev/pylint
    rev: v3.3.8
    hooks:
    - id: pylint
 -   repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.17.1
    hooks:
--- a/.pylintrc
+++ b/.pylintrc
@@ -0,0 +1,15 @@
 [MASTER]
 init-hook="from pylint.config import find_default_config_files; import sys; sys.path.append(next(find_default_config_files()).parent.as_posix())"
 [TYPECHECK]
 # List of members which are set dynamically and missed by Pylint inference
 # system, and so shouldn't trigger E1101 when accessed.
 generated-members=numpy.*, torch.*
 [pylint.messages_control]
 disable=missing-function-docstring, line-too-long, import-error,
    too-many-arguments, too-many-locals, too-many-statements, too-many-branches, too-few-public-methods,
    too-many-instance-attributes, fixme, import-outside-toplevel, logging-fstring-interpolation,
    too-many-positional-arguments, possibly-used-before-assignment
--- a/cicd/multigpu.py
+++ b/cicd/multigpu.py
@@ -2,6 +2,8 @@
 modal application to run axolotl gpu tests in Modal
 """
 # pylint: disable=duplicate-code
 import os
 import pathlib
 import tempfile
@@ -61,7 +63,7 @@ def run_cmd(cmd: str, run_folder: str):
    # Propagate errors from subprocess.
    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
@app.function(
--- a/cicd/single_gpu.py
+++ b/cicd/single_gpu.py
@@ -1,5 +1,7 @@
 """Modal app to run axolotl GPU tests"""
 # pylint: disable=duplicate-code
 import os
 import pathlib
 import tempfile
@@ -68,4 +70,4 @@ def run_cmd(cmd: str, run_folder: str):
    # Propagate errors from subprocess.
    if exit_code := subprocess.call(cmd.split(), cwd=run_folder, env=sp_env):  # nosec
-        exit(exit_code)
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
--- a/docs/scripts/generate_config_docs.py
+++ b/docs/scripts/generate_config_docs.py
@@ -47,6 +47,7 @@ class QuartoGenerator:
        """Check if a type is a Pydantic BaseModel."""
        return inspect.isclass(type_obj) and issubclass(type_obj, BaseModel)
    # pylint: disable=too-many-return-statements
    def _extract_nested_type(self, field_type) -> Any:
        """Extract the actual type from complex type annotations."""
        # Handle Annotated types (Python 3.9+)
@@ -123,6 +124,7 @@ class QuartoGenerator:
        return field_type
    # pylint: disable=too-many-return-statements
    def _extract_all_pydantic_models_from_type(
        self, field_type
    ) -> list[type[BaseModel]]:
@@ -316,6 +318,7 @@ class QuartoGenerator:
        return all_groups
    # pylint: disable=too-many-return-statements
    def _extract_field_groups_from_source(
        self, model_class: type[BaseModel]
    ) -> list[dict]:
@@ -500,7 +503,7 @@ class QuartoGenerator:
                    nested_schema = nested_model.model_json_schema()
                    nested_properties = nested_schema.get("properties", {})
                    nested_required = nested_schema.get("required", [])
-                except Exception:
+                except Exception:  # pylint: disable=broad-exception-caught
                    # Fallback: use model fields directly
                    nested_properties = {}
                    nested_required = []
@@ -604,7 +607,7 @@ class QuartoGenerator:
            schema = model_class.model_json_schema()
            properties = schema.get("properties", {})
            required = schema.get("required", [])
-        except Exception as e:
+        except Exception as e:  # pylint: disable=broad-exception-caught
            print(
                f"Warning: Could not generate JSON schema ({e}). Using model fields instead."
            )
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
--- a/examples/gpt-oss/README.md
+++ b/examples/gpt-oss/README.md
@@ -41,12 +41,6 @@ model, and final model output, you may need at least 3TB of free disk space to k
 axolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
 ```
 To simplify fine-tuning across 2 nodes × 8x H100 (80GB) GPUs, we've partnered with [Baseten](https://baseten.co) to showcase multi-node
 training of the 120B model using Baseten Truss. You can read more about this recipe on
 [Baseten's blog](https://www.baseten.co/blog/how-to-fine-tune-gpt-oss-120b-with-baseten-and-axolotl/). The recipe can
 be found on their
 [GitHub](https://github.com/basetenlabs/ml-cookbook/tree/main/examples/oss-gpt-120b-axolotl/training).
 ERRATA: Transformers saves the model Architecture prefixed with `FSDP` which needs to be manually renamed in `config.json`.
 See https://github.com/huggingface/transformers/pull/40207 for the status of this issue.
@@ -67,23 +61,9 @@ mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/
 ### Inferencing your fine-tuned model
 #### vLLM
 GPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425
 for more information about using a special vllm-openai docker image for inferencing with vLLM.
 Optionally, vLLM can be installed from nightly:
 ```bash
 pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
 ```
 and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment):
 ```bash
 vllm serve ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-20b --host 0.0.0.0 --port 8888  --tensor-parallel-size 8
 ```
 #### SGLang
 SGLang has 0-day support in main, see https://github.com/sgl-project/sglang/issues/8833 for infomation on installing
 SGLang from source. Once you've installed SGLang, run the following command to launch a SGLang server:
--- a/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
@@ -44,7 +44,7 @@ bf16: true
 tf32: true
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+attn_implementation: kernels-community/vllm-flash-attn3
 gradient_checkpointing: true
 activation_offloading: true
--- a/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
@@ -40,7 +40,7 @@ bf16: true
 tf32: true
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+attn_implementation: kernels-community/vllm-flash-attn3
 gradient_checkpointing: true
 activation_offloading: true
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
@@ -15,7 +15,7 @@ datasets:
    field_thinking: thinking
    template_thinking_key: thinking
-dataset_prepared_path: ./outputs/last_run_prepared
+dataset_prepared_path: last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/gpt-oss-out/
@@ -41,7 +41,7 @@ bf16: true
 tf32: true
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+attn_implementation: kernels-community/vllm-flash-attn3
 gradient_checkpointing: true
 activation_offloading: true
--- a/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
@@ -15,7 +15,7 @@ datasets:
    field_thinking: thinking
    template_thinking_key: thinking
-dataset_prepared_path: ./outputs/last_run_prepared
+dataset_prepared_path: last_run_prepared
 val_set_size: 0
 output_dir: ./outputs/gpt-oss-out/
@@ -40,7 +40,7 @@ bf16: true
 tf32: true
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+attn_implementation: kernels-community/vllm-flash-attn3
 gradient_checkpointing: true
 activation_offloading: true
--- a/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
+++ b/examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
@@ -53,7 +53,7 @@ bf16: true
 tf32: true
 flash_attention: true
-attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3
+attn_implementation: kernels-community/vllm-flash-attn3
 gradient_checkpointing: true
 activation_offloading: true
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -26,34 +26,3 @@ include-package-data = true
 [tool.setuptools.cmdclass]
 build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"
 [tool.ruff]
 line-length = 88
 target-version = "py310"
 [tool.ruff.lint]
 select = ["E", "F", "W", "C90", "B"]
 ignore = [
    "E203",  # Whitespace before ':'
    "E501",  # Line too long
    "C901",  # Too complex
    "B019",  # Use of functools.cache on methods
    "E722",  # Bare except
    "F821",  # Undefined name (for dynamic exec)
 ]
 [tool.ruff.lint.isort]
 known-third-party = ["wandb", "comet_ml"]
 known-local-folder = ["src", "tests"]
 # Black-compatible isort settings
 force-single-line = false
 combine-as-imports = true
 split-on-trailing-comma = true
 [tool.ruff.format]
 # Use black's formatting style exactly
 quote-style = "double"
 indent-style = "space"
 skip-magic-trailing-comma = false
 line-ending = "auto"
 docstring-code-format = false
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,8 +13,8 @@ liger-kernel==0.6.1
 packaging==23.2
 huggingface_hub>=0.33.0
-peft>=0.17.0
+peft==0.17.0
-transformers==4.55.3
+transformers==4.55.2
 tokenizers>=0.21.1
 accelerate==1.10.0
 datasets==4.0.0
--- a/scripts/chat_datasets.py
+++ b/scripts/chat_datasets.py
@@ -27,7 +27,7 @@ def parse_dataset(dataset=None, split="train"):
            break
    if not field_messages:
        raise ValueError(
-            f"No conversation field found in dataset: {', '.join(feature_keys)}"
+            f'No conversation field found in dataset: {", ".join(feature_keys)}'
        )
    ds_cfg["field_messages"] = field_messages
@@ -40,7 +40,7 @@ def parse_dataset(dataset=None, split="train"):
            break
    if not message_property_mappings["role"]:
        raise ValueError(
-            f"No role field found in messages: {', '.join(message_fields)}"
+            f'No role field found in messages: {", ".join(message_fields)}'
        )
    for key in ["content", "text", "value"]:
@@ -49,7 +49,7 @@ def parse_dataset(dataset=None, split="train"):
            break
    if not message_property_mappings["content"]:
        raise ValueError(
-            f"No content field found in messages: {', '.join(message_fields)}"
+            f'No content field found in messages: {", ".join(message_fields)}'
        )
    ds_cfg["message_property_mappings"] = message_property_mappings
--- a/scripts/unsloth_install.py
+++ b/scripts/unsloth_install.py
@@ -1,10 +1,11 @@
 # noqa
 # pylint: skip-file
 import sys
 try:
    import torch
-except ImportError as error:
+except ImportError:
-    raise ImportError("Install torch via `pip install torch`") from error
+    raise ImportError("Install torch via `pip install torch`")
 from packaging.version import Version as V
 use_uv = "--uv" in sys.argv[1:]
--- a/setup.py
+++ b/setup.py
@@ -118,9 +118,9 @@ def get_package_version():
 extras_require = {
-    "flash-attn": ["flash-attn==2.8.3"],
+    "flash-attn": ["flash-attn==2.8.2"],
    "ring-flash-attn": [
-        "flash-attn==2.8.3",
+        "flash-attn==2.8.2",
        "ring-flash-attn>=0.1.7",
        "yunchang==0.6.0",
    ],
--- a/src/axolotl/cli/args.py
+++ b/src/axolotl/cli/args.py
@@ -40,12 +40,6 @@ class VllmServeCliArgs:
        default=None,
        metadata={"help": "Number of tensor parallel workers to use."},
    )
    data_parallel_size: Optional[int] = field(
        default=None,
        metadata={
            "help": "Number of data parallel workers to use for vLLM serving. This controls how many model replicas are used for parallel inference."
        },
    )
    host: Optional[str] = field(
        default=None,  # nosec B104
        metadata={"help": "Host address to run the server on."},
--- a/src/axolotl/cli/art.py
+++ b/src/axolotl/cli/art.py
@@ -22,7 +22,7 @@ HAS_PRINTED_LOGO = False
 def print_axolotl_text_art():
    """Prints axolotl ASCII art."""
-    global HAS_PRINTED_LOGO
+    global HAS_PRINTED_LOGO  # pylint: disable=global-statement
    if HAS_PRINTED_LOGO:
        return
    if is_main_process():
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -41,7 +41,7 @@ def run_cmd(cmd: str, run_folder: str, volumes=None):
    if exit_code := subprocess.call(  # nosec B603
        cmd.split(), cwd=run_folder, env=new_env
    ):
-        exit(exit_code)
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
    # Commit writes to volume.
    if volumes:
@@ -82,7 +82,7 @@ class ModalCloud(Cloud):
        return res
    def get_image(self):
-        docker_tag = "main-py3.11-cu126-2.7.1"
+        docker_tag = "main-py3.11-cu124-2.6.0"
        if self.config.docker_tag:
            docker_tag = self.config.docker_tag
        docker_image = f"axolotlai/axolotl:{docker_tag}"
@@ -130,6 +130,7 @@ class ModalCloud(Cloud):
        res = []
        if self.config.secrets:
            for key in self.config.get("secrets", []):
                # pylint: disable=duplicate-code
                if isinstance(key, str):
                    if val := os.environ.get(key, ""):
                        res.append(modal.Secret.from_dict({key: val}))
@@ -176,8 +177,8 @@ class ModalCloud(Cloud):
            with self.app.run(detach=True):
                modal_fn.remote(
                    config_yaml,
                    *args,
                    volumes={k: v[0] for k, v in self.volumes.items()},
                    *args,
                    **kwargs,
                )
@@ -186,7 +187,7 @@ class ModalCloud(Cloud):
            return int(self.config.timeout)
        return 60 * 60 * 24  # 24 hours
-    def get_train_gpu(self):
+    def get_train_gpu(self):  # pylint: disable=too-many-return-statements
        count = self.config.gpu_count or 1
        family = self.config.gpu.lower() or "l40s"
@@ -199,7 +200,7 @@ class ModalCloud(Cloud):
        if family in ["a10", "a10g"]:
            return modal.gpu.A10G(count=count)
        if family == "h100":
-            return f"H100:{count}"
+            return modal.gpu.H100(count=count)
        if family == "t4":
            return modal.gpu.T4(count=count)
        if family == "l4":
@@ -276,7 +277,7 @@ def _train(
    launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
    launcher_args: list[str] | None = None,
    volumes=None,
-    **kwargs,
+    **kwargs,  # pylint: disable=unused-argument
 ):
    Path("/workspace/mounts").mkdir(parents=True, exist_ok=True)
    with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out:
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -210,7 +210,7 @@ def load_cfg(
    try:
        device_props = torch.cuda.get_device_properties("cuda")
        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
-    except:
+    except:  # pylint: disable=bare-except # noqa: E722
        gpu_version = None
    prepare_plugins(cfg)
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -28,7 +28,7 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: CLI arguments.
    """
-
+    # pylint: disable=duplicate-code
    check_accelerate_default_config()
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        check_user_token()
@@ -49,7 +49,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
-
+    # pylint: disable=duplicate-code
    parsed_cfg = load_cfg(config, **kwargs)
    parser = HfArgumentParser(TrainerCliArgs)
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
--- a/src/axolotl/cli/inference.py
+++ b/src/axolotl/cli/inference.py
@@ -35,7 +35,7 @@ def get_multi_line_input() -> str:
    instruction = ""
    for line in sys.stdin:
-        instruction += line
+        instruction += line  # pylint: disable=consider-using-join
    return instruction
@@ -64,7 +64,7 @@ def do_inference(
            importlib.import_module("axolotl.prompters"), prompter
        )
    elif cfg.chat_template:
-        chat_template_str = get_chat_template(cfg.chat_template, tokenizer=tokenizer)
+        chat_template_str = get_chat_template(cfg.chat_template)
    elif cfg.datasets[0].type == "chat_template":
        chat_template_str = get_chat_template_from_config(
            cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
@@ -167,6 +167,7 @@ def do_inference_gradio(
        if not instruction:
            return
        if prompter_module:
            # pylint: disable=stop-iteration-return
            prompt: str = next(
                prompter_module().build_prompt(instruction=instruction.strip("\n"))
            )
@@ -251,7 +252,7 @@ def do_cli(
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
-
+    # pylint: disable=duplicate-code
    parsed_cfg = load_cfg(config, inference=True, rl=None, **kwargs)
    parsed_cfg.sample_packing = False
    parser = transformers.HfArgumentParser(InferenceCliArgs)
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -1,5 +1,7 @@
 """Click CLI definitions for various axolotl commands."""
 # pylint: disable=redefined-outer-name
 import os
 import subprocess  # nosec B404
 from typing import Literal, Optional
--- a/src/axolotl/cli/merge_sharded_fsdp_weights.py
+++ b/src/axolotl/cli/merge_sharded_fsdp_weights.py
@@ -32,7 +32,7 @@ LOG = get_logger(__name__)
 class BFloat16CastPlanner(_EmptyStateDictLoadPlanner):
    """A custom planner to cast tensors to bfloat16 on the fly during loading."""
-    def commit_tensor(self, read_item, tensor):
+    def commit_tensor(self, read_item, tensor):  # pylint: disable=unused-argument
        tensor.copy_(tensor.to(torch.bfloat16))
@@ -59,10 +59,10 @@ def _distributed_checkpoint_to_merged_weights(
    state_dict: Dict = {}
    save_path_ = Path(save_path)
    save_path_.mkdir(exist_ok=True)
-    dist_cp_format_utils._load_state_dict(
+    dist_cp_format_utils._load_state_dict(  # pylint: disable=protected-access
        state_dict,
        storage_reader=dist_cp.FileSystemReader(checkpoint_dir),
-        planner=BFloat16CastPlanner(),
+        planner=BFloat16CastPlanner(),  # pylint: disable=protected-access
        no_dist=True,
    )
@@ -191,7 +191,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
-
+    # pylint: disable=duplicate-code
    parsed_cfg = load_cfg(config, **kwargs)
    fsdp_dir = Path(parsed_cfg.output_dir) / "pytorch_model_fsdp_0"
--- a/src/axolotl/cli/preprocess.py
+++ b/src/axolotl/cli/preprocess.py
@@ -73,7 +73,7 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
                    AutoModelForCausalLM.from_pretrained(
                        model_name, trust_remote_code=True
                    )
-                except Exception:  # nosec B110
+                except Exception as exc:  # pylint: disable=broad-exception-caught,unused-variable  # nosec B110  # noqa F841
                    pass
                # fmt: on
@@ -95,10 +95,9 @@ def do_cli(
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
-
+    # pylint: disable=duplicate-code
    os.environ["AXOLOTL_IS_PREPROCESS"] = "1"
-    is_preprocess = kwargs.pop("is_preprocess", True)
+    parsed_cfg = load_cfg(config, **kwargs)
    parsed_cfg = load_cfg(config, is_preprocess=is_preprocess, **kwargs)
    parsed_cfg.is_preprocess = True
    parser = transformers.HfArgumentParser(PreprocessCliArgs)
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -59,7 +59,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
-
+    # pylint: disable=duplicate-code
    parsed_cfg = load_cfg(config, **kwargs)
    parser = HfArgumentParser(TrainerCliArgs)
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
--- a/src/axolotl/cli/utils/args.py
+++ b/src/axolotl/cli/utils/args.py
@@ -65,7 +65,7 @@ def add_options_from_dataclass(config_class: Type[Any]) -> Callable:
        for field in reversed(dataclasses.fields(config_class)):
            field_type = _strip_optional_type(field.type)
-            if field_type is bool:
+            if field_type == bool:
                field_name = field.name.replace("_", "-")
                option_name = f"--{field_name}/--no-{field_name}"
                function = click.option(
@@ -103,7 +103,7 @@ def add_options_from_config(config_class: Type[BaseModel]) -> Callable:
        for name, field in reversed(config_class.model_fields.items()):
            field_type = _strip_optional_type(field.annotation)
-            if field_type is bool:
+            if field_type == bool:
                field_name = name.replace("_", "-")
                option_name = f"--{field_name}/--no-{field_name}"
                function = click.option(
--- a/src/axolotl/cli/utils/sweeps.py
+++ b/src/axolotl/cli/utils/sweeps.py
@@ -3,12 +3,11 @@
 import random
 from copy import deepcopy
 from itertools import product
 from typing import Any
 def generate_sweep_configs(
    base_config: dict[str, list], sweeps_config: dict[str, list]
-) -> list[dict[str, Any]]:
+) -> list[dict[str, list]]:
    """
    Recursively generates all possible configurations by applying sweeps to the base config.
@@ -49,10 +48,7 @@ def generate_sweep_configs(
                new_config = {}
                # new_config = deepcopy(base_config)
                # Combine regular parameters with paired parameters
-                full_combo = {
+                full_combo = {**dict(zip(param_names, reg_combo)), **paired_set}
                    **dict(zip(param_names, reg_combo, strict=False)),
                    **paired_set,
                }
                for param_name, param_value in full_combo.items():
                    new_config[param_name] = param_value
                print(new_config)
@@ -61,7 +57,7 @@ def generate_sweep_configs(
            # If no paired values, just use regular combinations
            # new_config = deepcopy(base_config)
            new_config = {}
-            for param_name, param_value in zip(param_names, reg_combo, strict=False):
+            for param_name, param_value in zip(param_names, reg_combo):
                new_config[param_name] = param_value
            print(new_config)
            all_combinations.append(new_config)
--- a/src/axolotl/cli/utils/train.py
+++ b/src/axolotl/cli/utils/train.py
@@ -4,7 +4,6 @@ import os
 import subprocess  # nosec
 import sys
 import tempfile
 from pathlib import Path
 from typing import Any, Iterator, Literal
 import yaml
@@ -89,12 +88,8 @@ def generate_config_files(config: str, sweep: str | None) -> Iterator[tuple[str,
    # Generate all possible configurations
    permutations = generate_sweep_configs(base_config, sweep_config)
    is_group = len(permutations) > 1
-    base_output_dir = base_config.get("output_dir", "./model-out")
+    for permutation in permutations:
-    for idx, permutation in enumerate(permutations, start=1):
+        # pylint: disable=consider-using-with
        permutation_dir = Path(permutation.get("output_dir", base_output_dir))
        permutation_id = f"sweep{idx:04d}"
        permutation["output_dir"] = str(permutation_dir / permutation_id)
        temp_file = tempfile.NamedTemporaryFile(
            mode="w",
            suffix=".yaml",
--- a/src/axolotl/cli/vllm_serve.py
+++ b/src/axolotl/cli/vllm_serve.py
@@ -39,7 +39,7 @@ def do_vllm_serve(
    model = cfg.base_model
    serve_module = cli_args.get("serve_module", "trl.scripts.vllm_serve")
-    vllm_serve_main = __import__(serve_module, fromlist=["main"]).main
+    vllm_serve_main = getattr(__import__(serve_module, fromlist=["main"]), "main")
    tensor_parallel_size = 1
    data_parallel_size = 1
@@ -68,6 +68,7 @@ def do_vllm_serve(
        cli_args.get("enable_reasoning") or cfg.vllm.enable_reasoning or False
    )
    # pylint: disable=unexpected-keyword-arg
    vllm_script_args = AxolotlScriptArguments(
        model=model,
        tensor_parallel_size=tensor_parallel_size,
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -6,7 +6,7 @@ from dataclasses import dataclass
 from datasets import Dataset
-import axolotl.monkeypatch.data.batch_dataset_fetcher  # noqa: F401
+import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
 from axolotl.loaders import load_processor, load_tokenizer
 from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
--- a/src/axolotl/convert.py
+++ b/src/axolotl/convert.py
@@ -67,7 +67,9 @@ class JsonToJsonlConverter:
        self.json_parser = json_parser
        self.jsonl_serializer = jsonl_serializer
-    def convert(self, input_file_path, output_file_path):
+    def convert(
        self, input_file_path, output_file_path
    ):  # pylint: disable=unused-argument
        content = self.file_reader.read(input_file_path)
        data = self.json_parser.parse(content)
        # data = [r for r in data if r["conversations"]]  # vicuna cleaned has rows with empty conversations
--- a/src/axolotl/core/attention/flex_block_mask.py
+++ b/src/axolotl/core/attention/flex_block_mask.py
@@ -84,7 +84,9 @@ def create_causal_mask(
    batch_size, dtype = input_embeds.shape[0], input_embeds.dtype
    if attention_mask is not None:
-        def causal_doc_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
+        def causal_doc_mask_mod(
            batch_idx, head_idx, q_idx, kv_idx
        ):  # pylint: disable=unused-argument
            """
            Defines the logic of a block causal mask by combining both a standard causal mask
            and a block diagonal document mask.
@@ -101,7 +103,9 @@ def create_causal_mask(
        mask_factory_function = causal_doc_mask_mod
    else:
        mask_factory_function = causal_mask_function
-    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[config._attn_implementation]
+    mask_interface = ALL_MASK_ATTENTION_FUNCTIONS[
        config._attn_implementation  # pylint: disable=protected-access
    ]
    # Do not allow skip if we are compiling (this is to match BC)
    allow_is_causal_skip = (
--- a/src/axolotl/core/builders/base.py
+++ b/src/axolotl/core/builders/base.py
@@ -44,7 +44,7 @@ from axolotl.utils.schemas.enums import CustomSupportedOptimizers
 LOG = logging.getLogger(__name__)
 with suppress(ImportError):
-    import torch._dynamo
+    import torch._dynamo  # pylint: disable=ungrouped-imports
 class TrainerBuilderBase(abc.ABC):
@@ -260,14 +260,14 @@ class TrainerBuilderBase(abc.ABC):
                adam_kwargs["eps"] = training_args_kwargs.get("adam_epsilon")
            if self.cfg.optimizer == "muon":
-                from axolotl.contribs.mit.muon import (
+                from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
                    MuonOptimizerFactory,
                )
                optimizer_cls = MuonOptimizerFactory
                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "dion":
-                from axolotl.contribs.mit.dion import (
+                from axolotl.contribs.mit.dion import (  # pylint: disable=no-name-in-module
                    DionOptimizerFactory,
                )
@@ -414,8 +414,12 @@ class TrainerBuilderBase(abc.ABC):
    def _configure_torch_compile(self, training_args_kwargs: dict):
        if self.cfg.torch_compile and getattr(torch, "_dynamo", None):
-            torch._dynamo.config.suppress_errors = True
+            torch._dynamo.config.suppress_errors = (  # pylint: disable=protected-access
-            torch._dynamo.config.accumulated_cache_size_limit = 256
+                True
            )
            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
                256
            )
            training_args_kwargs["torch_compile"] = self.cfg.torch_compile
            if self.cfg.torch_compile_backend:
                training_args_kwargs["torch_compile_backend"] = (
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -344,14 +344,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            training_args_cls = AxolotlPRMConfig
        else:
            training_args_cls = AxolotlTrainingArguments
-        training_args = training_args_cls(
+        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
            **training_arguments_kwargs,
        )
        training_args = self.hook_post_create_training_args(training_args)
        # unset run_name so wandb sets up experiment names
        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
-            training_args.run_name = None
+            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
                None
            )
        data_collator_kwargs = {
            "padding": True,  # True/"longest" is the default
@@ -422,7 +424,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
    ):
        if training_args.pretraining:
            if (
-                self.cfg.pretraining_sample_concatenation is False
+                not self.cfg.pretraining_sample_concatenation
                or self.cfg.micro_batch_size > 1
            ):
                return DataCollatorForSeq2Seq(self.tokenizer, **kwargs)
--- a/src/axolotl/core/builders/rl.py
+++ b/src/axolotl/core/builders/rl.py
@@ -168,14 +168,16 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if plugin_training_args:
                training_args_kwargs.update(plugin_training_args)
-        training_args = training_args_cls(
+        training_args = training_args_cls(  # pylint: disable=unexpected-keyword-arg
            logging_first_step=True,
            **training_args_kwargs,
        )
        # unset run_name so wandb sets up experiment names
        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
-            training_args.run_name = None
+            training_args.run_name = (  # pylint: disable=attribute-defined-outside-init
                None
            )
        return training_args, trainer_kwargs
--- a/src/axolotl/core/chat/format/chatml.py
+++ b/src/axolotl/core/chat/format/chatml.py
@@ -10,7 +10,7 @@ from .shared import wrap_tools
 def format_message(
    message: Messages,
-    message_index: Optional[int] = None,
+    message_index: Optional[int] = None,  # pylint: disable=unused-argument
 ) -> Messages:
    if message.is_chat_formatted:
        return message
--- a/src/axolotl/core/chat/messages.py
+++ b/src/axolotl/core/chat/messages.py
@@ -15,11 +15,11 @@ class MessageRoles(str, Enum):
    Message roles for the system, user, assistant, and tools
    """
-    system = "system"
+    system = "system"  # pylint: disable=invalid-name
-    user = "user"
+    user = "user"  # pylint: disable=invalid-name
-    assistant = "assistant"
+    assistant = "assistant"  # pylint: disable=invalid-name
-    tool = "tool"
+    tool = "tool"  # pylint: disable=invalid-name
-    ipython = (
+    ipython = (  # pylint: disable=invalid-name
        # for responses from builtin tools
        "ipython"
    )
@@ -30,12 +30,12 @@ class MessageContentTypes(str, Enum):
    Message content types for text, image, audio, tool calls, and tool responses
    """
-    special_token = "special_token"  # nosec B105
+    special_token = "special_token"  # pylint: disable=invalid-name  # nosec B105
-    text = "text"
+    text = "text"  # pylint: disable=invalid-name
-    image = "image"
+    image = "image"  # pylint: disable=invalid-name
-    audio = "audio"
+    audio = "audio"  # pylint: disable=invalid-name
-    tool_call = "tool_call"
+    tool_call = "tool_call"  # pylint: disable=invalid-name  # to differentiate regular responses from tool calls from the assistant
-    tool_response = "tool_response"
+    tool_response = "tool_response"  # pylint: disable=invalid-name
 class SpecialToken(str, Enum):
@@ -43,8 +43,8 @@ class SpecialToken(str, Enum):
    Special tokens for beginning of string and end of string
    """
-    bos_token = "bos_token"  # nosec B105
+    bos_token = "bos_token"  # pylint: disable=invalid-name  # nosec B105
-    eos_token = "eos_token"  # nosec B105
+    eos_token = "eos_token"  # pylint: disable=invalid-name  # nosec B105
 class ToolCallFunction(BaseModel):
@@ -73,7 +73,7 @@ class ToolCallContents(BaseModel):
    name: str
    arguments: dict[str, Union[str, int]]
-    id: Optional[str] = None
+    id: Optional[str] = None  # pylint: disable=invalid-name
    def __str__(self) -> str:
        data = {"name": self.name, "arguments": self.arguments}
@@ -89,7 +89,7 @@ class ToolResponseContents(BaseModel):
    name: str
    content: Union[str, dict[str, Union[str, int, float]]]
-    id: Optional[str] = None
+    id: Optional[str] = None  # pylint: disable=invalid-name
    def __str__(self) -> str:
        data = {"name": self.name, "content": self.content}
--- a/src/axolotl/core/datasets/transforms/chat_builder.py
+++ b/src/axolotl/core/datasets/transforms/chat_builder.py
@@ -1,17 +1,23 @@
 """
-This module contains a function that builds a transform that takes a row from the
+This module contains a function that builds a transform that takes a row from the dataset and converts it to a Chat.
 dataset and converts it to a Chat.
 """
-from typing import Any, Mapping
+from typing import Any, Mapping, Union
-def chat_message_transform_builder(
+def chat_message_transform_builder(  # pylint: disable=dangerous-default-value
    train_on_inputs=False,
    conversations_field: str = "conversations",
-    message_field_role: str | list[str] | None = None,  # commonly "role"
+    message_field_role: Union[str, list[str]] = ["role", "from"],  # commonly "role"
-    message_field_content: str | list[str] | None = None,  # commonly "content"
+    message_field_content: Union[str, list[str]] = [
-    message_field_training: str | list[str] | None = None,  # commonly "weight"
+        "value",
        "text",
        "content",
    ],  # commonly "content"
    message_field_training: Union[str, list[str]] = [
        "train",
        "weight",
    ],  # commonly "weight"
 ):
    """Builds a transform that takes a row from the dataset and converts it to a Chat
@@ -33,12 +39,6 @@ def chat_message_transform_builder(
            A function that takes a list of conversations and returns a list of messages.
    """
    if message_field_training is None:
        message_field_training = ["train", "weight"]
    if message_field_content is None:
        message_field_content = ["value", "text", "content"]
    if message_field_role is None:
        message_field_role = ["role", "from"]
    message_field_role = (
        [message_field_role]
        if isinstance(message_field_role, str)
--- a/src/axolotl/core/trainers/init.py
+++ b/src/axolotl/core/trainers/init.py
@@ -1,5 +1,6 @@
 """Init for axolotl.core.trainers"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .base import AxolotlTrainer
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -1,5 +1,7 @@
 """Module for customized trainers"""
 # pylint: disable=too-many-lines
 from __future__ import annotations
 import os
@@ -270,6 +272,20 @@ class AxolotlTrainer(
                    num_workers=self.args.dataloader_num_workers,
                    rank=self.args.process_index,
                )
        if (
            self.args.accelerator_config is not None
            and self.args.accelerator_config.split_batches
            and self.args.accelerator_config.dispatch_batches
        ):
            if self.args.sample_packing and self.args.pretraining:
                if not self.args.eval_sample_packing and not is_training:
                    dataloader_params["batch_size"] *= self.accelerator.num_processes
                else:
                    dataloader_params["batch_size"] = self.accelerator.num_processes
            elif not self.args.sample_packing and self.args.pretraining:
                dataloader_params["batch_size"] *= self.accelerator.num_processes
        if self.args.sample_packing and (
            (is_training and not self.args.pretraining)
            or (not is_training and self.args.eval_sample_packing is not False)
@@ -283,9 +299,9 @@ class AxolotlTrainer(
        # fmt: off
        if dataloader_key is not None and self.args.dataloader_persistent_workers:
            if hasattr(self, "_eval_dataloaders"):
-                self._eval_dataloaders[dataloader_key] = dataloader  # type: ignore
+                self._eval_dataloaders[dataloader_key] = dataloader  # type: ignore  # pylint: disable=access-member-before-definition
            else:
-                self._eval_dataloaders = {dataloader_key: dataloader}
+                self._eval_dataloaders = {dataloader_key: dataloader}  # pylint: disable=attribute-defined-outside-init
        # fmt: on
        return self.accelerator.prepare(dataloader)
@@ -441,7 +457,7 @@ class AxolotlTrainer(
        model,
        inputs,
        return_outputs=False,
-        num_items_in_batch=None,
+        num_items_in_batch=None,  # pylint: disable=unused-argument
    ):
        concat_inputs = AxolotlTrainer.orpo_concatenate_inputs(
            inputs,
@@ -522,7 +538,9 @@ class AxolotlTrainer(
        accelerator_config = self.args.accelerator_config.to_dict()
        use_configured_state = accelerator_config.get("use_configured_state", False)
        if not use_configured_state:
-            AcceleratorState._reset_state(reset_partial_state=True)
+            AcceleratorState._reset_state(  # pylint: disable=protected-access
                reset_partial_state=True
            )
        super().create_accelerator_and_postprocess()
@@ -536,6 +554,7 @@ class AxolotlTrainer(
            ):
                self.accelerator.state.fsdp_plugin.limit_all_gathers = True
    # pylint: disable=unused-argument
    def additional_accelerator_args(
        self, fp8: bool = False, enable_fsdp_float8_all_gather: bool = False, **kwargs
    ) -> dict[str, Any]:
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -101,11 +101,11 @@ class AxolotlDPOTrainer(
    ) -> dict[str, torch.Tensor]:
        if self.args.dpo_norm_loss:
            # fmt: off
-            loss_type: str = self.loss_type  # type: ignore[has-type]
+            loss_type: str = self.loss_type  # type: ignore[has-type]  # pylint: disable=access-member-before-definition
            # fmt: on
            # concatenated_forward handles avg token logprob for ipo case already
-            self.loss_type = "ipo"
+            self.loss_type = "ipo"  # pylint: disable=attribute-defined-outside-init
            res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
-            self.loss_type = loss_type
+            self.loss_type = loss_type  # pylint: disable=attribute-defined-outside-init
            return res
        return super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -128,7 +128,9 @@ class GRPOStrategy:
        return grpo_args_kwargs
    @classmethod
-    def set_trainer_args(cls, cfg: DictDefault) -> list[Any]:
+    def set_trainer_args(
        cls, cfg: DictDefault
    ) -> list[Any]:  # pylint: disable=unused-argument
        trainer_args = []
        if cfg.trl and cfg.trl.reward_funcs:
            reward_funcs = []
@@ -149,7 +151,7 @@ class GRPOStrategy:
        return trainer_kwargs
    @classmethod
-    def get_collator(cls, *args, **kwargs):
+    def get_collator(cls, *args, **kwargs):  # pylint: disable=unused-argument
        # No data collation is needed in GRPO, handled by trl's trainer __init__
        return None
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -1,5 +1,7 @@
 """Axolotl GRPO trainers (with and without sequence parallelism handling)"""
 # pylint: disable=too-many-lines,duplicate-code,protected-access,no-member
 import warnings
 from functools import partial
 from typing import Any
@@ -50,6 +52,7 @@ from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, Optimizer
 from axolotl.monkeypatch.ring_attn import get_ring_attn_group
 if is_peft_available():
    # pylint: disable=unused-import
    from peft import PeftConfig
@@ -250,7 +253,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
    def get_train_dataloader(self) -> DataLoader:
        """Get dataloader for training"""
        train_dataset = self.train_dataset
-
+        # pylint: disable=access-member-before-definition
        data_collator = self.data_collator  # type: ignore
        # Handle dataset preprocessing
@@ -263,7 +266,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                    train_dataset, description="training"
                )
        else:
-            self.data_collator = self._get_collator_with_removed_columns(
+            self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
                data_collator,
                description="training",
            )
@@ -305,10 +308,10 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
        # Generate completions using either vLLM or regular generation
        if self.args.use_vllm:
            # First, have main process load weights if needed
-
+            # pylint: disable=access-member-before-definition
            if self.state.global_step != self._last_loaded_step:  # type: ignore[has-type]
                self._move_model_to_vllm()
-
+                # pylint: disable=attribute-defined-outside-init
                self._last_loaded_step = self.state.global_step
            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
@@ -330,9 +333,8 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                        # Extract prompts from this SP group, accounting for num_generations duplicates
                        # We only need prompts from one rank in each SP group
                        group_prompts = all_prompts_text[
-                            group_leader_rank * len(prompts_text) : (
+                            group_leader_rank
-                                group_leader_rank + 1
+                            * len(prompts_text) : (group_leader_rank + 1)
                            )
                            * len(prompts_text) : self.num_generations
                        ]
@@ -483,7 +485,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
        )
        if is_conversational(inputs[0]):
            completions = []
-            for prompt, completion in zip(prompts, completions_text, strict=False):
+            for prompt, completion in zip(prompts, completions_text):
                bootstrap = (
                    prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
                )
@@ -501,7 +503,6 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                self.reward_funcs,
                self.reward_processing_classes,
                self.reward_func_names,
                strict=False,
            )
        ):
            with profiling_context(self, reward_func_name):
@@ -510,17 +511,14 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
                ):  # Module instead of PretrainedModel for compat with compiled models
                    if is_conversational(inputs[0]):
                        messages = [
-                            {"messages": p + c}
+                            {"messages": p + c} for p, c in zip(prompts, completions)
                            for p, c in zip(prompts, completions, strict=False)
                        ]
                        texts = [
                            apply_chat_template(x, reward_processing_class)["text"]
                            for x in messages
                        ]
                    else:
-                        texts = [
+                        texts = [p + c for p, c in zip(prompts, completions)]
                            p + c for p, c in zip(prompts, completions, strict=False)
                        ]
                    reward_inputs = reward_processing_class(
                        text=texts,
                        return_tensors="pt",
@@ -566,8 +564,7 @@ class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
            row_reward_kwargs["completion"] = completions[nan_row_idx]
            warnings.warn(
                f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. "
-                "Please ensure that at least one reward function returns a valid reward.",
+                "Please ensure that at least one reward function returns a valid reward."
                stacklevel=2,
            )
        # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
--- a/src/axolotl/core/trainers/mamba.py
+++ b/src/axolotl/core/trainers/mamba.py
@@ -5,6 +5,7 @@ import torch
 from axolotl.core.trainers.base import AxolotlTrainer
 # pylint: disable=too-many-ancestors
 class AxolotlMambaTrainer(AxolotlTrainer):
    """Mamba specific trainer to handle loss calculation"""
@@ -14,8 +15,8 @@ class AxolotlMambaTrainer(AxolotlTrainer):
        self,
        model,
        inputs,
-        return_outputs=False,
+        return_outputs=False,  # pylint: disable=unused-argument
-        num_items_in_batch=None,
+        num_items_in_batch=None,  # pylint: disable=unused-argument
    ):
        input_ids = inputs.pop("input_ids")
        lm_logits = model(input_ids).logits
--- a/src/axolotl/core/trainers/mixins/init.py
+++ b/src/axolotl/core/trainers/mixins/init.py
@@ -1,5 +1,6 @@
 """Init for axolotl.core.trainers.mixins"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .activation_checkpointing import ActivationOffloadingMixin
--- a/src/axolotl/core/trainers/mixins/activation_checkpointing.py
+++ b/src/axolotl/core/trainers/mixins/activation_checkpointing.py
@@ -92,7 +92,7 @@ def get_lora_act_offloading_ctx_manager(
        `contextlib.ContextDecorator`:
            Activation offloading context manager for the model.
    """
-
+    # pylint: disable=unnecessary-dunder-call
    activations_handling_ctx = OffloadActivations(
        use_pin_memory=use_pin_memory,
        use_streams=use_streams,
--- a/src/axolotl/core/trainers/mixins/distributed_parallel.py
+++ b/src/axolotl/core/trainers/mixins/distributed_parallel.py
@@ -26,6 +26,7 @@ class DistributedParallelMixin(Trainer):
            self.accelerator.distributed_type == "FSDP"
            and self.accelerator.state.fsdp_plugin is None
        ):
            # pylint: disable=protected-access
            # handle Context Parallelism without FSDP
            self.accelerator.state.distributed_type = "MULTI_GPU"
            self.accelerator.state._shared_state["distributed_type"] = "MULTI_GPU"
--- a/src/axolotl/core/trainers/mixins/optimizer.py
+++ b/src/axolotl/core/trainers/mixins/optimizer.py
@@ -70,11 +70,11 @@ class OptimizerMixin(Trainer):
                }
            )
        if params["embeddings"]:
-            lr = optimizer_kwargs["lr"]
+            lr = optimizer_kwargs["lr"]  # pylint: disable=invalid-name
            if self.args.embedding_lr_scale:
-                lr *= self.args.embedding_lr_scale
+                lr *= self.args.embedding_lr_scale  # pylint: disable=invalid-name
            elif self.args.embedding_lr:
-                lr = self.args.embedding_lr
+                lr = self.args.embedding_lr  # pylint: disable=invalid-name
            optimizer_grouped_parameters.append(
                {
                    "params": list(params["embeddings"].values()),
@@ -143,7 +143,7 @@ class OptimizerMixin(Trainer):
                loraplus_lr_embedding = getattr(
                    self.args, "loraplus_lr_embedding", 1e-6
                )
-                self.optimizer = create_loraplus_optimizer(
+                self.optimizer = create_loraplus_optimizer(  # pylint: disable=attribute-defined-outside-init
                    opt_model,
                    optimizer_cls,
                    loraplus_lr_ratio=loraplus_lr_ratio,
@@ -185,15 +185,17 @@ class OptimizerMixin(Trainer):
                                p.data_ptr(): p.numel() for p in module.parameters()
                            }.values()
                        )
-                        LOG.info(f"skipped {module}: {skipped / 2**20}M params")
+                        LOG.info(f"skipped {module}: {skipped/2**20}M params")
                        manager.register_module_override(
                            module, "weight", {"optim_bits": 32}
                        )
                        LOG.debug(f"bitsandbytes: will optimize {module} in fp32")
-                LOG.info(f"skipped: {skipped / 2**20}M params")
+                LOG.info(f"skipped: {skipped/2**20}M params")
        if is_sagemaker_mp_enabled():
-            self.optimizer = smp.DistributedOptimizer(self.optimizer)
+            self.optimizer = smp.DistributedOptimizer(  # pylint: disable=attribute-defined-outside-init
                self.optimizer
            )
        return self.optimizer
--- a/src/axolotl/core/trainers/mixins/scheduler.py
+++ b/src/axolotl/core/trainers/mixins/scheduler.py
@@ -46,7 +46,7 @@ class SchedulerMixin(Trainer):
        )
        # fmt: off
-        if self.lr_scheduler is None:  # type: ignore
+        if self.lr_scheduler is None:  # type: ignore  # pylint: disable=access-member-before-definition
            # fmt: on
            plugin_manager = PluginManager.get_instance()
            lr_scheduler: LRScheduler | None = plugin_manager.create_lr_scheduler(
@@ -90,7 +90,7 @@ class SchedulerMixin(Trainer):
                    LOG.warning(
                        "Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
-                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(
+                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(  # pylint: disable=attribute-defined-outside-init
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
@@ -98,7 +98,7 @@ class SchedulerMixin(Trainer):
            elif self.args.cosine_min_lr_ratio and self.args.cosine_constant_lr_ratio and use_cosine_min_lr:
                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
                assert 0 <= self.args.cosine_constant_lr_ratio <= 1.0, "cosine_constant_lr_ratio must be between 0.0 and 1.0"
-                self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(
+                self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(  # pylint: disable=attribute-defined-outside-init
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
@@ -107,7 +107,7 @@ class SchedulerMixin(Trainer):
                )
            elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
-                self.lr_scheduler = get_cosine_schedule_with_min_lr(
+                self.lr_scheduler = get_cosine_schedule_with_min_lr(  # pylint: disable=attribute-defined-outside-init
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
@@ -133,7 +133,7 @@ class SchedulerMixin(Trainer):
            )
            if not self.lr_scheduler:
                super().create_scheduler(num_training_steps, optimizer)
-            self.lr_scheduler = JaggedLRRestartScheduler(
+            self.lr_scheduler = JaggedLRRestartScheduler(  # pylint: disable=attribute-defined-outside-init
                optimizer,
                self.lr_scheduler,
                self.args.jagged_restart_steps,
--- a/src/axolotl/core/training_args_base.py
+++ b/src/axolotl/core/training_args_base.py
@@ -14,6 +14,7 @@ class AxolotlTrainingMixins:
    Mixin class for the Axolotl training args.
    """
    # pylint: disable=duplicate-code
    model_type: Optional[str] = field(
        default=None, metadata={"help": "HF model configuration model_type."}
    )
--- a/src/axolotl/datasets.py
+++ b/src/axolotl/datasets.py
@@ -26,7 +26,7 @@ class TokenizedPromptDataset(Dataset):
        keep_in_memory: Whether to keep the tokenized dataset in memory.
    """
-    def __init__(
+    def __init__(  # pylint: disable=super-init-not-called
        self,
        prompt_tokenizer: PromptTokenizingStrategy,
        dataset: Dataset,
@@ -99,7 +99,7 @@ class ConstantLengthDataset(IterableDataset):
        seq_length: Length of token sequences to return.
    """
-    def __init__(
+    def __init__(  # pylint: disable=super-init-not-called
        self,
        tokenizer,
        datasets,
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -79,7 +79,7 @@ def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, f
    model, tokenizer, _, processor = setup_model_and_tokenizer(cfg)
    # Get datasets
-
+    # pylint: disable=duplicate-code
    train_dataset = dataset_meta.train_dataset
    eval_dataset = dataset_meta.eval_dataset
    total_num_steps = dataset_meta.total_num_steps
--- a/src/axolotl/exception_handling.py
+++ b/src/axolotl/exception_handling.py
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -76,7 +76,7 @@ class BasePlugin:
    def __init__(self):
        """Initializes the BasePlugin."""
-    def register(self, cfg: dict):
+    def register(self, cfg: dict):  # pylint: disable=unused-argument
        """Registers the plugin with the given configuration as an unparsed dict.
        Args:
@@ -104,13 +104,14 @@ class BasePlugin:
            dataset_meta: The metadata for the training dataset.
        """
-    def pre_model_load(self, cfg: DictDefault):
+    def pre_model_load(self, cfg: DictDefault):  # pylint: disable=unused-argument
        """Performs actions before the model is loaded.
        Args:
            cfg: The configuration for the plugin.
        """
    # pylint: disable=unused-argument
    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
        """Performs actions after the model is built/loaded, but before any adapters are applied.
@@ -118,6 +119,7 @@ class BasePlugin:
            cfg: The configuration for the plugin.
        """
    # pylint: disable=unused-argument
    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
        """Performs actions before LoRA weights are loaded.
@@ -126,6 +128,7 @@ class BasePlugin:
            model: The loaded model.
        """
    # pylint: disable=unused-argument
    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after LoRA weights are loaded.
@@ -134,6 +137,7 @@ class BasePlugin:
            model: The loaded model.
        """
    # pylint: disable=unused-argument
    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after the model is loaded.
@@ -142,6 +146,7 @@ class BasePlugin:
            model: The loaded model.
        """
    # pylint: disable=unused-argument
    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
        """Returns a custom class for the trainer.
@@ -152,6 +157,7 @@ class BasePlugin:
            The first non-`None` trainer class returned by a plugin.
        """
    # pylint: disable=unused-argument
    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
        """Performs actions after the trainer is created.
@@ -160,7 +166,7 @@ class BasePlugin:
            trainer: The trainer object for training.
        """
-    def get_training_args(self, cfg: DictDefault):
+    def get_training_args(self, cfg: DictDefault):  # pylint: disable=unused-argument):
        """
        Returns custom training arguments to set on TrainingArgs.
@@ -171,7 +177,9 @@ class BasePlugin:
            object: dict containing the training arguments.
        """
-    def get_collator_cls_and_kwargs(self, cfg: DictDefault, is_eval: bool = False):
+    def get_collator_cls_and_kwargs(
        self, cfg: DictDefault, is_eval: bool = False
    ):  # pylint: disable=unused-argument):
        """
        Returns a custom class for the collator.
@@ -183,6 +191,7 @@ class BasePlugin:
            class: The class for the collator.
        """
    # pylint: disable=unused-argument
    def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
        """Creates and returns an optimizer for training.
@@ -194,6 +203,7 @@ class BasePlugin:
            The created optimizer.
        """
    # pylint: disable=unused-argument
    def create_lr_scheduler(
        self,
        cfg: DictDefault,
@@ -213,6 +223,7 @@ class BasePlugin:
            The created learning rate scheduler.
        """
    # pylint: disable=unused-argument
    def add_callbacks_pre_trainer(
        self, cfg: DictDefault, model: PreTrainedModel
    ) -> list[Callable]:
@@ -227,6 +238,7 @@ class BasePlugin:
        """
        return []
    # pylint: disable=unused-argument
    def add_callbacks_post_trainer(
        self, cfg: DictDefault, trainer: Trainer
    ) -> list[Callable]:
@@ -242,6 +254,7 @@ class BasePlugin:
        """
        return []
    # pylint: disable=unused-argument
    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after training is complete.
@@ -250,7 +263,7 @@ class BasePlugin:
            model: The loaded model.
        """
-    def post_train_unload(self, cfg: DictDefault):
+    def post_train_unload(self, cfg: DictDefault):  # pylint: disable=unused-argument
        """Performs actions after training is complete and the model is unloaded.
        Args:
@@ -298,7 +311,7 @@ def load_plugin(plugin_name: str) -> BasePlugin:
    return plugin
-class PluginManager:
+class PluginManager:  # pylint: disable=too-many-public-methods
    """The `PluginManager` class is responsible for loading and managing plugins. It
    should be a singleton so it can be accessed from anywhere in the codebase.
--- a/src/axolotl/integrations/config.py
+++ b/src/axolotl/integrations/config.py
@@ -50,9 +50,15 @@ def merge_input_args():
        dynamic_input += f"class AxolotlInputConfig(AxolotlInputConfigBase, {', '.join(plugin_classes)}):\n    pass\n"
        namespace: Dict[Any, Any] = {}
-        exec(dynamic_input, globals(), namespace)  # nosec B102
+        exec(  # pylint: disable=exec-used  # nosec B102
-        AxolotlInputConfig = namespace["AxolotlInputConfig"]
+            dynamic_input, globals(), namespace
-        AxolotlConfigWCapabilities = namespace["AxolotlConfigWCapabilities"]
+        )
        AxolotlInputConfig = namespace[  # pylint: disable=invalid-name
            "AxolotlInputConfig"
        ]
        AxolotlConfigWCapabilities = namespace[  # pylint: disable=invalid-name
            "AxolotlConfigWCapabilities"
        ]
        return AxolotlConfigWCapabilities, AxolotlInputConfig
    return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase
@@ -68,7 +74,7 @@ def merge_training_args() -> Type:
    Returns:
    tuple: A tuple containing the newly created classes, AxolotlTrainingMixins.
    """
-
+    # pylint: disable=duplicate-code
    from axolotl.core.training_args_base import (
        AxolotlTrainingMixins as AxolotlTrainingMixinsBase,
    )
@@ -87,7 +93,11 @@ def merge_training_args() -> Type:
        namespace: Dict[Any, Any] = {}
        local_vars = {"AxolotlTrainingMixinsBase": AxolotlTrainingMixinsBase}
-        exec(dynamic_input, {**globals(), **local_vars}, namespace)  # nosec B102
+        exec(  # pylint: disable=exec-used  # nosec B102
-        AxolotlTrainingMixins = namespace["AxolotlTrainingMixins"]
+            dynamic_input, {**globals(), **local_vars}, namespace
        )
        AxolotlTrainingMixins = namespace[  # pylint: disable=invalid-name
            "AxolotlTrainingMixins"
        ]
        return AxolotlTrainingMixins
    return AxolotlTrainingMixinsBase
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -18,7 +18,6 @@ Module for the Plugin for Cut Cross Entropy integration with Axolotl.
 Cut Cross Entropy is an optimized implementation of cross entropy loss
 from Apple's ML team.
 """
 import importlib
 from functools import partial
@@ -29,7 +28,7 @@ from axolotl.utils import get_pytorch_version
 from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
 from axolotl.utils.logging import get_logger
-from .args import CutCrossEntropyArgs as CutCrossEntropyArgs
+from .args import CutCrossEntropyArgs  # pylint: disable=unused-import. # noqa: F401
 LOG = get_logger(__name__)
@@ -107,7 +106,9 @@ class CutCrossEntropyPlugin(BasePlugin):
        """
        from cut_cross_entropy.transformers.patch import PATCH_FNS
-        def patch_generic(maybe_model, patch_options, model_type: str):
+        def patch_generic(
            maybe_model, patch_options, model_type: str
        ):  # pylint: disable=unused-argument
            import cut_cross_entropy.transformers.llama
            from cut_cross_entropy.transformers.llama import cce_forward
@@ -120,10 +121,12 @@ class CutCrossEntropyPlugin(BasePlugin):
                )
                model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
-                cut_cross_entropy.transformers.llama._PATCH_OPTS = patch_options
+                cut_cross_entropy.transformers.llama._PATCH_OPTS = (  # pylint: disable=protected-access
                    patch_options
                )
                model_cls.forward = cce_forward
-
+            # pylint: disable=duplicate-code
            except (ImportError, AttributeError) as e:
                raise RuntimeError(
                    f"Could not import ForCausalLM class for model_type: {model_type}. "
--- a/src/axolotl/integrations/cut_cross_entropy/args.py
+++ b/src/axolotl/integrations/cut_cross_entropy/args.py
@@ -15,7 +15,6 @@
 """
 Module for handling Cut Cross Entropy input arguments.
 """
 from typing import Optional
 from pydantic import BaseModel, model_validator
--- a/src/axolotl/integrations/grokfast/init.py
+++ b/src/axolotl/integrations/grokfast/init.py
@@ -7,7 +7,7 @@ from transformers.trainer_callback import TrainerCallback
 from axolotl.utils.logging import get_logger
 from ..base import BasePlugin
-from .args import GrokfastArgs as GrokfastArgs
+from .args import GrokfastArgs  # pylint: disable=unused-import. # noqa: F401
 from .optimizer import gradfilter_ema
 LOG = get_logger(__name__)
@@ -24,10 +24,12 @@ class GrokfastCallbackHandler(TrainerCallback):
        self.alpha = alpha
        self.lamb = lamb
-    def on_train_begin(self, *args_, **kwargs):
+    def on_train_begin(self, *args_, **kwargs):  # pylint: disable=unused-argument
        self.grads = None
-    def on_pre_optimizer_step(self, args_, state, control, **kwargs):
+    def on_pre_optimizer_step(
        self, args_, state, control, **kwargs
    ):  # pylint: disable=unused-argument
        model = kwargs.pop("model")
        self.grads = gradfilter_ema(model, self.grads, alpha=self.alpha, lamb=self.lamb)
        return control
--- a/src/axolotl/integrations/grokfast/optimizer.py
+++ b/src/axolotl/integrations/grokfast/optimizer.py
@@ -1,6 +1,7 @@
 # Copyright: MIT License (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
 # Reference: https://github.com/ironjr/grokfast
 # pylint: skip-file
 from collections import deque
 from typing import Dict, Literal, Optional
--- a/src/axolotl/integrations/kd/init.py
+++ b/src/axolotl/integrations/kd/init.py
@@ -15,7 +15,6 @@
 """
 Plugin init to add KD support to Axolotl.
 """
 from typing import Any
 from transformers import Trainer
@@ -23,7 +22,7 @@ from transformers import Trainer
 from axolotl.integrations.base import BasePlugin
 from axolotl.integrations.kd.callbacks import KDTemperatureSchedulerCallback
-from .args import KDArgs as KDArgs
+from .args import KDArgs  # pylint: disable=unused-import. # noqa: F401
 class KDPlugin(BasePlugin):
--- a/src/axolotl/integrations/kd/args.py
+++ b/src/axolotl/integrations/kd/args.py
@@ -15,7 +15,6 @@
 """
 Plugin args for KD support.
 """
 from dataclasses import dataclass
 from enum import Enum
@@ -27,8 +26,8 @@ class InferenceServerType(str, Enum):
    Online inferences server types to handle different request args
    """
-    vllm = "vllm"
+    vllm = "vllm"  # pylint: disable=invalid-name
-    sglang = "sglang"
+    sglang = "sglang"  # pylint: disable=invalid-name
 class KDArgs(BaseModel):
--- a/src/axolotl/integrations/kd/callbacks.py
+++ b/src/axolotl/integrations/kd/callbacks.py
@@ -19,7 +19,9 @@ class KDTemperatureSchedulerCallback(TrainerCallback):
        self.trainer = trainer
-    def on_step_end(self, args, state, control, **kwargs):
+    def on_step_end(
        self, args, state, control, **kwargs
    ):  # pylint: disable=unused-argument
        # cosine decay temperature over the max steps
        progress = state.global_step / state.max_steps
--- a/src/axolotl/integrations/kd/chat_template.py
+++ b/src/axolotl/integrations/kd/chat_template.py
@@ -15,7 +15,6 @@
 """
 Chat template prompt strategy loader with KD support
 """
 import logging
 from typing import Any, Dict
@@ -193,6 +192,7 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
        """
        Transform logprobs to target format for KD training
        """
        # pylint: disable=duplicate-code
        logprobs = sample.pop(self.logprobs_field)
        target_seq_len = len(logprobs)
@@ -240,7 +240,7 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
                target_mask.append([1] * top_k)
        for token_pos_logprobs, pos_target_token_ids in zip(
-            logprobs, sample["target_token_ids"], strict=False
+            logprobs, sample["target_token_ids"]
        ):
            # Convert to a tensor for easier manipulation
            position_logprobs_tensor = torch.tensor(
@@ -299,7 +299,7 @@ class KDStrategyLoader(StrategyLoader):
    Load ChatTemplateStrategy with KD support using StrategyLoader.
    """
-    def _get_strategy_cls(self, cfg):
+    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
        return ChatTemplateStrategyWithKD
    def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
@@ -319,7 +319,7 @@ class KDStrategyLoaderV2(KDStrategyLoader):
    Load KD chat template datasets with pre-tokenized logprob data
    """
-    def _get_strategy_cls(self, cfg):
+    def _get_strategy_cls(self, cfg):  # pylint: disable=unused-argument
        return ChatTemplateStrategyWithKDv2
--- a/src/axolotl/integrations/kd/collator.py
+++ b/src/axolotl/integrations/kd/collator.py
@@ -37,6 +37,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
    target_logprobs. It also creates a teacher_mask to indicate which entries are valid.
    """
    # pylint: disable=duplicate-code
    tokenizer: PreTrainedTokenizerBase
    model: Optional[Any] = None
    padding: Union[bool, str, PaddingStrategy] = True
@@ -71,7 +72,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
                        // self.pad_to_multiple_of
                    ) * self.pad_to_multiple_of
-                for f in features:
+                for f in features:  # pylint: disable=invalid-name
                    remainder = [pad_token_id] * (max_len - len(f[feature_name]))
                    if isinstance(f[feature_name], list):
                        f[feature_name] = (
@@ -100,7 +101,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
        if has_teacher_data:
            # Extract and remove from features
-            for f in features:
+            for f in features:  # pylint: disable=invalid-name
                target_logprobs_list.append(f.pop("target_logprobs"))
                target_token_ids_list.append(f.pop("target_token_ids"))
                target_mask_list.append(f.pop("target_mask"))
@@ -116,25 +117,24 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
            padded_teacher_mask_list = []
            for t_logprobs, t_ids, t_mask in zip(
-                target_logprobs_list,
+                target_logprobs_list, target_token_ids_list, target_mask_list
                target_token_ids_list,
                target_mask_list,
                strict=False,
            ):
                t_logprobs_padded = []
                t_ids_padded = []
                t_mask_padded = []
-                for lp, ids, mask in zip(t_logprobs, t_ids, t_mask, strict=False):
+                for lp, ids, mask in zip(  # pylint: disable=invalid-name
                    t_logprobs, t_ids, t_mask
                ):
                    lp_len = len(lp)
                    if lp_len < max_k:
                        # Use -1e9 for padding logprobs and 0 for token_ids
                        pad_len = max_k - lp_len
-                        lp = lp + [-1e9] * pad_len
+                        lp = lp + [-1e9] * pad_len  # pylint: disable=invalid-name
                        ids = ids + [0] * pad_len
                        mask = mask + [0] * pad_len
                    else:
-                        lp = lp[:max_k]
+                        lp = lp[:max_k]  # pylint: disable=invalid-name
                        ids = ids[:max_k]
                        mask = mask[:max_k]
@@ -216,7 +216,9 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
        #    We want to produce a single "merged" feature dict for each sub-batch.
        out_features = [{} for _ in features]
-        for i, sub_features in enumerate(features):
+        for i, sub_features in enumerate(  # pylint: disable=too-many-nested-blocks
            features
        ):
            # sub_features is a list of dicts, each dict = one sequence’s features
            # We'll merge them into out_features[i].
            #
@@ -253,7 +255,9 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
                        if field_name in feat and isinstance(
                            feat[field_name], (list, torch.Tensor)
                        ):
-                            if isinstance(feat[field_name][0], (dict, str)):
+                            if isinstance(
                                feat[field_name][0], (dict, str)
                            ):  # pylint: disable=too-many-nested-blocks
                                continue
                            arr = np.array(feat[field_name])
                            arrays.append(arr)
--- a/src/axolotl/integrations/kd/collator_online_teacher.py
+++ b/src/axolotl/integrations/kd/collator_online_teacher.py
@@ -144,7 +144,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                }
            for sequence_data, seq_input_ids, seq_labels in zip(
-                api_data, batch_input_ids, labels, strict=False
+                api_data, batch_input_ids, labels
            ):
                current_target_logprobs = []
                current_target_token_ids = []
@@ -165,7 +165,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                assert len(seq_input_ids) == len(input_top_logprobs)
                for i, _, label in zip(
-                    range(len(seq_input_ids)), seq_input_ids, seq_labels, strict=False
+                    range(len(seq_input_ids)), seq_input_ids, seq_labels
                ):
                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
                        # this is always the case for the first token.
@@ -202,8 +202,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                        # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
                        pos_logprobs_raw, pos_token_ids, _ = [
-                            list(row)
+                            list(row) for row in zip(*pos_top_logprobs_data)
                            for row in zip(*pos_top_logprobs_data, strict=False)
                        ]
                        # Ensure correct length (top_k)
@@ -318,7 +317,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                }
            for sequence_data, seq_input_ids, seq_labels in zip(
-                choices, batch_input_ids, labels, strict=False
+                choices, batch_input_ids, labels
            ):
                # seq_input_ids: List[int]
                # seq_labels: List[int]
@@ -343,9 +342,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                seq_len = len(seq_input_ids)
-                for i, _, label in zip(
+                for i, _, label in zip(range(seq_len), seq_input_ids, seq_labels):
                    range(seq_len), seq_input_ids, seq_labels, strict=False
                ):
                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
                        # this is always the case for the first token.
                        # there is never logprob data for the first token since that's a true input
@@ -427,7 +424,7 @@ class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
                            list(range(self.kd_online_topk))
                        )
                        current_target_mask.append([0] * self.kd_online_topk)
-                for _ in range(max(0, seq_len - len(current_target_logprobs))):
+                for i in range(max(0, seq_len - len(current_target_logprobs))):
                    current_target_logprobs.append(
                        [-float("inf")] * self.kd_online_topk
                    )
--- a/src/axolotl/integrations/kd/kernels/liger.py
+++ b/src/axolotl/integrations/kd/kernels/liger.py
@@ -197,7 +197,7 @@ class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
        compute_ce_loss: bool = True,
        normalize_topk: bool = True,
    ):
-        CHUNK_SIZE = chunk_size
+        CHUNK_SIZE = chunk_size  # pylint: disable=invalid-name
        grad_weight_acc = torch.zeros_like(student_lm_head_weight)
        grad_inputs_list = []
        grad_bias_acc = (
@@ -298,8 +298,8 @@ class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
            accumulate_chunk_grads_compiled = accumulate_chunk_grads
        # Use the same chunking logic as LigerFusedLinearDistillationBase.forward
-        B, N, D = student_input.shape
+        B, N, D = student_input.shape  # pylint: disable=invalid-name
-        K = target_token_ids.shape[-1]
+        K = target_token_ids.shape[-1]  # pylint: disable=invalid-name
        student_input_flat = student_input.reshape(-1, student_input.shape[-1])
        target_token_ids_flat = target_token_ids.reshape(-1, target_token_ids.shape[-1])
--- a/src/axolotl/integrations/kd/kernels/models.py
+++ b/src/axolotl/integrations/kd/kernels/models.py
@@ -40,9 +40,10 @@ def kldiv_forward_llama_like(
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
-    logits_to_keep: Union[int, torch.Tensor] = 0,
+    logits_to_keep: Union[int, torch.Tensor] = 0,  # pylint: disable=unused-argument
    **kwargs: Unpack[TransformersKwargs],  # type: ignore[misc]
 ) -> CausalLMOutputWithPast:
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
--- a/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
+++ b/src/axolotl/integrations/kd/topk_logprob/forward_kl.py
@@ -15,7 +15,6 @@
 """
 loss for top_k KL divergence
 """
 import torch
 from torch import nn
@@ -118,6 +117,7 @@ class ChunkedTopKKDLoss(nn.Module):
        target_mask: torch.Tensor,  # [B, seq_len, K]
        num_items_in_batch: int = -1,  # optional batch size for normalization
    ) -> torch.Tensor:
        # 1. Split along the "token" dimension (dim=1).
        student_logits_chunks = student_logits.chunk(self.num_output_chunks, dim=1)
        token_ids_chunks = target_token_ids.chunk(self.num_output_chunks, dim=1)
@@ -131,11 +131,7 @@ class ChunkedTopKKDLoss(nn.Module):
        # 2. Loop over each chunk and compute a chunk-specific loss.
        for st_chunk, tid_chunk, lp_chunk, msk_chunk in zip(
-            student_logits_chunks,
+            student_logits_chunks, token_ids_chunks, logprobs_chunks, mask_chunks
            token_ids_chunks,
            logprobs_chunks,
            mask_chunks,
            strict=False,
        ):
            # We pass num_items_in_batch=-1 so that the kd_loss
            # will average over *this chunk's* valid tokens only.
--- a/src/axolotl/integrations/kd/trainer.py
+++ b/src/axolotl/integrations/kd/trainer.py
@@ -21,6 +21,7 @@ from axolotl.core.trainers.base import AxolotlTrainer
 from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss
 # pylint: disable=too-many-ancestors
 class AxolotlKDTrainer(AxolotlTrainer):
    """
    Custom trainer subclass for Knowledge Distillation (KD)
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -18,7 +18,6 @@ Module for the Plugin for LIGER integraton with Axolotl.
 Liger Kernel is the collection of Triton-native kernels for LLM Training.
 It is designed to be performant, correct, and light-weight.
 """
 from .args import LigerArgs
 from .plugin import LigerPlugin
--- a/src/axolotl/integrations/liger/models/base.py
+++ b/src/axolotl/integrations/liger/models/base.py
@@ -41,6 +41,7 @@ def lce_forward(
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
@@ -180,7 +181,7 @@ def patch_lce_forward(
        model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
        model_cls.forward = lce_forward
-
+    # pylint: disable=duplicate-code
    except (ImportError, AttributeError) as e:
        raise RuntimeError(
            f"Could not import ForCausalLM class for model_type: {model_type}. "
--- a/src/axolotl/integrations/liger/models/deepseekv2.py
+++ b/src/axolotl/integrations/liger/models/deepseekv2.py
@@ -2,6 +2,8 @@
 DeepseekV2 model with LigerFusedLinearCrossEntropyLoss
 """
 # pylint: disable=duplicate-code
 from typing import List, Optional, Tuple, Union
 import torch
--- a/src/axolotl/integrations/liger/models/jamba.py
+++ b/src/axolotl/integrations/liger/models/jamba.py
@@ -2,6 +2,8 @@
 Jamba model with LigerFusedLinearCrossEntropyLoss
 """
 # pylint: disable=duplicate-code
 from typing import Optional, Tuple, Union
 import torch
--- a/src/axolotl/integrations/liger/models/llama4.py
+++ b/src/axolotl/integrations/liger/models/llama4.py
@@ -46,6 +46,7 @@ def lce_forward(
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
@@ -77,7 +78,9 @@ def lce_forward(
    hidden_states = outputs[0]
    if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1:
-        raise Exception("Liger Kernel does not support pretraining_tp!!")
+        raise Exception(  # pylint: disable=broad-exception-raised
            "Liger Kernel does not support pretraining_tp!!"
        )
    logits = None
    loss = None
@@ -125,7 +128,7 @@ def apply_liger_kernel_to_llama4(
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
-    **kwargs,
+    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
@@ -141,15 +144,15 @@ def apply_liger_kernel_to_llama4(
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
-    import transformers.models.llama4.modeling_llama4  # noqa: F401
+    import transformers.models.llama4.modeling_llama4  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-    assert not (cross_entropy and fused_linear_cross_entropy), (
+    assert not (
-        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+        cross_entropy and fused_linear_cross_entropy
-    )
+    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_llama4 = sys.modules["transformers.models.llama4.modeling_llama4"]
@@ -162,7 +165,7 @@ def apply_liger_kernel_to_llama4(
            # clone config to avoid modifying the original
            config = deepcopy(config)
            if intermediate_size:
-                config.intermediate_size = intermediate_size
+                setattr(config, "intermediate_size", intermediate_size)
            return LigerSwiGLUMLP(config, **kwargs)
        modeling_llama4.Llama4TextMLP = _liger_swiglu_mlp_wrapper
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -43,6 +43,7 @@ def lce_forward(
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
@@ -112,8 +113,9 @@ def apply_liger_kernel_to_qwen3(
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
-    **kwargs,
+    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
@@ -128,15 +130,15 @@ def apply_liger_kernel_to_qwen3(
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
-    import transformers.models.qwen3.modeling_qwen3  # noqa: F401
+    import transformers.models.qwen3.modeling_qwen3  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-    assert not (cross_entropy and fused_linear_cross_entropy), (
+    assert not (
-        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+        cross_entropy and fused_linear_cross_entropy
-    )
+    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -45,6 +45,7 @@ def lce_forward(
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
@@ -134,8 +135,9 @@ def apply_liger_kernel_to_qwen3_moe(
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
-    **kwargs,
+    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
@@ -150,15 +152,15 @@ def apply_liger_kernel_to_qwen3_moe(
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
-    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401
+    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
-    assert not (cross_entropy and fused_linear_cross_entropy), (
+    assert not (
-        "cross_entropy and fused_linear_cross_entropy cannot both be True."
+        cross_entropy and fused_linear_cross_entropy
-    )
+    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
@@ -172,7 +174,7 @@ def apply_liger_kernel_to_qwen3_moe(
            # clone config to avoid modifying the original
            config = deepcopy(config)
            if intermediate_size:
-                config.intermediate_size = intermediate_size
+                setattr(config, "intermediate_size", intermediate_size)
            return LigerSwiGLUMLP(config, **kwargs)
        modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
--- a/src/axolotl/integrations/lm_eval/init.py
+++ b/src/axolotl/integrations/lm_eval/init.py
@@ -7,7 +7,7 @@ import subprocess  # nosec
 from axolotl.integrations.base import BasePlugin
 from axolotl.integrations.lm_eval.cli import build_lm_eval_command
-from .args import LMEvalArgs as LMEvalArgs
+from .args import LMEvalArgs  # pylint: disable=unused-import. # noqa: F401
 class LMEvalPlugin(BasePlugin):
@@ -20,6 +20,7 @@ class LMEvalPlugin(BasePlugin):
    def post_train_unload(self, cfg):
        if cfg.lm_eval_post_train:
            # pylint: disable=duplicate-code
            for lm_eval_args in build_lm_eval_command(
                cfg.lm_eval_tasks,
                bfloat16=cfg.bfloat16 or cfg.bf16,
--- a/src/axolotl/integrations/lm_eval/cli.py
+++ b/src/axolotl/integrations/lm_eval/cli.py
@@ -99,6 +99,7 @@ def lm_eval(config: str, cloud: Optional[str] = None):
        with open(config, encoding="utf-8") as file:
            cfg: DictDefault = DictDefault(yaml.safe_load(file))
        # pylint: disable=duplicate-code
        for lm_eval_args in build_lm_eval_command(
            cfg.lm_eval_tasks,
            bfloat16=cfg.bfloat16 or cfg.bf16,
--- a/src/axolotl/integrations/spectrum/init.py
+++ b/src/axolotl/integrations/spectrum/init.py
@@ -23,7 +23,7 @@ import requests
 from axolotl.integrations.base import BasePlugin
 from axolotl.utils.logging import get_logger
-from .args import SpectrumArgs as SpectrumArgs
+from .args import SpectrumArgs  # pylint: disable=unused-import. # noqa: F401
 LOG = get_logger(__name__)
@@ -46,7 +46,7 @@ def _generate_unfrozen_params_yaml(snr_data, top_fraction=0.5):
        "^lm_head.weight$",
        "^model.embed_tokens.weight$",
    ]
-    for _, layer_names in top_layers_by_type.items():
+    for layer_type, layer_names in top_layers_by_type.items():
        for layer_name in layer_names:
            unfrozen_parameters.append(layer_name)
    return unfrozen_parameters
@@ -84,7 +84,7 @@ class SpectrumPlugin(BasePlugin):
                snr_data = json.load(fin)
        except FileNotFoundError:
            pass
-        except Exception as exc:
+        except Exception as exc:  # pylint: disable=broad-exception-caught
            LOG.warning(f"Failed to read SNR data from {snr_path}: {exc}")
        if not snr_data:
--- a/src/axolotl/integrations/spectrum/args.py
+++ b/src/axolotl/integrations/spectrum/args.py
@@ -15,7 +15,6 @@
 """
 Module for handling Spectrum input arguments.
 """
 from typing import Optional
 from pydantic import BaseModel, model_validator
--- a/src/axolotl/kernels/geglu.py
+++ b/src/axolotl/kernels/geglu.py
@@ -5,6 +5,8 @@ See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202).
 Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
 """
 # pylint: disable=invalid-name,unnecessary-lambda-assignment,duplicate-code
 import torch
 import triton
 import triton.language as tl
--- a/src/axolotl/kernels/lora.py
+++ b/src/axolotl/kernels/lora.py
@@ -7,6 +7,8 @@ See "LoRA: Low-Rank Adaptation of Large Language Models"
 Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
 """
 # pylint: disable=invalid-name
 from typing import Callable
 import torch
--- a/src/axolotl/kernels/quantize.py
+++ b/src/axolotl/kernels/quantize.py
@@ -1,5 +1,7 @@
 """Dequantization utilities for `bitsandbytes` integration."""
 # pylint: disable=invalid-name,global-statement
 import ctypes
 import bitsandbytes as bnb
--- a/src/axolotl/kernels/swiglu.py
+++ b/src/axolotl/kernels/swiglu.py
@@ -99,6 +99,7 @@ def _swiglu_bwd_kernel(
    tl.store(up_ptr + offsets, grad_up, mask=mask)  # grad wrt up
 # pylint: disable=unnecessary-lambda-assignment
 def swiglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
    """
    SwiGLU forward pass. Computes SwiGLU activation: `x * sigmoid(x) * up`, where
@@ -127,6 +128,7 @@ def swiglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
    return out
 # pylint: disable=unnecessary-lambda-assignment
 def swiglu_backward(
    grad_output: torch.Tensor, gate: torch.Tensor, up: torch.Tensor
 ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
--- a/src/axolotl/loaders/init.py
+++ b/src/axolotl/loaders/init.py
@@ -1,5 +1,6 @@
 """Init for axolotl.loaders module"""
 # pylint: disable=unused-import
 # flake8: noqa
 from .adapter import load_adapter, load_lora
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -28,12 +28,14 @@ LOG = get_logger(__name__)
 def setup_quantized_meta_for_peft(model: torch.nn.Module):
    """Replaces `quant_state.to` with a dummy function to prevent PEFT from moving `quant_state` to meta device"""
-    def temp_to_method(self, *args, **kwargs):
+    def temp_to_method(self, *args, **kwargs):  # pylint: disable=unused-argument
        return self
    for param in model.parameters():
        if isinstance(param, Params4bit):
-            param.quant_state._orig_to = param.quant_state.to
+            param.quant_state._orig_to = (  # pylint: disable=protected-access
                param.quant_state.to
            )
            param.quant_state.to = types.MethodType(temp_to_method, param.quant_state)
@@ -41,8 +43,10 @@ def setup_quantized_peft_meta_for_training(model: torch.nn.Module):
    """Replaces dummy `quant_state.to` method with the original function to allow training to continue"""
    for param in model.parameters():
        if isinstance(param, Params4bit) and hasattr(param.quant_state, "_orig_to"):
-            param.quant_state.to = param.quant_state._orig_to
+            param.quant_state.to = (
-            param.quant_state._orig_to = None
+                param.quant_state._orig_to  # pylint: disable=protected-access
            )
            param.quant_state._orig_to = None  # pylint: disable=protected-access
 def find_all_linear_names(model):
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -102,7 +102,7 @@ class ModelLoader:
        *,
        inference: bool = False,
        reference_model: bool = False,
-        **kwargs,
+        **kwargs,  # pylint: disable=unused-argument
    ):
        """Initializes the ModelLoader.
@@ -134,7 +134,7 @@ class ModelLoader:
        # Init model config
        self.model_config = load_model_config(cfg)
-        self.auto_model_loader = AutoModelForCausalLM
+        self.auto_model_loader = AutoModelForCausalLM  # pylint: disable=invalid-name
        # Initialize the patch manager
        self.patch_manager = PatchManager(
@@ -607,19 +607,27 @@ class ModelLoader:
            self.model_kwargs["attn_implementation"] = self.cfg.attn_implementation
        elif self.cfg.flex_attention:
            self.model_kwargs["attn_implementation"] = "flex_attention"
-            self.model_config._attn_implementation = "flex_attention"
+            self.model_config._attn_implementation = (  # pylint: disable=protected-access
                "flex_attention"
            )
        elif self.cfg.flash_attention:
            if not self.cfg.sample_packing and self.cfg.s2_attention:
                pass
            self.model_kwargs["attn_implementation"] = "flash_attention_2"
-            self.model_config._attn_implementation = "flash_attention_2"
+            self.model_config._attn_implementation = (  # pylint: disable=protected-access
                "flash_attention_2"
            )
        elif self.cfg.sdp_attention:
            self.model_kwargs["attn_implementation"] = "sdpa"
-            self.model_config._attn_implementation = "sdpa"
+            self.model_config._attn_implementation = (  # pylint: disable=protected-access
                "sdpa"
            )
        elif self.cfg.eager_attention:
            self.model_kwargs["attn_implementation"] = "eager"
-            self.model_config._attn_implementation = "eager"
+            self.model_config._attn_implementation = (  # pylint: disable=protected-access
                "eager"
            )
        if self.cfg.low_cpu_mem_usage:
            self.model_kwargs["low_cpu_mem_usage"] = True
@@ -759,7 +767,7 @@ class ModelLoader:
                )
        elif self.model_type == "MambaLMHeadModel":
            # FIXME this is janky at best and hacked together to make it work
-            MambaLMHeadModel = fix_mamba_attn_for_loss()
+            MambaLMHeadModel = fix_mamba_attn_for_loss()  # pylint: disable=invalid-name
            self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
            self.model_kwargs["device"] = torch.cuda.current_device()
@@ -808,6 +816,7 @@ class ModelLoader:
        if is_deepspeed_zero3_enabled():
            skip_move_to_device = True
        # pylint: disable=protected-access
        if self.cfg.tensor_parallel_size > 1:
            # workaround for upstream 4.54.0 not setting _tp_size or _device_mesh
            # TODO(wing): remove once 4.54.1 is released
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -277,14 +277,6 @@ class PatchManager:
                has_remote_code=has_remote_code,
            )
        if self.cfg.sample_packing:
            from axolotl.monkeypatch.data.batch_dataset_fetcher import (
                apply_multipack_dataloader_patch,
            )
            LOG.info("Applying multipack dataloader patch for sample packing...")
            apply_multipack_dataloader_patch()
    def _apply_fsdp2_bnb_patches(self):
        """Apply FSDP2 BNB patches."""
        if (
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -50,7 +50,7 @@ def modify_tokenizer_files(
    tokenizer_dir = os.path.join(output_dir, "tokenizer")
    os.makedirs(tokenizer_dir, exist_ok=True)
-    if is_local_main_process():
+    if is_local_main_process():  # pylint: disable=too-many-nested-blocks
        # Load the tokenizer
        temp_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
@@ -73,9 +73,9 @@ def modify_tokenizer_files(
                for token_id, new_value in token_id_mappings.items():
                    token_id_str = str(token_id)
                    if token_id_str in config_data["added_tokens_decoder"]:
-                        config_data["added_tokens_decoder"][token_id_str]["content"] = (
+                        config_data["added_tokens_decoder"][token_id_str][
-                            new_value
+                            "content"
-                        )
+                        ] = new_value
                    else:
                        raise ValueError(
                            f"Token ID {token_id_str} not found in added_tokens_decoder"
@@ -215,7 +215,7 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
        for k, val in special_tokens.items():
            # check if new special token is not already in tokenizer and
            # is adapter training to make sure lora_modules_to_save is set
-
+            # pylint: disable=too-many-boolean-expressions
            if (
                (getattr(tokenizer, k) is None or getattr(tokenizer, k) != val)
                and (len(tokenizer.encode(val, add_special_tokens=False)) > 2)
--- a/src/axolotl/models/mamba/init.py
+++ b/src/axolotl/models/mamba/init.py
@@ -21,4 +21,4 @@ def fix_mamba_attn_for_loss():
    from .modeling_mamba import MambaLMHeadModel as MambaLMHeadModelFixed
    mixer_seq_simple.MambaLMHeadModel = MambaLMHeadModelFixed
-    return mixer_seq_simple.MambaLMHeadModel
+    return mixer_seq_simple.MambaLMHeadModel  # pylint: disable=invalid-name
--- a/src/axolotl/models/mamba/modeling_mamba.py
+++ b/src/axolotl/models/mamba/modeling_mamba.py
@@ -1,3 +1,4 @@
 # pylint: skip-file
 import os
 from collections import namedtuple
 from functools import partial
@@ -111,7 +112,7 @@ class MambaLMHeadModel(nn.Module, GenerationMixin):
        self,
        save_directory: Union[str, os.PathLike],
        state_dict: Optional[dict] = None,
-        safe_serialization: Optional[bool] = None,
+        safe_serialization: Optional[bool] = None,  # pylint: disable=unused-argument
    ):
        if state_dict is None:
            state_dict = self.state_dict()
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -130,9 +130,9 @@ def get_state_dict(self, model, unwrap=True):
                        "Deepspeed TP requires deepspeed >= 0.16.4, Please update DeepSpeed via `pip install deepspeed -U`."
                    )
                state_dict = (
-                    model._consolidated_16bit_state_dict()
+                    model._consolidated_16bit_state_dict()  # pylint: disable=protected-access
                    if tp_sharding
-                    else model._zero3_consolidated_16bit_state_dict()
+                    else model._zero3_consolidated_16bit_state_dict()  # pylint: disable=protected-access
                )
            else:
                raise ValueError(
@@ -187,7 +187,7 @@ def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
    # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
    # wrap this. Therefore we must ensure the bias has the same dtype as the weight
-    if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None:
+    if module.base_layer.bias is not None:
        if module.base_layer.weight.dtype != module.base_layer.bias.dtype:
            log_bias_dtype_mismatch = True
            module.base_layer.bias.data = module.base_layer.bias.data.to(
@@ -231,7 +231,8 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
    )
    is_type_fsdp = isinstance(model, FSDPModule) or (
-        is_compiled_module(model) and isinstance(model._orig_mod, FSDPModule)
+        is_compiled_module(model)
        and isinstance(model._orig_mod, FSDPModule)  # pylint: disable=protected-access
    )
    if is_type_fsdp:
        return model
--- a/src/axolotl/monkeypatch/accelerate/parallelism_config.py
+++ b/src/axolotl/monkeypatch/accelerate/parallelism_config.py
@@ -2,6 +2,7 @@
 workaround to allow parallelism config for pure CP
 """
 # pylint: disable=protected-access
 import os
 import warnings
@@ -29,7 +30,7 @@ def _validate_accelerator(self, accelerator):
    allow_parallelism_config = False
    if (
-        self.cp_size > 1
+        self.cp_size > 1  # pylint: disable=chained-comparison
        and self.dp_shard_size <= 1
        and os.environ.get("ACCELERATE_ALLOW_CP_STANDALONE", "false").lower() == "true"
    ):
@@ -54,7 +55,6 @@ def _validate_accelerator(self, accelerator):
        warnings.warn(
            "ParallelismConfig has the following warnings:\n" + "\n".join(_warnings),
            UserWarning,
            stacklevel=2,
        )
--- a/src/axolotl/monkeypatch/attention/flex_attn.py
+++ b/src/axolotl/monkeypatch/attention/flex_attn.py
@@ -65,9 +65,11 @@ def patch_flex_wrapper(**flex_attn_compile_kwargs):
            return self._compiled_flex_attention
    transformers.integrations.flex_attention.WrappedFlexAttention = WrappedFlexAttention
-    sys.modules[
+    setattr(
-        "transformers.integrations.flex_attention"
+        sys.modules["transformers.integrations.flex_attention"],
-    ].WrappedFlexAttention = WrappedFlexAttention
+        "WrappedFlexAttention",
        WrappedFlexAttention,
    )
 def patch_flex_make_mask():
@@ -142,7 +144,9 @@ def patch_flex_make_mask():
        # computation prior to the softmax. For sample packing, we need both the
        # logic for both causal mask and document mask. See PyTorch's official
        # blog post for more details: https://pytorch.org/blog/flexattention/#mask-mods
-        def causal_mask_mod(batch_idx, head_idx, q_idx, kv_idx):
+        def causal_mask_mod(
            batch_idx, head_idx, q_idx, kv_idx
        ):  # pylint: disable=unused-argument
            """
            Defines the logic of a block causal mask by combining both a standard causal mask
            and a block diagonal document mask.
@@ -194,12 +198,14 @@ def patch_flex_make_mask():
    for n in tuple(sys.modules):
        if ".modeling_" in n:
            if hasattr(sys.modules[n], "make_flex_block_causal_mask"):
-                sys.modules[
+                sys.modules[n].make_flex_block_causal_mask = (
-                    n
+                    patched_make_flex_block_causal_mask
-                ].make_flex_block_causal_mask = patched_make_flex_block_causal_mask
+                )
-                sys.modules[
+                setattr(
-                    n
+                    sys.modules[n],
-                ].make_flex_block_causal_mask = patched_make_flex_block_causal_mask
+                    "make_flex_block_causal_mask",
                    patched_make_flex_block_causal_mask,
                )
    transformers.integrations.flex_attention.make_flex_block_causal_mask = (
        patched_make_flex_block_causal_mask
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Wing Lian	bb65157dcf	fix conditional for None values	2025-08-17 12:49:48 -04:00
Wing Lian	7fd3d8abc4	handle batch size correchtly when using split and dispatch batches	2025-08-16 22:05:31 -04:00