models.py -> loaders/ module refactor (#2680)

* models.py -> loaders/ module refactor * refactor ModelLoader class * plugin manager changes * circular import fix * pytest * pytest * minor improvements * fix * minor changes * fix test * remove dead code * coderabbit comments * lint * fix * coderabbit suggestion I liked * more coderabbit * review comments, yak shaving * lint * updating in light of SP ctx manager changes * review comment * review comment 2
2025-05-23 15:51:11 -04:00
parent 8cde256db2
commit b5f1e53a0f
33 changed files with 2249 additions and 2039 deletions
--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -20,8 +20,9 @@ from transformers import (
    ProcessorMixin,
 )

+from axolotl.loaders import load_processor, load_tokenizer
+from axolotl.loaders.model import ModelLoader
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.models import load_model, load_processor, load_tokenizer

 LOG = logging.getLogger(__name__)

@@ -318,7 +319,8 @@ def load_model_and_tokenizer(
    tokenizer = load_tokenizer(cfg)

    LOG.info("loading model...")
-    model, _ = load_model(cfg, tokenizer, inference=inference)
+    model_loader = ModelLoader(cfg, tokenizer, inference=inference)
+    model, _ = model_loader.load()

    processor = None
    if cfg.is_multimodal:
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -10,10 +10,10 @@ from datasets import Dataset

 import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401
 from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
+from axolotl.loaders import load_processor, load_tokenizer
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.models import load_processor, load_tokenizer
 from axolotl.utils.schemas.enums import RLType
 from axolotl.utils.tokenization import check_dataset_labels

--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -59,6 +59,7 @@ from axolotl.core.training_args import (
    AxolotlTrainingArguments,
 )
 from axolotl.integrations.base import PluginManager
+from axolotl.loaders.utils import ensure_dtype
 from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
 from axolotl.monkeypatch.relora import ReLoRACallback
 from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
@@ -86,7 +87,6 @@ from axolotl.utils.collators import (
    V2BatchSamplerDataCollatorForSeq2Seq,
 )
 from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
-from axolotl.utils.models import ensure_dtype
 from axolotl.utils.schemas.enums import CustomSupportedOptimizers, RLType

 try:
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -43,7 +43,7 @@ from trl.trainer.utils import pad

 from axolotl.core.trainers.grpo.sampler import SequenceParallelRepeatRandomSampler
 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
-from axolotl.monkeypatch.ring_attn.patch import get_ring_attn_group
+from axolotl.monkeypatch.ring_attn import get_ring_attn_group

 if is_peft_available():
    # pylint: disable=unused-import
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -10,71 +10,73 @@
 # License for the specific language governing permissions and limitations under
 # the License.

-"""
-Base class for all plugins.
+"""Base class for all plugins.

 A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.
 Plugins can be used to integrate third-party models, modify the training process, or add new features.

 To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
 """
+
+from __future__ import annotations
+
 import collections
 import importlib
 import logging
-from typing import OrderedDict
+from typing import TYPE_CHECKING, Callable, OrderedDict, Union

-import torch
+from peft import PeftModel
+from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LRScheduler
+from transformers import PreTrainedModel, Trainer

 from axolotl.utils.dict import DictDefault

+if TYPE_CHECKING:
+    from axolotl.common.datasets import TrainDatasetMeta
+

 class BasePlugin:
-    """
-    Base class for all plugins. Defines the interface for plugin methods.
-
-    Attributes:
-    None
+    """Base class for all plugins. Defines the interface for plugin methods.

    Methods:
-    register(cfg): Registers the plugin with the given configuration.
-    load_datasets(cfg): Loads and preprocesses the dataset for training.
-    pre_model_load(cfg): Performs actions before the model is loaded.
-    post_model_build(cfg, model): Performs actions after the model is loaded, but before LoRA adapters are applied.
-    pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
-    post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
-    post_model_load(cfg, model): Performs actions after the model is loaded, inclusive of any adapters.
-    post_trainer_create(cfg, trainer): Performs actions after the trainer is created.
-    create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
-    create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and returns a learning rate scheduler.
-    add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before training.
-    add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after training.
+        register(cfg): Registers the plugin with the given configuration.
+        load_datasets(cfg): Loads and preprocesses the dataset for training.
+        pre_model_load(cfg): Performs actions before the model is loaded.
+        post_model_build(cfg, model): Performs actions after the model is loaded, but
+            before LoRA adapters are applied.
+        pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
+        post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
+        post_model_load(cfg, model): Performs actions after the model is loaded,
+            inclusive of any adapters.
+        post_trainer_create(cfg, trainer): Performs actions after the trainer is
+            created.
+        create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
+        create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and
+            returns a learning rate scheduler.
+        add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before
+            training.
+        add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after
+            training.
    """

    def __init__(self):
-        """
-        Initializes the BasePlugin.
-        """
+        """Initializes the BasePlugin."""

    def register(self, cfg):  # pylint: disable=unused-argument
-        """
-        Registers the plugin with the given configuration.
+        """Registers the plugin with the given configuration.

-        Parameters:
-        cfg (dict): The configuration for the plugin.
-
-        Returns:
-        None
+        Args:
+            cfg: The configuration for the plugin.
        """

    def get_input_args(self) -> str | None:
-        """
-        Returns a pydantic model for the plugin's input arguments.
-        """
+        """Returns a pydantic model for the plugin's input arguments."""

-    def load_datasets(self, cfg: DictDefault, preprocess: bool = False):
-        """
-        Loads and preprocesses the dataset for training.
+    def load_datasets(
+        self, cfg: DictDefault, preprocess: bool = False
+    ) -> Union["TrainDatasetMeta", None]:
+        """Loads and preprocesses the dataset for training.

        Args:
            cfg: The configuration for the plugin.
@@ -84,181 +86,164 @@ class BasePlugin:
            dataset_meta: The metadata for the training dataset.
        """

-    def pre_model_load(self, cfg):  # pylint: disable=unused-argument
-        """
-        Performs actions before the model is loaded.
+    def pre_model_load(self, cfg: DictDefault):  # pylint: disable=unused-argument
+        """Performs actions before the model is loaded.

        Args:
-            cfg (dict): The configuration for the plugin.
+            cfg: The configuration for the plugin.
+        """
+
+    # pylint: disable=unused-argument
+    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
+        """Performs actions after the model is built/loaded, but before any adapters are applied.
+
+        Args:
+            cfg: The configuration for the plugin.
+        """
+
+    # pylint: disable=unused-argument
+    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
+        """Performs actions before LoRA weights are loaded.
+
+        Args:
+            cfg: The configuration for the plugin.
+            model: The loaded model.
+        """
+
+    # pylint: disable=unused-argument
+    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        """Performs actions after LoRA weights are loaded.
+
+        Args:
+            cfg: The configuration for the plugin.
+            model: The loaded model.
+        """
+
+    # pylint: disable=unused-argument
+    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        """Performs actions after the model is loaded.
+
+        Args:
+            cfg: The configuration for the plugin.
+            model: The loaded model.
+        """
+
+    # pylint: disable=unused-argument
+    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
+        """Returns a custom class for the trainer.
+
+        Args:
+            cfg: The global axolotl configuration.

        Returns:
-            None
+            The first non-`None` trainer class returned by a plugin.
        """

-    def post_model_build(self, cfg, model):  # pylint: disable=unused-argument
-        """
-        Performs actions after the model is built/loaded, but before any adapters are applied.
+    # pylint: disable=unused-argument
+    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
+        """Performs actions after the trainer is created.

        Args:
-            cfg (dict): The configuration for the plugin.
+            cfg: The configuration for the plugin.
+            trainer: The trainer object for training.
        """

-    def post_model_load(self, cfg, model):  # pylint: disable=unused-argument
-        """
-        Performs actions after the model is loaded.
+    # pylint: disable=unused-argument
+    def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
+        """Creates and returns an optimizer for training.

        Args:
-            cfg (dict): The configuration for the plugin.
-            model (object): The loaded model.
+            cfg: The configuration for the plugin.
+            trainer: The trainer object for training.

        Returns:
-            None
-        """
-
-    def pre_lora_load(self, cfg, model):  # pylint: disable=unused-argument
-        """
-        Performs actions before LoRA weights are loaded.
-
-        Args:
-            cfg (dict): The configuration for the plugin.
-            model (object): The loaded model.
-
-        Returns:
-            None
-        """
-
-    def post_lora_load(self, cfg, model):  # pylint: disable=unused-argument
-        """
-        Performs actions after LoRA weights are loaded.
-
-        Args:
-            cfg (dict): The configuration for the plugin.
-            model (object): The loaded model.
-
-        Returns:
-            None
-        """
-
-    def get_trainer_cls(self, cfg):  # pylint: disable=unused-argument):
-        """
-        Returns a custom class for the trainer.
-
-        Args:
-            cfg (dict): The global axolotl configuration.
-
-        Returns:
-            class: The class for the trainer.
-        """
-
-    def post_trainer_create(self, cfg, trainer):  # pylint: disable=unused-argument
-        """
-        Performs actions after the trainer is created.
-
-        Args:
-            cfg (dict): The configuration for the plugin.
-            trainer (object): The trainer object for training.
-
-        Returns:
-            None
-        """
-
-    def create_optimizer(self, cfg, trainer):  # pylint: disable=unused-argument
-        """
-        Creates and returns an optimizer for training.
-
-        Args:
-            cfg (dict): The configuration for the plugin.
-            trainer (object): The trainer object for training.
-
-        Returns:
-            object: The created optimizer.
+            The created optimizer.
        """

+    # pylint: disable=unused-argument
    def create_lr_scheduler(
-        self, cfg, trainer, optimizer, num_training_steps
-    ) -> LRScheduler | None:  # pylint: disable=unused-argument
-        """
-        Creates and returns a learning rate scheduler.
+        self,
+        cfg: DictDefault,
+        trainer: Trainer,
+        optimizer: Optimizer,
+        num_training_steps: int,
+    ) -> LRScheduler | None:
+        """Creates and returns a learning rate scheduler.

        Args:
-            cfg (dict): The configuration for the plugin.
-            trainer (object): The trainer object for training.
-            optimizer (object): The optimizer for training.
-            num_training_steps (int): Total number of training steps
+            cfg: The configuration for the plugin.
+            trainer: The trainer object for training.
+            optimizer: The optimizer for training.
+            num_training_steps: Total number of training steps

        Returns:
-            object (LRScheduler): The created learning rate scheduler.
+            The created learning rate scheduler.
        """

-    def add_callbacks_pre_trainer(self, cfg, model):  # pylint: disable=unused-argument
-        """
-        setup callbacks before creating the trainer.
+    # pylint: disable=unused-argument
+    def add_callbacks_pre_trainer(
+        self, cfg: DictDefault, model: PreTrainedModel
+    ) -> list[Callable]:
+        """Set up callbacks before creating the trainer.

        Args:
-            cfg (dict): The configuration for the plugin.
-            model (object): The loaded model.
+            cfg: The configuration for the plugin.
+            model: The loaded model.

        Returns:
-            List[callable]: A list of callback functions to be added to the TrainingArgs
+            A list of callback functions to be added to the `TrainingArgs`.
        """
        return []

+    # pylint: disable=unused-argument
    def add_callbacks_post_trainer(
-        self, cfg, trainer
-    ):  # pylint: disable=unused-argument
-        """
-        Adds callbacks to the trainer after creating the trainer.
-        This is useful for callbacks that require access to the model or trainer.
+        self, cfg: DictDefault, trainer: Trainer
+    ) -> list[Callable]:
+        """Adds callbacks to the trainer after creating the trainer. This is useful for
+        callbacks that require access to the model or trainer.

        Args:
-            cfg (dict): The configuration for the plugin.
-            trainer (object): The trainer object for training.
+            cfg: The configuration for the plugin.
+            trainer: The trainer object for training.

        Returns:
-            List[callable]: A list of callback functions to be added
+            A list of callback functions to be added
        """
        return []

-    def post_train(self, cfg, model):  # pylint: disable=unused-argument
-        """
-        Performs actions after training is complete.
+    # pylint: disable=unused-argument
+    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        """Performs actions after training is complete.

        Args:
-            cfg (dict): The axolotl configuration
-            model (object): The loaded model.
-
-        Returns:
-            None
+            cfg: The axolotl configuration.
+            model: The loaded model.
        """

-    def post_train_unload(self, cfg):  # pylint: disable=unused-argument
-        """
-        Performs actions after training is complete and the model is unloaded.
+    def post_train_unload(self, cfg: DictDefault):  # pylint: disable=unused-argument
+        """Performs actions after training is complete and the model is unloaded.

        Args:
-            cfg (dict): The configuration for the plugin.
-
-        Returns:
-            None
+            cfg: The configuration for the plugin.
        """


 def load_plugin(plugin_name: str) -> BasePlugin:
-    """
-    Loads a plugin based on the given plugin name.
+    """Loads a plugin based on the given plugin name.

-    The plugin name should be in the format "module_name.class_name".
-    This function splits the plugin name into module and class, imports the module,
-    retrieves the class from the module, and creates an instance of the class.
+    The plugin name should be in the format "module_name.class_name". This function
+    splits the plugin name into module and class, imports the module, retrieves the
+    class from the module, and creates an instance of the class.

-    Parameters:
-    plugin_name (str): The name of the plugin to be loaded. The name should be in the format "module_name.class_name".
+    Args:
+        plugin_name: The name of the plugin to be loaded. The name should be in the
+            format "module_name.class_name".

    Returns:
-    BasePlugin: An instance of the loaded plugin.
+        An instance of the loaded plugin.

    Raises:
-    ImportError: If the plugin module cannot be imported.
+        ImportError: If the plugin module cannot be imported.
    """
    # split the plugin name into module and class
    module_name, class_name = plugin_name.rsplit(".", 1)
@@ -284,28 +269,25 @@ def load_plugin(plugin_name: str) -> BasePlugin:


 class PluginManager:
-    """
-    The PluginManager class is responsible for loading and managing plugins.
-    It should be a singleton so it can be accessed from anywhere in the codebase.
+    """The `PluginManager` class is responsible for loading and managing plugins. It
+    should be a singleton so it can be accessed from anywhere in the codebase.

    Attributes:
-    plugins (List[BasePlugin]): A list of loaded plugins.
+        plugins: A list of loaded plugins.

    Methods:
-    get_instance(): Static method to get the singleton instance of PluginManager.
-    register(plugin_name: str): Registers a new plugin by its name.
-    pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
+        get_instance(): Static method to get the singleton instance of `PluginManager`.
+        register(plugin_name: str): Registers a new plugin by its name.
+        pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
    """

    plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()

-    _instance = None
-    _cfg = None
+    _instance: PluginManager | None = None
+    _cfg: DictDefault | None = None

    def __new__(cls):
-        """
-        Creates a new instance of PluginManager if it doesn't exist yet.
-        """
+        """Creates a new instance of PluginManager if it doesn't exist yet."""
        if cls._instance is None:
            cls._instance = super(PluginManager, cls).__new__(cls)
            cls._instance.plugins: OrderedDict[str, BasePlugin] = (
@@ -315,9 +297,8 @@ class PluginManager:

    @staticmethod
    def get_instance() -> "PluginManager":
-        """
-        Returns the singleton instance of PluginManager.
-        If the instance doesn't exist, it creates a new one.
+        """Returns the singleton instance of PluginManager. If the instance doesn't
+        exist, it creates a new one.
        """
        if PluginManager._instance is None:
            PluginManager()
@@ -332,17 +313,13 @@ class PluginManager:
        self._cfg = cfg

    def register(self, plugin_name: str):
-        """
-        Registers a new plugin by its name.
+        """Registers a new plugin by its name.

-        Parameters:
-        plugin_name (str): The name of the plugin to be registered.
-
-        Returns:
-        None
+        Args:
+            plugin_name: The name of the plugin to be registered.

        Raises:
-        ImportError: If the plugin module cannot be imported.
+            ImportError: If the plugin module cannot be imported.
        """
        try:
            logging.info(f"Attempting to load plugin: {plugin_name}")
@@ -352,12 +329,11 @@ class PluginManager:
        except ImportError:
            logging.error(f"Failed to load plugin: {plugin_name}")

-    def get_input_args(self):
-        """
-        Returns a list of Pydantic classes for all registered plugins' input arguments.'
+    def get_input_args(self) -> list[str]:
+        """Returns a list of Pydantic classes for all registered plugins' input arguments.'

        Returns:
-        list[str]: A list of Pydantic classes for all registered plugins' input arguments.'
+            A list of Pydantic classes for all registered plugins' input arguments.'
        """
        input_args = []
        for plugin in self.plugins.values():
@@ -366,16 +342,17 @@ class PluginManager:
                input_args.append(input_args_from_plugin)
        return input_args

-    def load_datasets(self, cfg, preprocess: bool = False):
-        """
-        Calls the load_datasets method of each registered plugin.
+    def load_datasets(
+        self, cfg: DictDefault, preprocess: bool = False
+    ) -> Union["TrainDatasetMeta", None]:
+        """Calls the load_datasets method of each registered plugin.

        Args:
            cfg: The configuration for the plugins.
-            preprocess : Whether this is preprocess step of the datasets.
+            preprocess: Whether this is preprocess step of the datasets.

        Returns:
-            dataset_meta: The dataset metadata loaded from all registered plugins.
+            The dataset metadata loaded from all registered plugins.
        """
        return_ds_meta = None
        for plugin in self.plugins.values():
@@ -387,83 +364,66 @@ class PluginManager:
                    raise RuntimeError("Multiple plugins loaded datasets")
        return return_ds_meta

-    def pre_model_load(self, cfg):
-        """
-        Calls the pre_model_load method of all registered plugins.
+    def pre_model_load(self, cfg: DictDefault):
+        """Calls the pre_model_load method of all registered plugins.

-        Parameters:
-        cfg (dict): The configuration for the plugins.
-
-        Returns:
-        None
+        Args:
+            cfg: The configuration for the plugins.
        """
        for plugin in self.plugins.values():
            plugin.pre_model_load(cfg)

-    def post_model_build(self, cfg, model):
-        """
-        Calls the post_model_build method of all registered plugins after the model has been built/loaded,
-        but before any adapters have been applied.
+    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
+        """Calls the `post_model_build` method of all registered plugins after the
+        model has been built / loaded, but before any adapters have been applied.

        Args:
-            cfg (dict): The configuration for the plugins.
-            model (object): The loaded model.
+            cfg: The configuration for the plugins.
+            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_model_build(cfg, model)

-    def post_model_load(self, cfg, model):
-        """
-        Calls the post_model_load method of all registered plugins after the model has been loaded
-        inclusive of any adapters
+    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
+        """Calls the `pre_lora_load` method of all registered plugins.

-        Parameters:
-        cfg (dict): The configuration for the plugins.
-        model (object): The loaded model.
-
-        Returns:
-        None
-        """
-        for plugin in self.plugins.values():
-            plugin.post_model_load(cfg, model)
-
-    def pre_lora_load(self, cfg, model):
-        """
-        Calls the pre_lora_load method of all registered plugins.
-
-        Parameters:
-        cfg (dict): The configuration for the plugins.
-        model (object): The loaded model.
-
-        Returns:
-        None
+        Args:
+            cfg: The configuration for the plugins.
+            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.pre_lora_load(cfg, model)

-    def post_lora_load(self, cfg, model):
-        """
-        Calls the post_lora_load method of all registered plugins.
+    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        """Calls the `post_lora_load` method of all registered plugins.

-        Parameters:
-        cfg (dict): The configuration for the plugins.
-        model (object): The loaded model.
-
-        Returns:
-        None
+        Args:
+            cfg: The configuration for the plugins.
+            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_lora_load(cfg, model)

-    def get_trainer_cls(self, cfg):
-        """
-        Calls the get_trainer_cls method of all registered plugins and returns the first non-None trainer class.
+    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        """Calls the `post_model_load` method of all registered plugins after the model
+        has been loaded inclusive of any adapters.

-        Parameters:
-        cfg (dict): The configuration for the plugins.
+        Args:
+            cfg: The configuration for the plugins.
+            model: The loaded model.
+        """
+        for plugin in self.plugins.values():
+            plugin.post_model_load(cfg, model)
+
+    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
+        """Calls the `get_trainer_cls` method of all registered plugins and returns the
+        first non-`None` trainer class.
+
+        Args:
+            cfg: The configuration for the plugins.

        Returns:
-        object: The trainer class, or None if none was found.
+            The first non-`None` trainer class returned by a plugin.
        """
        for plugin in self.plugins.values():
            trainer_cls = plugin.get_trainer_cls(cfg)
@@ -471,29 +431,25 @@ class PluginManager:
                return trainer_cls
        return None

-    def post_trainer_create(self, cfg, trainer):
-        """
-        Calls the post_trainer_create method of all registered plugins.
+    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
+        """Calls the `post_trainer_create` method of all registered plugins.

-        Parameters:
-        cfg (dict): The configuration for the plugins.
-        trainer (object): The trainer object for training.
-
-        Returns:
-        None
+        Args:
+            cfg: The configuration for the plugins.
+            trainer: The trainer object for training.
        """
        for plugin in self.plugins.values():
            plugin.post_trainer_create(cfg, trainer)

-    def create_optimizer(self, trainer):
-        """
-        Calls the create_optimizer method of all registered plugins and returns the first non-None optimizer.
+    def create_optimizer(self, trainer: Trainer) -> Optimizer | None:
+        """Calls the `create_optimizer` method of all registered plugins and returns
+        the first non-`None` optimizer.

-        Parameters:
-        trainer (object): The trainer object for training.
+        Args:
+            trainer: The trainer object for training.

        Returns:
-        object: The created optimizer, or None if none was found.
+            The created optimizer, or `None` if none was found.
        """
        for plugin in self.plugins.values():
            optimizer = plugin.create_optimizer(self.cfg, trainer)
@@ -502,17 +458,17 @@ class PluginManager:
        return None

    def create_lr_scheduler(
-        self, trainer, optimizer, num_training_steps
+        self, trainer: Trainer, optimizer: Optimizer, num_training_steps: int
    ) -> LRScheduler | None:
-        """
-        Calls the create_lr_scheduler method of all registered plugins and returns the first non-None scheduler.
+        """Calls the `create_lr_scheduler` method of all registered plugins and returns
+        the first non-`None` scheduler.

-        Parameters:
-        trainer (object): The trainer object for training.
-        optimizer (object): The optimizer for training.
+        Args:
+            trainer: The trainer object for training.
+            optimizer: The optimizer for training.

        Returns:
-        object: The created learning rate scheduler, or None if none was found.
+            The created learning rate scheduler, or `None` if not found.
        """
        for plugin in self.plugins.values():
            scheduler: LRScheduler | None = plugin.create_lr_scheduler(
@@ -525,16 +481,17 @@ class PluginManager:
                return scheduler
        return None

-    def add_callbacks_pre_trainer(self, cfg, model):
-        """
-        Calls the add_callbacks_pre_trainer method of all registered plugins.
+    def add_callbacks_pre_trainer(
+        self, cfg: DictDefault, model: PreTrainedModel
+    ) -> list[Callable]:
+        """Calls the add_callbacks_pre_trainer method of all registered plugins.

-        Parameters:
-        cfg (dict): The configuration for the plugins.
-        model (object): The loaded model.
+        Args:
+            cfg: The configuration for the plugins.
+            model: The loaded model.

        Returns:
-        List[callable]: A list of callback functions to be added to the TrainingArgs.
+            A list of callback functions to be added to the `TrainingArgs`.
        """
        callbacks = []
        for plugin in self.plugins.values():
@@ -543,16 +500,17 @@ class PluginManager:
                callbacks.extend(plugin_callbacks)
        return callbacks

-    def add_callbacks_post_trainer(self, cfg, trainer):
-        """
-        Calls the add_callbacks_post_trainer method of all registered plugins.
+    def add_callbacks_post_trainer(
+        self, cfg: DictDefault, trainer: Trainer
+    ) -> list[Callable]:
+        """Calls the `add_callbacks_post_trainer` method of all registered plugins.

-        Parameters:
-        cfg (dict): The configuration for the plugins.
-        trainer (object): The trainer object for training.
+        Args:
+            cfg: The configuration for the plugins.
+            trainer: The trainer object for training.

        Returns:
-        List[callable]: A list of callback functions to be added to the TrainingArgs.
+            A list of callback functions to be added to the `TrainingArgs`.
        """
        callbacks = []
        for plugin in self.plugins.values():
@@ -561,41 +519,31 @@ class PluginManager:
                callbacks.extend(plugin_callbacks)
        return callbacks

-    def post_train(self, cfg, model):
-        """
-        Calls the post_train method of all registered plugins.
+    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
+        """Calls the post_train method of all registered plugins.

-        Parameters:
-        cfg (dict): The configuration for the plugins.
-        model (object): The loaded model.
-
-        Returns:
-        None
+        Args:
+            cfg: The configuration for the plugins.
+            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_train(cfg, model)

-    def post_train_unload(self, cfg):
-        """
-        Calls the post_train_unload method of all registered plugins.
+    def post_train_unload(self, cfg: DictDefault):
+        """Calls the post_train_unload method of all registered plugins.

-        Parameters:
-        cfg (dict): The configuration for the plugins.
-        model (object): The loaded model.
-
-        Returns:
-        None
+        Args:
+            cfg: The configuration for the plugins.
+            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_train_unload(cfg)


 class BaseOptimizerFactory:
-    """
-    Base class for factories to create custom optimizers
-    """
+    """Base class for factories to create custom optimizers"""

    def __call__(
        self, opt_model, training_args, **optimizer_kwargs
-    ) -> "torch.optim.Optimizer":
+    ) -> Optimizer | None:
        pass
--- a/src/axolotl/loaders/init.py
+++ b/src/axolotl/loaders/init.py
@@ -0,0 +1,10 @@
+"""Init for axolotl.loaders module"""
+
+# pylint: disable=unused-import
+# flake8: noqa
+
+from .adapter import load_adapter, load_lora
+from .constants import MULTIMODAL_AUTO_MODEL_MAPPING
+from .model import ModelLoader
+from .processor import load_processor
+from .tokenizer import load_tokenizer
--- a/src/axolotl/loaders/adapter.py
+++ b/src/axolotl/loaders/adapter.py
@@ -0,0 +1,206 @@
+"""Adapter loading functionality, including LoRA / QLoRA and associated utils"""
+
+import logging
+import os
+import types
+from typing import Any
+
+import bitsandbytes as bnb
+import torch
+from bitsandbytes.nn import Params4bit
+from peft import (
+    AdaptionPromptConfig,
+    LoftQConfig,
+    LoraConfig,
+    PeftConfig,
+    PeftMixedModel,
+    PeftModel,
+    get_peft_model,
+)
+from transformers import PreTrainedModel
+
+from axolotl.loaders.utils import get_linear_embedding_layers
+from axolotl.utils.dict import DictDefault
+
+LOG = logging.getLogger(__name__)
+
+
+def setup_quantized_meta_for_peft(model: torch.nn.Module):
+    """Replaces `quant_state.to` with a dummy function to prevent PEFT from moving `quant_state` to meta device"""
+
+    def temp_to_method(self, *args, **kwargs):  # pylint: disable=unused-argument
+        return self
+
+    for param in model.parameters():
+        if isinstance(param, Params4bit):
+            param.quant_state._orig_to = (  # pylint: disable=protected-access
+                param.quant_state.to
+            )
+            param.quant_state.to = types.MethodType(temp_to_method, param.quant_state)
+
+
+def setup_quantized_peft_meta_for_training(model: torch.nn.Module):
+    """Replaces dummy `quant_state.to` method with the original function to allow training to continue"""
+    for param in model.parameters():
+        if isinstance(param, Params4bit) and hasattr(param.quant_state, "_orig_to"):
+            param.quant_state.to = (
+                param.quant_state._orig_to  # pylint: disable=protected-access
+            )
+            param.quant_state._orig_to = None  # pylint: disable=protected-access
+
+
+def find_all_linear_names(model):
+    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
+    lora_module_names = set()
+    for name, module in model.named_modules():
+        if (
+            isinstance(module, cls)
+            or "Linear" in module.__class__.__name__
+            and module.__class__.__name__ not in ("LlamaLinearScalingRotaryEmbedding",)
+        ):
+            names = name.split(".")
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+
+    embedding_modules = get_linear_embedding_layers(model.config.model_type)
+    output_embedding = embedding_modules[1]
+    if output_embedding in lora_module_names:  # needed for 16-bit
+        lora_module_names.remove(output_embedding)
+
+    return list(lora_module_names)
+
+
+def load_lora(
+    model: PreTrainedModel,
+    cfg: DictDefault,
+    inference: bool = False,
+    config_only: bool = False,
+) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel | None, PeftConfig | None]:
+    lora_target_modules = cfg.lora_target_modules or []
+
+    if cfg.lora_target_linear:
+        linear_names = find_all_linear_names(model)
+        LOG.info(f"found linear modules: {repr(sorted(linear_names))}")
+        lora_target_modules_as_list = (
+            lora_target_modules
+            if isinstance(lora_target_modules, list)
+            else [lora_target_modules]
+        )
+        lora_target_modules = list(set(lora_target_modules_as_list + linear_names))
+
+    lora_config_kwargs = {}
+    loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
+    if loftq_bits:
+        lora_config_kwargs["loftq_config"] = LoftQConfig(loftq_bits=loftq_bits)
+        lora_config_kwargs["init_lora_weights"] = "loftq"
+    if cfg.peft_init_lora_weights:
+        lora_config_kwargs["init_lora_weights"] = cfg.peft_init_lora_weights
+    if cfg.peft_use_dora:
+        lora_config_kwargs["use_dora"] = cfg.peft_use_dora
+        LOG.info("Initializing LoRA weights using dora. This might take longer.")
+    if cfg.peft_use_rslora:
+        lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
+    if cfg.peft_layer_replication:
+        lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
+
+    lora_config = LoraConfig(
+        r=cfg.lora_r,
+        lora_alpha=cfg.lora_alpha,
+        target_modules=lora_target_modules,
+        layers_to_transform=cfg.peft_layers_to_transform,
+        layers_pattern=cfg.peft_layers_pattern,
+        lora_dropout=cfg.lora_dropout,
+        fan_in_fan_out=cfg.lora_fan_in_fan_out,
+        modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
+        bias="none",
+        task_type="CAUSAL_LM",
+        **lora_config_kwargs,
+    )
+
+    if config_only:
+        return None, lora_config
+
+    rank = int(os.environ.get("LOCAL_RANK", 0))
+
+    if (
+        cfg.fsdp
+        and cfg.adapter
+        and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
+        and rank != 0
+    ):
+        setup_quantized_meta_for_peft(model)
+
+    if cfg.lora_model_dir:
+        LOG.debug("Loading pretrained PEFT - LoRA")
+        model_kwargs: Any = {}
+        if cfg.lora_on_cpu:
+            model_kwargs["max_memory"] = {"cpu": "256GiB"}
+            model_kwargs["device_map"] = {"": "cpu"}
+        model = PeftModel.from_pretrained(
+            model,
+            cfg.lora_model_dir,
+            is_trainable=(not inference),
+            **model_kwargs,
+        )
+    else:
+        model = get_peft_model(model, lora_config)
+
+    if rank == 0:
+        try:
+            model.print_trainable_parameters()
+        except AttributeError as exc:
+            LOG.warning(
+                "Exception caught during model.print_trainable_parameters(): %s", exc
+            )
+    elif (
+        cfg.fsdp
+        and cfg.adapter
+        and cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
+        and rank != 0
+    ):
+        setup_quantized_peft_meta_for_training(model)
+
+    return model, lora_config
+
+
+def load_adapter(
+    model: PreTrainedModel,
+    cfg: DictDefault,
+    adapter: str | None,
+    inference: bool = False,
+) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel, PeftConfig | None]:
+    if adapter is None:
+        return model, None
+    if hasattr(model, "enable_input_require_grads"):
+        model.enable_input_require_grads()
+    if adapter in ["lora", "qlora"]:
+        peft_model, lora_config = load_lora(model, cfg, inference=inference)
+        return peft_model, lora_config
+    if adapter == "llama-adapter":
+        peft_model, lora_config = load_llama_adapter(model, cfg)
+        return peft_model, lora_config
+
+    raise NotImplementedError(f"{adapter} PEFT adapter not available")
+
+
+def load_llama_adapter(
+    model: PreTrainedModel, cfg: DictDefault
+) -> tuple[PeftModel | PeftMixedModel, PeftConfig]:
+    peft_config = AdaptionPromptConfig(
+        adapter_layers=cfg.peft_adapter.layers,  # layers (L)
+        adapter_len=cfg.peft_adapter.len,  # prompt length (K)
+        task_type="CAUSAL_LM",
+    )
+
+    if cfg.lora_model_dir:
+        LOG.debug("Loading pretrained PEFT - llama_adapter")
+        peft_model = PeftModel.from_pretrained(
+            model,
+            cfg.lora_model_dir,
+            torch_dtype=torch.float16,
+        )
+    else:
+        peft_model = get_peft_model(model, peft_config)
+
+    peft_model.print_trainable_parameters()
+
+    return peft_model, peft_config
--- a/src/axolotl/loaders/constants.py
+++ b/src/axolotl/loaders/constants.py
@@ -0,0 +1,21 @@
+"""Shared constants for axolotl.loaders module"""
+
+from transformers import (
+    Gemma3ForConditionalGeneration,
+    Llama4ForConditionalGeneration,
+    LlavaForConditionalGeneration,
+    Mistral3ForConditionalGeneration,
+    MllamaForConditionalGeneration,
+    Qwen2_5_VLForConditionalGeneration,
+    Qwen2VLForConditionalGeneration,
+)
+
+MULTIMODAL_AUTO_MODEL_MAPPING = {
+    "mllama": MllamaForConditionalGeneration,
+    "llama4": Llama4ForConditionalGeneration,
+    "llava": LlavaForConditionalGeneration,
+    "qwen2_vl": Qwen2VLForConditionalGeneration,
+    "qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
+    "mistral3": Mistral3ForConditionalGeneration,
+    "gemma3": Gemma3ForConditionalGeneration,
+}
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -0,0 +1,754 @@
+"""Model loader class implementation for loading, configuring, and patching various
+models.
+"""
+
+import gc
+import logging
+import math
+import os
+from functools import cached_property
+from importlib.util import find_spec
+from typing import Any
+
+import peft
+import torch
+import transformers
+import transformers.modeling_utils
+from accelerate import init_empty_weights
+from peft import PeftConfig, PeftMixedModel, PeftModel, prepare_model_for_kbit_training
+from transformers import (
+    AutoModelForCausalLM,
+    AutoModelForVision2Seq,
+    AwqConfig,
+    BitsAndBytesConfig,
+    GPTQConfig,
+    PreTrainedModel,
+    PreTrainedTokenizerBase,
+)
+from transformers.integrations.deepspeed import (
+    HfTrainerDeepSpeedConfig,
+    is_deepspeed_zero3_enabled,
+)
+
+from axolotl.common.architectures import MOE_ARCH_BLOCK
+from axolotl.integrations.base import PluginManager
+from axolotl.loaders.adapter import load_adapter, load_lora
+from axolotl.loaders.constants import MULTIMODAL_AUTO_MODEL_MAPPING
+from axolotl.loaders.patch_manager import PatchManager
+from axolotl.loaders.utils import (
+    get_linear_embedding_layers,
+    get_module_class_from_name,
+    load_model_config,
+)
+from axolotl.models.mamba import fix_mamba_attn_for_loss
+from axolotl.utils.bench import log_gpu_memory_usage
+from axolotl.utils.dict import DictDefault
+from axolotl.utils.distributed import (
+    get_device_count,
+    get_device_type,
+)
+from axolotl.utils.model_shard_quant import load_sharded_model_quant
+from axolotl.utils.schemas.enums import RLType
+
+LOG = logging.getLogger(__name__)
+PLUGIN_MANAGER = PluginManager.get_instance()
+
+
+class ModelLoader:
+    """Manages model configuration, initialization and application of patches during
+    model loading.
+
+    This class orchestrates the entire process of loading a model from configuration to
+    final preparation. It handles device mapping, quantization, attention mechanisms,
+    adapter integration, and various optimizations.
+
+    The loading process includes:
+        - Loading and validating model configuration
+        - Applying monkey patches for optimizations / fixes
+        - Setting up device mapping (including multi-GPU configurations)
+        - Configuring quantization
+        - Setting attention mechanisms (Flash Attention, SDPA, etc.)
+        - Loading and initializing the model
+        - Applying adapters (LoRA, QLoRA, etc.)
+
+    Attributes:
+        model: The loaded model instance (available after load() is called).
+        model_kwargs: Dictionary of keyword arguments passed to model initialization.
+        base_model: Name or path of the base model to load.
+        model_type: Type of model to load (e.g., `AutoModelForCausalLM`).
+        model_config: Configuration object for the model.
+        auto_model_loader: class used for loading the model (default:
+            `AutoModelForCausalLM`).
+    """
+
+    def __init__(
+        self,
+        cfg: DictDefault,
+        tokenizer: PreTrainedTokenizerBase,
+        *,
+        inference: bool = False,
+        reference_model: bool = False,
+        **kwargs,  # pylint: disable=unused-argument
+    ):
+        """Initializes the ModelLoader.
+
+        Args:
+            cfg: Configuration dictionary with model and training settings.
+            tokenizer: Tokenizer instance associated with the model.
+            processor: Optional processor for multimodal models. Defaults to None.
+            inference: Whether the model is being loaded for inference mode. Defaults
+                to False.
+            reference_model: Whether this is a reference model (used in setups like DPO
+                training). Defaults to False.
+            **kwargs: Additional keyword arguments (ignored).
+        """
+        self.cfg = cfg
+        self.tokenizer = tokenizer
+        self.inference: bool = inference
+        self.reference_model: bool = reference_model
+
+        # Init model kwargs
+        self.model_kwargs: dict[str, Any] = {}
+        if cfg.overrides_of_model_kwargs:
+            for key, val in cfg.overrides_of_model_kwargs.items():
+                self.model_kwargs[key] = val
+
+        # Init model
+        self.model: PreTrainedModel | PeftModel | PeftMixedModel
+        self.base_model = cfg.base_model
+        self.model_type = cfg.type_of_model
+
+        # Init model config
+        self.model_config = load_model_config(cfg)
+        self.auto_model_loader = AutoModelForCausalLM  # pylint: disable=invalid-name
+
+        # Initialize the patch manager
+        self.patch_manager = PatchManager(
+            cfg=cfg,
+            model_config=self.model_config,
+            inference=inference,
+        )
+
+    @cached_property
+    def has_flash_attn(self) -> bool:
+        """Check if flash attention is installed."""
+        return find_spec("flash_attn") is not None
+
+    @cached_property
+    def qlora_fsdp(self):
+        """Property that determines if FSDP with QLoRA is enabled."""
+        return self.cfg.fsdp and self.cfg.adapter == "qlora"
+
+    def load(self) -> tuple[PreTrainedModel, PeftConfig | None]:
+        """Load and prepare the model with all configurations and patches.
+
+        Returns:
+            A tuple with the loaded model and its LoRA configuration (if applicable).
+        """
+        # Initial setup and patches
+        self.patch_manager.apply_pre_model_load_patches()
+        self._apply_pre_model_load_setup()
+
+        # Build the model
+        PLUGIN_MANAGER.pre_model_load(self.cfg)
+        skip_move_to_device = self._build_model()
+        PLUGIN_MANAGER.post_model_build(self.cfg, self.model)
+
+        # Post-build model configuration
+        self._apply_post_model_load_setup()
+
+        # Load adapters (LoRA, etc.)
+        PLUGIN_MANAGER.pre_lora_load(self.cfg, self.model)
+        lora_config = self._load_adapters()
+        PLUGIN_MANAGER.post_lora_load(self.cfg, self.model)
+
+        # Apply remaining patches and finalize
+        self._apply_post_lora_load_setup(skip_move_to_device)
+        self.patch_manager.apply_post_model_load_patches(self.model)
+        PLUGIN_MANAGER.post_model_load(self.cfg, self.model)
+
+        return self.model, lora_config
+
+    def _apply_pre_model_load_setup(self):
+        """Apply patches and setup configurations before model loading."""
+        self._set_auto_model_loader()
+        self._set_device_map_config()
+        if self.cfg.revision_of_model:
+            self.model_kwargs["revision"] = self.cfg.revision_of_model
+        self._set_quantization_config()
+        self._set_attention_config()
+
+    def _apply_post_model_load_setup(self):
+        """Configure the model after it has been loaded."""
+        # Handle PeftModel if needed
+        if (
+            isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
+            and not self.qlora_fsdp
+        ):
+            self.model = self.model.merge_and_unload()
+
+        self._resize_token_embeddings()
+        self._adjust_model_config()
+        self._log_memory_usage()
+        self._configure_embedding_dtypes()
+
+    def _resize_token_embeddings(self):
+        """Resize token embeddings if needed."""
+        embeddings_len = (
+            math.ceil(len(self.tokenizer) / 32) * 32
+            if self.cfg.resize_token_embeddings_to_32x
+            else len(self.tokenizer)
+        )
+        if hasattr(self.model, "get_input_embeddings") and (
+            self.model.get_input_embeddings().num_embeddings < embeddings_len
+            or (
+                self.model.get_input_embeddings().num_embeddings > embeddings_len
+                and self.cfg.shrink_embeddings
+            )
+        ):
+            resize_kwargs = {}
+            if self.cfg.mean_resizing_embeddings is not None and (
+                self.model_config.model_type != "llava"
+            ):
+                resize_kwargs["mean_resizing"] = self.cfg.mean_resizing_embeddings
+            self.model.resize_token_embeddings(embeddings_len, **resize_kwargs)
+        else:
+            self.model.tie_weights()
+
+    def _adjust_model_config(self):
+        if (
+            hasattr(self.model, "config")
+            and hasattr(self.model.config, "max_position_embeddings")
+            and self.model.config.max_position_embeddings
+            and self.cfg.sequence_len > self.model.config.max_position_embeddings
+        ):
+            LOG.warning(
+                "increasing model.config.max_position_embeddings from "
+                f"{self.model.config.max_position_embeddings} to {self.cfg.sequence_len}"
+            )
+            self.model.config.max_position_embeddings = self.cfg.sequence_len
+
+        if (
+            hasattr(self.model, "config")
+            and hasattr(self.model.config, "bos_token_id")
+            and self.model.config.bos_token_id
+            and self.model.config.bos_token_id != self.tokenizer.bos_token_id
+        ):
+            self.model.config.bos_token_id = self.tokenizer.bos_token_id
+
+        if (
+            hasattr(self.model, "config")
+            and hasattr(self.model.config, "eos_token_id")
+            and self.model.config.eos_token_id
+            and self.model.config.eos_token_id != self.tokenizer.eos_token_id
+        ):
+            self.model.config.eos_token_id = self.tokenizer.eos_token_id
+
+    def _log_memory_usage(self):
+        """Log device memory usage after model load."""
+        if hasattr(self.model, "device") and self.model.device.type in (
+            "cuda",
+            "mps",
+            "npu",
+        ):
+            log_gpu_memory_usage(LOG, "after model load", self.model.device)
+
+    def _configure_embedding_dtypes(self):
+        """Configure embedding module dtypes."""
+        # Get embedding modules
+        embedding_modules = get_linear_embedding_layers(self.cfg.model_config_type)
+
+        # Initial dtype conversion
+        if not self.cfg.fsdp:
+            # We don't run this during FSDP because this will leave mixed and bfloat16
+            # dtypes in the model which FSDP doesn't like
+            if self.cfg.load_in_4bit and self.cfg.embeddings_skip_upcast:
+                embedding_modules = []
+            self._convert_embedding_modules_dtype(
+                embedding_modules,
+                dist_dtype=torch.float32,
+                before_kbit_train_or_finetune=True,
+            )
+
+        # Handle DeepSpeed Zero3
+        if is_deepspeed_zero3_enabled():
+            self._set_z3_leaf_modules()
+
+        # Apply gradient checkpointing if needed
+        needs_fa2_dtype = self.cfg.adapter or self.cfg.fsdp
+        if self.cfg.adapter in ["lora", "qlora"]:
+            needs_fa2_dtype = True
+            if self.cfg.gradient_checkpointing:
+                self.model.gradient_checkpointing_enable(
+                    gradient_checkpointing_kwargs=self.cfg.gradient_checkpointing_kwargs
+                )
+
+        self._prepare_model_for_quantization()
+
+        # Convert dtypes if needed
+        should_convert = (
+            # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so
+            # we need to convert them back to fp16/bf16 for flash-attn compatibility.
+            (
+                (needs_fa2_dtype or self.cfg.flash_attention or self.cfg.flex_attention)
+                and not self.qlora_fsdp
+            )
+            # CCE requires embedding layers to be in fp16/bf16 for backward pass
+            or self.cfg.cut_cross_entropy
+        )
+
+        if should_convert:
+            LOG.info("Converting modules to %s", self.cfg.torch_dtype)
+            self._convert_embedding_modules_dtype(
+                embedding_modules=embedding_modules,
+                dist_dtype=self.cfg.torch_dtype,
+                before_kbit_train_or_finetune=False,
+            )
+
+    def _load_adapters(self) -> PeftConfig | None:
+        """Load LoRA or other adapters."""
+        # Load LoRA or adapter
+        lora_config = None
+        if not self.reference_model or self.cfg.lora_model_dir:
+            # If we're not loading the reference model, then we're loading the model
+            # for training. Then, the DPO trainer doesn't want the PEFT model loaded
+            # over it, it just wants the LoRA / PEFT config.
+            if (
+                self.cfg.adapter
+                and self.cfg.rl in [RLType.DPO, RLType.IPO, RLType.KTO]
+                and not self.cfg.merge_lora
+            ):
+                _, lora_config = load_lora(
+                    self.model, self.cfg, inference=False, config_only=True
+                )
+            else:
+                self.model, lora_config = load_adapter(
+                    self.model, self.cfg, self.cfg.adapter
+                )
+
+        return lora_config
+
+    def _apply_post_lora_load_setup(self, skip_move_to_device: bool):
+        """Apply final optimizations and patches."""
+        # Place model on accelerator
+        if (
+            self.cfg.ddp
+            and not self.cfg.load_in_8bit
+            and not (self.cfg.rl and self.cfg.load_in_4bit)
+            and not skip_move_to_device
+        ):
+            # TODO: validate this conditional
+            self.model.to(f"{str(get_device_type())}:{self.cfg.local_rank}")
+
+        if get_device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) == 1:
+            self.model.is_parallelizable = True
+            self.model.model_parallel = True
+
+        if not any(
+            param.requires_grad
+            for _, param in self.model.named_parameters(recurse=True)
+        ):
+            LOG.warning("There are no parameters that require gradient updates")
+
+        if self.cfg.flash_optimum:
+            from optimum.bettertransformer import BetterTransformer
+
+            self.model = BetterTransformer.transform(self.model)
+
+        if self.cfg.adapter is not None:
+            log_gpu_memory_usage(LOG, "after adapters", self.model.device)
+
+        for _ in range(3):
+            gc.collect()
+            torch.cuda.empty_cache()
+
+    def _set_auto_model_loader(self):
+        """Set `self.auto_model_loader`. Defaults to `transformers.AutoModelForCausalLM`
+        (set at `__init__`). When using a multimodal model, `self.auto_model_loader`
+        should be set according to the type of the model.
+        """
+        if self.cfg.is_multimodal:
+            self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get(
+                self.model_config.model_type, AutoModelForVision2Seq
+            )
+
+    def _set_device_map_config(self):
+        """Setup `device_map` according to config"""
+        device_map = self.cfg.device_map
+        max_memory = self.cfg.max_memory
+
+        if self.cfg.gpu_memory_limit:
+            gpu_memory_limit = (
+                str(self.cfg.gpu_memory_limit) + "GiB"
+                if isinstance(self.cfg.gpu_memory_limit, int)
+                else self.cfg.gpu_memory_limit
+            )
+
+            max_memory = {}
+            num_device = get_device_count()
+            for i in range(num_device):
+                max_memory[i] = gpu_memory_limit
+            max_memory["cpu"] = "256GiB"  # something sufficiently large to fit anything
+
+        if max_memory is not None:
+            # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py
+            from accelerate import infer_auto_device_map
+
+            with init_empty_weights():
+                model_canvas = self.auto_model_loader.from_config(
+                    self.model_config,
+                    trust_remote_code=self.cfg.trust_remote_code or False,
+                )
+            model_canvas.tie_weights()
+            device_map = infer_auto_device_map(
+                model_canvas,
+                max_memory=max_memory,
+                dtype=self.cfg.torch_dtype,
+            )
+            # We can discard max_memory now as we have a device map set up
+            max_memory = None
+
+        self.model_kwargs["torch_dtype"] = self.cfg.torch_dtype
+
+        if not is_deepspeed_zero3_enabled():
+            self.model_kwargs["device_map"] = device_map
+
+            cur_device = get_device_type()
+            if "mps" in str(cur_device):
+                self.model_kwargs["device_map"] = "mps:0"
+            elif "npu" in str(cur_device):
+                self.model_kwargs["device_map"] = "npu:0"
+
+        # TODO: can we put the reference model on it's own gpu? I think we have to move
+        # logits around to calculate loss
+        # if cfg.rl:
+        #     if torch.cuda.device_count() > 1:
+        #         if reference_model:
+        #             model_kwargs["device_map"] = "cuda:" + str(
+        #                 torch.cuda.current_device() + 1
+        #             )
+        #         else:
+        #             model_kwargs["device_map"] = "cuda:" + str(torch.cuda.current_device())
+
+    def _set_quantization_config(self):
+        """Set up quantization config (bitsandbytes, awq, gptq, etc.)"""
+        self.model_kwargs["load_in_8bit"] = self.cfg.load_in_8bit
+        self.model_kwargs["load_in_4bit"] = self.cfg.load_in_4bit
+
+        if self.cfg.gptq:
+            if not hasattr(self.model_config, "quantization_config"):
+                LOG.warning(
+                    "model config does not contain quantization_config information"
+                )
+            else:
+                if self.cfg.gptq_disable_exllama is not None:
+                    self.model_config.quantization_config["disable_exllama"] = (
+                        self.cfg.gptq_disable_exllama
+                    )
+                self.model_kwargs["quantization_config"] = GPTQConfig(
+                    **self.model_config.quantization_config
+                )
+        if (
+            self.cfg.adapter in ["qlora", "lora"]
+            and hasattr(self.model_config, "quantization_config")
+            and self.model_config.quantization_config["quant_method"]
+            in ["gptq", "awq", "bitsandbytes"]
+        ):
+            if self.model_config.quantization_config["quant_method"] == "gptq":
+                self.model_kwargs["quantization_config"] = GPTQConfig(
+                    **self.model_config.quantization_config
+                )
+            elif self.model_config.quantization_config["quant_method"] == "awq":
+                self.model_kwargs["quantization_config"] = AwqConfig(
+                    **self.model_config.quantization_config
+                )
+            elif (
+                self.model_config.quantization_config["quant_method"] == "bitsandbytes"
+            ):
+                self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
+                    **self.model_config.quantization_config
+                )
+        elif self.cfg.adapter == "qlora" and self.model_kwargs["load_in_4bit"]:
+            bnb_config = {
+                "load_in_4bit": True,
+                "llm_int8_threshold": 6.0,
+                "llm_int8_has_fp16_weight": False,
+                "bnb_4bit_compute_dtype": self.cfg.torch_dtype,
+                "bnb_4bit_use_double_quant": True,
+                "bnb_4bit_quant_type": "nf4",
+                "bnb_4bit_quant_storage": torch.bfloat16,
+            }
+            if self.cfg.model_config_type in ["jamba", "qwen2_moe"] and not (
+                self.cfg.deepspeed or self.cfg.fsdp
+            ):
+                # for some reason, this causes the loss to be off by an order of magnitude
+                # but deepspeed needs this still in bfloat16
+                bnb_config["bnb_4bit_quant_storage"] = torch.float32
+
+            if self.cfg.bnb_config_kwargs:
+                bnb_config.update(self.cfg.bnb_config_kwargs)
+
+            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
+                **bnb_config,
+            )
+        elif self.cfg.adapter == "lora" and self.model_kwargs["load_in_8bit"]:
+            bnb_config = {
+                "load_in_8bit": True,
+            }
+            # Exclude mamba blocks from int8 quantization for jamba
+            if self.cfg.model_config_type == "jamba":
+                bnb_config["llm_int8_skip_modules"] = ["mamba"]
+            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
+                **bnb_config,
+            )
+
+        # no longer needed per https://github.com/huggingface/transformers/pull/26610
+        if "quantization_config" in self.model_kwargs or self.cfg.gptq:
+            self.model_kwargs.pop("load_in_8bit", None)
+            self.model_kwargs.pop("load_in_4bit", None)
+
+    def _set_attention_config(self):
+        """Sample packing uses custom FA2 patch"""
+        if self.cfg.flex_attention:
+            self.model_kwargs["attn_implementation"] = "flex_attention"
+            self.model_config._attn_implementation = (  # pylint: disable=protected-access
+                "flex_attention"
+            )
+
+        elif self.cfg.flash_attention:
+            if not self.cfg.sample_packing and self.cfg.s2_attention:
+                pass
+            self.model_kwargs["attn_implementation"] = "flash_attention_2"
+            self.model_config._attn_implementation = (  # pylint: disable=protected-access
+                "flash_attention_2"
+            )
+        elif self.cfg.sdp_attention:
+            self.model_kwargs["attn_implementation"] = "sdpa"
+            self.model_config._attn_implementation = (  # pylint: disable=protected-access
+                "sdpa"
+            )
+        elif self.cfg.eager_attention:
+            self.model_kwargs["attn_implementation"] = "eager"
+            self.model_config._attn_implementation = (  # pylint: disable=protected-access
+                "eager"
+            )
+
+        if self.cfg.low_cpu_mem_usage:
+            self.model_kwargs["low_cpu_mem_usage"] = True
+
+    def _configure_zero3_memory_efficient_loading(self):
+        """Set the deepspeed config to load the model into RAM first before moving
+        to VRAM.
+
+        We need to return `hf_ds_cfg` as it needs to exist before model loading.
+        """
+        hf_ds_cfg = None
+
+        if os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3":
+            hf_ds_cfg = HfTrainerDeepSpeedConfig(self.cfg.deepspeed)
+            hf_ds_cfg.fill_match(
+                "train_micro_batch_size_per_gpu", self.cfg.micro_batch_size
+            )
+            hf_ds_cfg.fill_match(
+                "gradient_accumulation_steps", self.cfg.gradient_accumulation_steps
+            )
+            hf_ds_cfg.fill_match(
+                "train_batch_size",
+                int(os.getenv("WORLD_SIZE", "1"))
+                * self.cfg.micro_batch_size
+                * self.cfg.gradient_accumulation_steps,
+            )
+            if "device_map" in self.model_kwargs:
+                del self.model_kwargs["device_map"]
+
+            transformers.modeling_utils.is_deepspeed_zero3_enabled = lambda: True
+            transformers.integrations.deepspeed.is_deepspeed_zero3_enabled = (
+                lambda: True
+            )
+
+        return hf_ds_cfg
+
+    def _build_model(self) -> bool:
+        """Load model, with load strategy depending on config."""
+        skip_move_to_device = False
+        if (
+            self.qlora_fsdp
+            and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
+            and (
+                self.cfg.model_config_type == "dbrx"
+                or self.cfg.qlora_sharded_model_loading
+            )
+        ):
+            quant_storage = self.cfg.torch_dtype
+            quantization_config = getattr(
+                self.model_config, "quantization_config", None
+            )
+            quantization_config = (
+                quantization_config or self.model_kwargs["quantization_config"]
+            )
+            self.model = load_sharded_model_quant(
+                self.base_model,
+                self.model_config,
+                self.cfg,
+                quant_storage=quant_storage,
+                quantization_config=quantization_config,
+            )
+            skip_move_to_device = True
+        elif (
+            self.model_config.model_type in ["llama", "llama4"]
+            and not self.cfg.trust_remote_code
+            and not self.cfg.gptq
+        ):
+            # TODO: Do we need to open this up for all models?
+            if self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading:
+                skip_move_to_device = True
+                if "device_map" in self.model_kwargs:
+                    del self.model_kwargs["device_map"]
+
+            self._configure_zero3_memory_efficient_loading()
+
+            # Load model with random initialization if specified
+            if self.cfg.random_init_weights:
+                # AutoModel classes support the from_config method
+                if self.auto_model_loader in [
+                    AutoModelForCausalLM,
+                    AutoModelForVision2Seq,
+                ]:
+                    self.model = self.auto_model_loader.from_config(
+                        config=self.model_config,
+                    )
+                else:
+                    self.model = self.auto_model_loader(config=self.model_config)
+            else:
+                self.model = self.auto_model_loader.from_pretrained(
+                    self.base_model,
+                    config=self.model_config,
+                    **self.model_kwargs,
+                )
+        elif self.model_type == "MambaLMHeadModel":
+            # FIXME this is janky at best and hacked together to make it work
+            MambaLMHeadModel = fix_mamba_attn_for_loss()  # pylint: disable=invalid-name
+
+            self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
+            self.model_kwargs["device"] = torch.cuda.current_device()
+            self.model_kwargs.pop("torch_dtype", None)
+            self.model_kwargs.pop("device_map", None)
+
+            self.model = MambaLMHeadModel.from_pretrained(
+                self.base_model,
+                **self.model_kwargs,
+            )
+        elif (
+            self.model_type
+            and self.model_type != "AutoModelForCausalLM"
+            and not self.cfg.trust_remote_code
+        ):
+            if self.cfg.gptq:
+                self.model = self.auto_model_loader.from_pretrained(
+                    self.base_model,
+                    config=self.model_config,
+                    trust_remote_code=self.cfg.trust_remote_code or False,
+                    **self.model_kwargs,
+                )
+            else:
+                self.model = getattr(transformers, self.model_type).from_pretrained(
+                    self.base_model,
+                    config=self.model_config,
+                    trust_remote_code=self.cfg.trust_remote_code or False,
+                    **self.model_kwargs,
+                )
+        else:
+            if self.cfg.gptq:
+                self.model = self.auto_model_loader.from_pretrained(
+                    self.base_model,
+                    config=self.model_config,
+                    trust_remote_code=self.cfg.trust_remote_code or False,
+                    **self.model_kwargs,
+                )
+            else:
+                if (
+                    self.cfg.fsdp
+                    and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading
+                ):
+                    # disabling either of these two still leads to VRAM spike before setting back down
+                    skip_move_to_device = True
+                    if "device_map" in self.model_kwargs:
+                        del self.model_kwargs["device_map"]
+
+                self._configure_zero3_memory_efficient_loading()
+
+                self.model = self.auto_model_loader.from_pretrained(
+                    self.base_model,
+                    config=self.model_config,
+                    trust_remote_code=self.cfg.trust_remote_code or False,
+                    **self.model_kwargs,
+                )
+        if is_deepspeed_zero3_enabled():
+            skip_move_to_device = True
+
+        return skip_move_to_device
+
+    def _set_z3_leaf_modules(self):
+        from deepspeed.utils import set_z3_leaf_modules
+
+        if self.cfg.model_config_type in MOE_ARCH_BLOCK:
+            moe_blocks = MOE_ARCH_BLOCK[self.cfg.model_config_type]
+            moe_blocks = [moe_blocks] if isinstance(moe_blocks, str) else moe_blocks
+            set_z3_leaf_modules(
+                self.model,
+                [
+                    get_module_class_from_name(self.model, module_name)
+                    for module_name in moe_blocks
+                ],
+            )
+
+    def _prepare_model_for_quantization(self):
+        """Prepare loaded model for quantization."""
+        skip_prepare_model_for_kbit_training = False
+        if self.cfg.model_config_type == "qwen" and self.cfg.adapter == "lora":
+            # Qwen doesn't play nicely with LoRA if this is enabled
+            skip_prepare_model_for_kbit_training = True
+
+        loftq_bits = (
+            self.cfg.peft
+            and self.cfg.peft.loftq_config
+            and self.cfg.peft.loftq_config.loftq_bits
+        )
+        if self.cfg.adapter == "lora" and loftq_bits:
+            skip_prepare_model_for_kbit_training = True
+
+        if (
+            self.qlora_fsdp
+            or (self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
+            or is_deepspeed_zero3_enabled()
+        ):
+            # Make sure everything is in the same dtype
+            skip_prepare_model_for_kbit_training = True
+
+        if (
+            not skip_prepare_model_for_kbit_training
+            and self.cfg.adapter in ["lora", "qlora"]
+            and (self.cfg.load_in_8bit or self.cfg.load_in_4bit)
+        ):
+            LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
+            self.model = prepare_model_for_kbit_training(
+                self.model, use_gradient_checkpointing=self.cfg.gradient_checkpointing
+            )
+
+    def _convert_embedding_modules_dtype(
+        self,
+        embedding_modules: list[str],
+        dist_dtype: torch.dtype,
+        before_kbit_train_or_finetune: bool,
+    ):
+        for name, module in self.model.named_modules():
+            if "norm" in name:
+                module.to(dist_dtype)
+            if before_kbit_train_or_finetune:
+                if name.endswith(".gate"):
+                    module.to(dist_dtype)
+                if self.model_config.model_type == "btlm":
+                    # don't upcast lm_head for btlm
+                    continue
+            if any(m in name for m in embedding_modules) and hasattr(module, "weight"):
+                module.to(dist_dtype)
--- a/src/axolotl/loaders/patch_manager.py
+++ b/src/axolotl/loaders/patch_manager.py
@@ -0,0 +1,380 @@
+"""Patch manager class implementation to complement `axolotl.loaders.ModelLoader`.
+
+Applies pre- and post-model load patches for various fixes and optimizations.
+"""
+
+import importlib.util
+import logging
+from functools import cached_property
+
+import addict
+import transformers
+from transformers import PretrainedConfig, PreTrainedModel
+
+from axolotl.integrations.base import PluginManager
+from axolotl.monkeypatch.multipack import (
+    SUPPORTED_MULTIPACK_MODEL_TYPES,
+    patch_for_multipack,
+)
+from axolotl.utils.dict import DictDefault
+
+LOG = logging.getLogger(__name__)
+PLUGIN_MANAGER = PluginManager.get_instance()
+
+
+class PatchManager:
+    """Manages the application of patches during the model loading process."""
+
+    def __init__(
+        self,
+        cfg: DictDefault,
+        model_config: PretrainedConfig | addict.Dict,
+        inference: bool = False,
+    ):
+        """Initialize the `PatchManager`.
+
+        Args:
+            cfg: Configuration dictionary with model and training settings.
+            model_config: Configuration object for the model.
+            inference: Whether the model is being loaded for inference mode.
+        """
+        self.cfg = cfg
+        self.model_config = model_config
+        self.inference = inference
+
+    @cached_property
+    def has_flash_attn(self) -> bool:
+        """Check if flash attention is installed."""
+        return importlib.util.find_spec("flash_attn") is not None
+
+    def apply_pre_model_load_patches(self):
+        """Apply pre-model load patches based on config."""
+        self._apply_flash_attention_patches()
+        self._apply_fsdp_patches()
+        self._apply_adapter_patches()
+        self._apply_flex_attention_patches()
+        self._apply_model_specific_patches()
+        self._apply_fp8_patches()
+        self._apply_flash_attention_peft_patches()
+        self._apply_gradient_checkpointing_patches()
+        self._patch_attention()
+        self._apply_multipack_patches()
+        self._patch_llama_derived_model()
+        self._apply_mistral_cross_entropy_patch()
+        self._apply_unsloth_self_attention_patch()
+
+    def apply_post_model_load_patches(self, model: PreTrainedModel):
+        """Apply patches that require the model instance."""
+        self._apply_llama_flash_attn_patches(model)
+        self._apply_unsloth_patches(model)
+        self._apply_lora_kernel_patch(model)
+
+    def _apply_flash_attention_patches(self):
+        """Apply patches related to Flash Attention."""
+        if self.cfg.xformers_attention and self.cfg.sample_packing:
+            from axolotl.monkeypatch.attention import patch_xformers_attn_over_fa2
+
+            patch_xformers_attn_over_fa2()
+            self.cfg.flash_attention = True
+
+    def _apply_fsdp_patches(self):
+        """Apply patches for FSDP configurations."""
+        if self.cfg.fsdp_config and str(self.cfg.fsdp_config.fsdp_version) == "2":
+            from axolotl.monkeypatch.accelerate.fsdp2 import patch_accelerate_fsdp_utils
+
+            patch_accelerate_fsdp_utils()
+
+    def _apply_adapter_patches(self):
+        """Apply patches for adapter configurations."""
+        if self.cfg.adapter and self.cfg.embeddings_skip_upcast:
+            from axolotl.monkeypatch.peft.utils import patch_peft_prep_code
+
+            patch_peft_prep_code()
+
+    def _apply_flex_attention_patches(self):
+        """Apply patches for flexible attention."""
+        if self.cfg.flex_attention:
+            from axolotl.monkeypatch.attention.flex_attn import (
+                patch_flex_make_mask,
+                patch_flex_wrapper,
+            )
+
+            flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
+            patch_flex_wrapper(**flex_attn_compile_kwargs)
+            patch_flex_make_mask()
+
+    def _apply_model_specific_patches(self):
+        """Apply patches specific to model architectures."""
+        if (
+            self.cfg.model_config_type == "llama4"
+            and self.cfg.llama4_linearized_experts
+        ):
+            from axolotl.monkeypatch.models.llama4.modeling import (
+                patch_llama4_linearized_modeling,
+            )
+
+            patch_llama4_linearized_modeling()
+
+        if self.cfg.model_config_type == "gemma3":
+            from axolotl.monkeypatch.gemma3 import (
+                patch_gemma3conditionalgeneration_forward,
+            )
+
+            patch_gemma3conditionalgeneration_forward()
+
+    def _apply_fp8_patches(self):
+        """Apply patches for FP8 support."""
+        if self.cfg.fp8:
+            from axolotl.monkeypatch.trainer_accelerator_args import (
+                patch_create_accelerate_code_for_fp8,
+            )
+
+            patch_create_accelerate_code_for_fp8()
+
+    def _apply_flash_attention_peft_patches(self):
+        """Apply patches for Flash Attention with PEFT."""
+        if self.cfg.adapter:
+            from axolotl.monkeypatch.transformers_fa_utils import (
+                patch_fa_peft_integration,
+            )
+
+            patch_fa_peft_integration()
+
+    def _apply_gradient_checkpointing_patches(self):
+        """Apply patches for gradient checkpointing."""
+        if self.cfg.gradient_checkpointing in ["unsloth", "offload"]:
+            from axolotl.monkeypatch.gradient_checkpointing import (
+                hf_grad_checkpoint_offload_wrapper,
+            )
+
+            transformers.modeling_utils.checkpoint = hf_grad_checkpoint_offload_wrapper
+        if self.cfg.gradient_checkpointing == "offload_disk":
+            from axolotl.monkeypatch.gradient_checkpointing import (
+                hf_grad_checkpoint_disk_offload_wrapper,
+            )
+
+            transformers.modeling_utils.checkpoint = (
+                hf_grad_checkpoint_disk_offload_wrapper
+            )
+
+    def _apply_mistral_cross_entropy_patch(self):
+        """Apply Mistral cross entropy patch if configured."""
+        if (
+            self.cfg.model_config_type == "mistral"
+            and self.cfg.flash_attn_cross_entropy_loss
+        ):
+            from axolotl.monkeypatch.mistral_attn_hijack_flash import (
+                patch_mistral_cross_entropy,
+            )
+
+            patch_mistral_cross_entropy()
+
+    def _apply_unsloth_self_attention_patch(self):
+        """Apply Unsloth self-attention patches if configured."""
+        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
+            from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
+
+            patch_self_attn_lora(self.cfg)
+
+    def _apply_multipack_patches(self):
+        """Apply multipack patches if necessary."""
+        if (
+            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
+            and (self.cfg.flash_attention or self.cfg.flex_attention)
+            and self.cfg.sample_packing
+        ):
+            # Get automap config if it exists
+            auto_map_config = None
+            if isinstance(self.model_config, dict) and "auto_map" in self.model_config:
+                auto_map_config = self.model_config["auto_map"]
+            elif hasattr(self.model_config, "auto_map"):
+                auto_map_config = self.model_config.auto_map
+
+            # Determine if the model has remote code
+            if auto_map_config is not None:
+                has_remote_code = "AutoModelForCausalLM" in auto_map_config
+            else:
+                has_remote_code = False
+
+            if has_remote_code and self.cfg.trust_remote_code is False:
+                # If explicitly set in YAML, prefer that
+                has_remote_code = self.cfg.trust_remote_code
+
+            patch_for_multipack(
+                self.cfg.model_config_type,
+                model_name=self.cfg.base_model,
+                has_remote_code=has_remote_code,
+            )
+
+            if self.cfg.is_llama_derived_model:
+                self._patch_loss_llama()
+
+    def _patch_attention(self):
+        """Apply attention-specific patches based on model type."""
+        if not (self.cfg.flash_attention and hasattr(self.model_config, "model_type")):
+            return
+
+        if self.model_config.model_type == "mllama" and self.cfg.flash_attention:
+            from axolotl.monkeypatch.attention.mllama import patch_mllama
+
+            patch_mllama()
+
+        if self.model_config.model_type == "btlm":
+            from axolotl.monkeypatch.btlm_attn_hijack_flash import (
+                replace_btlm_attn_with_flash_attn,
+            )
+
+            replace_btlm_attn_with_flash_attn(self.cfg.base_model)
+
+        if self.model_config.model_type == "stablelm_epoch" and self.cfg.sample_packing:
+            from axolotl.monkeypatch.stablelm_attn_hijack_flash import (
+                replace_stablelm_attn_with_flash_attn,
+            )
+
+            replace_stablelm_attn_with_flash_attn(self.cfg.base_model)
+
+    def _patch_loss_llama(self):
+        """Patch loss functions and other optimizations for LLaMA models."""
+        if self.cfg.flash_attn_cross_entropy and self.has_flash_attn:
+            from axolotl.monkeypatch.llama_attn_hijack_flash import (
+                patch_fa_llama_cross_entropy,
+            )
+
+            patch_fa_llama_cross_entropy()
+        elif self.cfg.unsloth_cross_entropy_loss:
+            from axolotl.monkeypatch.unsloth_ import integrate_cross_entropy_loss_patch
+
+            integrate_cross_entropy_loss_patch(model_type="llama")
+
+        if self.cfg.flash_attn_rms_norm and self.has_flash_attn:
+            from axolotl.monkeypatch.llama_attn_hijack_flash import patch_llama_rms_norm
+
+            patch_llama_rms_norm()
+        elif self.cfg.unsloth_rms_norm:
+            from axolotl.monkeypatch.unsloth_ import patch_unsloth_layernorm
+
+            patch_unsloth_layernorm()
+
+        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
+            from axolotl.monkeypatch.unsloth_ import patch_self_attn_lora
+
+            patch_self_attn_lora()
+
+    def _patch_llama_flash_attention(self, packed=False):
+        """Apply Flash Attention patches for LLaMA models."""
+        from axolotl.monkeypatch.llama_attn_hijack_flash import (
+            replace_llama_attn_with_flash_attn,
+        )
+
+        if packed:
+            if self.cfg.device not in ["mps", "cpu"] and not self.inference:
+                LOG.info("patching with flash attention for sample packing")
+                replace_llama_attn_with_flash_attn(
+                    packed=True,
+                    cross_entropy=self.cfg.flash_attn_cross_entropy,
+                    rms_norm=self.cfg.flash_attn_rms_norm,
+                )
+        elif self.cfg.s2_attention:
+            LOG.info("patching w/ flash-enabled, shifted-sparse attention")
+            replace_llama_attn_with_flash_attn(
+                packed=False,
+                cross_entropy=self.cfg.flash_attn_cross_entropy,
+                rms_norm=self.cfg.flash_attn_rms_norm,
+                use_shifted_sparse_attn=True,
+            )
+        elif self.cfg.flash_attn_cross_entropy or self.cfg.flash_attn_rms_norm:
+            replace_llama_attn_with_flash_attn(
+                packed=False,
+                cross_entropy=self.cfg.flash_attn_cross_entropy,
+                rms_norm=self.cfg.flash_attn_rms_norm,
+            )
+
+    def _patch_llama_xformers_attention(self):
+        """Apply xformers attention patches for LLaMA models."""
+        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
+            hijack_llama_attention,
+        )
+
+        LOG.info("Patching with xformers attention...")
+        hijack_llama_attention()
+
+    def _patch_llama_sample_packing(self):
+        """Apply sample packing patches for LLaMA models."""
+        from axolotl.monkeypatch.llama_patch_multipack import (
+            hijack_llama_prepare_4d_mask,
+        )
+
+        LOG.info("Patching llama _prepare_4d_causal_attention_mask*...")
+        hijack_llama_prepare_4d_mask()
+
+    def _patch_llama_derived_model(self):
+        """Modify all llama derived models in one block."""
+        if self.cfg.is_llama_derived_model and not (
+            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
+            and (self.cfg.flash_attention or self.cfg.flex_attention)
+            and self.cfg.sample_packing
+        ):
+            self._patch_loss_llama()
+
+            if self.cfg.flash_attention:
+                self._patch_llama_flash_attention(packed=self.cfg.sample_packing)
+            elif self.cfg.xformers_attention:
+                self._patch_llama_xformers_attention()
+            elif self.cfg.sample_packing:
+                self._patch_llama_sample_packing()
+            elif self.cfg.s2_attention:
+                raise NotImplementedError(
+                    "Shifted-sparse attention not currently implemented without flash attention."
+                )
+
+    def _apply_llama_flash_attn_patches(self, model):
+        """Apply LLaMA-specific flash attention patches."""
+        if (
+            self.model_config.model_type in ["llama", "llama4"]
+            and not self.cfg.trust_remote_code
+            and not self.cfg.gptq
+            and self.cfg.flash_attention
+            and not self.inference
+        ):
+            # TODO(MengqingCao): split these patches seperately
+            from axolotl.monkeypatch.llama_attn_hijack_flash import (
+                is_xformers_swiglu_available,
+                replace_llama_mlp_with_swiglu,
+                replace_llama_qkv_with_fused,
+            )
+
+            if self.cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
+                LOG.info("Patching with SwiGLU...")
+                replace_llama_mlp_with_swiglu(model)
+
+            if self.cfg.flash_attn_fuse_qkv:
+                LOG.info("Patching with fused QKV...")
+                replace_llama_qkv_with_fused(model)
+
+    def _apply_unsloth_patches(self, model):
+        """Apply unsloth optimization patches."""
+        if self.cfg.unsloth_lora_mlp:
+            from axolotl.monkeypatch.unsloth_ import integrate_lora_mlp_patch
+
+            integrate_lora_mlp_patch(peft_model=model)
+
+        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
+            from axolotl.monkeypatch.unsloth_ import integrate_lora_patch
+
+            integrate_lora_patch(peft_model=model, cfg=self.cfg)
+
+        if self.cfg.unsloth_rope:
+            from axolotl.monkeypatch.unsloth_ import integrate_rope_embeddings
+
+            integrate_rope_embeddings()
+
+    def _apply_lora_kernel_patch(self, model):
+        """Apply LoRA kernel patches."""
+        if (
+            self.cfg.lora_mlp_kernel
+            or self.cfg.lora_qkv_kernel
+            or self.cfg.lora_o_kernel
+        ):
+            from axolotl.monkeypatch.lora_kernels import apply_lora_kernel_patches
+
+            apply_lora_kernel_patches(model=model, cfg=self.cfg)
--- a/src/axolotl/loaders/processor.py
+++ b/src/axolotl/loaders/processor.py
@@ -0,0 +1,56 @@
+"""Processor loading functionality for multi-modal models"""
+
+import logging
+from typing import Any
+
+import transformers
+from transformers import (
+    AutoProcessor,
+    PreTrainedTokenizerBase,
+)
+
+from axolotl.utils.dict import DictDefault
+
+LOG = logging.getLogger(__name__)
+
+
+def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
+    processor_kwargs: dict[str, Any] = {}  # Do we actually need this?
+
+    processor_cls = AutoProcessor
+    if cfg.processor_type:
+        processor_cls = getattr(transformers, cfg.processor_type)
+
+    processor = processor_cls.from_pretrained(
+        cfg.processor_config,
+        trust_remote_code=cfg.trust_remote_code or False,
+        tokenizer=tokenizer,
+        **processor_kwargs,
+    )
+
+    # Attempt to load image size from processor if available
+    if (
+        cfg.image_size is None
+        and hasattr(processor, "size")
+        and any(dim in processor.size for dim in ["width", "height"])
+    ):
+        im_width = None
+        im_height = None
+        if "width" in processor.size:
+            im_width = processor.size["width"]
+        if "height" in processor.size:
+            im_height = processor.size["height"]
+
+        # If both width and height are set, use a tuple
+        if im_width is not None and im_height is not None:
+            cfg.image_size = (im_width, im_height)
+        # If only width is set, use as integer
+        elif im_width is not None:
+            cfg.image_size = im_width
+        # If only height is set, use as integer
+        elif im_height is not None:
+            cfg.image_size = im_height
+
+        LOG.debug(f"Loaded image size: {cfg.image_size} from processor")
+
+    return processor
--- a/src/axolotl/loaders/tokenizer.py
+++ b/src/axolotl/loaders/tokenizer.py
@@ -0,0 +1,281 @@
+"""Tokenizer loading functionality and associated utils"""
+
+import json
+import logging
+import os
+
+import transformers
+from transformers import (
+    AddedToken,
+    AutoTokenizer,
+)
+
+from axolotl.integrations.base import PluginManager
+from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
+from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
+from axolotl.utils.chat_templates import get_chat_template_from_config
+from axolotl.utils.distributed import (
+    barrier,
+    is_local_main_process,
+    is_main_process,
+)
+
+LOG = logging.getLogger(__name__)
+PLUGIN_MANAGER = PluginManager.get_instance()
+
+
+def modify_tokenizer_files(
+    tokenizer_path: str, token_mappings: dict[int, str], output_dir: str
+) -> str:
+    """
+    Modify tokenizer files to replace added_tokens strings, save to output directory,
+    and return the path to the modified tokenizer.
+
+    This only works with reserved tokens that were added to the tokenizer, not tokens
+    already part of the vocab.
+
+    Args:
+        tokenizer_path: Path or name of the original tokenizer
+        token_mappings: Dict mapping {token_id (int): new_token_string}
+        output_dir: Directory to save the modified tokenizer
+
+    Returns:
+        Path to the modified tokenizer directory
+
+    Ref: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941
+    """
+    # Create the tokenizer directory in output_dir if it doesn't exist
+    tokenizer_dir = os.path.join(output_dir, "tokenizer")
+    os.makedirs(tokenizer_dir, exist_ok=True)
+
+    if is_local_main_process():  # pylint: disable=too-many-nested-blocks
+        # Load the tokenizer
+        temp_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, use_fast=True)
+
+        # Save the tokenizer to the output directory
+        temp_tokenizer.save_pretrained(tokenizer_dir)
+
+        # Get the token IDs and map them to their new values
+        token_id_mappings = {
+            int(token_id): new_value for token_id, new_value in token_mappings.items()
+        }
+
+        # 1. Update tokenizer_config.json - added_tokens_decoder
+        config_path = os.path.join(tokenizer_dir, "tokenizer_config.json")
+        if os.path.exists(config_path):
+            with open(config_path, "r", encoding="utf-8") as f:
+                config_data = json.load(f)
+
+            # Update added_tokens_decoder
+            if "added_tokens_decoder" in config_data:
+                for token_id, new_value in token_id_mappings.items():
+                    token_id_str = str(token_id)
+                    if token_id_str in config_data["added_tokens_decoder"]:
+                        config_data["added_tokens_decoder"][token_id_str][
+                            "content"
+                        ] = new_value
+                    else:
+                        raise ValueError(
+                            f"Token ID {token_id_str} not found in added_tokens_decoder"
+                        )
+
+            # Write the updated config back
+            with open(config_path, "w", encoding="utf-8") as f:
+                json.dump(config_data, f, indent=2)
+
+        # 2. Update tokenizer.json - added_tokens
+        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
+        if os.path.exists(tokenizer_path):
+            with open(tokenizer_path, "r", encoding="utf-8") as f:
+                tokenizer_data = json.load(f)
+
+            # Update added_tokens
+            if "added_tokens" in tokenizer_data:
+                for token_id, new_value in token_id_mappings.items():
+                    for i, token_entry in enumerate(tokenizer_data["added_tokens"]):
+                        if token_entry["id"] == token_id:
+                            tokenizer_data["added_tokens"][i]["content"] = new_value
+                            break
+                    else:
+                        # Reaching this section means the token_id was not found in tokenizer.json added_tokens
+                        raise ValueError(
+                            f"Token ID {token_id} not found in added_tokens"
+                        )
+            if "model" in tokenizer_data and "vocab" in tokenizer_data["model"]:
+                for token_id, new_value in token_id_mappings.items():
+                    for entry_val, entry_id in tokenizer_data["model"]["vocab"].items():
+                        if entry_id == token_id:
+                            del tokenizer_data["model"]["vocab"][entry_val]
+                            tokenizer_data["model"]["vocab"][new_value] = token_id
+                            break
+
+            # Write the updated tokenizer data back
+            with open(tokenizer_path, "w", encoding="utf-8") as f:
+                json.dump(tokenizer_data, f, indent=2)
+
+    barrier()
+    return tokenizer_dir
+
+
+def load_tokenizer(cfg):
+    """Load and configure the tokenizer based on the provided config."""
+    model_config = load_model_config(cfg)
+    tokenizer_kwargs = {}
+    use_fast = True  # this is the default
+
+    if cfg.tokenizer_use_fast is not None:
+        use_fast = cfg.tokenizer_use_fast
+    if cfg.tokenizer_legacy is not None:
+        # True is the default w/ https://github.com/huggingface/transformers/pull/25224
+        tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy
+
+    tokenizer_cls = AutoTokenizer
+    if cfg.tokenizer_type:
+        tokenizer_cls = getattr(transformers, cfg.tokenizer_type)
+
+    # Set base tokenizer path
+    tokenizer_path = cfg.tokenizer_config
+
+    # Apply token string overrides if specified
+    if cfg.added_tokens_overrides:
+        # Modify tokenizer files and get path to modified tokenizer
+        tokenizer_path = modify_tokenizer_files(
+            tokenizer_path, cfg.added_tokens_overrides, output_dir=cfg.output_dir
+        )
+
+    tokenizer = tokenizer_cls.from_pretrained(
+        tokenizer_path,
+        trust_remote_code=cfg.trust_remote_code or False,
+        use_fast=use_fast,
+        **tokenizer_kwargs,
+    )
+
+    if (
+        tokenizer.__class__.__name__
+        in [
+            "LlamaTokenizer",
+            "LlamaTokenizerFast",
+            "CodeLlamaTokenizer",
+            "CodeLlamaTokenizerFast",
+        ]
+        and hasattr(tokenizer, "pad_token")
+        and not tokenizer.pad_token
+    ):
+        # set a pad_token, but use eos_token so we don't add a new token
+        tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
+
+    if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+    # Mistral's official FA implementation requires left padding
+    if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing:
+        tokenizer.padding_side = "left"
+
+    # Qwen base only has single token, so we need to set the special tokens
+    if cfg.is_qwen_derived_model:
+        token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"]
+        for attr_name in token_ids:
+            if getattr(tokenizer, attr_name) is None:
+                setattr(tokenizer, attr_name, tokenizer.eod_id)
+
+        token_names = ["bos_token", "eos_token", "pad_token", "unk_token"]
+        for attr_name in token_names:
+            if getattr(tokenizer, attr_name) is None:
+                setattr(tokenizer, attr_name, "<|endoftext|>")
+
+    additional_special_tokens = None
+    if cfg.special_tokens:
+        special_tokens = cfg.special_tokens.to_dict()
+        additional_special_tokens = special_tokens.pop(
+            "additional_special_tokens", None
+        )
+        lora_modules_to_save = get_linear_embedding_layers(model_config.model_type)
+        for k, val in special_tokens.items():
+            # check if new special token is not already in tokenizer and
+            # is adapter training to make sure lora_modules_to_save is set
+            # pylint: disable=too-many-boolean-expressions
+            if (
+                (getattr(tokenizer, k) is None or getattr(tokenizer, k) != val)
+                and (len(tokenizer.encode(val, add_special_tokens=False)) > 2)
+                and cfg.adapter
+                and (
+                    not cfg.lora_modules_to_save
+                    or not all(
+                        x in cfg.lora_modules_to_save for x in lora_modules_to_save
+                    )
+                )
+                and k != "pad_token"
+            ):
+                lora_modules_to_save = ", ".join(
+                    [f"`{x}`" for x in lora_modules_to_save]
+                )
+                raise ValueError(
+                    f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens."
+                )
+
+            tokenizer.add_special_tokens(
+                {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)}
+            )
+
+        # If we add bos_token and eos_token, we need to update the post processor to
+        # handle them correctly.
+        # https://github.com/huggingface/transformers/pull/24132
+        bos_or_eos_in_special_tokens = (
+            "bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens
+        )
+        if (
+            tokenizer.__class__.__name__
+            in (
+                "LlamaTokenizerFast",
+                "CodeLlamaTokenizerFast",
+            )
+            and bos_or_eos_in_special_tokens
+        ):
+            tokenizer.update_post_processor()
+
+    if cfg.tokens:
+        tokenizer.add_tokens(
+            [
+                AddedToken(token, rstrip=False, lstrip=False, normalized=False)
+                for token in cfg.tokens
+            ]
+        )
+
+    # Additional special tokens are a List, and need to be treated differently than regular special
+    # tokens. We add them after we have called `add_tokens` in case these additional special tokens
+    # are new tokens.
+    #
+    # Usage:
+    #
+    # ```py
+    # special_tokens:
+    #   additional_special_tokens: ["<|im_start|>", "<|im_end|>"]
+    # ```
+    if additional_special_tokens is not None:
+        tokenizer.add_special_tokens(
+            {"additional_special_tokens": additional_special_tokens}
+        )
+
+    if is_main_process(use_environ=True):
+        LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
+        LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
+        LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
+        LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+
+    if cfg.chat_template:
+        chat_template_string = get_chat_template_from_config(
+            cfg=cfg,
+            tokenizer=tokenizer,
+        )
+        if cfg.default_system_message and cfg.chat_template == "chatml":
+            chat_template_string = chat_template_string.replace(
+                "You are a helpful assistant.", cfg.default_system_message
+            )
+
+        tokenizer.chat_template = chat_template_string
+    else:
+        LOG.info(
+            "No Chat template selected. Consider adding a chat template for easier inference."
+        )
+    return tokenizer
--- a/src/axolotl/loaders/utils.py
+++ b/src/axolotl/loaders/utils.py
@@ -0,0 +1,211 @@
+"""Utilities for axolotl.loaders module"""
+
+import contextlib
+import logging
+from typing import Type
+
+import addict
+import torch
+from transformers import AutoConfig, PretrainedConfig, PreTrainedModel
+
+from axolotl.utils.dict import DictDefault
+
+LOG = logging.getLogger(__name__)
+
+
+def get_module_class_from_name(
+    module: torch.nn.Module, name: str
+) -> Type[torch.nn.Module] | None:
+    """Gets a class from a module by its name. Copied from `accelerate.utils.dataclasses`
+    (https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/dataclasses.py#L2805).
+
+    Args:
+        module: The module to get the class from.
+        name: The name of the class.
+
+    Returns:
+        The class type of the matching module, or `None` if no match is found.
+    """
+    modules_children = list(module.children())
+    if module.__class__.__name__ == name:
+        return module.__class__
+
+    if len(modules_children) == 0:
+        return None
+
+    for child_module in modules_children:
+        module_class = get_module_class_from_name(child_module, name)
+        if module_class is not None:
+            return module_class
+
+    return None
+
+
+def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
+    """Validates and adjusts model config based on `axolotl` config.
+
+    This function performs several important checks and adjustments:
+        - Disables model caching for better memory efficiency
+        - Handles multimodal model-specific configurations
+        - Validates quantization settings
+        - Ensures proper LoRA configuration when using adapters with new tokens
+
+    Args:
+        cfg: Dictionary mapping `axolotl` config keys to values.
+        model_config: The model's configuration object from `transformers`.
+
+    Raises:
+        ValueError: If a multimodal model lacks text configuration, if GPTQ settings
+            are inconsistent, or if LoRA `modules_to_save` is improperly configured
+            with new tokens.
+    """
+    if hasattr(model_config, "use_cache"):
+        model_config.use_cache = False
+
+    if cfg.is_multimodal:
+        # For multimodal configs, use_cache is set in the text_config
+        if hasattr(model_config, "get_text_config"):
+            text_config = model_config.get_text_config()
+            if hasattr(text_config, "use_cache"):
+                text_config.use_cache = False
+        else:
+            raise ValueError(
+                "No text config found for multimodal model. Please raise an Issue with model details."
+            )
+
+        # Check if image_size is not set and load image size from model config if available
+        if (
+            cfg.image_size is None
+            and hasattr(model_config, "vision_config")
+            and hasattr(model_config.vision_config, "image_size")
+        ):
+            cfg.image_size = model_config.vision_config.image_size
+            LOG.debug(f"Loaded image size: {cfg.image_size} from model config")
+
+    quant_config_exists = (
+        hasattr(model_config, "quantization_config")
+        and model_config.quantization_config
+    )
+
+    # Detect compressed-tensors config
+    is_compressed_tensors_config = (
+        quant_config_exists
+        and model_config.quantization_config.get("quant_method") == "compressed-tensors"
+    )
+
+    if is_compressed_tensors_config:
+        if model_config.quantization_config.get("config_groups"):
+            LOG.warning(
+                "Found `config_groups` in a compressed-tensors config. "
+                "QAT integration with llmcompressor is not tested."
+            )
+        # Skip further quant checks for compressed-tensors
+        return
+
+    quant_config_method_is_gptq = (
+        quant_config_exists
+        and "quant_method" in model_config.quantization_config
+        and model_config.quantization_config["quant_method"] == "gptq"
+    )
+
+    if cfg.gptq and not quant_config_method_is_gptq:
+        raise ValueError(
+            "model_config.quantization_config is not set or quant_method is not set to gptq. "
+            "Please make sure to point to a GPTQ model."
+        )
+
+    lora_modules_to_save = get_linear_embedding_layers(model_config.model_type)
+    if (
+        cfg.adapter
+        and cfg.tokens
+        and (
+            not cfg.lora_modules_to_save
+            or not all(x in cfg.lora_modules_to_save for x in lora_modules_to_save)
+        )
+    ):
+        lora_modules_to_save_joined = ", ".join(
+            map(lambda x: f"`{x}`", lora_modules_to_save)
+        )
+        raise ValueError(
+            "`lora_modules_to_save` not properly set when adding new tokens. "
+            f"Please include [{lora_modules_to_save_joined}] in `lora_modules_to_save`."
+        )
+
+
+def load_model_config(cfg: DictDefault) -> PretrainedConfig | addict.Dict:
+    """Loads and configures a model configuration from HuggingFace or local sources.
+
+    This function determines the appropriate model config source, loads it, applies any
+    necessary overrides, and validates it for compatibility with the `axolotl` config.
+
+    Args:
+        cfg: Dictionary mapping `axolotl` config keys to values.
+
+    Returns:
+        A configured model configuration object (`AutoConfig` instance), or a simple
+            dictionary configuration for special cases like Mamba models.
+
+    Raises:
+        ValueError: If configuration loading fails for reasons other than special cases
+            that are handled (e.g., Mamba models).
+    """
+    model_config_name = cfg.base_model_config or cfg.base_model
+    if not model_config_name and cfg.tokenizer_config:
+        model_config_name = cfg.tokenizer_config
+    trust_remote_code = cfg.trust_remote_code is True
+    config_kwargs = {}
+    if cfg.revision_of_model:
+        config_kwargs["revision"] = cfg.revision_of_model
+    if cfg.num_labels:
+        # num_labels is used to initialize classifier models
+        config_kwargs["num_labels"] = cfg.num_labels
+    try:
+        model_config = AutoConfig.from_pretrained(
+            model_config_name,
+            trust_remote_code=trust_remote_code,
+            **config_kwargs,
+        )
+    except ValueError as error:
+        if "mamba" in model_config_name:
+            return addict.Dict(
+                {
+                    "model_type": "mamba",
+                }
+            )
+        raise error
+
+    if cfg.overrides_of_model_config:
+        for key, val in cfg.overrides_of_model_config.items():
+            setattr(model_config, key, val)
+
+    check_model_config(cfg, model_config)
+
+    return model_config
+
+
+def ensure_dtype(model: PreTrainedModel, dtype: torch.dtype = torch.bfloat16):
+    """Ensures all modules in the model are converted to the specified data type."""
+    for name, module in model.named_modules():
+        weight_mismatch = False
+        with contextlib.suppress(AttributeError):
+            weight_mismatch = module.weight.dtype != dtype
+
+        bias_mismatch = False
+        with contextlib.suppress(AttributeError):
+            bias_mismatch = module.bias.dtype != dtype
+
+        if weight_mismatch:
+            print(f"Converting module {name}.weight: {module.weight.dtype} -> {dtype}")
+        if bias_mismatch:
+            print(f"Converting module {name}.bias: {module.bias.dtype} -> {dtype}")
+        if weight_mismatch or bias_mismatch:
+            module.to(dtype)
+
+
+def get_linear_embedding_layers(model_type: str) -> list[str]:
+    """Returns layer names of linear embeddings needed for LoRA based on model type."""
+    if model_type == "gpt_neox":
+        return ["embed_in", "embed_out"]
+    if model_type == "falcon":
+        return ["word_embeddings", "lm_head"]
+    return ["embed_tokens", "lm_head"]
--- a/src/axolotl/monkeypatch/gradient_checkpointing/init.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/init.py
@@ -5,10 +5,10 @@ from functools import partial

 from packaging import version

-from axolotl.utils.gradient_checkpointing.offload_cpu import (
+from axolotl.monkeypatch.gradient_checkpointing.offload_cpu import (
    CPU_Offloaded_Gradient_Checkpointer,
 )
-from axolotl.utils.gradient_checkpointing.offload_disk import (
+from axolotl.monkeypatch.gradient_checkpointing.offload_disk import (
    Disco,
 )

--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
--- a/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
+++ b/src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
--- a/src/axolotl/monkeypatch/peft/utils.py
+++ b/src/axolotl/monkeypatch/peft/utils.py
@@ -75,4 +75,4 @@ def patch_peft_prep_code():
    exec(prep_code, globals())  # pylint: disable=exec-used  # nosec B102
    LOG.info("patching prepare_model_for_kbit_training to allow for overrides")
    peft.utils.other.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
-    axolotl.utils.models.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
+    axolotl.loaders.model.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -28,11 +28,15 @@ from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
 )
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.integrations.base import PluginManager
+from axolotl.loaders import (
+    ModelLoader,
+    load_processor,
+    load_tokenizer,
+)
 from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
-from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.schemas.enums import RLType
 from axolotl.utils.trainer import setup_trainer

@@ -76,7 +80,8 @@ def setup_model_and_tokenizer(
        msg += " and peft_config..."
    LOG.debug(msg)

-    model, peft_config = load_model(cfg, tokenizer, processor=processor)
+    model_loader = ModelLoader(cfg, tokenizer, processor=processor)
+    model, peft_config = model_loader.load()
    if model.generation_config is not None:
        model.generation_config.do_sample = True

@@ -113,7 +118,8 @@ def setup_reference_model(
            model_ref = None  # explicit setting to None
        else:
            # load the model again for model_ref/baseline
-            model_ref, _ = load_model(cfg, tokenizer, reference_model=True)
+            model_loader = ModelLoader(cfg, tokenizer, reference_model=True)
+            model_ref, _ = model_loader.load()
    return model_ref


--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -11,9 +11,10 @@ from transformers.utils.import_utils import is_torch_npu_available

 from axolotl.integrations.base import PluginManager
 from axolotl.integrations.config import merge_input_args
+from axolotl.loaders import MULTIMODAL_AUTO_MODEL_MAPPING
+from axolotl.loaders.utils import load_model_config
 from axolotl.utils.bench import log_gpu_memory_usage
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.models import MULTIMODAL_AUTO_MODEL_MAPPING, load_model_config
 from axolotl.utils.schemas.config import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
 )
--- a/src/axolotl/utils/ctx_managers/sequence_parallel.py
+++ b/src/axolotl/utils/ctx_managers/sequence_parallel.py
@@ -10,7 +10,7 @@ from torch.utils.hooks import RemovableHandle
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils import ModelOutput

-from axolotl.monkeypatch.ring_attn.patch import (
+from axolotl.monkeypatch.ring_attn import (
    get_ring_attn_group,
    patch_prepare_data_loader,
    patch_prepare_device_mesh,
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -10,6 +10,7 @@ import yaml
 from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk

 from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
+from axolotl.loaders import load_tokenizer
 from axolotl.prompt_strategies.dpo import load as load_dpo
 from axolotl.prompt_strategies.kto import load as load_kto
 from axolotl.prompt_strategies.orpo import load as load_orpo
@@ -17,7 +18,6 @@ from axolotl.utils.data.shared import datasets_w_name_generator, load_dataset_w_
 from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process, zero_first
-from axolotl.utils.models import load_tokenizer
 from axolotl.utils.schemas.enums import RLType

 LOG = logging.getLogger(__name__)
--- a/src/axolotl/utils/lora_embeddings.py
+++ b/src/axolotl/utils/lora_embeddings.py
@@ -1,14 +0,0 @@
-"""
-helpers for lora embeddings
-"""
-
-
-def get_linear_embedding_layers(model_type):
-    """
-    returns the linear embedding layers needed for loras, dependent on the model arch
-    """
-    if model_type == "gpt_neox":
-        return ["embed_in", "embed_out"]
-    if model_type == "falcon":
-        return ["word_embeddings", "lm_head"]
-    return ["embed_tokens", "lm_head"]
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -470,6 +470,16 @@ class AxolotlInputConfig(

        return data

+    @model_validator(mode="before")
+    @classmethod
+    def check_sample_packing_with_s2attn(cls, data):
+        if data.get("sample_packing") and data.get("s2_attention"):
+            raise ValueError(
+                "Received `sample_packing=true` and `s2_attention=true`; however, \
+                shifted-sparse attention does not currently support sample packing."
+            )
+        return data
+
    @model_validator(mode="before")
    @classmethod
    def check_batch_flattening_fa(cls, data):