Update __init__.py

nit
auto detect tp_size
2025-02-26 00:21:16 -05:00 · 2025-02-26 00:21:16 -05:00 · 2025-02-26 00:21:16 -05:00 · 2025-02-26 00:21:16 -05:00 · 2025-02-26 00:21:16 -05:00 · 2025-02-26 00:21:16 -05:00
25 changed files with 359 additions and 1041 deletions
--- a/README.md
+++ b/README.md
@@ -19,6 +19,9 @@
    <br/>
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
+    <a href="https://www.phorm.ai/query?projectId=e315ba4a-4e14-421f-ab05-38a1f9076f25">
+    <img alt="phorm.ai" src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg?&logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNSIgaGVpZ2h0PSI0IiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgogIDxwYXRoIGQ9Ik00LjQzIDEuODgyYTEuNDQgMS40NCAwIDAgMS0uMDk4LjQyNmMtLjA1LjEyMy0uMTE1LjIzLS4xOTIuMzIyLS4wNzUuMDktLjE2LjE2NS0uMjU1LjIyNmExLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxMmMtLjA5OS4wMTItLjE5Mi4wMTQtLjI3OS4wMDZsLTEuNTkzLS4xNHYtLjQwNmgxLjY1OGMuMDkuMDAxLjE3LS4xNjkuMjQ2LS4xOTFhLjYwMy42MDMgMCAwIDAgLjItLjEwNi41MjkuNTI5IDAgMCAwIC4xMzgtLjE3LjY1NC42NTQgMCAwIDAgLjA2NS0uMjRsLjAyOC0uMzJhLjkzLjkzIDAgMCAwLS4wMzYtLjI0OS41NjcuNTY3IDAgMCAwLS4xMDMtLjIuNTAyLjUwMiAwIDAgMC0uMTY4LS4xMzguNjA4LjYwOCAwIDAgMC0uMjQtLjA2N0wyLjQzNy43MjkgMS42MjUuNjcxYS4zMjIuMzIyIDAgMCAwLS4yMzIuMDU4LjM3NS4zNzUgMCAwIDAtLjExNi4yMzJsLS4xMTYgMS40NS0uMDU4LjY5Ny0uMDU4Ljc1NEwuNzA1IDRsLS4zNTctLjA3OUwuNjAyLjkwNkMuNjE3LjcyNi42NjMuNTc0LjczOS40NTRhLjk1OC45NTggMCAwIDEgLjI3NC0uMjg1Ljk3MS45NzEgMCAwIDEgLjMzNy0uMTRjLjExOS0uMDI2LjIyNy0uMDM0LjMyNS0uMDI2TDMuMjMyLjE2Yy4xNTkuMDE0LjMzNi4wMy40NTkuMDgyYTEuMTczIDEuMTczIDAgMCAxIC41NDUuNDQ3Yy4wNi4wOTQuMTA5LjE5Mi4xNDQuMjkzYTEuMzkyIDEuMzkyIDAgMCAxIC4wNzguNThsLS4wMjkuMzJaIiBmaWxsPSIjRjI3NzdBIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+Cjwvc3ZnPgo=">
+  </a>
 </p>

 Axolotl is a tool designed to streamline post-training for various AI models.
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -40,7 +40,6 @@ website:

        - section: "Deployments"
          contents:
-            - docs/docker.qmd
            - docs/multi-gpu.qmd
            - docs/multi-node.qmd
            - docs/ray-integration.qmd
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -78,6 +78,9 @@ tf32: true # require >=ampere
 bfloat16: true # require >=ampere
 float16: true

+# Use Tensor parallel
+tensor_parallel: true # require multi-gGPU
+
 # Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset
 gpu_memory_limit: 20GiB
 # Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge
@@ -163,12 +166,6 @@ datasets:
      system: ["system"]
      tool: ["tool"]

-    # Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.
-    # This does not drop the default system message from chat_template if it exists. If you wish to,
-    # we recommend using a custom jinja template with the default system message removed or
-    # adding a system turn with empty content.
-    drop_system_message:
-
    # IMPORTANT: The following fields determine which parts of the conversation to train on.
    # Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
    # See examples at `docs/dataset-formats/conversation.qmd`
@@ -228,8 +225,8 @@ process_reward_model:
 chat_template: tokenizer_default
 # custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
 chat_template_jinja: null
-# Changes the default system message. Currently only supports chatml.
-default_system_message: You are a helpful assistant. Please give a long and detailed answer.
+# Changes the default system message
+default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
 # Axolotl attempts to save the dataset as an arrow after packing the data together so
 # subsequent training attempts load faster, relative path
 dataset_prepared_path: data/last_run_prepared
@@ -451,7 +448,7 @@ gradient_checkpointing: false
 early_stopping_patience: 3

 # Specify a scheduler and kwargs to use with the optimizer
-lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
+lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
 lr_scheduler_kwargs:
 cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
 cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
@@ -534,8 +531,6 @@ flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
 sdp_attention:
 # Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
 s2_attention:
-# Optional[bool]. Whether to use low_cpu_mem_usage
-low_cpu_mem_usage:
 # Resume from a specific checkpoint dir
 resume_from_checkpoint:
 # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
--- a/docs/dataset-formats/index.qmd
+++ b/docs/dataset-formats/index.qmd
@@ -129,7 +129,6 @@ You can mix and match within each approach or across approaches to train a model
 We suggest this approach when you want to bring your own tokenized dataset.

 Axolotl expects the dataset to have three keys:
-
 - `input_ids`: from tokenizing formatted prompt
 - `attention_mask`: for masking padding. If you don't add padding, it would be equal to `len(input_ids) * [1]`
 - `labels`: this is the same as `input_ids`, however, if you want to mask certain tokens, you would set those indices to `-100`.
--- a/docs/docker.qmd
+++ b/docs/docker.qmd
@@ -1,140 +0,0 @@
---
-title: "Docker"
-format:
-  html:
-    toc: true
-    toc-depth: 4
---
-
-This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
-
-## Base
-
-The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
-
-#### Image
-
-```
-axolotlai/axolotl-base
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base)
-
-#### Tags format
-
-```bash
-main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
-```
-
-Tags examples:
-
- `main-base-py3.11-cu124-2.6.0`
- `main-base-py3.11-cu124-2.5.1`
- `main-base-py3.11-cu124-2.4.1`
-
-## Main
-
-The main image is the image that is used to run Axolotl. It is based on the `axolotlai/axolotl-base` image and includes the Axolotl codebase, dependencies, and more.
-
-#### Image
-
-```
-axolotlai/axolotl
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
-
-#### Tags format {#sec-main-tags}
-
-```bash
-# on push to main
-main-py{python_version}-cu{cuda_version}-{pytorch_version}
-
-# latest main (currently torch 2.5.1, python 3.11, cuda 12.4)
-main-latest
-
-# nightly build
-{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}
-
-# tagged release
-{version}
-```
-
-:::{.callout-tip}
-
-There may be some extra tags appended to the image, like `-vllm` which installs those packages.
-
-:::
-
-Tags examples:
-
- `main-py3.11-cu124-2.6.0`
- `main-py3.11-cu124-2.5.1`
- `main-py3.11-cu124-2.4.1`
- `main-latest`
- `main-20250303-py3.11-cu124-2.6.0`
- `main-20250303-py3.11-cu124-2.5.1`
- `main-20250303-py3.11-cu124-2.4.1`
- `0.7.1`
-
-## Cloud
-
-The cloud image is the image that is used to run Axolotl in the cloud. It is based on the `axolotlai/axolotl` image and sets ENV variables like HuggingFace cache directories for volume mounts, tmux, and more for different cloud providers.
-
-:::{.callout-tip}
-
-Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variables to disable it.
-
-:::
-
-#### Image
-
-```
-axolotlai/axolotl-cloud
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud)
-
-#### Tags format
-
-This uses the same tags as the [`main` image](#sec-main-tags).
-
-#### Environment variables
-
- `JUPYTER_DISABLE`: Disable Jupyter lab.
- `JUPYTER_PASSWORD`: Set a password for the Jupyter lab.
- `PUBLIC_KEY`: Add a public key for the SSH service.
- `SSH_KEY`: Add a private key for the SSH service.
-
-#### Volume mounts
-
-:::{.callout-tip}
-
-We recommend mounting volumes to `/workspace/data` for data persistence. `/workspace/axolotl` contains the source code and is ephemeral.
-
-:::
-
- `/workspace/data/axolotl-artifacts`: Directory to store Axolotl artifacts.
- `/workspace/data/huggingface-cache`: Directory to store HuggingFace cache.
-
-## Cloud-no-tmux
-
-This is the same as the [`cloud` image](#sec-cloud) but without tmux.
-
-#### Image
-
-```
-axolotlai/axolotl-cloud-term
-```
-
-Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud-term)
-
-:::{.callout-note}
-
-The naming may be a bit confusing as it has `-term` appended to the end.
-
-:::
-
-#### Tags format
-
-This uses the same tags as the [`cloud` image](#sec-cloud-tags).
--- a/docs/faq.qmd
+++ b/docs/faq.qmd
@@ -19,9 +19,7 @@ description: Frequently asked questions

 **Q: AttributeError: 'DummyOptim' object has no attribute 'step'**

-**Q: ModuleNotFoundError: No module named 'mpi4py' using single GPU with deepspeed**
-
-> A: You may be using deepspeed with single gpu. Please remove the `deepspeed:` section in the yaml file or `--deepspeed` CLI flag.
+> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.

 **Q: The codes is stuck on saving preprocessed datasets.**

--- a/docs/installation.qmd
+++ b/docs/installation.qmd
@@ -65,8 +65,6 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
 ```
 :::

-Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
-
 ## Cloud Environments {#sec-cloud}

 ### Cloud GPU Providers {#sec-cloud-gpu}
--- a/requirements.txt
+++ b/requirements.txt
@@ -63,4 +63,3 @@ torchao==0.7.0
 schedulefree==1.3.0

 axolotl-contribs-lgpl==0.0.3
-axolotl-contribs-mit==0.0.3
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -41,12 +41,11 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
    else:
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

-    model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
+    model, tokenizer = train(cfg=cfg, dataset_meta=dataset_meta)
    plugin_manager = PluginManager.get_instance()

    del model
    del tokenizer
-    del trainer

    plugin_manager.post_train_unload(cfg)

--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -24,8 +24,8 @@ class TrainDatasetMeta:
    """Dataclass with fields for training and validation datasets and metadata."""

    train_dataset: Dataset
-    eval_dataset: Dataset | None = None
-    total_num_steps: int | None = None
+    eval_dataset: Optional[Dataset] = None
+    total_num_steps: Optional[int] = None


 def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -35,7 +35,6 @@ from transformers import (
    EarlyStoppingCallback,
    TrainerCallback,
 )
-from transformers.training_args import OptimizerNames
 from trl.trainer.utils import RewardDataCollatorWithPadding

 from axolotl.core.trainers.base import (
@@ -85,7 +84,6 @@ from axolotl.utils.collators import (
    V2BatchSamplerDataCollatorForSeq2Seq,
 )
 from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
-from axolotl.utils.config.models.input.v0_4_1 import CustomSupportedOptimizers
 from axolotl.utils.models import ensure_dtype

 try:
@@ -93,11 +91,13 @@ try:
 except ImportError:
    pass

-LOG = logging.getLogger(__name__)
+LOG = logging.getLogger("axolotl.core.trainer_builder")


 class TrainerBuilderBase(abc.ABC):
-    """Base class for trainer builder."""
+    """
+    Base class for trainer builder
+    """

    _train_dataset = None
    _eval_dataset = None
@@ -110,9 +110,9 @@ class TrainerBuilderBase(abc.ABC):
        self.tokenizer = tokenizer
        self.processor = processor

-        # If the model supports tagging, add the axolotl tag.
+        # in case the model supports tagging, add the axolotl tag.
        # This makes sure the tag is correctly pushed even if a user calls
-        # model.push_to_hub instead of trainer.push_to_hub.
+        # model.push_to_hub instad of  trainer.push_to_hub.
        if hasattr(model, "add_model_tags"):
            model.add_model_tags(["axolotl"])

@@ -227,8 +227,8 @@ class TrainerBuilderBase(abc.ABC):

 class HFCausalTrainerBuilder(TrainerBuilderBase):
    """
-    Build the HuggingFace training args/trainer for causal models and reward modeling
-    using TRL.
+    Build the HuggingFace training args/trainer for causal models
+    and reward modelling using TRL.
    """

    def get_callbacks(self):
@@ -551,8 +551,30 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            training_arguments_kwargs["run_name"] = self.cfg.mlflow_run_name
        else:
            training_arguments_kwargs["run_name"] = None
+        training_arguments_kwargs["optim"] = (
+            self.cfg.optimizer if self.cfg.optimizer else "adamw_hf"
+        )
+        if self.cfg.optim_args:
+            if isinstance(self.cfg.optim_args, dict):
+                optim_args = ",".join(
+                    [f"{key}={value}" for key, value in self.cfg.optim_args.items()]
+                )
+            else:
+                optim_args = self.cfg.optim_args
+            training_arguments_kwargs["optim_args"] = optim_args
+        if self.cfg.optim_target_modules:
+            training_arguments_kwargs[
+                "optim_target_modules"
+            ] = self.cfg.optim_target_modules
+        training_arguments_kwargs["loraplus_lr_ratio"] = self.cfg.loraplus_lr_ratio
+        training_arguments_kwargs[
+            "loraplus_lr_embedding"
+        ] = self.cfg.loraplus_lr_embedding
+        training_arguments_kwargs["embedding_lr"] = self.cfg.embedding_lr
+        training_arguments_kwargs["embedding_lr_scale"] = self.cfg.embedding_lr_scale
+        training_arguments_kwargs["lr_groups"] = self.cfg.lr_groups

-        if self.cfg.lr_scheduler in ["one_cycle", "rex", "log_sweep"]:
+        if self.cfg.lr_scheduler in ["one_cycle", "log_sweep"]:
            training_arguments_kwargs["lr_scheduler_type"] = "cosine"
            training_arguments_kwargs[
                "alternate_lr_scheduler_type"
@@ -636,119 +658,54 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        if self.cfg.reward_model:
            training_arguments_kwargs["max_length"] = self.cfg.sequence_len

-        # Handle custom optimizer
-        custom_supported_optimizers = [opt.value for opt in CustomSupportedOptimizers]
-        if self.cfg.optimizer in custom_supported_optimizers:
-            # Common optimizer kwargs
-            optimizer_kwargs = {
-                "lr": training_arguments_kwargs.get("learning_rate"),
-                "weight_decay": training_arguments_kwargs.get("weight_decay"),
-            }
+        # pylint: disable=duplicate-code
+        if self.cfg.optimizer in [
+            "optimi_adamw",
+            "ao_adamw_4bit",
+            "ao_adamw_8bit",
+            "ao_adamw_fp8",
+            "adopt_adamw",
+        ]:
+            # Set default so transformers doesn't throw
+            training_arguments_kwargs["optim"] = "adamw_hf"
+            training_arguments_kwargs["alternate_optimizer"] = self.cfg.optimizer

-            # Adam-specific kwargs
-            adam_kwargs = {}
-            if training_arguments_kwargs.get(
-                "adam_beta1"
-            ) and training_arguments_kwargs.get("adam_beta2"):
-                adam_kwargs["betas"] = (
-                    training_arguments_kwargs.get("adam_beta1"),
-                    training_arguments_kwargs.get("adam_beta2"),
-                )
-            if training_arguments_kwargs.get("adam_epsilon"):
-                adam_kwargs["eps"] = training_arguments_kwargs.get("adam_epsilon")
+        if self.cfg.optimizer == "lion_pytorch":
+            from lion_pytorch import Lion

-            if self.cfg.optimizer == "muon":
-                from axolotl.contribs.mit.muon import (  # pylint: disable=no-name-in-module
-                    MuonOptimizerFactory,
+            lion_kwargs = {"lr": training_arguments_kwargs["learning_rate"]}
+            if "weight_decay" in training_arguments_kwargs:
+                lion_kwargs["weight_decay"] = training_arguments_kwargs["weight_decay"]
+
+            if (
+                "adam_beta1" in training_arguments_kwargs
+                and "adam_beta2" in training_arguments_kwargs
+            ):
+                lion_kwargs["betas"] = (
+                    training_arguments_kwargs["adam_beta1"],
+                    training_arguments_kwargs["adam_beta2"],
                )

-                optimizer_cls = MuonOptimizerFactory
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "optimi_adamw":
-                from optimi import AdamW
-
-                optimizer_kwargs["foreach"] = False
-                optimizer_cls = AdamW
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "ao_adamw_4bit":
-                # TODO remove 20250401
-                from torchao.prototype.low_bit_optim import AdamW4bit
-
-                optimizer_cls = AdamW4bit
-                optimizer_kwargs.update(adam_kwargs)
-
-                LOG.warning(
-                    f"`ao_adamw_4bit` will be deprecated soon. Please use `{OptimizerNames.ADAMW_TORCH_4BIT}` instead."
-                )
-            elif self.cfg.optimizer == "ao_adamw_8bit":
-                from torchao.prototype.low_bit_optim import AdamW8bit
-
-                optimizer_cls = AdamW8bit
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "ao_adamw_fp8":
-                from torchao.prototype.low_bit_optim import AdamWFp8
-
-                optimizer_cls = AdamWFp8
-                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "adopt_adamw":
-                from axolotl.utils.optimizers.adopt import ADOPT
-
-                optimizer_cls = ADOPT
-                adam_kwargs["decouple"] = True
-                optimizer_kwargs.update(adam_kwargs)
-
-            # Parse any additional optimizer args from config
-            if self.cfg.optim_args:
-                if isinstance(self.cfg.optim_args, dict):
-                    optimizer_kwargs.update(self.cfg.optim_args)
-                else:
-                    # Parse string format "key1=value1,key2=value2"
-                    for mapping in self.cfg.optim_args.replace(" ", "").split(","):
-                        key, value = mapping.split("=")
-                        optimizer_kwargs[key] = value
-
-            trainer_kwargs["optimizer_cls_and_kwargs"] = (
-                optimizer_cls,
-                optimizer_kwargs,
+            trainer_kwargs["optimizers"] = (
+                Lion(params=self.model.parameters(), **lion_kwargs),
+                None,
            )
-        else:
-            # Use transformers' optimizer
-            training_arguments_kwargs["optim"] = self.cfg.optimizer
-
-            # Parse any additional optimizer args from config
-            if self.cfg.optim_args:
-                if isinstance(self.cfg.optim_args, dict):
-                    optim_args = ",".join(
-                        [f"{key}={value}" for key, value in self.cfg.optim_args.items()]
-                    )
-                else:
-                    optim_args = self.cfg.optim_args
-                training_arguments_kwargs["optim_args"] = optim_args
+            # Set default so transformers doesn't throw
+            training_arguments_kwargs["optim"] = "adamw_hf"

        if self.cfg.optimizer == "adamw_anyprecision":
            if Path(self.cfg.torchdistx_path).exists():
                sys.path.append(self.cfg.torchdistx_path)
                importlib.import_module("torchdistx")

-        if self.cfg.optim_target_modules:
-            training_arguments_kwargs[
-                "optim_target_modules"
-            ] = self.cfg.optim_target_modules
-
-        training_arguments_kwargs["embedding_lr"] = self.cfg.embedding_lr
-        training_arguments_kwargs["embedding_lr_scale"] = self.cfg.embedding_lr_scale
-
-        training_arguments_kwargs["loraplus_lr_ratio"] = self.cfg.loraplus_lr_ratio
-        training_arguments_kwargs[
-            "loraplus_lr_embedding"
-        ] = self.cfg.loraplus_lr_embedding
-        training_arguments_kwargs["lr_groups"] = self.cfg.lr_groups
-
        if self.cfg.accelerator_config:
            training_arguments_kwargs[
                "accelerator_config"
            ] = self.cfg.accelerator_config

+        if self.cfg.tensor_parallel:
+            training_arguments_kwargs["tp_size"] = torch.cuda.device_count()
+
        if self.cfg.kd_ce_alpha is not None:
            training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
        if self.cfg.kd_alpha is not None:
@@ -918,7 +875,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):


 class HFRLTrainerBuilder(TrainerBuilderBase):
-    """Trainer factory class for TRL-based RLHF trainers (e.g. DPO)"""
+    """
+    Trainer factory class for TRL-based RLHF trainers (e.g. DPO)
+    """

    def get_callbacks(self):
        callbacks = super().get_callbacks()
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -14,7 +14,6 @@ from typing import Dict, Literal, Optional
 import torch
 from datasets import Dataset
 from peft.optimizers import create_loraplus_optimizer
-from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
 from transformers import Trainer
@@ -23,11 +22,9 @@ from transformers.utils import is_sagemaker_mp_enabled
 from trl import CPOTrainer, KTOTrainer, ORPOTrainer, PRMTrainer, RewardTrainer
 from trl.trainer.utils import pad_to_length

-from axolotl.integrations.base import BaseOptimizerFactory
 from axolotl.monkeypatch.relora import ReLoRAScheduler
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
 from axolotl.utils.schedulers import (
-    RexLR,
    get_cosine_schedule_with_min_lr,
    get_cosine_schedule_with_quadratic_warmup,
    get_cosine_schedule_with_warmup_decay_constant,
@@ -118,17 +115,6 @@ class SchedulerMixin(Trainer):
                    **extra_lr_kwargs,
                    **self.args.lr_scheduler_kwargs,
                )
-            elif self.args.alternate_lr_scheduler_type == "rex":
-                if use_cosine_min_lr:
-                    assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
-
-                self.lr_scheduler = RexLR(
-                    optimizer=optimizer,
-                    max_lr=self.args.learning_rate,
-                    min_lr=0 if not use_cosine_min_lr else (self.args.learning_rate * self.args.cosine_min_lr_ratio),
-                    total_steps=num_training_steps,
-                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
-                )
            elif use_cosine_quadratic:
                if use_cosine_min_lr:
                    LOG.warning("Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
@@ -168,18 +154,47 @@ class SchedulerMixin(Trainer):
        return self.lr_scheduler


-class OptimizerMixin(Trainer):
+class AxolotlTrainer(SchedulerMixin, Trainer):
    """
-    Mixin class for shared handling of building custom optimizers
+    Extend the base Trainer for axolotl helpers
    """

    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
+    tag_names = ["axolotl"]

-    def create_optimizer_grouped_parameters(
-        self, opt_model, optimizer_kwargs
-    ) -> list[dict]:
+    def __init__(
+        self,
+        *_args,
+        bench_data_collator=None,
+        eval_data_collator=None,
+        dataset_tags=None,
+        **kwargs,
+    ):
+        self.bench_data_collator = bench_data_collator
+        self.eval_data_collator = eval_data_collator
+        self.dataset_tags = dataset_tags
+        self._signature_columns = None  # workaround for pylint
+        super().__init__(*_args, **kwargs)
+        self.train_data_collator = self.data_collator
+        self._stored_metrics = defaultdict(lambda: defaultdict(list))
+        if self.args.orpo_alpha:
+            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
+
+    def _wrap_model(self, model, training=True, dataloader=None):
+        if self.args.torch_compile:
+            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
+                256
+            )
+            model = torch.compile(
+                model,
+                backend=self.args.torch_compile_backend,
+                mode=self.args.torch_compile_mode,
+            )
+        return super()._wrap_model(model, training=training, dataloader=dataloader)
+
+    def create_optimizer_grouped_parameters(self, opt_model, optimizer_kwargs):
        decay_parameters = self.get_decay_parameter_names(opt_model)
-        params: dict = {
+        params = {
            "to_weight_decay": {},  # LayerNorm and bias
            "embeddings": {},  # lm_head, embed_tokens,
            "no_weight_decay": {},
@@ -266,30 +281,23 @@ class OptimizerMixin(Trainer):
            and self.args.embedding_lr_scale is None
            and self.args.embedding_lr is None
            and self.args.lr_groups is None
-            and self.optimizer_cls_and_kwargs is None
+            and self.args.alternate_optimizer
+            not in [
+                "optimi_adamw",
+                "ao_adamw_8bit",
+                "ao_adamw_4bit",
+                "ao_adamw_fp8",
+                "adopt_adamw",
+            ]
        ):
            return super().create_optimizer()

        opt_model = self.model_wrapped if is_sagemaker_mp_enabled() else self.model
-
-        if (
-            not self.optimizer
-            and self.optimizer_cls_and_kwargs is not None
-            and issubclass(self.optimizer_cls_and_kwargs[0], BaseOptimizerFactory)
-        ):
-            optimizer_factory_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs
-            self.optimizer = optimizer_factory_cls()(
-                opt_model, self.args, **optimizer_kwargs
+        if self.optimizer is None:  # pylint: disable=access-member-before-definition
+            optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(
+                self.args,
+                opt_model,
            )
-
-        if not self.optimizer:
-            if self.optimizer_cls_and_kwargs is not None:
-                optimizer_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs
-            else:
-                optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(
-                    self.args, opt_model
-                )
-
            optimizer_grouped_parameters = self.create_optimizer_grouped_parameters(
                opt_model, optimizer_kwargs
            )
@@ -306,47 +314,50 @@ class OptimizerMixin(Trainer):
                    loraplus_lr_embedding=loraplus_lr_embedding,
                    **optimizer_kwargs,
                )
-            else:
-                # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs`
-                # e.g. for GaLore optimizer.
-                if "params" in optimizer_kwargs:
-                    optimizer_grouped_parameters = optimizer_kwargs.pop("params")
-
-                # Overwrite `model` in case it's created by `get_optimizer_cls_and_kwargs`
-                # e.g. for LOMO optimizer.
-                if "model" in optimizer_kwargs:
-                    optimizer_grouped_parameters = optimizer_kwargs.pop("model")
-
-                # For layer-wise dummy optimizers we overwrite optimizer_grouped_parameters with `optimizer_dict`
-                # to avoid arguments conflicts.
-                if "optimizer_dict" in optimizer_kwargs:
-                    optimizer_grouped_parameters = optimizer_kwargs.pop(
-                        "optimizer_dict"
-                    )
-
-                self.optimizer = optimizer_cls(
-                    optimizer_grouped_parameters, **optimizer_kwargs
+            elif (
+                self.args.embedding_lr_scale is not None
+                or self.args.embedding_lr is not None
+                or self.args.lr_groups is not None
+            ):
+                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
+                    optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
                )
+            elif self.args.alternate_optimizer == "optimi_adamw":
+                from optimi import AdamW

-            if optimizer_cls.__name__ == "Adam8bit":
-                import bitsandbytes
+                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
+                    AdamW(
+                        optimizer_grouped_parameters, foreach=False, **optimizer_kwargs
+                    )
+                )
+            elif self.args.alternate_optimizer == "ao_adamw_4bit":
+                from torchao.prototype.low_bit_optim import AdamW4bit

-                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
+                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
+                    AdamW4bit(optimizer_grouped_parameters, **optimizer_kwargs)
+                )
+            elif self.args.alternate_optimizer == "ao_adamw_8bit":
+                from torchao.prototype.low_bit_optim import AdamW8bit

-                skipped = 0
-                for module in opt_model.modules():
-                    if isinstance(module, nn.Embedding):
-                        skipped += sum(
-                            {
-                                p.data_ptr(): p.numel() for p in module.parameters()
-                            }.values()
-                        )
-                        LOG.info(f"skipped {module}: {skipped/2**20}M params")
-                        manager.register_module_override(
-                            module, "weight", {"optim_bits": 32}
-                        )
-                        LOG.debug(f"bitsandbytes: will optimize {module} in fp32")
-                LOG.info(f"skipped: {skipped/2**20}M params")
+                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
+                    AdamW8bit(optimizer_grouped_parameters, **optimizer_kwargs)
+                )
+            elif self.args.alternate_optimizer == "ao_adamw_fp8":
+                from torchao.prototype.low_bit_optim import AdamWFp8
+
+                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
+                    AdamWFp8(optimizer_grouped_parameters, **optimizer_kwargs)
+                )
+            elif self.args.alternate_optimizer == "adopt_adamw":
+                from axolotl.utils.optimizers.adopt import ADOPT
+
+                self.optimizer = (  # pylint: disable=attribute-defined-outside-init
+                    ADOPT(
+                        optimizer_grouped_parameters,
+                        decouple=True,
+                        **optimizer_kwargs,
+                    )
+                )

        if is_sagemaker_mp_enabled():
            self.optimizer = smp.DistributedOptimizer(  # pylint: disable=attribute-defined-outside-init
@@ -355,45 +366,6 @@ class OptimizerMixin(Trainer):

        return self.optimizer

-
-class AxolotlTrainer(SchedulerMixin, OptimizerMixin, Trainer):
-    """
-    Extend the base Trainer for axolotl helpers
-    """
-
-    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
-    tag_names = ["axolotl"]
-
-    def __init__(
-        self,
-        *_args,
-        bench_data_collator=None,
-        eval_data_collator=None,
-        dataset_tags=None,
-        **kwargs,
-    ):
-        self.bench_data_collator = bench_data_collator
-        self.eval_data_collator = eval_data_collator
-        self.dataset_tags = dataset_tags
-        self._signature_columns = None  # workaround for pylint
-        super().__init__(*_args, **kwargs)
-        self.train_data_collator = self.data_collator
-        self._stored_metrics = defaultdict(lambda: defaultdict(list))
-        if self.args.orpo_alpha:
-            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
-
-    def _wrap_model(self, model, training=True, dataloader=None):
-        if self.args.torch_compile:
-            torch._dynamo.config.accumulated_cache_size_limit = (  # pylint: disable=protected-access
-                256
-            )
-            model = torch.compile(
-                model,
-                backend=self.args.torch_compile_backend,
-                mode=self.args.torch_compile_mode,
-            )
-        return super()._wrap_model(model, training=training, dataloader=dataloader)
-
    def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
        if self.args.sample_packing and not self.args.pretraining:
            if self.args.multipack_real_batches:
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -9,7 +9,6 @@ import logging
 from trl.trainer.grpo_trainer import RewardFunc

 from axolotl.core.trainers.grpo.trainer import AxolotlGRPOTrainer
-from axolotl.utils.config.models.input.v0_4_1.trl import TRLConfig

 LOG = logging.getLogger("axolotl")

@@ -32,44 +31,30 @@ class GRPOStrategy:
    @classmethod
    def set_training_args_kwargs(cls, cfg):
        grpo_args_kwargs = {}
-
-        if not hasattr(cfg, "trl") or not cfg.trl:
-            return grpo_args_kwargs
-
-        trl: TRLConfig = cfg.trl  # type: ignore
-
-        if trl.use_vllm:
-            grpo_args_kwargs["use_vllm"] = trl.use_vllm
-            grpo_args_kwargs["vllm_device"] = (
-                trl.vllm_device if trl.vllm_device else "auto"
-            )
-
-            if trl.vllm_gpu_memory_utilization:
+        if cfg.trl and cfg.trl.use_vllm:
+            grpo_args_kwargs["use_vllm"] = cfg.trl.use_vllm
+            if cfg.trl and cfg.trl.vllm_device:
+                grpo_args_kwargs["vllm_device"] = cfg.trl.vllm_device
+            else:
+                grpo_args_kwargs["vllm_device"] = "auto"
+            if cfg.trl and cfg.trl.vllm_gpu_memory_utilization:
                grpo_args_kwargs[
                    "vllm_gpu_memory_utilization"
-                ] = trl.vllm_gpu_memory_utilization
-
-            if trl.vllm_max_model_len:
-                grpo_args_kwargs["vllm_max_model_len"] = trl.vllm_max_model_len
-
-        if trl.num_generations:
-            grpo_args_kwargs["num_generations"] = trl.num_generations
-
-        if trl.sync_ref_model:
-            grpo_args_kwargs["sync_ref_model"] = trl.sync_ref_model
-
-            if trl.ref_model_mixup_alpha:
-                grpo_args_kwargs["ref_model_mixup_alpha"] = trl.ref_model_mixup_alpha
-
-            if trl.ref_model_sync_steps:
-                grpo_args_kwargs["ref_model_sync_steps"] = trl.ref_model_sync_steps
-
-        grpo_args_kwargs["max_completion_length"] = trl.max_completion_length
-        grpo_args_kwargs["log_completions"] = trl.log_completions
-
-        if trl.reward_weights:
-            grpo_args_kwargs["reward_weights"] = trl.reward_weights
-
+                ] = cfg.trl.vllm_gpu_memory_utilization
+            if cfg.trl and cfg.trl.vllm_max_model_len:
+                grpo_args_kwargs["vllm_max_model_len"] = cfg.trl.vllm_max_model_len
+        if cfg.trl and cfg.trl.num_generations:
+            grpo_args_kwargs["num_generations"] = cfg.trl.num_generations
+        if cfg.trl and cfg.trl.sync_ref_model:
+            grpo_args_kwargs["sync_ref_model"] = cfg.trl.sync_ref_model
+            if cfg.trl and cfg.trl.ref_model_mixup_alpha:
+                grpo_args_kwargs[
+                    "ref_model_mixup_alpha"
+                ] = cfg.trl.ref_model_mixup_alpha
+            if cfg.trl and cfg.trl.ref_model_sync_steps:
+                grpo_args_kwargs["ref_model_sync_steps"] = cfg.trl.ref_model_sync_steps
+        grpo_args_kwargs["max_completion_length"] = cfg.trl.max_completion_length
+        grpo_args_kwargs["log_completions"] = cfg.trl.log_completions
        return grpo_args_kwargs

    @classmethod
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -23,8 +23,6 @@ import importlib
 import logging
 from typing import OrderedDict

-import torch
-

 class BasePlugin:
    """
@@ -471,14 +469,3 @@ class PluginManager:
        """
        for plugin in self.plugins.values():
            plugin.post_train_unload(cfg)
-
-
-class BaseOptimizerFactory:
-    """
-    Base class for factories to create custom optimizers
-    """
-
-    def __call__(
-        self, opt_model, training_args, **optimizer_kwargs
-    ) -> "torch.optim.Optimizer":
-        pass
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -4,22 +4,6 @@ Cut Cross Entropy reduces VRAM usage through optimization on the cross-entropy o

 See https://github.com/apple/ml-cross-entropy

-## Requirements
-
- PyTorch 2.4.0 or higher
-
-## Installation
-
-Run the following command to install `cut_cross_entropy[transformers]` if you don't have it already.
-
-```bash
-# if you are in dev environment
-python scripts/cutcrossentropy_install.py | sh
-
-# if you are not in dev environment
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy @ git+https://github.com/apple/ml-cross-entropy.git@9c297c905f55b73594b5d650722d1e78183b77bd"'
-```
-
 ## Usage

 ```yaml
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -1,29 +1,26 @@
 """Prepare and train a model on a dataset. Can also infer from a model or merge lora"""

-import importlib
 import inspect
 import os
 import signal
 import sys
 import weakref
 from pathlib import Path
-from typing import Any
+from typing import Tuple, Union

 import torch
 import transformers.modelcard
 from accelerate.logging import get_logger
 from accelerate.utils import save_fsdp_model
-from datasets import Dataset
-from peft import PeftConfig, PeftModel
-from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
+from peft import PeftModel
+from pkg_resources import get_distribution  # type: ignore
+from transformers import PreTrainedModel, PreTrainedTokenizer
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
-from transformers.trainer import Trainer

 from axolotl.common.datasets import TrainDatasetMeta
 from axolotl.contribs.lgpl.unsloth import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
 )
-from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.freeze import freeze_layers_except
@@ -35,25 +32,17 @@ try:
 except ImportError:
    BetterTransformer = None

+project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+src_dir = os.path.join(project_root, "src")
+sys.path.insert(0, src_dir)
+
 configure_logging()
 LOG = get_logger(__name__)


-def setup_model_and_tokenizer(
-    cfg: DictDefault,
-) -> tuple[
-    PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None
-]:
-    """
-    Load the tokenizer, processor (for multimodal models), and model based on configuration.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-
-    Returns:
-        Tuple containing model, tokenizer, `peft_config` (if LoRA / QLoRA, else
-            `None`), and processor (if multimodal, else `None`).
-    """
+def train(
+    *, cfg: DictDefault, dataset_meta: TrainDatasetMeta
+) -> Tuple[Union[PeftModel, PreTrainedModel], PreTrainedTokenizer]:
    # Load tokenizer
    LOG.debug(
        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
@@ -66,58 +55,11 @@ def setup_model_and_tokenizer(
    if cfg.is_multimodal:
        processor = load_processor(cfg, tokenizer)

-    # Load the model and peft_config
-    msg = "loading model"
-    if cfg.adapter:
-        msg += " and peft_config..."
-    LOG.debug(msg)
+    # Get datasets
+    train_dataset = dataset_meta.train_dataset
+    eval_dataset = dataset_meta.eval_dataset
+    total_num_steps = dataset_meta.total_num_steps

-    model, peft_config = load_model(cfg, tokenizer, processor=processor)
-    if model.generation_config is not None:
-        model.generation_config.do_sample = True
-
-    # Apply freezing if specified
-    if cfg.unfrozen_parameters:
-        freeze_layers_except(model, cfg.unfrozen_parameters)
-
-    return model, tokenizer, peft_config, processor
-
-
-def setup_reference_model(
-    cfg: DictDefault, tokenizer: PreTrainedTokenizer
-) -> PreTrainedModel | None:
-    """
-    Set up the reference model for RL training if needed.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-        tokenizer: The tokenizer to use for the reference model.
-
-    Returns:
-        Reference model if needed for RL training, `None` otherwise.
-    """
-    model_ref = None
-    if cfg.rl and cfg.rl != "orpo":
-        if cfg.adapter and not cfg.rl_adapter_ref_model:
-            # use built-in trl autounwrap
-            LOG.debug("Passing model_ref: None to RL trainer")
-            model_ref = None  # explicit setting to None
-        else:
-            # load the model again for model_ref/baseline
-            model_ref, _ = load_model(cfg, tokenizer, reference_model=True)
-    return model_ref
-
-
-def determine_resume_checkpoint(cfg: DictDefault) -> str | None:
-    """
-    Determine the checkpoint to resume from based on configuration.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-
-    Returns:
-        Path to the checkpoint to resume from, or `None` if not resuming.
-    """
    if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
        possible_checkpoints = [
            str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
@@ -131,22 +73,77 @@ def determine_resume_checkpoint(cfg: DictDefault) -> str | None:
            LOG.info(
                f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
            )
-    return cfg.resume_from_checkpoint
+    resume_from_checkpoint = cfg.resume_from_checkpoint

+    # Load the model and tokenizer
+    msg = "loading model"
+    if cfg.adapter:
+        msg += " and peft_config..."
+    LOG.debug(msg)
+    model, peft_config = load_model(cfg, tokenizer, processor=processor)
+    if model.generation_config is not None:
+        model.generation_config.do_sample = True

-def setup_signal_handler(
-    cfg: DictDefault, model: PreTrainedModel, safe_serialization: bool
-):
-    """
-    Set up signal handler for graceful termination.
+    model_ref = None
+    if cfg.rl and cfg.rl != "orpo":
+        if cfg.adapter and not cfg.rl_adapter_ref_model:
+            # use built-in trl autounwrap
+            LOG.debug("Passing model_ref: None to RL trainer")
+            model_ref = None  # explicit setting to None
+        else:
+            # load the model again for model_ref/baseline
+            model_ref, _ = load_model(cfg, tokenizer, reference_model=True)

-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-        model: The model to save on termination
-        safe_serialization: Whether to use safe serialization when saving
-    """
-    # ray workers don't have access to this signal
-    if cfg.local_rank == 0 and not cfg.use_ray:
+    safe_serialization = cfg.save_safetensors is True
+
+    if cfg.unfrozen_parameters:
+        freeze_layers_except(model, cfg.unfrozen_parameters)
+
+    trainer = setup_trainer(
+        cfg,
+        train_dataset,
+        eval_dataset,
+        (model, model_ref, peft_config),
+        tokenizer,
+        processor,
+        total_num_steps,
+    )
+
+    if cfg.fix_untrained_tokens:
+        # check if the `token_ids_to_fix` kwarg exists in the fix_untrained_tokens args
+        sig = inspect.signature(fix_untrained_tokens)
+        # if the function has the `token_ids_to_fix` arg, and fix_untrained_tokens is a list
+        if "token_ids_to_fix" in sig.parameters and isinstance(
+            cfg.fix_untrained_tokens, list
+        ):
+            fix_untrained_tokens(
+                model,
+                tokenizer,
+                train_dataset,
+                token_ids_to_fix=cfg.fix_untrained_tokens,
+            )
+        else:
+            fix_untrained_tokens(model, tokenizer, train_dataset)
+        if cfg.local_rank == 0:
+            model.save_pretrained(
+                str(Path(cfg.output_dir)), safe_serialization=safe_serialization
+            )
+
+    # go ahead and presave, so we have the adapter config available to inspect
+    if peft_config:
+        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
+        peft_config.save_pretrained(cfg.output_dir)
+    # additionally presave the tokenizer and model configs
+    if not Path(cfg.output_dir).is_dir():
+        os.makedirs(cfg.output_dir, exist_ok=True)
+    tokenizer.save_pretrained(str(Path(cfg.output_dir)))
+    if hasattr(model, "config"):
+        model.config.save_pretrained(str(Path(cfg.output_dir)))
+
+    # In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
+    if (
+        cfg.local_rank == 0 and not cfg.use_ray
+    ):  # ray workers don't have access to this signal

        def terminate_handler(_, __, model_weakref):
            if model_weakref() is not None:
@@ -164,22 +161,21 @@ def setup_signal_handler(
            lambda signum, frame: terminate_handler(signum, frame, _model_weakref),
        )

+    badge_markdown = """[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)"""
+    transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}"

-def execute_training(
-    cfg: DictDefault, trainer: Any, resume_from_checkpoint: str | None
-):
-    """
-    Execute the training process with appropriate backend configurations.
+    if getattr(cfg, "axolotl_config_path"):
+        raw_axolotl_cfg = Path(cfg.axolotl_config_path)
+        version = get_distribution("axolotl").version
+        if raw_axolotl_cfg.is_file():
+            transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n<details><summary>See axolotl config</summary>\n\naxolotl version: `{version}`\n```yaml\n{raw_axolotl_cfg.read_text(encoding='utf-8')}\n```\n\n</details><br>\n"

-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-        trainer: The configured trainer object.
-        resume_from_checkpoint: Path to checkpoint to resume from, if applicable.
-    """
    LOG.info("Starting trainer...")
    if cfg.group_by_length:
        LOG.info("hang tight... sorting dataset for group_by_length")

+    pretrain_hooks(cfg, trainer)
+
    if cfg.flash_optimum:
        with torch.backends.cuda.sdp_kernel(
            # TODO configure these from the YAML w/ sdp_kernel_kwargs: ...
@@ -191,30 +187,15 @@ def execute_training(
    else:
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)

+    post_train_hooks(cfg, trainer)

-def save_trained_model(
-    cfg: DictDefault,
-    trainer: Any,
-    model: PreTrainedModel,
-    safe_serialization: bool,
-):
-    """
-    Save the trained model according to configuration and training setup.
+    LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")

-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-        trainer: The trainer object.
-        model: The trained model to save.
-        safe_serialization: Whether to use safe serialization.
-    """
-    LOG.info(f"Training completed! Saving pre-trained model to {cfg.output_dir}.")
-
-    # Post training module hooks
+    # post training
    for name, module in model.named_modules():
        if hasattr(module, "_post_training"):
            module._post_training(model, name)  # pylint: disable=protected-access

-    # Handle FSDP state dict type
    state_dict_type = "FULL_STATE_DICT"
    if trainer.is_fsdp_enabled:
        if cfg.fsdp_final_state_dict_type:
@@ -222,18 +203,16 @@ def save_trained_model(
        trainer.accelerator.state.fsdp_plugin.set_state_dict_type(state_dict_type)
        LOG.info(f"Set FSDP state dict type to {state_dict_type} for saving.")

-    # Handle ReLoRA early return case
    if cfg.relora_steps:
        if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
            model = model.merge_and_unload()
        else:
            # final model weights have already been saved by `ReLoRACallback.on_train_end`
-            return
+            return model, tokenizer

+    # TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
+    # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
    if cfg.fsdp:
-        # TODO: do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
-        # only save on rank 0, otherwise it corrupts output on multi-GPU when multiple
-        # processes attempt to write the same file
        if (
            state_dict_type == "SHARDED_STATE_DICT"
            and cfg.fsdp_config.fsdp_state_dict_type == "SHARDED_STATE_DICT"
@@ -265,6 +244,7 @@ def save_trained_model(
                os.remove(os.path.join(cfg.output_dir, "model.safetensors"))
            except FileNotFoundError:
                pass
+
    elif cfg.local_rank == 0:
        if cfg.flash_optimum and BetterTransformer:
            model = BetterTransformer.reverse(model)
@@ -275,239 +255,58 @@ def save_trained_model(
            )
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)

-
-def create_model_card(cfg: DictDefault, trainer: Trainer):
-    """
-    Create a model card for the trained model if needed.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-        trainer: The trainer object with model card creation capabilities.
-    """
    if not cfg.hub_model_id:
-        # Guard since create_model_card may fail if dataset_tags is empty list
        try:
            model_card_kwarg = {
                "model_name": cfg.output_dir.lstrip("./")
                .encode("utf-8")
                .decode("utf-8")
            }
-
-            # We check if we're using a TRL trainer; if so, `dataset_tags` is not consumed.
-            rl = cfg.rl is not None or cfg.reward_model or cfg.process_reward_model
-            if cfg.datasets is not None and not rl:
-                dataset_tags = [
-                    d["path"] for d in cfg.datasets if not Path(d["path"]).is_dir()
-                ]
-                dataset_tags = [d for d in dataset_tags if not d.startswith("https://")]
-
-                if dataset_tags:
-                    model_card_kwarg["dataset_tags"] = dataset_tags
+            if cfg.datasets is not None:
+                if cfg.rl is not None or cfg.reward_model or cfg.process_reward_model:
+                    dataset_tags = [
+                        d["path"] for d in cfg.datasets if not Path(d["path"]).is_dir()
+                    ]
+                    dataset_tags = [
+                        d for d in dataset_tags if not d.startswith("https://")
+                    ]
+                    if dataset_tags:
+                        # guard as create_model_card may fail if dataset_tags is empty list
+                        model_card_kwarg["dataset_name"] = dataset_tags
+                else:
+                    dataset_tags = [
+                        d["path"] for d in cfg.datasets if not Path(d["path"]).is_dir()
+                    ]
+                    dataset_tags = [
+                        d for d in dataset_tags if not d.startswith("https://")
+                    ]
+                    if dataset_tags:
+                        # guard as create_model_card may fail if dataset_tags is empty list
+                        model_card_kwarg["dataset_tags"] = dataset_tags

            trainer.create_model_card(**model_card_kwarg)
        except (AttributeError, UnicodeDecodeError):
            pass
    elif cfg.hub_model_id:
-        # Defensively push to the hub to ensure the model card is updated
+        # defensively push to the hub to ensure the model card is updated
        trainer.push_to_hub()

+    return model, tokenizer

-def save_initial_configs(
-    cfg: DictDefault,
-    tokenizer: PreTrainedTokenizer,
-    model: PreTrainedModel,
-    peft_config: PeftConfig | None,
-):
+
+def pretrain_hooks(_cfg, _trainer):
    """
-    Save initial configurations before training.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-        tokenizer: The tokenizer to save.
-        model: The model to save configuration for.
-        peft_config: The PEFT configuration to save if applicable.
+    Run hooks right before kicking off the training
+    :param cfg:
+    :param trainer:
+    :return:
    """
-    # Create output_dir if it doesn't already exist
-    output_dir = Path(cfg.output_dir)
-    if not output_dir.is_dir():
-        os.makedirs(cfg.output_dir, exist_ok=True)
-
-    # Pre-save adapter config so it's available to inspect
-    if peft_config:
-        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}...")
-        peft_config.save_pretrained(cfg.output_dir)
-
-    # Pre-save the tokenizer and model configs
-    LOG.info(f"Pre-saving tokenizer to {cfg.output_dir}...")
-    tokenizer.save_pretrained(str(output_dir))
-    if hasattr(model, "config"):
-        LOG.info(f"Pre-saving model config to {cfg.output_dir}...")
-        model.config.save_pretrained(str(output_dir))


-def setup_model_card(cfg: DictDefault):
+def post_train_hooks(_cfg, _trainer):
    """
-    Set up the Axolotl badge and add the Axolotl config to the model card if available.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
+    Run hooks right after training completes
+    :param cfg:
+    :param trainer:
+    :return:
    """
-    badge_markdown = """[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)"""
-    transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}"
-
-    if getattr(cfg, "axolotl_config_path"):
-        raw_axolotl_cfg = Path(cfg.axolotl_config_path)
-        version = importlib.metadata.version("axolotl")
-        if raw_axolotl_cfg.is_file():
-            transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n<details><summary>See axolotl config</summary>\n\naxolotl version: `{version}`\n```yaml\n{raw_axolotl_cfg.read_text(encoding='utf-8')}\n```\n\n</details><br>\n"
-
-
-def handle_untrained_tokens_fix(
-    cfg: DictDefault,
-    model: PreTrainedModel,
-    tokenizer: PreTrainedTokenizer,
-    train_dataset: Dataset,
-    safe_serialization: bool,
-):
-    """
-    Apply fixes for untrained tokens if configured.
-
-    Args:
-        cfg: Dictionary mapping `axolotl` config keys to values.
-        model: The model to apply fixes to.
-        tokenizer: The tokenizer for token identification.
-        train_dataset: The training dataset to use.
-        safe_serialization: Whether to use safe serialization when saving.
-    """
-    if not cfg.fix_untrained_tokens:
-        return
-
-    # Check if the `token_ids_to_fix` kwarg exists in the fix_untrained_tokens args
-    sig = inspect.signature(fix_untrained_tokens)
-
-    # If the function has the `token_ids_to_fix` arg, and fix_untrained_tokens is a list
-    if "token_ids_to_fix" in sig.parameters and isinstance(
-        cfg.fix_untrained_tokens, list
-    ):
-        fix_untrained_tokens(
-            model,
-            tokenizer,
-            train_dataset,
-            token_ids_to_fix=cfg.fix_untrained_tokens,
-        )
-    else:
-        fix_untrained_tokens(model, tokenizer, train_dataset)
-
-    if cfg.local_rank == 0:
-        model.save_pretrained(
-            str(Path(cfg.output_dir)), safe_serialization=safe_serialization
-        )
-
-
-def setup_model_and_trainer(
-    cfg: DictDefault, dataset_meta: TrainDatasetMeta
-) -> tuple[
-    HFRLTrainerBuilder | HFCausalTrainerBuilder,
-    PeftModel | PreTrainedModel,
-    PreTrainedTokenizer,
-    PeftConfig | None,
-]:
-    """
-    Load model, tokenizer, trainer, etc. Helper function to encapsulate the full
-    trainer setup.
-
-    Args:
-        cfg: The configuration dictionary with training parameters.
-        dataset_meta: Object with training, validation datasets and metadata.
-
-    Returns:
-        Tuple of:
-            - Trainer (Causal or RLHF)
-            - Model
-            - Tokenizer
-            - PEFT config
-    """
-    # Load tokenizer, processor and model
-    model, tokenizer, peft_config, processor = setup_model_and_tokenizer(cfg)
-
-    # Set up reference model for RL if needed
-    model_ref = setup_reference_model(cfg, tokenizer)
-
-    # Get datasets from metadata
-    train_dataset = dataset_meta.train_dataset
-    eval_dataset = dataset_meta.eval_dataset
-    total_num_steps = dataset_meta.total_num_steps
-
-    # Set up trainer
-    trainer = setup_trainer(
-        cfg=cfg,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        model=model,
-        tokenizer=tokenizer,
-        processor=processor,
-        total_num_steps=total_num_steps,
-        model_ref=model_ref,
-        peft_config=peft_config,
-    )
-
-    return (
-        trainer,
-        model,
-        tokenizer,
-        peft_config,
-    )
-
-
-def train(
-    cfg: DictDefault, dataset_meta: TrainDatasetMeta
-) -> tuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]:
-    """
-    Train a model on the given dataset.
-
-    Args:
-        cfg: The configuration dictionary with training parameters
-        dataset_meta: Object with training, validation datasets and metadata
-
-    Returns:
-        Tuple of (model, tokenizer) after training
-    """
-    # Setup model, tokenizer, (causal or RLHF) trainer etc.
-    (
-        trainer,
-        model,
-        tokenizer,
-        peft_config,
-    ) = setup_model_and_trainer(cfg, dataset_meta)
-
-    # Determine if we need to resume from a checkpoint
-    resume_from_checkpoint = determine_resume_checkpoint(cfg)
-
-    # Configuration for saving
-    safe_serialization = cfg.save_safetensors is True
-
-    # Handle untrained tokens if configured
-    train_dataset = dataset_meta.train_dataset
-    handle_untrained_tokens_fix(
-        cfg, model, tokenizer, train_dataset, safe_serialization
-    )
-
-    # Save initial configs
-    save_initial_configs(cfg, tokenizer, model, peft_config)
-
-    # Set up signal handler for graceful termination
-    setup_signal_handler(cfg, model, safe_serialization)
-
-    # Set up badges and config info for model card
-    setup_model_card(cfg)
-
-    # Execute the training
-    execute_training(cfg, trainer, resume_from_checkpoint)
-
-    # Save the trained model
-    save_trained_model(cfg, trainer, model, safe_serialization)
-
-    # Create model card
-    create_model_card(cfg, trainer)
-
-    return model, tokenizer, trainer
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -64,18 +64,6 @@ class ChatTemplate(str, Enum):
    metharme = "metharme"  # pylint: disable=invalid-name


-class CustomSupportedOptimizers(str, Enum):
-    """Custom supported optimizers"""
-
-    optimi_adamw = "optimi_adamw"  # pylint: disable=invalid-name
-    ao_adamw_4bit = "ao_adamw_4bit"  # pylint: disable=invalid-name
-    ao_adamw_8bit = "ao_adamw_8bit"  # pylint: disable=invalid-name
-    ao_adamw_fp8 = "ao_adamw_fp8"  # pylint: disable=invalid-name
-    adopt_adamw = "adopt_adamw"  # pylint: disable=invalid-name
-    lion_pytorch = "lion_pytorch"  # pylint: disable=invalid-name
-    muon = "muon"  # pylint: disable=invalid-name
-
-
 class DeprecatedParameters(BaseModel):
    """configurations that are deprecated"""

@@ -506,7 +494,17 @@ class HyperparametersConfig(BaseModel):
    embedding_lr_scale: Optional[float] = None
    weight_decay: Optional[float] = 0.0
    optimizer: Optional[
-        Union[OptimizerNames, CustomSupportedOptimizers]
+        Union[
+            OptimizerNames,
+            Literal[
+                "lion_pytorch",
+                "optimi_adamw",
+                "ao_adamw_4bit",
+                "ao_adamw_8bit",
+                "ao_adamw_fp8",
+                "adopt_adamw",
+            ],
+        ]
    ] = OptimizerNames.ADAMW_HF
    optim_args: Optional[Union[str, Dict[str, Any]]] = Field(
        default=None,
@@ -520,7 +518,7 @@ class HyperparametersConfig(BaseModel):
    )
    torchdistx_path: Optional[str] = None
    lr_scheduler: Optional[
-        Union[SchedulerType, Literal["one_cycle"], Literal["rex"]]
+        Union[SchedulerType, Literal["one_cycle"]]
    ] = SchedulerType.COSINE
    lr_scheduler_kwargs: Optional[Dict[str, Any]] = None
    lr_quadratic_warmup: Optional[bool] = None
@@ -750,6 +748,8 @@ class AxolotlInputConfig(
    local_rank: Optional[int] = None
    ddp: Optional[bool] = None

+    tensor_parallel: Optional[bool] = None
+
    seed: Optional[int] = None
    ddp_timeout: Optional[int] = None
    ddp_bucket_cap_mb: Optional[int] = None
@@ -1179,13 +1179,6 @@ class AxolotlInputConfig(
            LOG.warning("adamw hyperparameters found, but no adamw optimizer set")
        return self

-    @model_validator(mode="before")
-    @classmethod
-    def check_lr_groups(cls, data):
-        if data.get("lr_groups") and data.get("loraplus_lr_ratio"):
-            raise ValueError("lr_groups and loraplus_lr_ratio cannot be used together.")
-        return data
-
    @model_validator(mode="before")
    @classmethod
    def check_saves(cls, data):
@@ -1380,6 +1373,13 @@ class AxolotlInputConfig(
            )
        return data

+    @model_validator(mode="before")
+    @classmethod
+    def check_fsdp_tp(cls, data):
+        if data.get("fsdp") and data.get("tensor_parallel"):
+            raise ValueError("FSDP with tensor parallelism is not supported yet.")
+        return data
+
    @model_validator(mode="after")
    def check_fft_possible_bad_config(self):
        if (
--- a/src/axolotl/utils/config/models/input/v0_4_1/trl.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/trl.py
@@ -27,7 +27,6 @@ class TRLConfig(BaseModel):
    vllm_dtype: Optional[str] = "auto"

    reward_funcs: Optional[List[str]] = None
-    reward_weights: Optional[List[float]] = None
    num_generations: Optional[int] = None
    log_completions: Optional[bool] = False

--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -762,6 +762,9 @@ class ModelLoader:
            return hf_ds_cfg

        skip_move_to_device = False
+        if self.cfg.tensor_parallel:
+            del self.model_kwargs["device_map"]
+
        if (  # pylint: disable=condition-evals-to-constant)
            (self.cfg.fsdp and self.cfg.fsdp_config.fsdp_cpu_ram_efficient_loading)
            and not qlora_fsdp
--- a/src/axolotl/utils/schedulers.py
+++ b/src/axolotl/utils/schedulers.py
@@ -6,80 +6,6 @@ from torch.optim import Optimizer
 from torch.optim.lr_scheduler import LambdaLR, LRScheduler


-class RexLR(LRScheduler):
-    """
-    Reflected Exponential (REX) learning rate scheduler.
-
-    - Original implementation: https://github.com/IvanVassi/REX_LR
-    - Original license: Apache 2.0
-    - Based on: https://arxiv.org/abs/2107.04197
-
-    Args:
-        optimizer (torch.optim.Optimizer): The optimizer to schedule the learning rate for.
-        max_lr (float): The maximum learning rate.
-        min_lr (float): The minimum learning rate.
-        total_steps (int): The total number of training steps.
-        num_warmup_steps (int): The number of warmup steps.
-        last_step (int): The index of last step.
-    """
-
-    def __init__(
-        self, optimizer, max_lr, min_lr, total_steps=0, num_warmup_steps=0, last_step=0
-    ):
-        if min_lr > max_lr:
-            raise ValueError(
-                f'Value of "min_lr" should be less than value of "max_lr". Got min_lr={min_lr} and max_lr={max_lr}'
-            )
-        if num_warmup_steps > total_steps:
-            raise ValueError(
-                f"num_warmup_steps ({num_warmup_steps}) must be less than or equal to total_steps ({total_steps})."
-            )
-
-        self.min_lr = min_lr
-        self.max_lr = max_lr
-        self.total_steps = total_steps
-        self.num_warmup_steps = num_warmup_steps
-        self.last_step = last_step - 1
-
-        # Ensure each parameter group has an "initial_lr" key to avoid issues when resuming.
-        for group in optimizer.param_groups:
-            group.setdefault("initial_lr", group["lr"])
-
-        # Pass self.last_step as last_epoch to the parent.
-        super().__init__(optimizer, last_epoch=self.last_step)
-
-    @property
-    def last_step(self):
-        return self.last_epoch
-
-    @last_step.setter
-    def last_step(self, value):
-        self.last_epoch = value
-
-    def get_lr(self):
-        # Warmup phase: if defined, increase lr linearly from 0 to max_lr.
-        if 1 <= self.last_step <= self.num_warmup_steps:
-            return [
-                base_lr * self.last_step / self.num_warmup_steps
-                for base_lr in self.base_lrs
-            ]
-
-        # Post-warmup phase: adjust step relative to the end of warmup.
-        step_after = self.last_step - self.num_warmup_steps
-        remaining_steps = self.total_steps - self.num_warmup_steps
-
-        # Avoid LR spiking
-        if step_after >= remaining_steps or step_after == -1 or remaining_steps <= 0:
-            return [self.min_lr for _ in self.base_lrs]
-
-        mod_iter = step_after % remaining_steps
-        z = (remaining_steps - mod_iter) / remaining_steps
-        rex_factor = self.min_lr / self.max_lr + (1.0 - self.min_lr / self.max_lr) * (
-            z / (0.1 + 0.9 * z)
-        )
-        return [base_lr * rex_factor for base_lr in self.base_lrs]
-
-
 class InterpolatingLogScheduler(LRScheduler):
    """
    A scheduler that interpolates learning rates in a logarithmic fashion
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -547,6 +547,7 @@ def prepare_optim_env(cfg):
    if not check_cuda_p2p_ib_support():
        if os.getenv("NCCL_P2P_DISABLE") is None:
            os.environ["NCCL_P2P_DISABLE"] = "1"
+
    if cfg.fsdp:
        setup_fsdp_envs(cfg)
    elif cfg.deepspeed:
@@ -574,40 +575,14 @@ def prepare_opinionated_env(cfg):


 def setup_trainer(
-    cfg,
-    train_dataset,
-    eval_dataset,
-    model,
-    tokenizer,
-    processor,
-    total_num_steps,
-    model_ref=None,
-    peft_config=None,
+    cfg, train_dataset, eval_dataset, model, tokenizer, processor, total_num_steps
 ):
-    """
-    Helper method for instantiating and building a (causal or RLHF) trainer.
-
-    Args:
-        cfg: Axolotl config object containing training parameters.
-        train_dataset: Dataset to use for training.
-        eval_dataset: Dataset to use for evaluation.
-        model: The model to train.
-        tokenizer: Tokenizer for processing text input.
-        processor: Processor for data preparation.
-        total_num_steps: The total number of training steps.
-        model_ref: Optional reference model for RLHF training. Default is None.
-        peft_config: Optional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.
-
-    Returns:
-        A trainer instance (either `HFRLTrainer` or `HFCausalTrainer`) configured based
-            on the provided parameters.
-    """
    if cfg.rl:
-        trainer_builder = HFRLTrainerBuilder(cfg, model, tokenizer, processor)
-        trainer_builder.model_ref = model_ref
-        trainer_builder.peft_config = peft_config
+        trainer_builder = HFRLTrainerBuilder(cfg, model[0], tokenizer, processor)
+        trainer_builder.model_ref = model[1]
+        trainer_builder.peft_config = model[2]
    else:
-        trainer_builder = HFCausalTrainerBuilder(cfg, model, tokenizer, processor)
+        trainer_builder = HFCausalTrainerBuilder(cfg, model[0], tokenizer, processor)

    trainer_builder.train_dataset = train_dataset
    trainer_builder.eval_dataset = eval_dataset
--- a/tests/cli/test_cli_train.py
+++ b/tests/cli/test_cli_train.py
@@ -28,7 +28,7 @@ class TestTrainCommand(BaseCliTest):
        config_path.write_text(valid_test_config)

        with patch("axolotl.cli.train.train") as mock_train:
-            mock_train.return_value = (MagicMock(), MagicMock(), MagicMock())
+            mock_train.return_value = (MagicMock(), MagicMock())

            result = cli_runner.invoke(
                cli,
@@ -48,7 +48,7 @@ class TestTrainCommand(BaseCliTest):
        config_path = self._test_cli_overrides(tmp_path, valid_test_config)

        with patch("axolotl.cli.train.train") as mock_train:
-            mock_train.return_value = (MagicMock(), MagicMock(), MagicMock())
+            mock_train.return_value = (MagicMock(), MagicMock())

            result = cli_runner.invoke(
                cli,
--- a/tests/e2e/test_mixtral.py
+++ b/tests/e2e/test_mixtral.py
@@ -75,7 +75,7 @@ class TestMixtral(unittest.TestCase):
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

-        model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
+        model, _ = train(cfg=cfg, dataset_meta=dataset_meta)
        assert (
            model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
            == torch.float32
@@ -131,7 +131,7 @@ class TestMixtral(unittest.TestCase):
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

-        model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
+        model, _ = train(cfg=cfg, dataset_meta=dataset_meta)
        assert (
            model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
            == torch.float32
@@ -190,7 +190,7 @@ class TestMixtral(unittest.TestCase):
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

-        model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
+        model, _ = train(cfg=cfg, dataset_meta=dataset_meta)
        assert (
            model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
            == torch.float32
@@ -249,7 +249,7 @@ class TestMixtral(unittest.TestCase):
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

-        model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
+        model, _ = train(cfg=cfg, dataset_meta=dataset_meta)
        assert (
            model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
            == torch.float32
--- a/tests/e2e/test_optimizers.py
+++ b/tests/e2e/test_optimizers.py
@@ -65,9 +65,8 @@ class TestCustomOptimizers(unittest.TestCase):
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

-        _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
+        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
-        assert trainer.optimizer.optimizer.__class__.__name__ == "AdamW"

    @with_temp_dir
    @require_torch_2_5_1
@@ -112,57 +111,8 @@ class TestCustomOptimizers(unittest.TestCase):
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

-        _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
+        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
-        assert "ADOPT" in trainer.optimizer.optimizer.__class__.__name__
-
-    @with_temp_dir
-    @require_torch_2_5_1
-    def test_muon(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "LlamaTokenizer",
-                "sequence_len": 1024,
-                "load_in_8bit": True,
-                "adapter": "lora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0.1,
-                "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
-                },
-                "datasets": [
-                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "max_steps": 5,
-                "micro_batch_size": 8,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "muon",
-                "lr_scheduler": "cosine",
-                "weight_decay": 0.01,
-            }
-        )
-
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
-        check_model_output_exists(temp_dir, cfg)
-        assert "Muon" in trainer.optimizer.optimizer.__class__.__name__

    @with_temp_dir
    def test_fft_schedule_free_adamw(self, temp_dir):
--- a/tests/e2e/test_schedulers.py
+++ b/tests/e2e/test_schedulers.py
@@ -1,71 +0,0 @@
-"""
-E2E tests for custom schedulers using Llama
-"""
-
-import logging
-import os
-import unittest
-
-from axolotl.cli.args import TrainerCliArgs
-from axolotl.common.datasets import load_datasets
-from axolotl.train import train
-from axolotl.utils.config import normalize_config, validate_config
-from axolotl.utils.dict import DictDefault
-
-from .utils import check_model_output_exists, with_temp_dir
-
-LOG = logging.getLogger("axolotl.tests.e2e")
-os.environ["WANDB_DISABLED"] = "true"
-
-
-class TestCustomSchedulers(unittest.TestCase):
-    """
-    Test case for Llama models using LoRA
-    """
-
-    @with_temp_dir
-    def test_rex_scheduler(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "JackFram/llama-68m",
-                "tokenizer_type": "LlamaTokenizer",
-                "sequence_len": 1024,
-                "load_in_8bit": True,
-                "adapter": "lora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0.1,
-                "special_tokens": {
-                    "unk_token": "<unk>",
-                    "bos_token": "<s>",
-                    "eos_token": "</s>",
-                },
-                "datasets": [
-                    {
-                        "path": "mhenrichsen/alpaca_2k_test",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "micro_batch_size": 8,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "adamw_hf",
-                "max_steps": 20,
-                "lr_scheduler": "rex",
-                "warmup_steps": 5,
-                "cosine_min_lr_ratio": 0.05,
-            }
-        )
-
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        check_model_output_exists(temp_dir, cfg)
Author	SHA1	Message	Date
Sung Ching Liu	f68aedd1f8	Update __init__.py	2025-02-26 00:21:16 -05:00
Sunny Liu	3dd5c6f8ec	nit	2025-02-26 00:21:16 -05:00
Sunny Liu	4caa59a087	auto detect tp_size	2025-02-26 00:21:16 -05:00
Sunny Liu	984be14147	add tp_size in config doc	2025-02-26 00:21:16 -05:00
Sunny Liu	64adbf1a15	tp plan not needed	2025-02-26 00:21:16 -05:00
Sunny Liu	438b623031	prepare accelerate envs for tp	2025-02-26 00:21:16 -05:00
Sunny Liu	a74efcecbe	skip move to device	2025-02-26 00:21:16 -05:00
Sunny Liu	d663652216	del device_map for tp	2025-02-26 00:21:16 -05:00
Sunny Liu	dbd43aa18f	set tp_plan	2025-02-26 00:21:16 -05:00
Sunny Liu	dbdf97e828	enabe tp thru tp_size	2025-02-26 00:21:16 -05:00