move import of llmcompressor to reset session inside test

make sure to reset the session after each test
move decorator to test method instead of class
2025-04-30 18:10:44 -04:00 · 2025-04-30 17:21:53 -04:00 · 2025-04-30 17:21:53 -04:00 · 2025-04-30 17:21:53 -04:00 · 2025-04-30 17:21:53 -04:00 · 2025-04-30 17:21:53 -04:00
26 changed files with 40 additions and 546 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -30,7 +30,7 @@ jobs:
            cuda_version: 12.6.3
            python_version: "3.11"
            pytorch: 2.7.0
-            axolotl_extras:
+            axolotl_extras: vllm
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -4,12 +4,6 @@ on:
  pull_request:
    types: [opened, synchronize, reopened]
    # Run the workflow only when one of these files changes
    paths:
      - '**/*.md'      # any Markdown file
      - '**/*.qmd'     # any Quarto file
      - '_quarto.yaml'
 permissions:
  checks: write
  contents: write
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
@@ -1,90 +0,0 @@
 {
  "tests": [
    {
      "name": "quick_smoke_test_sft",
      "input": {
        "user_id": "user",
        "model_id": "llama-test",
        "run_id": "llama-test",
        "credentials": {
          "wandb_api_key": "",
          "hf_token": ""
        },
        "args": {
          "base_model": "HuggingFaceTB/SmolLM2-135M",
          "model_type": "AutoModelForCausalLM",
          "tokenizer_type": "AutoTokenizer",
          "load_in_4bit": true,
          "strict": false,
          "datasets": [
            {
              "path": "mhenrichsen/alpaca_2k_test",
              "type": "alpaca",
              "split": "train[:10%]"
            }
          ],
          "val_set_size": 0.02,
          "output_dir": "./outputs/lora-out",
          "sequence_len": 4096,
          "sample_packing": true,
          "eval_sample_packing": false,
          "pad_to_sequence_len": true,
          "adapter": "qlora",
          "lora_r": 32,
          "lora_alpha": 64,
          "lora_dropout": 0.05,
          "lora_target_linear": true,
          "lora_modules_to_save": [
            "embed_tokens",
            "lm_head"
          ],
          "gradient_accumulation_steps": 2,
          "micro_batch_size": 1,
          "num_epochs": 1,
          "optimizer": "adamw_torch_fused",
          "lr_scheduler": "cosine",
          "learning_rate": 0.0002,
          "train_on_inputs": false,
          "group_by_length": false,
          "bf16": "auto",
          "tf32": true,
          "gradient_checkpointing": true,
          "logging_steps": 1,
          "flash_attention": true,
          "warmup_steps": 1,
          "evals_per_epoch": 1,
          "eval_max_new_tokens": 128,
          "saves_per_epoch": 1,
          "weight_decay": 0.0,
          "special_tokens": {
            "pad_token": "<|endoftext|>"
          },
          "max_steps": 20
        }
      },
      "timeout": 100000
    }
  ],
  "config": {
    "gpuTypeId": "NVIDIA GeForce RTX 4090",
    "gpuCount": 1,
    "containerDiskInGb": 200,
    "env": [
      {
        "key": "TOKENIZER",
        "value": ""
      },
      {
        "key": "DISABLE_LOG_STATS",
        "value": "true"
      }
    ],
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ]
  }
 }
--- a/README.md
+++ b/README.md
@@ -51,7 +51,7 @@ Features:
 - NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
 - Python 3.11
- PyTorch ≥2.5.1
+- PyTorch ≥2.4.1
 ### Installation
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,7 +18,7 @@ accelerate==1.6.0
 datasets==3.5.0
 deepspeed>=0.15.4
 trl==0.17.0
-hf_xet==1.1.0
+hf_xet==1.0.0
 hqq==0.2.5
 optimum==1.16.2
--- a/setup.py
+++ b/setup.py
@@ -67,11 +67,13 @@ def parse_requirements(extras_require_map):
            if (major, minor) >= (2, 7):
                _install_requires.pop(_install_requires.index(xformers_version))
                # _install_requires.append("xformers==0.0.29.post3")  # xformers seems to be hard pinned to 2.6.0
                extras_require_map["vllm"] = ["vllm==0.8.5"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append(
                    "xformers==0.0.29.post2"
                )  # vllm needs post2 w torch 2.6
                extras_require_map["vllm"] = ["vllm==0.8.5"]
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
@@ -145,7 +147,7 @@ extras_require = {
        "ray[train]",
    ],
    "vllm": [
-        "vllm==0.8.5",
+        "vllm==0.7.2",
    ],
    "llmcompressor": [
        "llmcompressor==0.5.1",
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -2,7 +2,4 @@
 import os
 from axolotl.logging_config import configure_logging
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
 configure_logging()
--- a/src/axolotl/cli/checks.py
+++ b/src/axolotl/cli/checks.py
@@ -8,6 +8,9 @@ from accelerate.commands.config import config_args
 from huggingface_hub import HfApi
 from huggingface_hub.utils import LocalTokenNotFoundError
 from axolotl.logging_config import configure_logging
 configure_logging()
 LOG = logging.getLogger(__name__)
--- a/src/axolotl/cli/config.py
+++ b/src/axolotl/cli/config.py
@@ -5,7 +5,6 @@ import logging
 import os
 import tempfile
 from pathlib import Path
 from tempfile import NamedTemporaryFile
 from typing import Union
 from urllib.parse import urlparse
@@ -159,9 +158,7 @@ def plugin_set_cfg(cfg: DictDefault):
        plugin_manager.cfg = cfg
-def load_cfg(
+def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs) -> DictDefault:
    config: str | Path | DictDefault = Path("examples/"), **kwargs
 ) -> DictDefault:
    """
    Loads the `axolotl` configuration stored at `config`, validates it, and performs
    various setup.
@@ -173,24 +170,13 @@ def load_cfg(
    Returns:
        `DictDefault` mapping configuration keys to values.
    """
-    if isinstance(config, (str, Path)):
+    config = check_remote_config(config)
-        config = check_remote_config(config)
+    if Path(config).is_dir():
-        if Path(config).is_dir():
+        config = choose_config(Path(config))
            config = choose_config(Path(config))
-        # Load the config from the yaml file
+    # Load the config from the yaml file
-        with open(config, encoding="utf-8") as file:
+    with open(config, encoding="utf-8") as file:
-            cfg: DictDefault = DictDefault(yaml.safe_load(file))
+        cfg: DictDefault = DictDefault(yaml.safe_load(file))
        cfg.axolotl_config_path = config
    else:
        cfg = config
        with NamedTemporaryFile(
            mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
        ) as temp_file:
            temp_file.write(yaml.dump(config.to_dict()))
            temp_file.close()
        cfg.axolotl_config_path = temp_file.name
    # If there are any options passed in the cli, if it is something that seems valid
    # from the yaml, then overwrite the value
@@ -204,6 +190,8 @@ def load_cfg(
            else:
                cfg[k] = kwargs[k]
    cfg.axolotl_config_path = config
    try:
        device_props = torch.cuda.get_device_properties("cuda")
        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
--- a/src/axolotl/cli/utils.py
+++ b/src/axolotl/cli/utils.py
@@ -20,9 +20,11 @@ from transformers import (
    ProcessorMixin,
 )
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 configure_logging()
 LOG = logging.getLogger(__name__)
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -47,7 +47,7 @@ def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
 def load_datasets(
    *,
    cfg: DictDefault,
-    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
+    cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
 ) -> TrainDatasetMeta:
    """
    Loads one or more training or evaluation datasets, calling
@@ -64,8 +64,7 @@ def load_datasets(
    tokenizer = load_tokenizer(cfg)
    processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
    preprocess_iterable = (
-        cli_args
+        hasattr(cli_args, "iterable")
        and hasattr(cli_args, "iterable")
        and cli_args.iterable is not None
        and cli_args.iterable
    )
@@ -77,7 +76,7 @@ def load_datasets(
        preprocess_iterable=preprocess_iterable,
    )
-    if cli_args and (
+    if (
        cli_args.debug
        or cfg.debug
        or cli_args.debug_text_only
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -488,7 +488,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        # these are all the "standard" kwargs that are def used
        training_arguments_kwargs["max_steps"] = (
-            self.cfg.max_steps if self.cfg.max_steps else -1
+            total_num_steps if self.cfg.max_steps else -1
        )
        training_arguments_kwargs["max_seq_length"] = self.cfg.sequence_len
        training_arguments_kwargs["per_device_train_batch_size"] = (
--- a/src/axolotl/core/trainers/dpo/trainer.py
+++ b/src/axolotl/core/trainers/dpo/trainer.py
@@ -177,8 +177,12 @@ class AxolotlDPOTrainer(RngLoaderMixin, SchedulerMixin, DPOTrainer):
            # dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs
            if res["chosen_input_ids"][0] == processing_class.bos_token_id:
                res["chosen_input_ids"] = res["chosen_input_ids"][1:]
                res["chosen_labels"] = res["chosen_labels"][1:]
                res["chosen_attention_mask"] = res["chosen_attention_mask"][1:]
            if res["rejected_input_ids"][0] == processing_class.bos_token_id:
                res["rejected_input_ids"] = res["rejected_input_ids"][1:]
                res["rejected_labels"] = res["rejected_labels"][1:]
                res["rejected_attention_mask"] = res["rejected_attention_mask"][1:]
        return res
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -63,7 +63,6 @@ class GRPOStrategy:
        grpo_args_kwargs["max_completion_length"] = trl.max_completion_length
        grpo_args_kwargs["log_completions"] = trl.log_completions
        grpo_args_kwargs["num_completions_to_print"] = trl.num_completions_to_print
        if trl.reward_weights:
            grpo_args_kwargs["reward_weights"] = trl.reward_weights
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -11,6 +11,7 @@ from accelerate.logging import get_logger
 from datasets import Dataset
 from transformers.trainer import Trainer
 from axolotl.logging_config import configure_logging
 from axolotl.train import (
    TrainDatasetMeta,
    setup_model_and_tokenizer,
@@ -23,6 +24,7 @@ project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
 configure_logging()
 LOG = get_logger(__name__)
--- a/src/axolotl/integrations/liger/init.py
+++ b/src/axolotl/integrations/liger/init.py
@@ -151,30 +151,6 @@ class LigerPlugin(BasePlugin):
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3":
            from axolotl.integrations.liger.models.qwen3 import (
                apply_liger_kernel_to_qwen3,
            )
            apply_liger_kernel_to_qwen3(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3_moe":
            from axolotl.integrations.liger.models.qwen3_moe import (
                apply_liger_kernel_to_qwen3_moe,
            )
            apply_liger_kernel_to_qwen3_moe(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        else:
            logging.warning(
                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
--- a/src/axolotl/integrations/liger/models/qwen3.py
+++ b/src/axolotl/integrations/liger/models/qwen3.py
@@ -1,160 +0,0 @@
 """
 Liger FLCE for Qwen3. Based on transformers v4.51.3.
 """
 import sys
 from typing import Optional, Tuple, Union
 import torch
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
 from transformers.cache_utils import Cache
 from transformers.modeling_outputs import CausalLMOutputWithPast
 def lce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Cache] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs,
 ) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **kwargs,
        )
    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )
    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def apply_liger_kernel_to_qwen3(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
    import transformers.models.qwen3.modeling_qwen3  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
    assert not (
        cross_entropy and fused_linear_cross_entropy
    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]
    if rms_norm:
        modeling_qwen3.Qwen3RMSNorm = LigerRMSNorm
    if glu_activation:
        modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP
    if layer_norm:
        modeling_qwen3.nn.LayerNorm = LigerLayerNorm
    if cross_entropy:
        from transformers.loss.loss_utils import nn
        nn.functional.cross_entropy = liger_cross_entropy
    if fused_linear_cross_entropy:
        modeling_qwen3.Qwen3ForCausalLM.forward = lce_forward
--- a/src/axolotl/integrations/liger/models/qwen3_moe.py
+++ b/src/axolotl/integrations/liger/models/qwen3_moe.py
@@ -1,191 +0,0 @@
 """
 Liger FLCE for Qwen3 MoE. Based on transformers v4.51.3.
 """
 import sys
 from copy import deepcopy
 from typing import List, Optional, Union
 import torch
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
 from transformers.modeling_outputs import MoeCausalLMOutputWithPast
 from transformers.models.qwen3_moe.modeling_qwen3_moe import load_balancing_loss_func
 def lce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[List[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_router_logits: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs,
 ) -> MoeCausalLMOutputWithPast:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    Returns:
    """
    # pylint: disable=duplicate-code
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_router_logits = (
        output_router_logits
        if output_router_logits is not None
        else self.config.output_router_logits
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        output_router_logits=output_router_logits,
        cache_position=cache_position,
        **kwargs,
    )
    hidden_states = outputs[0]
    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **kwargs,
        )
    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )
    aux_loss = None
    if output_router_logits:
        aux_loss = load_balancing_loss_func(
            outputs.router_logits,
            self.num_experts,
            self.num_experts_per_tok,
            attention_mask,
        )
        if labels is not None:
            loss += self.router_aux_loss_coef * aux_loss.to(
                loss.device
            )  # make sure to reside in the same device
    return MoeCausalLMOutputWithPast(
        loss=loss,
        aux_loss=aux_loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )
 def apply_liger_kernel_to_qwen3_moe(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,  # pylint: disable=unused-argument
 ) -> None:
    # pylint: disable=duplicate-code
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)
    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """
    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401  # pylint: disable=unused-import
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
    assert not (
        cross_entropy and fused_linear_cross_entropy
    ), "cross_entropy and fused_linear_cross_entropy cannot both be True."
    modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]
    if rms_norm:
        modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm
    if glu_activation:
        def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
            "Accepts intermediate_size to pass to LigerSwiGLUMLP"
            # clone config to avoid modifying the original
            config = deepcopy(config)
            if intermediate_size:
                setattr(config, "intermediate_size", intermediate_size)
            return LigerSwiGLUMLP(config, **kwargs)
        modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper
    if layer_norm:
        modeling_qwen3_moe.nn.LayerNorm = LigerLayerNorm
    if cross_entropy:
        from transformers.loss.loss_utils import nn
        nn.functional.cross_entropy = liger_cross_entropy
    if fused_linear_cross_entropy:
        modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = lce_forward
--- a/src/axolotl/monkeypatch/attention/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/patch.py
@@ -12,8 +12,10 @@ import torch
 import torch.distributed as dist
 from accelerate.logging import get_logger
 from axolotl.logging_config import configure_logging
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 configure_logging()
 LOG = get_logger(__name__)
--- a/src/axolotl/monkeypatch/trainer/init.py
+++ b/src/axolotl/monkeypatch/trainer/init.py
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -30,6 +30,7 @@ from axolotl.core.trainers.mixins.sequence_parallel import (
    SequenceParallelContextManager,
 )
 from axolotl.integrations.base import PluginManager
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
@@ -41,6 +42,7 @@ try:
 except ImportError:
    BetterTransformer = None
 configure_logging()
 LOG = get_logger(__name__)
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -67,7 +67,7 @@ def resolve_dtype(cfg):
        else:
            LOG.debug("bf16 support not detected, disabling for this configuration.")
            cfg.bf16 = False
-            if cfg.fp16 is None and not cfg.float16:
+            if cfg.fp16 is None:
                cfg.fp16 = True
    if cfg.device == "mps":
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -512,17 +512,10 @@ class AxolotlInputConfig(
    @model_validator(mode="before")
    @classmethod
    def hint_sample_packing_padding(cls, data):
-        if data.get("sample_packing"):
+        if data.get("sample_packing") and not data.get("pad_to_sequence_len"):
-            pad_to_sequence_len = data.get("pad_to_sequence_len")
+            LOG.warning(
-            if pad_to_sequence_len is False:
+                "`pad_to_sequence_len: true` is recommended when using sample_packing"
-                LOG.warning(
+            )
                    "`pad_to_sequence_len: true` is recommended when using sample_packing"
                )
            elif pad_to_sequence_len is None:
                LOG.info(
                    "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
                )
                data["pad_to_sequence_len"] = True
        return data
    @model_validator(mode="before")
--- a/src/axolotl/utils/schemas/trl.py
+++ b/src/axolotl/utils/schemas/trl.py
@@ -67,12 +67,6 @@ class TRLConfig(BaseModel):
        default=False,
        json_schema_extra={"description": "Whether to log completions"},
    )
    num_completions_to_print: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of completions to print. If `log_completions` is `True`, this will be the number of completions logged."
        },
    )
    sync_ref_model: bool | None = Field(
        default=False,
        json_schema_extra={
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -597,8 +597,6 @@ def prepare_optim_env(cfg):
        os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
    elif cfg.fp16:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"
    else:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "no"
 def prepare_opinionated_env(cfg):
--- a/tests/patched/test_validation.py
+++ b/tests/patched/test_validation.py
@@ -648,7 +648,7 @@ class TestValidation(BaseValidation):
            DictDefault(
                {
                    "sample_packing": True,
-                    "pad_to_sequence_len": False,
+                    "pad_to_sequence_len": None,
                    "flash_attention": True,
                }
            )
@@ -662,26 +662,6 @@ class TestValidation(BaseValidation):
                for record in self._caplog.records
            )
    def test_packing_autoset(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "sample_packing": True,
                    "pad_to_sequence_len": None,
                    "flash_attention": True,
                }
            )
            | minimal_cfg
        )
        with self._caplog.at_level(logging.INFO):
            cfg = validate_config(cfg)
            assert any(
                "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
                in record.message
                for record in self._caplog.records
            )
            assert cfg.pad_to_sequence_len is True
    def test_merge_lora_no_bf16_fail(self, minimal_cfg):
        """
        This is assumed to be run on a CPU machine, so bf16 is not supported.
Author	SHA1	Message	Date
Wing Lian	6affbb1f85	move import of llmcompressor to reset session inside test	2025-04-30 18:10:44 -04:00
Wing Lian	0ed4b4c310	make sure to reset the session after each test	2025-04-30 17:21:53 -04:00
Wing Lian	f4a0f496a0	move decorator to test method instead of class	2025-04-30 17:21:53 -04:00
Wing Lian	82b16bd040	split llmcompressor from vllm checks	2025-04-30 17:21:53 -04:00
Wing Lian	fd5c985038	additional fixes for docker and saving compressed	2025-04-30 17:21:53 -04:00
Rahul Tuli	5246aebc04	Fix: Test Signed-off-by: Rahul Tuli <rtuli@redhat.com>	2025-04-30 17:21:53 -04:00
Rahul Tuli	f4bcc71c86	Apply patch from @winglian Signed-off-by: Rahul Tuli <rtuli@redhat.com>	2025-04-30 17:21:53 -04:00
Rahul Tuli	3a9e172272	Add: line about further optimizations using llmcompressor Signed-off-by: Rahul Tuli <rtuli@redhat.com>	2025-04-30 17:21:53 -04:00
Rahul Tuli	372f0e137b	Address Review Comments: * deleted redundant docs/llm_compressor.qmd * incorporated feedback in integration README.md * added llmcompressor integration to docs/custom_integrations.qmd Signed-off-by: Rahul Tuli <rtuli@redhat.com>	2025-04-30 17:21:52 -04:00
Rahul Tuli	17dffec71d	Add: .qmd file	2025-04-30 17:21:52 -04:00
Rahul Tuli	3a8b637598	Tests, Style, Updates	2025-04-30 17:21:52 -04:00
Rahul Tuli	12cd09e6f5	Rebase and updates!	2025-04-30 17:21:52 -04:00
Rahul Tuli	fe82f62248	Add: `llm_compressor` integration documentation	2025-04-30 17:21:52 -04:00
Rahul Tuli	db31d7ad22	Move: LLMCompressorPlugin into it's own submodule	2025-04-30 17:21:52 -04:00
Rahul Tuli	eb7f2aa4b9	Update model config	2025-04-30 17:21:51 -04:00
Rahul Tuli	f80e36ddd2	Use: absolute import	2025-04-30 17:21:51 -04:00
Rahul Tuli	412d2ec6d0	Rename: sft.yaml to sparse-finetuning.yaml	2025-04-30 17:21:51 -04:00
Rahul Tuli	50fc5e6984	Add: llcompressor installable	2025-04-30 17:21:51 -04:00
Rahul Tuli	83a88b745f	Address review comments from @markurtz	2025-04-30 17:21:51 -04:00
Rahul Tuli	8855bb115f	Apply suggestions from @markurtz Co-authored-by: Mark Kurtz <mark.j.kurtz@gmail.com>	2025-04-30 17:21:51 -04:00
Rahul Tuli	ef9543b371	Update llmcompressor version to latest	2025-04-30 17:21:51 -04:00
Rahul Tuli	25e701e885	Revert: TODO's	2025-04-30 17:21:50 -04:00
Rahul Tuli	891a21e599	Use: warning over warn	2025-04-30 17:21:50 -04:00
Rahul Tuli	8beb2f27ad	pre commit hooks	2025-04-30 17:21:50 -04:00
Rahul Tuli	56ba66b60f	Add:llmcompressor instalable	2025-04-30 17:21:50 -04:00
Rahul Tuli	13d4b865d6	Update: review comments!	2025-04-30 17:21:50 -04:00
Rahul Tuli	3da866b2b9	Add: SFTPlugin with llmcompressor	2025-04-30 17:21:50 -04:00