validate config to set defaults

Add: line about further optimizations using llmcompressor
Signed-off-by: Rahul Tuli <rtuli@redhat.com>
2025-04-26 13:11:25 -04:00 · 2025-04-24 14:06:25 -04:00 · 2025-04-24 12:37:14 -05:00 · 2025-04-24 13:36:09 -04:00 · 2025-04-24 12:45:57 -04:00 · 2025-04-24 12:45:57 -04:00
33 changed files with 870 additions and 1705 deletions
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,7 +1,5 @@
 codecov:
  require_ci_to_pass: yes
  notify:
    wait_for_ci: true
 coverage:
  precision: 2
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -49,7 +49,8 @@ sections = [
    ("Knowledge Distillation (KD)", "kd"),
    ("Liger Kernels", "liger"),
    ("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
-    ("Spectrum", "spectrum")
+    ("Spectrum", "spectrum"),
    ("LLMCompressor", "llm_compressor")
 ]
 for section_name, folder_name in sections:
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -0,0 +1,77 @@
 base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
 plugins:
  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.05
 output_dir: ./outputs/out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 eval_sample_packing: false
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 8
 micro_batch_size: 1
 num_epochs: 1
 optimizer: paged_adamw_8bit
 lr_scheduler: cosine
 learning_rate: 2e-5
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 early_stopping_patience:
 resume_from_checkpoint:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 warmup_steps: 100
 evals_per_epoch: 2
 eval_table_size:
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  pad_token: <|end_of_text|>
 llmcompressor:
  recipe:
    finetuning_stage:
      finetuning_modifiers:
        ConstantPruningModifier:
          targets: [
            're:.*q_proj.weight',
            're:.*k_proj.weight',
            're:.*v_proj.weight',
            're:.*o_proj.weight',
            're:.*gate_proj.weight',
            're:.*up_proj.weight',
            're:.*down_proj.weight',
          ]
          start: 0
  save_compressed: true
--- a/setup.py
+++ b/setup.py
@@ -149,6 +149,9 @@ extras_require = {
    "vllm": [
        "vllm==0.7.2",
    ],
    "llmcompressor": [
        "llmcompressor==0.5.1",
    ],
 }
 install_requires, dependency_links, extras_require_build = parse_requirements(
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -14,7 +14,6 @@ from axolotl.utils.data import prepare_dataset
 from axolotl.utils.data.rl import load_prepare_preference_datasets
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_processor, load_tokenizer
 from axolotl.utils.schemas.enums import RLType
 from axolotl.utils.tokenization import check_dataset_labels
 LOG = logging.getLogger(__name__)
@@ -126,7 +125,7 @@ def load_preference_datasets(
    total_num_steps: Optional[int] = int(
        math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
    )
-    if cfg.rl is RLType.GRPO:
+    if cfg.rl == "grpo":
        total_num_steps = None
    if cli_args.debug or cfg.debug:
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -84,7 +84,7 @@ from axolotl.utils.collators import (
 )
 from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
 from axolotl.utils.models import ensure_dtype
-from axolotl.utils.schemas.enums import CustomSupportedOptimizers, RLType
+from axolotl.utils.schemas.enums import CustomSupportedOptimizers
 try:
    import torch._dynamo  # pylint: disable=ungrouped-imports
@@ -538,6 +538,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        report_to = []
        if self.cfg.use_wandb:
            report_to.append("wandb")
            if self.cfg.wandb_name:
                training_arguments_kwargs["run_name"] = self.cfg.wandb_name
        if self.cfg.use_mlflow:
            report_to.append("mlflow")
        if self.cfg.use_tensorboard:
@@ -930,6 +932,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                collator = DataCollatorForSeq2Seq
        kwargs["return_tensors"] = "pt"
        if issubclass(collator, DataCollatorForSeq2Seq):
            kwargs["sequence_parallel_degree"] = training_args.sequence_parallel_degree
            kwargs["ring_attn_func"] = training_args.ring_attn_func
        return collator(
            *collator_args,
@@ -1009,8 +1014,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            training_args_kwargs["dataloader_prefetch_factor"] = (
                self.cfg.dataloader_prefetch_factor
            )
        if self.cfg.seed:
            training_args_kwargs["seed"] = self.cfg.seed
        if self.cfg.gradient_checkpointing:
            training_args_kwargs["gradient_checkpointing"] = (
                self.cfg.gradient_checkpointing
@@ -1048,13 +1051,9 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        if self.cfg.rpo_alpha is not None:
            training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha
        training_args_kwargs["sequence_parallel_degree"] = (
            self.cfg.sequence_parallel_degree
        )
        training_args_cls = None
        blocklist_args_kwargs = []
-        if self.cfg.rl is RLType.SIMPO:
+        if self.cfg.rl == "simpo":
            training_args_cls = AxolotlCPOConfig
            training_args_kwargs["loss_type"] = "simpo"
            training_args_kwargs["max_length"] = self.cfg.sequence_len
@@ -1062,13 +1061,13 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.cpo_alpha is not None:
                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
-        elif self.cfg.rl is RLType.ORPO:
+        elif self.cfg.rl == "orpo":
            training_args_cls = AxolotlORPOConfig
            training_args_kwargs["max_length"] = self.cfg.sequence_len
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
-        elif self.cfg.rl is RLType.KTO:
+        elif self.cfg.rl == "kto":
            training_args_cls = AxolotlKTOConfig
            training_args_kwargs["desirable_weight"] = (
@@ -1082,14 +1081,14 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            if self.cfg.max_prompt_len:
                training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
-        elif self.cfg.rl is RLType.GRPO:
+        elif self.cfg.rl == "grpo":
            training_args_cls = GRPOStrategy.get_training_args_class()
            training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
            blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()
        else:
            training_args_cls = AxolotlDPOConfig
-            if self.cfg.rl is RLType.IPO:
+            if self.cfg.rl == "ipo":
                training_args_kwargs["loss_type"] = "ipo"
            training_args_kwargs["max_length"] = self.cfg.sequence_len
            training_args_kwargs["max_completion_length"] = None
@@ -1126,33 +1125,33 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
    def build(self, total_num_steps):
        training_args = self.build_training_arguments(total_num_steps)
-        trainer_kwargs = {}
+        dpo_trainer_kwargs = {}
-        if self.cfg.rl is RLType.IPO:
+        if self.cfg.rl == "ipo":
            if self.cfg.dpo_label_smoothing:
-                trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
+                dpo_trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
        if self.eval_dataset:
-            trainer_kwargs["eval_dataset"] = self.eval_dataset
+            dpo_trainer_kwargs["eval_dataset"] = self.eval_dataset
        if self.cfg.adapter and self.peft_config:
-            trainer_kwargs["peft_config"] = self.peft_config
+            dpo_trainer_kwargs["peft_config"] = self.peft_config
        if self.cfg.precompute_ref_log_probs is not None:
-            trainer_kwargs["precompute_ref_log_probs"] = (
+            dpo_trainer_kwargs["precompute_ref_log_probs"] = (
                self.cfg.precompute_ref_log_probs
            )
-        if self.cfg.rl is RLType.GRPO:
+        if self.cfg.rl == "grpo":
            trainer_cls = GRPOStrategy.get_trainer_class()
            trainer_cls_args = [self.model]
            trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))
-            trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
+            dpo_trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
-        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
+        elif self.cfg.rl in ["dpo", "ipo"]:
            trainer_cls = DPOStrategy.get_trainer_class()
            trainer_cls_args = [self.model, self.model_ref]
-        elif self.cfg.rl is RLType.ORPO:
+        elif self.cfg.rl == "orpo":
            trainer_cls = AxolotlORPOTrainer
            trainer_cls_args = [self.model]
-        elif self.cfg.rl is RLType.KTO:
+        elif self.cfg.rl in ["kto"]:
            trainer_cls = AxolotlKTOTrainer
            trainer_cls_args = [self.model]
-        elif self.cfg.rl is RLType.SIMPO:
+        elif self.cfg.rl in ["simpo"]:
            trainer_cls = AxolotlCPOTrainer
            trainer_cls_args = [self.model]
        else:
@@ -1160,33 +1159,33 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        sig = inspect.signature(trainer_cls)
        if "tokenizer" in sig.parameters.keys():
-            trainer_kwargs["tokenizer"] = self.tokenizer
+            dpo_trainer_kwargs["tokenizer"] = self.tokenizer
        else:
-            trainer_kwargs["processing_class"] = self.tokenizer
+            dpo_trainer_kwargs["processing_class"] = self.tokenizer
        if self.cfg.datasets is not None and (
            trainer_cls is DPOStrategy.get_trainer_class()
        ):
-            trainer_kwargs["dataset_tags"] = [
+            dpo_trainer_kwargs["dataset_tags"] = [
                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
            ]
-        trainer = trainer_cls(
+        dpo_trainer = trainer_cls(
            *trainer_cls_args,
            args=training_args,
            train_dataset=self.train_dataset,
            callbacks=self.get_callbacks(),
-            **trainer_kwargs,
+            **dpo_trainer_kwargs,
        )
        if self.cfg.fsdp:
-            ensure_dtype(trainer.model, dtype=self.cfg.torch_dtype)
+            ensure_dtype(dpo_trainer.model, dtype=self.cfg.torch_dtype)
-            if self.cfg.rl in [RLType.DPO, RLType.IPO] and trainer.ref_model:
+            if self.cfg.rl in ["dpo", "ipo"] and dpo_trainer.ref_model:
-                ensure_dtype(trainer.ref_model, dtype=self.cfg.torch_dtype)
+                ensure_dtype(dpo_trainer.ref_model, dtype=self.cfg.torch_dtype)
-        trainer = self.hook_post_create_trainer(trainer)
+        dpo_trainer = self.hook_post_create_trainer(dpo_trainer)
-        for callback in self.get_post_trainer_create_callbacks(trainer):
+        for callback in self.get_post_trainer_create_callbacks(dpo_trainer):
-            trainer.add_callback(callback)
+            dpo_trainer.add_callback(callback)
-        return trainer
+        return dpo_trainer
 class HFPPOTrainerBuilder(TrainerBuilderBase):
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -371,15 +371,13 @@ class AxolotlTrainer(
                num_items_in_batch=num_items_in_batch,
            )
-        loss = super().compute_loss(
+        return super().compute_loss(
            model,
            inputs,
            return_outputs=return_outputs,
            num_items_in_batch=num_items_in_batch,
        )
        return loss
    @staticmethod
    def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
        concatenated_batch = {}
--- a/src/axolotl/core/trainers/dpo/init.py
+++ b/src/axolotl/core/trainers/dpo/init.py
@@ -3,7 +3,6 @@ DPO Specific Strategy for training
 """
 from axolotl.core.trainers.dpo.trainer import AxolotlDPOTrainer
 from axolotl.utils.schemas.enums import RLType
 class DPOStrategy:
@@ -24,7 +23,7 @@ class DPOStrategy:
    @classmethod
    def set_training_args_kwargs(cls, cfg):
        training_args_kwargs = {}
-        if cfg.rl is RLType.IPO:
+        if cfg.rl == "ipo":
            training_args_kwargs["loss_type"] = "ipo"
        training_args_kwargs["max_length"] = cfg.sequence_len
        training_args_kwargs["max_completion_length"] = None
--- a/src/axolotl/core/trainers/grpo/args.py
+++ b/src/axolotl/core/trainers/grpo/args.py
@@ -11,4 +11,6 @@ from axolotl.core.training_args import AxolotlTrainingMixins
@dataclass
 class AxolotlGRPOConfig(AxolotlTrainingMixins, GRPOConfig):
-    """Axolotl GRPO Config for GRPO training"""
+    """
    Axolotl GRPO Config for GRPO training
    """
--- a/src/axolotl/core/trainers/grpo/sampler.py
+++ b/src/axolotl/core/trainers/grpo/sampler.py
@@ -1,124 +0,0 @@
 """
 Repeat random sampler (akin to the one implemented in
 https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds
 sequence parallelism functionality; i.e., duplicating data across ranks in the same
 sequencee parallel group.
 """
 from typing import Sized
 import torch
 from torch.utils.data import Sampler
 class SequenceParallelRepeatRandomSampler(Sampler):
    """
    Sampler for GRPO training with sequence parallelism that ensures:
    1. Ranks in the same sequence parallel group receive identical data
    2. Each index is repeated multiple times for sampling different completions
    3. Entire batches are repeated for reuse in multiple updates
    """
    def __init__(
        self,
        dataset: Sized,
        mini_repeat_count: int,
        world_size: int,
        rank: int,
        batch_size: int = 1,
        repeat_count: int = 1,
        sequence_parallel_degree: int = 1,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
    ):
        self.dataset = dataset
        self.mini_repeat_count = mini_repeat_count
        self.batch_size = batch_size
        self.repeat_count = repeat_count
        self.shuffle = shuffle
        self.seed = seed
        self.drop_last = drop_last
        self.epoch = 0
        self.world_size = world_size
        self.rank = rank
        # Sequence parallelism parameters
        self.sequence_parallel_degree = sequence_parallel_degree
        self.num_sp_groups = world_size // sequence_parallel_degree
        self.sp_group_id = rank // sequence_parallel_degree
        # Adjust dataset size for distributed sampling
        self.num_samples = len(self.dataset)
        self.total_size = self.num_samples
        # Calculate effective number of samples per SP group
        if (
            self.drop_last
            and self.total_size % (self.num_sp_groups * self.batch_size) != 0
        ):
            # Drop last incomplete batch if drop_last is True
            self.num_samples_per_sp_group = (
                self.total_size // self.batch_size // self.num_sp_groups
            ) * self.batch_size
        else:
            # Round up to include last batch if drop_last is False
            self.num_samples_per_sp_group = (
                (self.total_size + self.batch_size * self.num_sp_groups - 1)
                // (self.batch_size * self.num_sp_groups)
                * self.batch_size
            )
    def __iter__(self):
        # Deterministically shuffle based on epoch and seed
        if self.shuffle:
            # Use same seed for all ranks in the same SP group
            g = torch.Generator()
            seed_value = self.seed + self.epoch + self.sp_group_id * 10000
            g.manual_seed(seed_value)
            indices = torch.randperm(len(self.dataset), generator=g).tolist()
        else:
            indices = list(range(len(self.dataset)))
        # Add extra samples to make it evenly divisible by batch_size
        if len(indices) % self.batch_size != 0:
            padding = indices[: self.batch_size - len(indices) % self.batch_size]
            indices += padding
        # Subsample based on SP group ID
        # Each SP group gets distinct batches of data
        batch_indices = []
        for i in range(0, len(indices), self.batch_size * self.num_sp_groups):
            start_idx = i + self.sp_group_id * self.batch_size
            end_idx = min(start_idx + self.batch_size, len(indices))
            if start_idx < len(indices):
                for j in range(self.batch_size):
                    if start_idx + j < end_idx:
                        batch_indices.append(indices[start_idx + j])
        # Make sure batch_indices is exactly batch_size * num_batches_per_sp_group
        if self.drop_last:
            num_batches_per_sp_group = self.num_samples_per_sp_group // self.batch_size
            target_len = self.batch_size * num_batches_per_sp_group
            if len(batch_indices) > target_len:
                batch_indices = batch_indices[:target_len]
        # Apply the GRPO repeat pattern
        final_indices = []
        for _ in range(self.repeat_count):
            for idx in batch_indices:
                for _ in range(self.mini_repeat_count):
                    final_indices.append(idx)
        return iter(final_indices)
    def __len__(self):
        # Total length including all repetitions
        return (
            self.num_samples_per_sp_group * self.mini_repeat_count * self.repeat_count
        )
    def set_epoch(self, epoch):
        """Sets the epoch for this sampler"""
        self.epoch = epoch
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -1,279 +1,26 @@
-"""Axolotl GRPO trainer"""
+"""
 Axolotl GRPO trainer
 """
 # pylint: disable=too-many-lines,duplicate-code
 import warnings
 from contextlib import nullcontext
 from typing import Any
-import datasets
+from accelerate.utils import is_deepspeed_available, is_peft_model
 import torch
 import torch.distributed as dist
 from accelerate.utils import (
    broadcast_object_list,
    gather,
    gather_object,
    is_peft_model,
 )
 from datasets import Dataset, IterableDataset
 from torch import nn
 from torch.utils.data import (
    BatchSampler,
    DataLoader,
    Sampler,
 )
 from transformers import (
    PreTrainedModel,
    PreTrainedTokenizerBase,
    Trainer,
    TrainerCallback,
    is_wandb_available,
 )
 from transformers.trainer_utils import seed_worker
 from transformers.utils import is_peft_available
 from trl import GRPOTrainer
-from trl.data_utils import (
+from trl.extras.profiling import profiling_decorator
    apply_chat_template,
    is_conversational,
    maybe_apply_chat_template,
 )
 from trl.extras.profiling import profiling_context, profiling_decorator
 from trl.import_utils import (
    is_deepspeed_available,
    is_rich_available,
 )
 from trl.models import (
    unwrap_model_for_generation,
 )
 from trl.trainer.grpo_config import GRPOConfig
 from trl.trainer.grpo_trainer import RewardFunc
 from trl.trainer.utils import (
    pad,
    print_prompt_completions_sample,
    selective_log_softmax,
 )
 from axolotl.core.trainers.grpo.sampler import SequenceParallelRepeatRandomSampler
 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
 from axolotl.monkeypatch.attention.ring_attn.patch import get_ring_attn_group
 if is_peft_available():
    # pylint: disable=unused-import
    from peft import PeftConfig
 if is_deepspeed_available():
    import deepspeed
 if is_wandb_available():
    import wandb
 class AxolotlGRPOTrainer(RngLoaderMixin, SchedulerMixin, GRPOTrainer):
-    """Extend the base GRPOTrainer for axolotl helpers"""
+    """
    Extend the base GRPOTrainer for axolotl helpers
    """
    _tag_names = ["trl", "grpo", "axolotl"]
    def __init__(
        self,
        model: str | PreTrainedModel,
        reward_funcs: RewardFunc | list[RewardFunc],
        args: GRPOConfig | None = None,
        train_dataset: Dataset | IterableDataset | None = None,
        eval_dataset: (
            Dataset | IterableDataset | dict[str, Dataset | IterableDataset] | None
        ) = None,
        processing_class: PreTrainedTokenizerBase | None = None,
        reward_processing_classes: (
            PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None
        ) = None,
        callbacks: list[TrainerCallback] | None = None,
        optimizers: tuple[
            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
        ] = (None, None),
        peft_config: "PeftConfig | None" = None,
    ):
        # First call the superclass constructor with all arguments
        super().__init__(
            model=model,
            reward_funcs=reward_funcs,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            processing_class=processing_class,
            reward_processing_classes=reward_processing_classes,
            callbacks=callbacks,
            optimizers=optimizers,
            peft_config=peft_config,
        )
        # Now execute your custom logic
        # Get number of SP groups (number of processes divided by SP degree)
        num_processes = self.accelerator.num_processes
        num_sp_groups = num_processes // self.args.sequence_parallel_degree
        # Calculate batch size per SP group (not per process)
        sp_group_batch_size = self.args.per_device_train_batch_size * num_sp_groups
        possible_values = [
            n_gen
            for n_gen in range(2, sp_group_batch_size + 1)
            if (sp_group_batch_size) % n_gen == 0
        ]
        if self.num_generations not in possible_values:
            raise ValueError(
                f"The batch size per SP group ({num_sp_groups} x "
                f"{self.args.per_device_train_batch_size}) must be evenly divisible by "
                f"the number of generations per prompt ({self.num_generations}). Given "
                "the current configuration, the valid values for the number of "
                f"generations are: {possible_values}."
            )
        if self.args.eval_strategy != "no":
            # If sequence parallelism is enabled, calculate batch size per SP group
            sp_group_eval_batch_size = args.per_device_eval_batch_size * num_sp_groups  # type: ignore[union-attr]
            possible_values = [
                n_gen
                for n_gen in range(2, sp_group_eval_batch_size + 1)
                if (sp_group_eval_batch_size) % n_gen == 0
            ]
            if self.num_generations not in possible_values:
                raise ValueError(
                    f"With sequence parallelism (degree {self.args.sequence_parallel_degree}), "
                    f"the eval batch size per SP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) "
                    f"must be evenly divisible by the number of generations per prompt "
                    f"({self.num_generations}). Given the current eval batch size, "
                    f"the valid values for the number of generations are: {possible_values}."
                )
        # Initialize the SP group
        self.sp_group = get_ring_attn_group()
        self.local_rank = dist.get_rank(group=self.sp_group)
        self.local_world_size = dist.get_world_size(group=self.sp_group)
        print("end of trainer init")
    def _get_train_sampler(self) -> Sampler:
        # Get distributed training info
        world_size = dist.get_world_size()
        rank = dist.get_rank()
        effective_batch_size = (
            self.args.per_device_train_batch_size
            * world_size
            * self.args.gradient_accumulation_steps
        )
        return SequenceParallelRepeatRandomSampler(
            dataset=self.train_dataset,
            mini_repeat_count=self.num_generations,
            world_size=world_size,
            rank=rank,
            batch_size=effective_batch_size
            // self.num_generations
            // self.args.sequence_parallel_degree,
            repeat_count=self.num_iterations,
            sequence_parallel_degree=self.args.sequence_parallel_degree,
            shuffle=True,
            seed=self.args.seed,
            drop_last=True,
        )
    def _create_dataloader_params(self, is_eval=False, custom_batch_size=None):
        """Create common dataloader parameters for train or eval."""
        batch_size = custom_batch_size or (
            self.args.eval_batch_size if is_eval else self._train_batch_size
        )
        params = {
            "batch_size": batch_size,
            "collate_fn": self.data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
        }
        # Add persistent workers only for training
        if not is_eval and hasattr(self.args, "dataloader_persistent_workers"):
            params["persistent_workers"] = self.args.dataloader_persistent_workers
        # Add prefetch factor if specified
        if self.args.dataloader_prefetch_factor:
            params["prefetch_factor"] = self.args.dataloader_prefetch_factor
        return params
    def _prepare_dataloader(
        self, dataset, sampler, is_eval=False, custom_batch_size=None
    ):
        """Prepare a dataloader with the given dataset and sampler."""
        # Get base parameters
        dataloader_params = self._create_dataloader_params(is_eval, custom_batch_size)
        # Add sampler configuration
        if not isinstance(dataset, torch.utils.data.IterableDataset):
            if isinstance(sampler, BatchSampler):
                # batch_size and batch_sampler are mutually exclusive
                dataloader_params["batch_sampler"] = sampler
                del dataloader_params["batch_size"]
            else:
                dataloader_params["sampler"] = sampler
                dataloader_params["drop_last"] = self.args.dataloader_drop_last
            if not is_eval:
                dataloader_params["worker_init_fn"] = seed_worker
        # Create the dataloader
        dataloader = DataLoader(dataset, **dataloader_params)
        if self.args.sample_packing and (
            (not is_eval and not self.args.pretraining)
            or (is_eval and self.args.eval_sample_packing is not False)
        ):
            self.accelerator.even_batches = False
        # Return unprepared dataloader if using sequence parallelism
        # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
        # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
        # slice each batch along the sequence dimension).
        if self.args.sequence_parallel_degree > 1:
            return dataloader
        # Otherwise prepare with accelerator
        return self.accelerator.prepare_data_loader(dataloader)
    def get_train_dataloader(self) -> DataLoader:
        """Get dataloader for training"""
        train_dataset = self.train_dataset
        # pylint: disable=access-member-before-definition
        data_collator = self.data_collator  # type: ignore
        # Initialize SP group attributes if sequence parallelism is enabled
        if self.args.sequence_parallel_degree > 1:
            self.sp_group = get_ring_attn_group()
            self.local_rank = dist.get_rank(group=self.sp_group)
            self.local_world_size = dist.get_world_size(group=self.sp_group)
        # Handle dataset preprocessing
        if isinstance(train_dataset, datasets.Dataset):
            # Add debug print before any modifications
            if self.args.sample_packing and not self.args.pretraining:
                train_dataset = train_dataset.remove_columns(["length"])
            if not self.args.sample_packing or self.args.pretraining:
                train_dataset = self._remove_unused_columns(
                    train_dataset, description="training"
                )
        else:
            self.data_collator = self._get_collator_with_removed_columns(  # pylint: disable=attribute-defined-outside-init
                data_collator,
                description="training",
            )
        # Get sampler and create dataloader
        sampler = self._get_train_sampler()
        dataloader = self._prepare_dataloader(train_dataset, sampler, is_eval=False)
        return dataloader
    @profiling_decorator
    def _move_model_to_vllm(self):
        # For DeepSpeed ZeRO-3, we need to gather all parameters before operations
@@ -320,577 +67,3 @@ class AxolotlGRPOTrainer(RngLoaderMixin, SchedulerMixin, GRPOTrainer):
        # Reset cache on main process
        if self.accelerator.is_main_process:
            self.vllm_client.reset_prefix_cache()
    # def _generate_and_score_completions(
    #     self, inputs: list[dict[str, torch.Tensor | Any]]
    # ) -> dict[str, torch.Tensor | Any]:
    #     device = self.accelerator.device
    #     prompts = [x["prompt"] for x in inputs]
    #     prompts_text = [
    #         maybe_apply_chat_template(example, self.processing_class)["prompt"]
    #         for example in inputs
    #     ]
    #     prompt_inputs = self.processing_class(
    #         text=prompts_text,
    #         return_tensors="pt",
    #         padding=True,
    #         padding_side="left",
    #         add_special_tokens=False,
    #     )
    #     # pylint: disable=protected-access
    #     prompt_inputs = Trainer._prepare_inputs(self, prompt_inputs)
    #     prompt_ids, prompt_mask = (
    #         prompt_inputs["input_ids"],
    #         prompt_inputs["attention_mask"],
    #     )
    #     if self.max_prompt_length is not None:
    #         prompt_ids = prompt_ids[:, -self.max_prompt_length :]
    #         prompt_mask = prompt_mask[:, -self.max_prompt_length :]
    #     # Generate completions using either vLLM or regular generation
    #     if self.args.use_vllm:
    #         # First, have main process load weights if needed
    #         # pylint: disable=access-member-before-definition
    #         if self.state.global_step != self._last_loaded_step:  # type: ignore[has-type]
    #             self._move_model_to_vllm()
    #             # pylint: disable=attribute-defined-outside-init
    #             self._last_loaded_step = self.state.global_step
    #         all_prompts_text = gather_object(prompts_text)
    #         if self.accelerator.is_main_process:
    #             # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
    #             # num_generations outputs for each one. This is faster than generating outputs for each duplicate
    #             # prompt individually.
    #             # ordered_set_of_prompts = all_prompts_text[:: self.num_generations]
    #             ordered_set_of_prompts = all_prompts_text[
    #                 :: self.num_generations * self.args.sequence_parallel_degree
    #             ]
    #             with profiling_context(self, "vLLM.generate"):
    #                 completion_ids = self.vllm_client.generate(
    #                     prompts=ordered_set_of_prompts,
    #                     n=self.num_generations,
    #                     repetition_penalty=self.repetition_penalty,
    #                     temperature=self.temperature,
    #                     top_p=self.top_p,
    #                     top_k=-1 if self.top_k is None else self.top_k,
    #                     min_p=0.0 if self.min_p is None else self.min_p,
    #                     max_tokens=self.max_completion_length,
    #                     guided_decoding_regex=self.guided_decoding_regex,
    #                 )
    #         else:
    #             completion_ids = [None] * (
    #                 len(all_prompts_text) // self.args.sequence_parallel_degree
    #             )
    #         # Broadcast the completions from the main process to all processes
    #         completion_ids = broadcast_object_list(completion_ids, from_process=0)
    #         # Determine the appropriate slice based on sequence parallelism
    #         if self.args.sequence_parallel_degree > 1:
    #             # Calculate SP group ID (which group of ranks this rank belongs to)
    #             sp_group_id = self.accelerator.process_index // self.local_world_size
    #             # Calculate the start index for this SP group
    #             sp_group_start = sp_group_id * len(prompts) * self.local_world_size
    #             # All ranks in the same SP group get the same data slice
    #             process_slice = slice(
    #                 sp_group_start,
    #                 sp_group_start + len(prompts),
    #             )
    #             completion_ids = completion_ids[process_slice]
    #         else:
    #             # Original behavior for non-sequence parallel case
    #             process_slice = slice(
    #                 self.accelerator.process_index * len(prompts),
    #                 (self.accelerator.process_index + 1) * len(prompts),
    #             )
    #             completion_ids = completion_ids[process_slice]
    #         # Pad the completions, and concatenate them with the prompts
    #         completion_ids = [
    #             torch.tensor(ids, device=device) for ids in completion_ids
    #         ]
    #         completion_ids = pad(
    #             completion_ids, padding_value=self.processing_class.pad_token_id
    #         )
    #     else:
    #         # Regular generation path
    #         with unwrap_model_for_generation(
    #             self.model_wrapped,
    #             self.accelerator,
    #             gather_deepspeed3_params=self.args.ds3_gather_for_generation,
    #         ) as unwrapped_model:
    #             prompt_completion_ids = unwrapped_model.generate(
    #                 prompt_ids,
    #                 attention_mask=prompt_mask,
    #                 generation_config=self.generation_config,
    #             )
    #         # Compute prompt length and extract completion ids
    #         prompt_length = prompt_ids.size(1)
    #         prompt_ids = prompt_completion_ids[:, :prompt_length]
    #         completion_ids = prompt_completion_ids[:, prompt_length:]
    #     prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
    #     # Mask everything after the first EOS token
    #     is_eos = completion_ids == self.processing_class.eos_token_id
    #     eos_idx = torch.full(
    #         (is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device
    #     )
    #     eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
    #     sequence_indices = torch.arange(is_eos.size(1), device=device).expand(
    #         is_eos.size(0), -1
    #     )
    #     completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
    #     # Concatenate prompt_mask with completion_mask for logit computation
    #     attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)
    #     logits_to_keep = completion_ids.size(
    #         1
    #     )  # we only need to compute the logits for the completion tokens
    #     with torch.no_grad():
    #         # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip it's
    #         # computation here, and use per_token_logps.detach() instead.
    #         if self.num_iterations > 1:
    #             if self.args.sequence_parallel_degree > 1:
    #                 old_per_token_logps, _ = self._get_per_token_logps_v2(
    #                     self.model,
    #                     prompt_completion_ids,
    #                     attention_mask,
    #                     logits_to_keep,
    #                 )
    #             else:
    #                 old_per_token_logps = super()._get_per_token_logps(
    #                     self.model,
    #                     prompt_completion_ids,
    #                     attention_mask,
    #                     logits_to_keep,
    #                 )
    #         else:
    #             old_per_token_logps = None
    #         if self.beta == 0.0:
    #             ref_per_token_logps = None
    #         elif self.ref_model is not None:
    #             if self.args.sequence_parallel_degree > 1:
    #                 ref_per_token_logps, _ = self._get_per_token_logps_v2(
    #                     self.ref_model,
    #                     prompt_completion_ids,
    #                     attention_mask,
    #                     logits_to_keep,
    #                 )
    #             else:
    #                 ref_per_token_logps = super()._get_per_token_logps(
    #                     self.ref_model,
    #                     prompt_completion_ids,
    #                     attention_mask,
    #                     logits_to_keep,
    #                 )
    #         else:
    #             with self.accelerator.unwrap_model(self.model).disable_adapter():
    #                 if self.args.sequence_parallel_degree > 1:
    #                     ref_per_token_logps, _ = self._get_per_token_logps_v2(
    #                         self.model,
    #                         prompt_completion_ids,
    #                         attention_mask,
    #                         logits_to_keep,
    #                     )
    #                 else:
    #                     ref_per_token_logps = super()._get_per_token_logps(
    #                         self.model,
    #                         prompt_completion_ids,
    #                         attention_mask,
    #                         logits_to_keep,
    #                     )
    #     # Decode the generated completions
    #     completions_text = self.processing_class.batch_decode(
    #         completion_ids, skip_special_tokens=True
    #     )
    #     if is_conversational(inputs[0]):
    #         completions = []
    #         for prompt, completion in zip(prompts, completions_text):
    #             bootstrap = (
    #                 prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
    #             )
    #             completions.append(
    #                 [{"role": "assistant", "content": bootstrap + completion}]
    #             )
    #     else:
    #         completions = completions_text
    #     rewards_per_func = torch.zeros(
    #         len(prompts), len(self.reward_funcs), device=device
    #     )
    #     for i, (reward_func, reward_processing_class) in enumerate(
    #         zip(self.reward_funcs, self.reward_processing_classes)
    #     ):
    #         if isinstance(
    #             reward_func, nn.Module
    #         ):  # Module instead of PretrainedModel for compat with compiled models
    #             reward_func_name = (
    #                 f"reward {reward_func.config._name_or_path.split('/')[-1]}"
    #             )
    #         else:
    #             # pylint: disable=protected-access
    #             reward_func_name = reward_func.__name__
    #         with profiling_context(self, reward_func_name):
    #             if isinstance(
    #                 reward_func, nn.Module
    #             ):  # Module instead of PretrainedModel for compat with compiled models
    #                 if is_conversational(inputs[0]):
    #                     messages = [
    #                         {"messages": p + c} for p, c in zip(prompts, completions)
    #                     ]
    #                     texts = [
    #                         apply_chat_template(x, reward_processing_class)["text"]
    #                         for x in messages
    #                     ]
    #                 else:
    #                     texts = [p + c for p, c in zip(prompts, completions)]
    #                 reward_inputs = reward_processing_class(
    #                     text=texts,
    #                     return_tensors="pt",
    #                     padding=True,
    #                     padding_side="right",
    #                     add_special_tokens=False,
    #                 )
    #                 # pylint: disable=protected-access
    #                 reward_inputs = Trainer._prepare_inputs(self, reward_inputs)
    #                 with torch.inference_mode():
    #                     rewards_per_func[:, i] = reward_func(**reward_inputs).logits[
    #                         :, 0
    #                     ]  # Shape (B*G,)
    #             else:
    #                 # Repeat all input columns (but "prompt" and "completion") to match the number of generations
    #                 keys = [
    #                     key for key in inputs[0] if key not in ["prompt", "completion"]
    #                 ]
    #                 reward_kwargs = {
    #                     key: [example[key] for example in inputs] for key in keys
    #                 }
    #                 output_reward_func = reward_func(
    #                     prompts=prompts, completions=completions, **reward_kwargs
    #                 )
    #                 # Convert None values to NaN
    #                 output_reward_func = [
    #                     reward if reward is not None else torch.nan
    #                     for reward in output_reward_func
    #                 ]
    #                 rewards_per_func[:, i] = torch.tensor(
    #                     output_reward_func, dtype=torch.float32, device=device
    #                 )
    #     # If all reward functions return None for a given row, issue a detailed warning
    #     if torch.isnan(rewards_per_func).all(dim=1).any():
    #         nan_row_idx = (
    #             torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0]
    #         )
    #         row_reward_kwargs = {
    #             key: value[nan_row_idx] for key, value in reward_kwargs.items()
    #         }
    #         row_reward_kwargs["prompt"] = prompts[nan_row_idx]
    #         row_reward_kwargs["completion"] = completions[nan_row_idx]
    #         warnings.warn(
    #             f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. "
    #             "Please ensure that at least one reward function returns a valid reward."
    #         )
    #     # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
    #     # completions may be distributed across processes
    #     rewards_per_func = gather(rewards_per_func)
    #     # Apply weights to each reward function's output and sum
    #     rewards = (
    #         rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
    #     ).nansum(dim=1)
    #     # Compute grouped-wise rewards
    #     mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
    #     std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
    #     # Normalize the rewards to compute the advantages
    #     mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(
    #         self.num_generations, dim=0
    #     )
    #     std_grouped_rewards = std_grouped_rewards.repeat_interleave(
    #         self.num_generations, dim=0
    #     )
    #     advantages = rewards - mean_grouped_rewards
    #     if self.args.scale_rewards:
    #         advantages = advantages / (std_grouped_rewards + 1e-4)
    #     # Slice to keep only the local part of the data
    #     process_slice = slice(
    #         self.accelerator.process_index * len(prompts),
    #         (self.accelerator.process_index + 1) * len(prompts),
    #     )
    #     advantages = advantages[process_slice]
    #     # Log the metrics
    #     mode = "eval" if self.control.should_evaluate else "train"
    #     if mode == "train":
    #         # pylint: disable=no-member
    #         self._total_train_tokens += (
    #             self.accelerator.gather_for_metrics(attention_mask.sum()).sum().item()
    #         )
    #     # pylint: disable=no-member
    #     self._metrics[mode]["num_tokens"] = [self._total_train_tokens]
    #     completion_length = (
    #         self.accelerator.gather_for_metrics(completion_mask.sum(1))
    #         .float()
    #         .mean()
    #         .item()
    #     )
    #     self._metrics[mode]["completion_length"].append(completion_length)
    #     # Calculate mean reward per function, but only for samples where the function was applied
    #     for i, reward_func in enumerate(self.reward_funcs):
    #         if isinstance(
    #             reward_func, nn.Module
    #         ):  # Module instead of PretrainedModel for compat with compiled models
    #             reward_func_name = reward_func.config._name_or_path.split("/")[-1]
    #         else:
    #             # pylint: disable=protected-access
    #             reward_func_name = reward_func.__name__
    #         # Only calculate mean for samples where this reward function was applied (non-NaN values)
    #         mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
    #         self._metrics[mode][f"rewards/{reward_func_name}"].append(mean_rewards)
    #     self._metrics[mode]["reward"].append(rewards.mean().item())
    #     self._metrics[mode]["reward_std"].append(std_grouped_rewards.mean().item())
    #     if (
    #         self.log_completions
    #         and self.state.global_step % self.args.logging_steps == 0
    #     ):
    #         prompts_to_log = gather_object(prompts_text)
    #         completions_to_log = gather_object(completions_text)
    #         rewards_to_log = rewards.tolist()
    #         if self.accelerator.is_main_process:
    #             if is_rich_available():
    #                 print_prompt_completions_sample(
    #                     prompts_to_log,
    #                     completions_to_log,
    #                     rewards_to_log,
    #                     self.state.global_step,
    #                 )
    #             if (
    #                 self.args.report_to
    #                 and "wandb" in self.args.report_to
    #                 and wandb.run is not None
    #             ):
    #                 import pandas as pd
    #                 # For logging
    #                 table = {
    #                     "step": [str(self.state.global_step)] * len(rewards),
    #                     "prompt": prompts_to_log,
    #                     "completion": completions_to_log,
    #                     "reward": rewards.tolist(),
    #                 }
    #                 df = pd.DataFrame(table)
    #                 wandb.log({"completions": wandb.Table(dataframe=df)})
    #     return {
    #         "prompt_ids": prompt_ids,
    #         "prompt_mask": prompt_mask,
    #         "completion_ids": completion_ids,
    #         "completion_mask": completion_mask,
    #         "old_per_token_logps": old_per_token_logps,
    #         "ref_per_token_logps": ref_per_token_logps,
    #         "advantages": advantages,
    #     }
    # def _get_per_token_logps_v2(
    #     self, model, input_ids, attention_mask, logits_to_keep, completion_mask=None
    # ):
    #     # Pad sequence to be divisible by SP degree if needed
    #     total_seq_len = input_ids.shape[1]
    #     if total_seq_len % self.local_world_size != 0:
    #         pad_len = self.local_world_size - (total_seq_len % self.local_world_size)
    #         pad_token_id = self.processing_class.pad_token_id or 0
    #         # Pad input_ids and attention_mask
    #         padding = torch.full(
    #             (input_ids.shape[0], pad_len),
    #             pad_token_id,
    #             dtype=input_ids.dtype,
    #             device=input_ids.device,
    #         )
    #         input_ids = torch.cat([input_ids, padding], dim=1)
    #         attn_padding = torch.zeros(
    #             (attention_mask.shape[0], pad_len),
    #             dtype=attention_mask.dtype,
    #             device=attention_mask.device,
    #         )
    #         attention_mask = torch.cat([attention_mask, attn_padding], dim=1)
    #         if completion_mask is not None:
    #             completion_mask = torch.cat([completion_mask, attn_padding], dim=1)
    #         total_seq_len += pad_len
    #         logits_to_keep += pad_len
    #     # Split the sequence
    #     slice_size = total_seq_len // self.local_world_size
    #     start = self.local_rank * slice_size
    #     end = start + slice_size
    #     # Get our slice
    #     input_ids_slice = input_ids[:, start:end]
    #     attention_mask_slice = attention_mask[:, start:end]
    #     # Calculate where our slice starts and ends relative to the completion tokens
    #     local_completion_mask = None
    #     prompt_len = input_ids.size(1) - logits_to_keep
    #     if start >= prompt_len:
    #         # Slice starts within the completion section
    #         start_in_completion = start - prompt_len
    #         end_in_completion = min(end - prompt_len, logits_to_keep)
    #         local_logits_to_keep = end_in_completion - start_in_completion
    #         if completion_mask is not None:
    #             local_completion_mask = completion_mask[
    #                 :, start_in_completion:end_in_completion
    #             ]
    #     elif end <= prompt_len:
    #         # Slice is entirely within the prompt section (no completion tokens)
    #         local_logits_to_keep = 0
    #         if completion_mask is not None:
    #             local_completion_mask = torch.zeros(
    #                 (completion_mask.size(0), 0), device=completion_mask.device
    #             )
    #     else:
    #         # Slice contains the boundary between prompt and completion
    #         start_in_completion = 0
    #         end_in_completion = min(end - prompt_len, logits_to_keep)
    #         local_logits_to_keep = end_in_completion - start_in_completion
    #         if completion_mask is not None:
    #             local_completion_mask = completion_mask[
    #                 :, start_in_completion:end_in_completion
    #             ]
    #     # Get logits with enough context to compute log probs
    #     logits = model(
    #         input_ids=input_ids_slice,
    #         attention_mask=attention_mask_slice,
    #         logits_to_keep=local_logits_to_keep + 1,
    #     ).logits
    #     # Only the last rank that contains completion tokens needs to remove the last logit
    #     is_last_rank_with_completions = (
    #         self.local_rank == self.local_world_size - 1  # Last rank overall
    #         or end
    #         >= prompt_len
    #         + logits_to_keep  # Our slice includes the last completion token
    #     )
    #     if is_last_rank_with_completions:
    #         logits = logits[:, :-1]
    #         if local_completion_mask is not None:
    #             local_completion_mask = local_completion_mask[:, :-1]
    #         local_logits_to_keep -= 1
    #     if start >= prompt_len:
    #         # For ranks where slice is all completion tokens,
    #         # we need to offset to match the logits (which predict the next token)
    #         offset = 1  # Skip the first token as it's predicted by the last token of the previous rank
    #         local_input_ids = input_ids_slice[:, offset : offset + local_logits_to_keep]
    #     else:
    #         # For the rank that contains the prompt-completion boundary,
    #         # we need to take completion tokens only
    #         offset = prompt_len - start  # Where completions start in our slice
    #         local_input_ids = input_ids_slice[:, offset : offset + local_logits_to_keep]
    #     logits = logits[
    #         :, -local_logits_to_keep:
    #     ]  # Take only logits for completion tokens
    #     logits = logits / self.temperature
    #     per_token_logps = selective_log_softmax(logits, local_input_ids)
    #     return per_token_logps, local_completion_mask
    # # pylint: disable=unused-argument
    # @profiling_decorator
    # def compute_loss(
    #     self, model, inputs, return_outputs=False, num_items_in_batch=None
    # ):
    #     if return_outputs:
    #         raise ValueError("The GRPOTrainer does not support returning outputs")
    #     # Unpack inputs
    #     prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
    #     completion_ids, completion_mask = (
    #         inputs["completion_ids"],
    #         inputs["completion_mask"],
    #     )
    #     prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
    #     attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
    #     logits_to_keep = completion_ids.size(1)
    #     if self.args.sequence_parallel_degree > 1:
    #         per_token_logps, completion_mask = self._get_per_token_logps_v2(
    #             model,
    #             prompt_completion_ids,
    #             attention_mask,
    #             logits_to_keep,
    #             completion_mask,
    #         )
    #     else:
    #         per_token_logps = super()._get_per_token_logps(
    #             model, prompt_completion_ids, attention_mask, logits_to_keep
    #         )
    #     # Compute the KL divergence between the model and the reference model
    #     if self.beta != 0.0:
    #         ref_per_token_logps = inputs["ref_per_token_logps"]
    #         per_token_kl = (
    #             torch.exp(ref_per_token_logps - per_token_logps)
    #             - (ref_per_token_logps - per_token_logps)
    #             - 1
    #         )
    #     # Compute the loss
    #     advantages = inputs["advantages"]
    #     # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip its computation
    #     # and use per_token_logps.detach() instead.
    #     old_per_token_logps = (
    #         inputs["old_per_token_logps"]
    #         if self.num_iterations > 1
    #         else per_token_logps.detach()
    #     )
    #     coef_1 = torch.exp(per_token_logps - old_per_token_logps)
    #     coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
    #     per_token_loss1 = coef_1 * advantages.unsqueeze(1)
    #     per_token_loss2 = coef_2 * advantages.unsqueeze(1)
    #     per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
    #     if self.beta != 0.0:
    #         per_token_loss = per_token_loss + self.beta * per_token_kl
    #     loss = (per_token_loss * completion_mask).sum() / completion_mask.sum()
    #     # Log metrics
    #     mode = "eval" if self.control.should_evaluate else "train"
    #     if self.beta != 0.0:
    #         mean_kl = (per_token_kl * completion_mask).sum() / completion_mask.sum()
    #         self._metrics[mode]["kl"].append(
    #             self.accelerator.gather_for_metrics(mean_kl).mean().item()
    #         )
    #     is_clipped = (per_token_loss1 < per_token_loss2).float()
    #     clip_ratio = (is_clipped * completion_mask).sum() / completion_mask.sum()
    #     self._metrics[mode]["clip_ratio"].append(
    #         self.accelerator.gather_for_metrics(clip_ratio).mean().item()
    #     )
    #     return loss
--- a/src/axolotl/core/trainers/mixins/init.py
+++ b/src/axolotl/core/trainers/mixins/init.py
@@ -6,4 +6,4 @@
 from .optimizer import OptimizerMixin
 from .rng_state_loader import RngLoaderMixin
 from .scheduler import SchedulerMixin
-from .sequence_parallel import SequenceParallelContextManager, SequenceParallelMixin
+from .sequence_parallel import SequenceParallelMixin
--- a/src/axolotl/core/trainers/mixins/sequence_parallel.py
+++ b/src/axolotl/core/trainers/mixins/sequence_parallel.py
@@ -1,144 +1,16 @@
-"""
+"""Module for Axolotl trainer sequence parallelism mixin"""
 Module for Axolotl trainer sequence parallelism mixin and training context manager
 """
 import functools
 import logging
 import torch
 import torch.distributed as dist
 from datasets import Dataset
 from torch import nn
 from torch.utils.data import DistributedSampler, Sampler
 from torch.utils.hooks import RemovableHandle
-from axolotl.monkeypatch.attention.ring_attn import (
+from axolotl.monkeypatch.attention.ring_attn import get_ring_attn_group
    get_ring_attn_group,
    update_ring_attn_params,
 )
 from axolotl.utils.schemas.enums import RingAttnFunc
 LOG = logging.getLogger(__name__)
 def _handle_logits_to_keep(
    logits_to_keep,
    local_rank: int,
    local_world_size: int,
    ring_attn_func: RingAttnFunc,
    total_seq_len: int,
 ):
    """
    Handle logits_to_keep parameter for sequence parallelism.
    Args:
        logits_to_keep: Integer or tensor indicating which positions to compute logits
            for.
        local_rank: Rank in the sequence parallel group.
        local_world_size: World size of the sequence parallel group.
        ring_attn_func: Ring attention function being used.
        total_seq_len: Full sequence length.
    Returns:
        Adjusted logits_to_keep appropriate for this rank's sharded sequence
    """
    print("start of _handle_logits_to_keep")
    print(dist.get_rank(), logits_to_keep)
    # No transformation needed if logits_to_keep is None
    if logits_to_keep is None:
        return None
    assert isinstance(
        logits_to_keep, int
    ), "sequence parallelism currently only supports integer logits_to_keep"
    assert ring_attn_func in [
        RingAttnFunc.VARLEN_LLAMA3,
        RingAttnFunc.BATCH_RING,
    ], "if specifying logits_to_keep, sequence parallelism currently only supports 'batch_ring' and 'varlen_llama3' `ring_attn_func`s"
    # For standard sharding, each rank gets a contiguous chunk
    chunk_size = total_seq_len // local_world_size
    start_idx = local_rank * chunk_size
    end_idx = start_idx + chunk_size
    # Check if logits_to_keep is in this rank's range
    if start_idx <= logits_to_keep < end_idx:
        print("end of _handle_logits_to_keep")
        print(dist.get_rank(), logits_to_keep - start_idx)
        return logits_to_keep - start_idx
    else:
        print("end of _handle_logits_to_keep")
        print(dist.get_rank(), -1)
        return -1
 def apply_sequence_parallelism(
    batch: dict[str, torch.Tensor],
    local_rank: int,
    local_world_size: int,
    ring_attn_func: RingAttnFunc,
 ) -> dict[str, torch.Tensor]:
    """
    Apply sequence parallelism slicing to a batch.
    Args:
        batch: Batch dictionary (e.g., input_ids, attention_mask, etc.).
        local_rank: Local rank in the sequence parallel group.
        local_world_size: World size of the sequence parallel group.
        ring_attn_func: The ring attention function to use.
    Returns:
        Sliced batch dictionary.
    """
    # Update ring attention params if needed
    if batch.get("position_ids") is not None:
        update_ring_attn_params(position_ids=batch["position_ids"])
    # Slice batch for sequence parallel processing
    total_seq_len = batch["input_ids"].size(1)
    for key in batch:
        if (
            isinstance(batch[key], torch.Tensor)
            and batch[key].dim() > 1
            and batch[key].size(1) == total_seq_len
        ):
            if ring_attn_func in [
                RingAttnFunc.VARLEN_LLAMA3,
                RingAttnFunc.BATCH_RING,
            ]:
                # Split in sequential fashion and grab this rank's chunk
                batch[key] = (
                    batch[key].chunk(local_world_size, dim=1)[local_rank].contiguous()
                )
            elif ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
                chunks = batch[key].chunk(2 * local_world_size, dim=1)
                # Take rank's chunk and opposing chunk for zigzag pattern
                selected_chunks = [
                    chunks[local_rank],
                    chunks[2 * local_world_size - local_rank - 1],
                ]
                batch[key] = torch.cat(selected_chunks, dim=1).contiguous()
            elif ring_attn_func is RingAttnFunc.BATCH_STRIPE:
                # Split into striped data and stack
                tensor = torch.stack(
                    batch[key].split(local_world_size, dim=1),
                    dim=1,
                ).transpose(1, 2)
                batch[key] = tensor[:, local_rank].contiguous()
        if key == "logits_to_keep":
            batch[key] = _handle_logits_to_keep(
                logits_to_keep=batch[key],
                local_rank=local_rank,
                local_world_size=local_world_size,
                ring_attn_func=ring_attn_func,
                total_seq_len=total_seq_len,
            )
    return batch
 class SequenceParallelMixin:
    """
    Mixin class for sequence parallelism support in trainers.
@@ -215,160 +87,3 @@ class SequenceParallelMixin:
        return self._create_sequence_parallel_sampler(
            eval_dataset, shuffle=False, is_eval=True
        )
 class SequenceParallelContextManager:
    """
    Context manager for sequence parallelism operations.
    This class provides a context that will automatically apply sequence parallelism
    during model forward passes using a pre-forward hook, and gather outputs from
    across the sequence parallelism group using a post-forward hook.
    """
    def __init__(
        self,
        model: nn.Module,
        sequence_parallel_degree: int,
        ring_attn_func: RingAttnFunc,
    ):
        self.model = model
        self.sequence_parallel_degree = sequence_parallel_degree
        self.ring_attn_func = ring_attn_func
        self.process_group = get_ring_attn_group()
        # Initialize sequence parallel group details
        self.local_rank = dist.get_rank(self.process_group)
        self.local_world_size = dist.get_world_size(self.process_group)
        # Will store hook handles for removal
        self.hook_handles: list[RemovableHandle] = []
        # Create a partially applied version of the apply_sequence_parallelism function
        # with pre-configured params
        self.apply_sequence_parallelism = functools.partial(
            apply_sequence_parallelism,
            local_rank=self.local_rank,
            local_world_size=self.local_world_size,
            ring_attn_func=self.ring_attn_func,
        )
    def __enter__(self):
        # Forward pre-hook to apply sequence parallelism
        def sequence_parallel_pre_hook(_, args, kwargs):
            # Apply sequence parallelism to kwargs
            kwargs = self.apply_sequence_parallelism(batch=kwargs)
            return args, kwargs
        # Forward post-hook to gather outputs
        def sequence_parallel_post_hook(_, __, output):
            print("start of sequence_parallel_post_hook")
            # Gather the sharded outputs
            output = self.gather_outputs(output)
            print("end of sequence_parallel_post_hook")
            return output
        # Register both hooks
        self.hook_handles.append(
            self.model.register_forward_pre_hook(
                sequence_parallel_pre_hook, with_kwargs=True
            )
        )
        self.hook_handles.append(
            self.model.register_forward_hook(sequence_parallel_post_hook)
        )
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        # Remove all hooks
        for handle in self.hook_handles:
            handle.remove()
        self.hook_handles = []
    def gather_outputs(self, output):
        """Gather sharded outputs from all ranks and reconstruct the full tensor."""
        # Handle different output formats (dict, tensor, etc.)
        if isinstance(output, dict):
            gathered_output = {}
            for key, value in output.items():
                if isinstance(value, torch.Tensor) and value.dim() > 1:
                    # Gather logits or other sequence-sharded tensors
                    gathered_value = self.gather_tensor(value)
                    gathered_output[key] = gathered_value
                else:
                    gathered_value = value.clone()
                    dist.all_reduce(
                        gathered_value, op=dist.ReduceOp.SUM, group=self.process_group
                    )
                    gathered_output[key] = gathered_value
            return gathered_output
        if isinstance(output, torch.Tensor):
            return self.gather_tensor(output)
        return output
    def gather_tensor(self, tensor):
        """Gather a sharded tensor from all ranks."""
        # Prepare tensors for all_gather
        world_size = self.local_world_size
        # Create list to store tensors from all ranks
        gathered_tensors = [torch.zeros_like(tensor) for _ in range(world_size)]
        # All-gather operation
        dist.all_gather(gathered_tensors, tensor, group=self.process_group)
        # Concatenate along sequence dimension (typically dim=1)
        if self.ring_attn_func in [RingAttnFunc.VARLEN_LLAMA3, RingAttnFunc.BATCH_RING]:
            # Simple concatenation for standard sharding
            return torch.cat(gathered_tensors, dim=1)
        if self.ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
            # Each rank has a pattern of (rank, world_size*2-rank-1)
            reconstituted_tensors = [None] * (world_size * 2)
            # First, split each gathered tensor into its two chunks
            for rank, gathered_tensor in enumerate(gathered_tensors):
                # Each tensor contains two chunks in the sequence dimension
                chunk_size = gathered_tensor.size(1) // 2
                chunk1, chunk2 = gathered_tensor.split(chunk_size, dim=1)
                # Place chunks in their original positions
                reconstituted_tensors[rank] = chunk1
                reconstituted_tensors[world_size * 2 - rank - 1] = chunk2
            # Concatenate the reconstituted tensors in the correct order
            return torch.cat(reconstituted_tensors, dim=1)
        # Otherwise, RingAttnFunc.BATCH_STRIPE
        # In striping, each rank has every world_size-th slice
        batch_size = tensor.size(0)
        hidden_dim = tensor.size(-1)
        # First, determine the full sequence length
        total_seq_len = 0
        for t in gathered_tensors:
            total_seq_len += t.size(1)
        # Create a tensor to hold the unstriped result
        result = torch.zeros(
            batch_size,
            total_seq_len,
            hidden_dim,
            dtype=tensor.dtype,
            device=tensor.device,
        )
        # For each rank's tensor, distribute its slices to the correct positions
        for rank, gathered_tensor in enumerate(gathered_tensors):
            # The rank's tensor contains every world_size-th slice
            # starting from its rank position
            seq_len = gathered_tensor.size(1)
            for i in range(seq_len):
                # Calculate the position in the full tensor
                pos = i * world_size + rank
                if pos < total_seq_len:
                    result[:, pos] = gathered_tensor[:, i]
        return result
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -9,7 +9,7 @@ from PIL.Image import Resampling
 from transformers import TrainingArguments
 from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig
-from axolotl.utils.schemas.enums import RingAttnFunc
+from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
@dataclass
--- a/src/axolotl/integrations/llm_compressor/README.md
+++ b/src/axolotl/integrations/llm_compressor/README.md
@@ -0,0 +1,108 @@
 # LLMCompressor Integration
 Fine-tune sparsified models in Axolotl using Neural Magic's [LLMCompressor](https://github.com/vllm-project/llm-compressor).
 This integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor's model compression capabilities with Axolotl's distributed training pipelines, users can efficiently fine-tune sparse models at scale.
 It uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.
 ---
 ## Requirements
 - Axolotl with `llmcompressor` extras:
  ```bash
  pip install "axolotl[llmcompressor]"
  ```
 - Requires `llmcompressor >= 0.5.1`
 This will install all necessary dependencies to fine-tune sparsified models using the integration.
 ---
 ## Usage
 To enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:
 ```yaml
 plugins:
  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
 llmcompressor:
  recipe:
    finetuning_stage:
      finetuning_modifiers:
        ConstantPruningModifier:
          targets: [
            're:.*q_proj.weight',
            're:.*k_proj.weight',
            're:.*v_proj.weight',
            're:.*o_proj.weight',
            're:.*gate_proj.weight',
            're:.*up_proj.weight',
            're:.*down_proj.weight',
          ]
          start: 0
  save_compressed: true
 # ... (other training arguments)
 ```
 This plugin **does not apply pruning or sparsification itself** — it is intended for **fine-tuning models that have already been sparsified**.
 Pre-sparsified checkpoints can be:
 - Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor)
 - Downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic)
 - Any custom LLM with compatible sparsity patterns that you've created yourself
 To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:
 [https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md)
 ### Storage Optimization with save_compressed
 Setting `save_compressed: true` in your configuration enables saving models in a compressed format, which:
 - Reduces disk space usage by approximately 40%
 - Maintains compatibility with vLLM for accelerated inference
 - Maintains compatibility with llmcompressor for further optimization (example: quantization)
 This option is highly recommended when working with sparse models to maximize the benefits of model compression.
 ### Example Config
 See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example.
 ---
 ## Inference with vLLM
 After fine-tuning your sparse model, you can leverage vLLM for efficient inference.
 You can also use LLMCompressor to apply additional quantization to your fine-tuned
 sparse model before inference for even greater performance benefits.:
 ```python
 from vllm import LLM, SamplingParams
 prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
 ]
 sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
 llm = LLM("path/to/your/sparse/model")
 outputs = llm.generate(prompts, sampling_params)
 for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 ```
 For more details on vLLM's capabilities and advanced configuration options, see the [official vLLM documentation](https://docs.vllm.ai/).
 ## Learn More
 For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:
 [https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)
--- a/src/axolotl/integrations/llm_compressor/init.py
+++ b/src/axolotl/integrations/llm_compressor/init.py
@@ -0,0 +1,5 @@
 """Integration entry point for the LLMCompressor plugin."""
 from .plugin import LLMCompressorPlugin
 __all__ = ["LLMCompressorPlugin"]
--- a/src/axolotl/integrations/llm_compressor/args.py
+++ b/src/axolotl/integrations/llm_compressor/args.py
@@ -0,0 +1,40 @@
 """
 LLMCompressor and Sparse Finetuning config models.
 """
 from typing import Any
 from pydantic import BaseModel, Field
 from typing_extensions import Annotated
 class CompressionArgs(BaseModel):
    """Sparse Finetuning config for LLMCompressor."""
    # Typing for recipe is set to Any due to:
    # https://github.com/vllm-project/llm-compressor/issues/1319
    recipe: Annotated[
        Any,
        Field(
            description="The recipe containing the compression algorithms and hyperparameters to apply."
        ),
    ]
    save_compressed: Annotated[
        bool,
        Field(
            default=False,
            description="Whether to save the compressed model after training.",
        ),
    ]
 class LLMCompressorArgs(BaseModel):
    """LLMCompressor configuration BaseModel."""
    llmcompressor: Annotated[
        CompressionArgs,
        Field(
            description="Arguments enabling compression pathways through the LLM Compressor plugins"
        ),
    ]
--- a/src/axolotl/integrations/llm_compressor/plugin.py
+++ b/src/axolotl/integrations/llm_compressor/plugin.py
@@ -0,0 +1,171 @@
 """
 Sparse Finetuning plugin for Axolotl — enables handling of sparse neural networks
 by maintaining masks for zero weights during training.
 """
 import logging
 from functools import wraps
 from typing import Any, Callable, Concatenate, ParamSpec, TypeVar
 from llmcompressor import active_session, create_session
 from llmcompressor.core import callbacks as session_callbacks
 from llmcompressor.recipe import Recipe
 from torch.nn import Module
 from transformers.trainer import Trainer
 from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
 from transformers.training_args import TrainingArguments
 from axolotl.integrations.base import BasePlugin
 P = ParamSpec("P")  # Params for generic function signatures
 R = TypeVar("R")  # Return type for generic function signatures
 LOG = logging.getLogger("axolotl.integrations.llm_compressor")
 class LLMCompressorCallbackHandler(TrainerCallback):
    """
    Trainer callback for Sparse Finetuning.
    Maintains sparsity patterns during training by applying masks after optimization steps,
    ensuring zero-weight updates are canceled out.
    """
    def __init__(self, trainer: Trainer, recipe: Any):
        """
        Initialize the Sparse Finetuning callback handler.
        Args:
            trainer (Trainer): Huggingface Trainer instance.
            recipe (Recipe | dict): Sparse finetuning recipe to apply.
        """
        super().__init__()
        self.trainer = trainer
        self.recipe = (
            Recipe.model_validate(recipe) if not isinstance(recipe, Recipe) else recipe
        )
        self.original_compute_loss = trainer.compute_loss
        self.trainer.compute_loss = compute_loss_wrapper(self.trainer.compute_loss)
        create_session()
    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the beginning of training. Initializes the compression session.
        Args:
            args (TrainingArguments): Training arguments.
            state (TrainerState): Trainer state.
            control (TrainerControl): Trainer control.
        """
        super().on_train_begin(args, state, control, **kwargs)
        self.trainer.accelerator.wait_for_everyone()
        active_session().initialize(
            model=self.trainer.model,
            optimizer=self.trainer.optimizer,
            start=state.epoch,
            recipe=self.recipe,
        )
        self.trainer.accelerator.wait_for_everyone()
    def on_step_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the beginning of a training step. Triggers batch_start callback.
        """
        super().on_step_begin(args, state, control, **kwargs)
        session_callbacks.batch_start()
    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the end of a training step. Triggers optimizer and batch_end callbacks.
        """
        super().on_step_end(args, state, control, **kwargs)
        session_callbacks.optim_pre_step()
        session_callbacks.optim_post_step()
        session_callbacks.batch_end()
    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the end of training. Finalizes the compression session.
        """
        super().on_train_end(args, state, control, **kwargs)
        active_session().finalize()
        self.trainer.compute_loss_func = self.original_compute_loss
 class LLMCompressorPlugin(BasePlugin):
    """
    Sparse Finetuning plugin for Axolotl integration.
    """
    def get_input_args(self) -> str:
        """
        Returns the path to the plugin's argument definition.
        Returns:
            str: Dotted path to the LLMCompressorArgs class.
        """
        return "axolotl.integrations.llm_compressor.args.LLMCompressorArgs"
    def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
        """
        Adds Sparse Finetuning callback to the Trainer instance.
        Args:
            cfg (Any): Configuration object containing the sparse recipe.
            trainer (Trainer): Huggingface Trainer instance.
        Returns:
            list: List containing the configured callback instances.
        """
        LOG.info("Adding Sparse Finetuning callback to the trainer")
        callback = LLMCompressorCallbackHandler(
            trainer=trainer,
            recipe=cfg.llmcompressor.recipe,
        )
        return [callback]
 def compute_loss_wrapper(
    compute_loss_func: Callable[Concatenate[Module, P], R],
 ) -> Callable[Concatenate[Module, P], R]:
    """
    Wraps the loss computation function to trigger the loss_calculated callback.
    Args:
        compute_loss_func (Callable): Original loss computation function.
    Returns:
        Callable: Wrapped function that also invokes the loss_calculated callback.
    """
    @wraps(compute_loss_func)
    def compute_and_notify(model: Module, *args: P.args, **kwargs: P.kwargs) -> R:
        loss = compute_loss_func(model, *args, **kwargs)
        if active_session().lifecycle.initialized_ and model.training:
            session_callbacks.loss_calculated(loss=loss)
        return loss
    return compute_and_notify
--- a/src/axolotl/integrations/llm_compressor/utils.py
+++ b/src/axolotl/integrations/llm_compressor/utils.py
@@ -0,0 +1,40 @@
 """Utilities for llmcompressor integration with axolotl."""
 from typing import Union
 from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
    modify_save_pretrained,
 )
 from transformers import PreTrainedModel, Trainer
 def save_compressed_model(
    model: PreTrainedModel,
    output_dir: Union[str, bytes],
    trainer: Trainer,
    safe_serialization: bool = False,
    save_compressed: bool = False,
 ) -> None:
    """
    Synchronize processes, apply compression hooks, and save the model.
    Args:
        model (PreTrainedModel): The model to be saved.
        output_dir (str or bytes): Path where the model files will be written.
        trainer (Trainer): Hugging Face Trainer for process synchronization.
        safe_serialization (bool): Use safe serialization if True.
        save_compressed (bool): Write compressed tensors if True.
    """
    trainer.accelerator.wait_for_everyone()
    # Only the main process writes the files
    if not trainer.accelerator.is_main_process:
        return
    modify_save_pretrained(model)
    model.save_pretrained(
        output_dir,
        safe_serialization=safe_serialization,
        save_compressed=save_compressed,
        skip_sparsity_compression_stats=not save_compressed,
    )
--- a/src/axolotl/monkeypatch/attention/ring_attn/init.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/init.py
@@ -4,6 +4,7 @@
 # flake8: noqa
 from .patch import (
    RingAttnFunc,
    get_ring_attn_group,
    register_ring_attn,
    set_ring_attn_group,
--- a/src/axolotl/monkeypatch/attention/ring_attn/adapters/batch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/adapters/batch.py
@@ -28,7 +28,7 @@ from transformers.modeling_flash_attention_utils import (
 )
 from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
-from axolotl.utils.schemas.enums import RingAttnFunc
+from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
 RING_ATTN_FUNC_MAPPING = {
    RingAttnFunc.BATCH_RING: ring_flash_attn_func,
--- a/src/axolotl/monkeypatch/attention/ring_attn/patch.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn/patch.py
@@ -6,13 +6,14 @@ package, specifically the `hf_adapter.substitute_hf_flash_attn` function to patc
 their sequence parallel version of Flash Attention 2.
 """
 from enum import Enum
 import torch
 import torch.distributed as dist
 from accelerate.logging import get_logger
 from axolotl.logging_config import configure_logging
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 from axolotl.utils.schemas.enums import RingAttnFunc
 configure_logging()
 LOG = get_logger(__name__)
@@ -42,6 +43,17 @@ def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
    RING_ATTN_GROUP = ring_attn_group
 class RingAttnFunc(str, Enum):
    """Enum class for supported `ring-flash-attn` implementations"""
    # VARLEN_RING = "varlen_ring"
    # VARLEN_ZIGZAG = "varlen_zigzag"
    VARLEN_LLAMA3 = "varlen_llama3"
    BATCH_RING = "batch_ring"
    BATCH_ZIGZAG = "batch_zigzag"
    BATCH_STRIPE = "batch_stripe"
 def register_ring_attn(
    sequence_parallel_degree: int,
    heads_k_stride: int | None,
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -6,7 +6,6 @@ import os
 import signal
 import sys
 import weakref
 from contextlib import nullcontext
 from pathlib import Path
 from typing import Any, Dict
@@ -26,15 +25,11 @@ from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
 )
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.core.trainers.mixins.sequence_parallel import (
    SequenceParallelContextManager,
 )
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.schemas.enums import RLType
 from axolotl.utils.trainer import setup_trainer
 try:
@@ -109,7 +104,7 @@ def setup_reference_model(
        Reference model if needed for RL training, `None` otherwise.
    """
    model_ref = None
-    if cfg.rl and cfg.rl != RLType.ORPO:
+    if cfg.rl and cfg.rl != "orpo":
        if cfg.adapter and not cfg.rl_adapter_ref_model:
            # use built-in trl autounwrap
            LOG.debug("Passing model_ref: None to RL trainer")
@@ -190,28 +185,16 @@ def execute_training(
        trainer: The configured trainer object.
        resume_from_checkpoint: Path to checkpoint to resume from, if applicable.
    """
-    # Define the context managers to use
+    LOG.info("Starting trainer...")
-    flash_context = (
+    if cfg.flash_optimum:
-        torch.backends.cuda.sdp_kernel(
+        with torch.backends.cuda.sdp_kernel(
            # TODO configure these from the YAML w/ sdp_kernel_kwargs: ...
            enable_flash=True,
            enable_math=True,
            enable_mem_efficient=True,
-        )
+        ):
-        if cfg.flash_optimum
+            trainer.train(resume_from_checkpoint=resume_from_checkpoint)
-        else nullcontext()
+    else:
    )
    sequence_parallel_context = (
        SequenceParallelContextManager(
            model=trainer.model,
            sequence_parallel_degree=cfg.sequence_parallel_degree,
            ring_attn_func=cfg.ring_attn_func,
        )
        if cfg.sequence_parallel_degree > 1
        else nullcontext()
    )
    LOG.info("Starting trainer...")
    with flash_context, sequence_parallel_context:
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)
@@ -288,6 +271,19 @@ def save_trained_model(
                os.remove(os.path.join(cfg.output_dir, "model.safetensors"))
            except FileNotFoundError:
                pass
    elif hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
        from axolotl.integrations.llm_compressor.utils import (
            save_compressed_model,
        )
        save_compressed_model(
            model=model,
            output_dir=cfg.output_dir,
            trainer=trainer,
            safe_serialization=safe_serialization,
            save_compressed=cfg.llmcompressor.save_compressed,
        )
    elif cfg.local_rank == 0:
        if cfg.flash_optimum and BetterTransformer:
            model = BetterTransformer.reverse(model)
@@ -296,6 +292,7 @@ def save_trained_model(
            trainer.model.save_pretrained(
                cfg.output_dir, safe_serialization=safe_serialization
            )
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
--- a/src/axolotl/utils/collators/batching.py
+++ b/src/axolotl/utils/collators/batching.py
@@ -1,12 +1,20 @@
-"""Data collators for axolotl to pad labels and position_ids for packed sequences"""
+"""
 Data collators for axolotl to pad labels and position_ids for packed sequences. Also
 includes logic for handling sequence parallelism collation.
 """
 from dataclasses import dataclass
 from typing import Any
 import numpy as np
 import torch
 import torch.distributed as dist
 from transformers import PreTrainedTokenizerBase
 from transformers.utils import PaddingStrategy
 from axolotl.monkeypatch.attention.ring_attn import update_ring_attn_params
 from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
@dataclass
 class DataCollatorForSeq2Seq:
@@ -41,6 +49,8 @@ class DataCollatorForSeq2Seq:
            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
        sequence_parallel_degree (`int`):
            The degree of sequence parallelism. Default to 1 for no sequence parallelism.
    """
    tokenizer: PreTrainedTokenizerBase
@@ -51,6 +61,17 @@ class DataCollatorForSeq2Seq:
    label_pad_token_id: int = -100
    position_pad_token_id: int = 0
    return_tensors: str = "pt"
    sequence_parallel_degree: int = 1
    ring_attn_func: RingAttnFunc | None = None
    def __post_init__(self):
        if self.sequence_parallel_degree > 1:
            from axolotl.monkeypatch.attention.ring_attn import get_ring_attn_group
            # Get information about our position in the SP group
            sp_group = get_ring_attn_group()
            self.local_rank = dist.get_rank(group=sp_group)
            self.local_world_size = dist.get_world_size(group=sp_group)
    def __call__(self, features, return_tensors=None):
        has_attn_mask = "attention_mask" in features[0].keys()
@@ -120,8 +141,62 @@ class DataCollatorForSeq2Seq:
            )
            features["decoder_input_ids"] = decoder_input_ids
        if self.sequence_parallel_degree > 1:
            features = self.apply_sequence_parallelism(features)
        return features
    def apply_sequence_parallelism(
        self, batch: dict[str, torch.Tensor]
    ) -> torch.Tensor:
        """
        Apply sequence parallelism slicing to a batch.
        Args:
            batch: Batch dictionary from parent collator.
        Returns:
            Sliced batch dictionary.
        """
        # Get local (start, end) for sequence parallelism slicing
        total_seq_len = batch["input_ids"].size(1)
        # Update params for varlen ring attention calculation
        if batch.get("position_ids") is not None:
            update_ring_attn_params(position_ids=batch["position_ids"])
        # Slice batch for sequence parallel processing
        for key in batch:
            if batch[key].size(1) == total_seq_len:
                if self.ring_attn_func in [
                    RingAttnFunc.VARLEN_LLAMA3,
                    RingAttnFunc.BATCH_RING,
                ]:
                    batch[key] = (
                        batch[key]
                        .chunk(self.local_world_size, dim=1)[self.local_rank]
                        .contiguous()
                    )
                elif self.ring_attn_func is RingAttnFunc.BATCH_ZIGZAG:
                    chunks = batch[key].chunk(2 * self.local_world_size, dim=1)
                    # Take rank's chunk and opposing chunk for zigzag pattern
                    selected_chunks = [
                        chunks[self.local_rank],
                        chunks[2 * self.local_world_size - self.local_rank - 1],
                    ]
                    batch[key] = torch.cat(selected_chunks, dim=1).contiguous()
                elif self.ring_attn_func is RingAttnFunc.BATCH_STRIPE:
                    # TODO(djsaunde): This doesn't seem to work as expected
                    # Split into striped data and stack
                    tensor = torch.stack(
                        batch[key].split(self.local_world_size, dim=1),
                        dim=1,
                    ).transpose(1, 2)
                    batch[key] = tensor[:, self.local_rank].contiguous()
        return batch
@dataclass
 class BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -126,6 +126,9 @@ def normalize_config(cfg):
            with open(ds_config_path, encoding="utf-8") as f:
                cfg.deepspeed = json.load(f)
    if cfg.sequence_parallel_degree is None:
        cfg.sequence_parallel_degree = 1
    if cfg.saves_per_epoch:
        save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
        if save_steps < 1.0:  # prevent saves on every step
--- a/src/axolotl/utils/data/rl.py
+++ b/src/axolotl/utils/data/rl.py
@@ -18,9 +18,8 @@ from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process, zero_first
 from axolotl.utils.models import load_tokenizer
 from axolotl.utils.schemas.enums import RLType
-LOG = logging.getLogger(__name__)
+LOG = logging.getLogger("axolotl")
 def _get_path(ds_hash, cfg):
@@ -81,7 +80,7 @@ def map_dataset(cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs):
 def drop_long_rl_seq(
    sample, rl, tokenizer, sequence_len  # pylint: disable=invalid-name
 ):
-    if rl in (RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO):
+    if rl in ("dpo", "ipo", "orpo", "simpo"):
        if not (
            sample.get("prompt") and sample.get("chosen") and sample.get("rejected")
        ):
@@ -101,7 +100,7 @@ def drop_long_rl_seq(
            len_prompt + len_rejected
        ) <= sequence_len
-    if rl is RLType.KTO:
+    if rl == "kto":
        if not (sample.get("prompt") and sample.get("completion")):
            raise ValueError("Prompt and completion keys are required for KTO datasets")
@@ -115,7 +114,7 @@ def drop_long_rl_seq(
        return (len_prompt + len_completion) <= sequence_len
-    if rl is RLType.GRPO:
+    if rl == "grpo":
        return True
    raise ValueError("Unknown RL type")
@@ -138,9 +137,9 @@ def load_prepare_preference_datasets(cfg):
            if _type:
                if isinstance(_type, DictDefault):
                    _type = "user_defined.default"
-                if _cfg.rl is RLType.ORPO:
+                if _cfg.rl == "orpo":
                    ds_transform_fn = load_orpo(_type, _cfg, dataset_idx=i)
-                elif _cfg.rl is RLType.KTO:
+                elif _cfg.rl == "kto":
                    ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
                else:
                    ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
@@ -151,7 +150,7 @@ def load_prepare_preference_datasets(cfg):
                split_datasets[i] = map_dataset(
                    cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs
                )
-            elif _cfg.rl is RLType.KTO:
+            elif _cfg.rl == "kto":
                ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
                map_kwargs = {}
                if isinstance(ds_transform_fn, tuple):
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -72,7 +72,6 @@ from axolotl.utils.distributed import (
 from axolotl.utils.gradient_checkpointing import hf_grad_checkpoint_offload_wrapper
 from axolotl.utils.lora_embeddings import get_linear_embedding_layers
 from axolotl.utils.model_shard_quant import load_sharded_model, load_sharded_model_quant
 from axolotl.utils.schemas.enums import RLType
 LOG = logging.getLogger(__name__)
@@ -140,6 +139,22 @@ def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
        hasattr(model_config, "quantization_config")
        and model_config.quantization_config
    )
    # Detect compressed-tensors config
    is_compressed_tensors_config = (
        quant_config_exists
        and model_config.quantization_config.get("quant_method") == "compressed-tensors"
    )
    if is_compressed_tensors_config:
        if model_config.quantization_config.get("config_groups"):
            LOG.warning(
                "Found `config_groups` in a compressed-tensors config. "
                "QAT integration with llmcompressor is not tested."
            )
        # Skip further quant checks for compressed-tensors
        return
    quant_config_method_is_gptq = (
        quant_config_exists
        and "quant_method" in model_config.quantization_config
@@ -1341,7 +1356,7 @@ class ModelLoader:
            # then the dpo trainer doesn't want the peft model loaded over it, it just wants the lora/peft config
            if (
                self.cfg.adapter
-                and self.cfg.rl in [RLType.DPO, RLType.IPO, RLType.KTO]
+                and self.cfg.rl in ["dpo", "ipo", "kto"]
                and not self.cfg.merge_lora
            ):
                _, lora_config = load_lora(
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -18,7 +18,6 @@ from pydantic import (
 )
 from transformers.utils.import_utils import is_torch_npu_available
 from axolotl.utils.distributed import is_main_process
 from axolotl.utils.schemas.datasets import (
    DatasetConfig,
    DPODataset,
@@ -28,7 +27,7 @@ from axolotl.utils.schemas.datasets import (
    StepwiseSupervisedDataset,
 )
 from axolotl.utils.schemas.deprecated import DeprecatedParameters, RemappedParameters
-from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType
+from axolotl.utils.schemas.enums import ChatTemplate, RLType
 from axolotl.utils.schemas.integrations import (
    CometConfig,
    GradioConfig,
@@ -260,7 +259,7 @@ class AxolotlInputConfig(
    sequence_parallel_degree: int | None = None
    heads_k_stride: int | None = None
-    ring_attn_func: RingAttnFunc | None = None
+    ring_attn_func: str | None = None
    special_tokens: SpecialTokensConfig | None = None
    tokens: list[str] | None = None
@@ -719,10 +718,9 @@ class AxolotlInputConfig(
            and data.get("eval_sample_packing") is None
            and not data.get("eval_table_size")
        ):
-            if is_main_process():
+            LOG.info(
-                LOG.info(
+                "explicitly setting `eval_sample_packing` to match `sample_packing`"
-                    "explicitly setting `eval_sample_packing` to match `sample_packing`"
+            )
                )
            data["eval_sample_packing"] = True
        if (
@@ -784,7 +782,7 @@ class AxolotlInputConfig(
    @model_validator(mode="after")
    def check_simpo_warmup(self):
-        if self.rl is RLType.SIMPO and self.warmup_ratio:
+        if self.rl == "simpo" and self.warmup_ratio:
            raise ValueError(
                "warmup_ratio is not supported with the simpo trainer. Please use `warmup_steps` instead"
            )
@@ -1151,17 +1149,22 @@ class AxolotlInputConfig(
        return data
-    @model_validator(mode="after")
+    @field_validator("sequence_parallel_degree", mode="after")
-    def check_sequence_parallel_degree(self):
+    @classmethod
-        if not self.sequence_parallel_degree:
+    def check_sequence_parallel_degree(cls, value, info):
-            self.sequence_parallel_degree = 1
+        if not value:
-        elif self.sequence_parallel_degree > 1:
+            value = 1
-            if not self.flash_attention:
+
        if value > 1:
            if not info.data.get("flash_attention"):
                raise ValueError(
                    "flash_attention: true must be set with sequence_parallel_degree > 1"
                )
-            if self.sample_packing and self.micro_batch_size > 1:
+            if (
                info.data.get("sample_packing")
                and not info.data["micro_batch_size"] == 1
            ):
                raise ValueError(
                    "micro_batch_size must be set to 1 when sample_packing is enabled"
                    "due to a `ring-flash-attn` requirement"
@@ -1179,41 +1182,44 @@ class AxolotlInputConfig(
            # TODO: monkeypatch / callback to average losses correctly across SP ranks
            # / fix gradient scaling across SP ranks. Losses, grads should be scaled
            # according to the proportion of non-padding tokens per rank.
-            if is_main_process():
+            LOG.warning(
-                LOG.warning(
+                "Sequence parallelism (SP) is enabled with "
-                    "Sequence parallelism (SP) is enabled with "
+                f"sequence_parallel_degree={value}. Please note that logged losses may "
-                    f"sequence_parallel_degree={self.sequence_parallel_degree}. "
+                "differ slightly to the non-SP losses due to transformers Trainer "
-                    "Please note that logged losses may differ slightly to the non-SP "
+                "implementation details. Please see "
-                    "losses due to transformers Trainer implementation details. "
+                "https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
-                    "Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
+                "for more details."
-                    "for more details."
+            )
                )
-        return self
+        return value
-    @model_validator(mode="after")
+    @field_validator("ring_attn_func", mode="after")
-    def validate_ring_attn_func(self):
+    @classmethod
-        if getattr(self, "sequence_parallel_degree", 1) == 1:
+    def check_ring_attn_func(cls, value, info):
-            return self
+        if not info.data.get("sequence_parallel_degree", 1) > 1:
            return value
-        if self.ring_attn_func is not None:
+        from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
        if value is not None:
            # Set the ring attention function if passed in config
            valid_funcs = list(RingAttnFunc)
-            if self.ring_attn_func in valid_funcs:
+            if value in valid_funcs:
-                self.ring_attn_func = RingAttnFunc(self.ring_attn_func)
+                value = RingAttnFunc(value)
            else:
                raise ValueError(
-                    f"ring_attn_func: {self.ring_attn_func} must be in {valid_funcs}"
+                    f"ring_attn_func: {value} must be one of {valid_funcs}"
                )
        else:
            # Default ring attention function selection
-            sample_packing = getattr(self, "sample_packing", False)
+            sample_packing = info.data.get("sample_packing")
-            self.ring_attn_func = (
+            value = (
                RingAttnFunc.VARLEN_LLAMA3
                if sample_packing
                else RingAttnFunc.BATCH_RING
            )
-        return self
+        return value
    @model_validator(mode="before")
    @classmethod
--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -6,12 +6,12 @@ from enum import Enum
 class RLType(str, Enum):
    """RL trainer type configuration subset"""
-    DPO = "dpo"  # pylint: disable=invalid-name
+    dpo = "dpo"  # pylint: disable=invalid-name
-    GRPO = "grpo"  # pylint: disable=invalid-name
+    grpo = "grpo"  # pylint: disable=invalid-name
-    IPO = "ipo"  # pylint: disable=invalid-name
+    ipo = "ipo"  # pylint: disable=invalid-name
-    ORPO = "orpo"  # pylint: disable=invalid-name
+    orpo = "orpo"  # pylint: disable=invalid-name
-    KTO = "kto"  # pylint: disable=invalid-name
+    kto = "kto"  # pylint: disable=invalid-name
-    SIMPO = "simpo"  # pylint: disable=invalid-name
+    simpo = "simpo"  # pylint: disable=invalid-name
 class ChatTemplate(str, Enum):
@@ -53,14 +53,3 @@ class CustomSupportedOptimizers(str, Enum):
    ao_adamw_fp8 = "ao_adamw_fp8"  # pylint: disable=invalid-name
    adopt_adamw = "adopt_adamw"  # pylint: disable=invalid-name
    muon = "muon"  # pylint: disable=invalid-name
 class RingAttnFunc(str, Enum):
    """Enum class for supported `ring-flash-attn` implementations"""
    # VARLEN_RING = "varlen_ring"
    # VARLEN_ZIGZAG = "varlen_zigzag"
    VARLEN_LLAMA3 = "varlen_llama3"
    BATCH_RING = "batch_ring"
    BATCH_ZIGZAG = "batch_zigzag"
    BATCH_STRIPE = "batch_stripe"
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -348,7 +348,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
                    load_from_cache_file=not cfg.is_preprocess,
                    desc="Add position_id column (PoSE)",
                )
-    elif cfg.sample_packing:
+    elif cfg.sample_packing or cfg.sequence_parallel_degree > 1:
        drop_long_kwargs = {}
        if filter_map_kwargs:
            drop_long_kwargs["desc"] = "Add position_id column (Sample Packing)"
@@ -358,7 +358,7 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
            **filter_map_kwargs,
            **drop_long_kwargs,
        )
-        if cfg.eval_sample_packing:
+        if cfg.eval_sample_packing or cfg.sequence_parallel_degree > 1:
            if eval_dataset:
                eval_dataset = eval_dataset.map(
                    add_position_ids,
--- a/tests/e2e/integrations/test_llm_compressor.py
+++ b/tests/e2e/integrations/test_llm_compressor.py
@@ -0,0 +1,104 @@
 """
 E2E smoke tests for LLMCompressorPlugin integration
 """
 from pathlib import Path
 import pytest
 from axolotl.cli.args import TrainerCliArgs
 from axolotl.common.datasets import load_datasets
 from axolotl.train import train
 from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
 from axolotl.utils.dict import DictDefault
 from tests.e2e.utils import check_model_output_exists, require_torch_2_4_1
 MODELS = [
    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
    "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
 ]
@pytest.mark.parametrize(
    "base_model", MODELS, ids=["no-checkpoint-recipe", "with-checkpoint-recipe"]
 )
@pytest.mark.parametrize(
    "save_compressed", [True, False], ids=["save_compressed", "save_uncompressed"]
 )
 class TestLLMCompressorIntegration:
    """
    e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin
    """
    @require_torch_2_4_1
    def test_llmcompressor_plugin(
        self, temp_dir, base_model: str, save_compressed: bool
    ):
        # core cfg
        cfg = DictDefault(
            {
                "base_model": base_model,
                "plugins": ["axolotl.integrations.llm_compressor.LLMCompressorPlugin"],
                "sequence_len": 1024,
                "val_set_size": 0.05,
                "special_tokens": {"pad_token": "<|endoftext|>"},
                "datasets": [{"path": "mhenrichsen/alpaca_2k_test", "type": "alpaca"}],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 1e-5,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "bf16": "auto",
                "max_steps": 5,
                "llmcompressor": {
                    "recipe": {
                        "finetuning_stage": {
                            "finetuning_modifiers": {
                                "ConstantPruningModifier": {
                                    "targets": [
                                        "re:.*q_proj.weight",
                                        "re:.*k_proj.weight",
                                        "re:.*v_proj.weight",
                                        "re:.*o_proj.weight",
                                        "re:.*gate_proj.weight",
                                        "re:.*up_proj.weight",
                                        "re:.*down_proj.weight",
                                    ],
                                    "start": 0,
                                },
                            },
                        },
                    },
                    "save_compressed": save_compressed,
                },
            }
        )
        prepare_plugins(cfg)
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
        _check_llmcompressor_model_outputs(temp_dir, save_compressed)
 def _check_llmcompressor_model_outputs(temp_dir, save_compressed):
    # recipe.yaml should exist
    assert (Path(temp_dir) / "recipe.yaml").exists()
    # sparsity config exists if save_compressed
    if save_compressed:
        from compressed_tensors import ModelCompressor
        from compressed_tensors.config import Sparse24BitMaskConfig
        compressor = ModelCompressor.from_pretrained(temp_dir)
        assert compressor is not None
        assert isinstance(compressor.sparsity_config, Sparse24BitMaskConfig)
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -1,4 +1,6 @@
-"""E2E tests for mixtral"""
+"""
 E2E tests for mixtral
 """
 import logging
 import os
@@ -97,7 +99,6 @@ class TestMixtral(unittest.TestCase):
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_sp.py
+++ b/tests/e2e/patched/test_sp.py
@@ -2,22 +2,17 @@
 # pylint: disable=redefined-outer-name,unused-argument
 import functools
 import sys
 from unittest.mock import MagicMock, patch
 import pytest
 import torch
 from accelerate.state import PartialState
 from axolotl.core.trainers.mixins.sequence_parallel import apply_sequence_parallelism
 from axolotl.monkeypatch.attention.ring_attn import (
    get_ring_attn_group,
    register_ring_attn,
    set_ring_attn_group,
 )
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.schemas.enums import RingAttnFunc
@pytest.fixture
@@ -52,27 +47,6 @@ def fixture_cfg():
    return cfg
@pytest.fixture
 def sequence_parallel_batch():
    """Create a test batch for sequence parallelism tests."""
    batch_size = 1
    seq_len = 8
    # Create test tensors
    input_ids = torch.arange(batch_size * seq_len).reshape(batch_size, seq_len)
    attention_mask = torch.ones(batch_size, seq_len)
    position_ids = torch.arange(seq_len).expand(batch_size, seq_len)
    # Create test batch
    batch = {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "position_ids": position_ids,
    }
    return batch
 class TestRingAttention:
    """Tests for the ring attention functionality."""
@@ -99,6 +73,11 @@ class TestRingAttention:
        self, mock_world_size, mock_rank, mock_new_group, partial_state
    ):
        """Test that ring attention groups are created correctly."""
        from axolotl.monkeypatch.attention.ring_attn import (
            RingAttnFunc,
            register_ring_attn,
        )
        # Setup mocks
        mock_world_size.return_value = 8  # 8 GPUs total
        mock_rank.return_value = 3  # GPU #3
@@ -122,308 +101,88 @@ class TestRingAttention:
        set_ring_attn_group(None)
-class TestConfigValidation:
+# Mock a simplified DataCollator test
-    """Tests for validating sequence parallelism configurations."""
+@patch("axolotl.monkeypatch.attention.ring_attn.get_ring_attn_group")
@patch("torch.distributed.get_rank")
@patch("torch.distributed.get_world_size")
 def test_sequence_parallel_slicing(
    mock_world_size, mock_rank, mock_get_group, partial_state
 ):
    """Test the basic sequence slicing logic without full collator instantiation."""
    # Setup mocks
    mock_get_group.return_value = MagicMock()
    mock_rank.return_value = 1  # Second GPU
    mock_world_size.return_value = 4  # 4 GPUs total
-    @pytest.fixture(autouse=True)
+    # Create a sample batch
-    def setup_mocks(self, monkeypatch):
+    batch = {
-        """Set up mocks for all tests in this class."""
+        "input_ids": torch.tensor(
-        # Mock the ring_flash_attn module
+            [
-        monkeypatch.setitem(sys.modules, "ring_flash_attn", MagicMock())
+                [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112],
                [201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212],
            ]
        ),
        "attention_mask": torch.ones(2, 12),
    }
-        # Mock the is_main_process function to return True
+    # Simplified slicing logic from SequenceParallelDataCollator
-        monkeypatch.setattr(
+    def slice_batch(batch, rank, world_size):
-            "axolotl.utils.schemas.config.is_main_process", lambda: True
+        result = {}
-        )
+        for key in batch:
            seq_len = batch[key].shape[1]
            slice_size = seq_len // world_size
            start_idx = rank * slice_size
            end_idx = start_idx + slice_size if rank < world_size - 1 else seq_len
            result[key] = batch[key][:, start_idx:end_idx]
        return result
-    @pytest.fixture
+    # Slice the batch
-    def base_cfg(self):
+    result = slice_batch(
-        """Create a base configuration for testing."""
+        batch, rank=mock_rank.return_value, world_size=mock_world_size.return_value
        return DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "datasets": [{"path": "mhenrichsen/alpaca_2k_test", "type": "alpaca"}],
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "learning_rate": 1e-3,
                "output_dir": "./model-out",
                "sequence_len": 512,
                "special_tokens": {"pad_token": "<|endoftext|>"},
            }
        )
    @pytest.mark.parametrize(
        "config_updates, expected_values, should_pass, error_msg",
        [
            # Valid configuration
            (
                {"sequence_parallel_degree": 2, "flash_attention": True},
                {"sequence_parallel_degree": 2, "flash_attention": True},
                True,
                None,
            ),
            # Default sequence_parallel_degree
            ({}, {"sequence_parallel_degree": 1}, True, None),
            # Invalid: sequence_parallel_degree > 1 without flash_attention
            (
                {"sequence_parallel_degree": 2, "flash_attention": False},
                None,
                False,
                "flash_attention: true must be set",
            ),
            # Invalid: sequence_parallel_degree > 1 with sample_packing and micro_batch_size > 1
            (
                {
                    "sequence_parallel_degree": 2,
                    "flash_attention": True,
                    "sample_packing": True,
                    "micro_batch_size": 2,
                    "pad_to_sequence_len": True,
                },
                None,
                False,
                "micro_batch_size must be set to 1",
            ),
        ],
        ids=[
            "valid_config",
            "default_sp_degree",
            "without_flash_attention",
            "sample_packing_with_large_batch",
        ],
    )
    def test_sequence_parallel_config_validation(
        self, base_cfg, config_updates, expected_values, should_pass, error_msg
    ):
        """Test various sequence parallelism configuration scenarios."""
        from axolotl.utils.schemas.config import AxolotlInputConfig
-        # Apply updates to base config
+    # Check slicing
-        cfg = base_cfg
+    assert result["input_ids"].shape == (2, 3)  # 12 tokens / 4 GPUs = 3 tokens per GPU
-        cfg.update(config_updates)
+    expected_input_ids = torch.tensor(
        if should_pass:
            # Should validate without errors
            config = AxolotlInputConfig(**cfg)
            # Check expected values
            for key, value in expected_values.items():
                assert getattr(config, key) == value
        else:
            # Should raise exception
            with pytest.raises(ValueError) as excinfo:
                AxolotlInputConfig(**cfg)
            assert error_msg in str(excinfo.value)
    @pytest.mark.parametrize(
        "ring_attn_func, sample_packing, expected_func",
        [
-            (None, True, RingAttnFunc.VARLEN_LLAMA3),
+            [104, 105, 106],  # Second slice of first sequence
-            (None, False, RingAttnFunc.BATCH_RING),
+            [204, 205, 206],  # Second slice of second sequence
-        ],
+        ]
        ids=["default_with_sample_packing", "default_without_sample_packing"],
    )
-    def test_ring_attn_func_validation(
+    assert torch.all(result["input_ids"] == expected_input_ids)
        self, base_cfg, ring_attn_func, sample_packing, expected_func
    ):
        """Test ring_attn_func validation and defaults."""
        from axolotl.utils.schemas.config import AxolotlInputConfig
        # Apply updates to base config
        cfg = base_cfg | {
            "sequence_parallel_degree": 2,
            "flash_attention": True,
            "sample_packing": sample_packing,
        }
        if ring_attn_func is not None:
            cfg["ring_attn_func"] = ring_attn_func
        # Should validate without errors
        config = AxolotlInputConfig(**cfg)
        # Check ring_attn_func value
        assert config.ring_attn_func.value == expected_func
    def test_invalid_ring_attn_func(self, base_cfg):
        """Test that an invalid ring_attn_func is rejected."""
        from axolotl.utils.schemas.config import AxolotlInputConfig
        # Invalid configuration with invalid ring_attn_func
        cfg = base_cfg | {
            "sequence_parallel_degree": 2,
            "flash_attention": True,
            "ring_attn_func": "INVALID_FUNC",
        }
        # Should raise ValidationError
        with pytest.raises(ValueError) as excinfo:
            AxolotlInputConfig(**cfg)
        # Verify error message
        assert "ring_attn_func: INVALID_FUNC must be in" in str(excinfo.value)
-class TestApplySequenceParallelism:
+@patch.dict("sys.modules", {"ring_flash_attn": MagicMock()})
-    """Tests for the apply_sequence_parallelism function."""
+def test_config_validation_with_valid_inputs(cfg):
    """Test that valid sequence parallelism configurations pass validation."""
    # Import the actual model class with appropriate mocks
    from axolotl.utils.schemas.config import AxolotlInputConfig
-    @pytest.fixture(autouse=True)
+    # Valid configuration: sequence_parallel_degree > 1 and flash_attention is True
-    def mock_distributed(self, monkeypatch):
+    cfg = cfg | {
-        """Mock torch.distributed functions for testing."""
+        "sequence_parallel_degree": 2,
-        # Mock is_initialized to return True
+        "flash_attention": True,
-        monkeypatch.setattr(torch.distributed, "is_initialized", lambda: True)
+    }
-        # Mock get_rank to return 0 by default
+    # Should validate without errors
-        monkeypatch.setattr(torch.distributed, "get_rank", lambda *args, **kwargs: 0)
+    config = AxolotlInputConfig(**cfg)
    assert config.sequence_parallel_degree == 2
    assert config.flash_attention is True
        # Mock get_world_size to return 2 by default
        monkeypatch.setattr(
            torch.distributed, "get_world_size", lambda *args, **kwargs: 2
        )
-        # Mock the process group
+def test_config_validation_with_invalid_inputs(cfg):
-        monkeypatch.setattr(
+    """Test that invalid sequence parallelism configurations fail validation."""
-            "axolotl.monkeypatch.attention.ring_attn.get_ring_attn_group",
+    from axolotl.utils.schemas.config import AxolotlInputConfig
            MagicMock,
        )
-        # Mock update_ring_attn_params
+    # Invalid configuration: sequence_parallel_degree > 1 but flash_attention is False
-        monkeypatch.setattr(
+    cfg = cfg | {
-            "axolotl.monkeypatch.attention.ring_attn.update_ring_attn_params",
+        "sequence_parallel_degree": 2,
-            lambda **kwargs: None,
+        "flash_attention": False,
-        )
+    }
-    def test_world_size_one(self, sequence_parallel_batch):
+    # Should raise ValidationError
-        """Test that function returns original batch when world size is 1."""
+    with pytest.raises(ValueError) as excinfo:
-        result = apply_sequence_parallelism(
+        AxolotlInputConfig(**cfg)
            batch=sequence_parallel_batch,
            local_rank=0,
            local_world_size=1,
            ring_attn_func=RingAttnFunc.BATCH_RING,
        )
-        # Should return the original batch unchanged
+    # Verify error message
-        assert result == sequence_parallel_batch
+    assert "flash_attention: true must be set" in str(excinfo.value)
    def test_batch_ring_rank0(self, sequence_parallel_batch):
        """Test BATCH_RING sharding for rank 0 in a 2-process group."""
        batch = sequence_parallel_batch
        seq_len = batch["input_ids"].size(1)
        result = apply_sequence_parallelism(
            batch=batch,
            local_rank=0,
            local_world_size=2,
            ring_attn_func=RingAttnFunc.BATCH_RING,
        )
        # Check that sequence dimension was sharded correctly
        assert result["input_ids"].shape[1] == seq_len // 2
        assert result["attention_mask"].shape[1] == seq_len // 2
        # Verify content: rank 0 should get the first half of the sequence
        assert torch.equal(result["input_ids"], batch["input_ids"][:, : seq_len // 2])
        assert torch.equal(
            result["position_ids"], batch["position_ids"][:, : seq_len // 2]
        )
    def test_batch_ring_rank1(self, sequence_parallel_batch):
        """Test BATCH_RING sharding for rank 1 in a 2-process group."""
        batch = sequence_parallel_batch
        seq_len = batch["input_ids"].size(1)
        original_input_ids = batch["input_ids"].clone()
        result = apply_sequence_parallelism(
            batch=batch,
            local_rank=1,
            local_world_size=2,
            ring_attn_func=RingAttnFunc.BATCH_RING,
        )
        # Verify content: rank 1 should get the second half of the sequence
        assert torch.equal(result["input_ids"], original_input_ids[:, seq_len // 2 :])
    def test_batch_zigzag(self, sequence_parallel_batch):
        """Test BATCH_ZIGZAG sharding pattern."""
        batch = sequence_parallel_batch
        original_input_ids = batch["input_ids"].clone()
        seq_len = batch["input_ids"].size(1)
        # Test rank 0
        result_rank0 = apply_sequence_parallelism(
            batch={k: v.clone() for k, v in batch.items()},
            local_rank=0,
            local_world_size=2,
            ring_attn_func=RingAttnFunc.BATCH_ZIGZAG,
        )
        # Test rank 1
        result_rank1 = apply_sequence_parallelism(
            batch={k: v.clone() for k, v in batch.items()},
            local_rank=1,
            local_world_size=2,
            ring_attn_func=RingAttnFunc.BATCH_ZIGZAG,
        )
        # Checks for both ranks
        assert result_rank0["input_ids"].shape[1] == seq_len // 2
        assert result_rank1["input_ids"].shape[1] == seq_len // 2
        # For a 2-rank system with 8 tokens, check specific zigzag pattern
        # Rank 0 should get chunks [0, 1] and [6, 7]
        # Rank 1 should get chunks [2, 3] and [4, 5]
        if seq_len == 8:
            # Create expected tensors for comparison
            rank0_expected = torch.cat(
                [original_input_ids[:, :2], original_input_ids[:, 6:8]], dim=1
            )
            rank1_expected = torch.cat(
                [original_input_ids[:, 2:4], original_input_ids[:, 4:6]], dim=1
            )
            assert torch.equal(result_rank0["input_ids"], rank0_expected)
            assert torch.equal(result_rank1["input_ids"], rank1_expected)
    def test_partial_application(self, sequence_parallel_batch):
        """Test that we can create a partially applied version of the function."""
        batch = sequence_parallel_batch
        original_input_ids = batch["input_ids"].clone()
        # Create a partially applied function
        rank0_ring_parallel = functools.partial(
            apply_sequence_parallelism,
            local_rank=0,
            local_world_size=2,
            ring_attn_func=RingAttnFunc.BATCH_RING,
        )
        # Use the partially applied function
        result = rank0_ring_parallel(batch=batch)
        # Verify it works as expected
        assert result["input_ids"].shape[1] == original_input_ids.shape[1] // 2
        assert torch.equal(
            result["input_ids"],
            original_input_ids[:, : original_input_ids.shape[1] // 2],
        )
    def test_missing_position_ids(self, sequence_parallel_batch):
        """Test handling of batch without position_ids."""
        # Create a batch without position_ids
        batch = {
            k: v for k, v in sequence_parallel_batch.items() if k != "position_ids"
        }
        original_input_ids = batch["input_ids"].clone()
        # This should run without error even though position_ids is missing
        result = apply_sequence_parallelism(
            batch=batch,
            local_rank=0,
            local_world_size=2,
            ring_attn_func=RingAttnFunc.BATCH_RING,
        )
        # Verification should pass
        assert "position_ids" not in result
        assert result["input_ids"].shape[1] == original_input_ids.shape[1] // 2
Author	SHA1	Message	Date
Wing Lian	b708a1cc45	validate config to set defaults	2025-04-26 13:11:25 -04:00
Rahul Tuli	daa9a58f83	Add: line about further optimizations using llmcompressor Signed-off-by: Rahul Tuli <rtuli@redhat.com>	2025-04-24 14:06:25 -04:00
Rahul Tuli	ae7069e15b	Merge branch 'main' into llmcompressor-sft	2025-04-24 12:37:14 -05:00
Rahul Tuli	20d48cd617	Address Review Comments: * deleted redundant docs/llm_compressor.qmd * incorporated feedback in integration README.md * added llmcompressor integration to docs/custom_integrations.qmd Signed-off-by: Rahul Tuli <rtuli@redhat.com>	2025-04-24 13:36:09 -04:00
Rahul Tuli	e766a730ba	Add: .qmd file	2025-04-24 12:45:57 -04:00
Rahul Tuli	7dc797860e	Tests, Style, Updates	2025-04-24 12:45:57 -04:00
Rahul Tuli	ff4904c8c4	Rebase and updates!	2025-04-24 12:45:57 -04:00
Rahul Tuli	45b7293793	Add: `llm_compressor` integration documentation	2025-04-24 12:45:57 -04:00
Rahul Tuli	279c7178bc	Move: LLMCompressorPlugin into it's own submodule	2025-04-24 12:45:57 -04:00
Rahul Tuli	e73c3709f9	Update model config	2025-04-24 12:45:57 -04:00
Rahul Tuli	33562189f8	Use: absolute import	2025-04-24 12:45:57 -04:00
Rahul Tuli	c057a2268f	Rename: sft.yaml to sparse-finetuning.yaml	2025-04-24 12:45:57 -04:00
Rahul Tuli	9d7a3809b5	Add: llcompressor installable	2025-04-24 12:45:57 -04:00
Rahul Tuli	b7b24d6a64	Address review comments from @markurtz	2025-04-24 12:45:57 -04:00
Rahul Tuli	8b82b8f7a1	Apply suggestions from @markurtz Co-authored-by: Mark Kurtz <mark.j.kurtz@gmail.com>	2025-04-24 12:45:57 -04:00
Rahul Tuli	81da58c0a1	Update llmcompressor version to latest	2025-04-24 12:45:57 -04:00
Rahul Tuli	2cd5a234a7	Revert: TODO's	2025-04-24 12:45:57 -04:00
Rahul Tuli	8c1af0747d	Use: warning over warn	2025-04-24 12:45:57 -04:00
Rahul Tuli	a06b360d99	pre commit hooks	2025-04-24 12:45:57 -04:00
Rahul Tuli	0f6456a14f	Add:llmcompressor instalable	2025-04-24 12:45:57 -04:00
Rahul Tuli	47a333ce49	Update: review comments!	2025-04-24 12:45:57 -04:00
Rahul Tuli	f9d6776c28	Add: SFTPlugin with llmcompressor	2025-04-24 12:45:57 -04:00