use uint8 dtype for qlora

support for configurable group and bin size for sample packing
fix xformers inference
2025-05-05 08:27:34 -04:00 · 2025-05-05 06:57:21 -04:00 · 2025-05-05 06:20:31 -04:00 · 2025-05-05 03:49:01 -04:00 · 2025-05-05 03:22:02 -04:00 · 2025-05-04 23:24:46 -04:00
17 changed files with 781 additions and 160 deletions
--- a/src/axolotl/cli/art.py
+++ b/src/axolotl/cli/art.py
@@ -16,8 +16,15 @@ AXOLOTL_LOGO = """
    @@@@  @@@@@@@@@@@@@@@@
 """
 HAS_PRINTED_LOGO = False
 def print_axolotl_text_art():
    """Prints axolotl ASCII art."""
    global HAS_PRINTED_LOGO  # pylint: disable=global-statement
    if HAS_PRINTED_LOGO:
        return
    if is_main_process():
        HAS_PRINTED_LOGO = True
        print(AXOLOTL_LOGO)
--- a/src/axolotl/common/datasets.py
+++ b/src/axolotl/common/datasets.py
@@ -48,6 +48,7 @@ def load_datasets(
    *,
    cfg: DictDefault,
    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
    debug: bool = False,
 ) -> TrainDatasetMeta:
    """
    Loads one or more training or evaluation datasets, calling
@@ -56,6 +57,7 @@ def load_datasets(
    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Command-specific CLI arguments.
        debug: Whether to print out tokenization of sample
    Returns:
        Dataclass with fields for training and evaluation datasets and the computed
@@ -77,20 +79,25 @@ def load_datasets(
        preprocess_iterable=preprocess_iterable,
    )
-    if cli_args and (
+    if (  # pylint: disable=too-many-boolean-expressions
-        cli_args.debug
+        cli_args
-        or cfg.debug
+        and (
-        or cli_args.debug_text_only
+            cli_args.debug
-        or int(cli_args.debug_num_examples) > 0
+            or cfg.debug
-    ):
+            or cli_args.debug_text_only
            or int(cli_args.debug_num_examples) > 0
        )
    ) or debug:
        LOG.info("check_dataset_labels...")
-        train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
+        num_examples = cli_args.debug_num_examples if cli_args else 1
        text_only = cli_args.debug_text_only if cli_args else False
        train_samples = sample_dataset(train_dataset, num_examples)
        check_dataset_labels(
            train_samples,
            tokenizer,
-            num_examples=cli_args.debug_num_examples,
+            num_examples=num_examples,
-            text_only=cli_args.debug_text_only,
+            text_only=text_only,
        )
        LOG.info("printing prompters...")
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
@@ -114,6 +114,8 @@ class AxolotlTrainer(
            packing_efficiency_estimate=self.args.sample_packing_efficiency,
            batch_max_len=batch_max_len,
            batch_size=batch_size,
            group_size=self.args.sample_packing_group_size,
            bin_size=self.args.sample_packing_bin_size,
            sequential=self.args.sample_packing_sequentially,
            drop_last=True,
        )
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -72,7 +72,7 @@ class CutCrossEntropyPlugin(BasePlugin):
        if cfg.cut_cross_entropy:
            self._check_requirements()
-            from axolotl.integrations.cut_cross_entropy.monkeypatch.patch import (
+            from .monkeypatch.patch import (
                cce_patch,
            )
--- a/src/axolotl/integrations/cut_cross_entropy/monkeypatch/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/monkeypatch/init.py
--- a/src/axolotl/monkeypatch/attention/init.py
+++ b/src/axolotl/monkeypatch/attention/init.py
@@ -0,0 +1,19 @@
 """
 attention module for attention monkeypatches
 """
 from transformers.integrations.flash_attention import flash_attention_forward
 def patch_xformers_attn_over_fa2():
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
    from .xformers import xformers_attention_forward
    ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = xformers_attention_forward
 def unpatch_xformers_attn_over_fa2():
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
    ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward()
--- a/src/axolotl/monkeypatch/attention/xformers.py
+++ b/src/axolotl/monkeypatch/attention/xformers.py
@@ -0,0 +1,160 @@
 """
 xformers attention implementation for packing
 """
 from typing import Optional
 import torch
 import xformers
 import xformers.ops.fmha
 from transformers.modeling_flash_attention_utils import (
    _upad_input,
 )
 from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
 xformers_attention = xformers.ops.fmha.memory_efficient_attention
 def xformers_attention_forward(
    module: torch.nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    dropout: float = 0.0,  # pylint: disable=unused-argument
    scaling: Optional[float] = None,  # pylint: disable=unused-argument
    sliding_window: Optional[int] = None,  # pylint: disable=unused-argument
    softcap: Optional[float] = None,  # pylint: disable=unused-argument
    cu_seq_lens_q: Optional[torch.LongTensor] = None,
    cu_seq_lens_k: Optional[torch.LongTensor] = None,
    max_length_q: Optional[int] = None,
    max_length_k: Optional[int] = None,  # pylint: disable=unused-argument
    **kwargs,  # pylint: disable=unused-argument
 ):
    # Get dimensions
    # query: [batch, heads, seq_len, hidden_dim]
    batch_size = query.size(0)
    query_length = query.shape[2]
    key_length = key.shape[2]
    # Default causal mask
    attn_bias = xformers.ops.LowerTriangularMask()
    # Check if we have sliding window attention
    has_sliding_window = sliding_window is not None and sliding_window < query_length
    # Transpose dimensions for xformers (Q: [b, h, s, d] -> [b, s, h, d])
    query = query.transpose(1, 2)
    key = key.transpose(1, 2)
    value = value.transpose(1, 2)
    # Get GQA parameters
    num_attention_heads = module.config.num_attention_heads
    num_key_value_heads = module.config.num_key_value_heads
    head_dim = query.size(-1)
    is_gqa = num_attention_heads != num_key_value_heads
    n_groups = num_attention_heads // num_key_value_heads if is_gqa else 1
    # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
    # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
    # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
    if position_ids is not None and (
        max_length_q is not None
        or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all())
    ):
        if cu_seq_lens_q is None or cu_seq_lens_k is None:
            cu_seq_lens_q = get_cu_seqlens_from_pos_ids(position_ids)[0]
            cu_seq_lens_q = cu_seq_lens_q.squeeze()
            seq_lengths = cu_seq_lens_q[1:] - cu_seq_lens_q[:-1]
            attn_bias = (
                xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(
                    q_seqlen=seq_lengths.tolist(),
                )
            )
        else:
            query = query.reshape(-1, query.size(-2), query.size(-1))
            key = key.reshape(-1, key.size(-2), key.size(-1))
            value = value.reshape(-1, value.size(-2), value.size(-1))
        # Handle GQA
        if is_gqa:
            key = key.repeat_interleave(n_groups, dim=2)
            value = value.repeat_interleave(n_groups, dim=2)
    elif attention_mask is not None:
        query, key, value, _, cu_seq_lens, _ = _upad_input(
            query, key, value, attention_mask, query_length
        )
        cu_seq_lens_q, cu_seq_lens_k = cu_seq_lens
        seq_lengths = []
        for i in range(len(cu_seq_lens_q) - 1):
            seq_lengths.append(cu_seq_lens_q[i + 1] - cu_seq_lens_q[i])
        attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(
            q_seqlen=seq_lengths,
            kv_seqlen=seq_lengths,
        )
        # Handle GQA
        if is_gqa:
            key = key.repeat_interleave(n_groups, dim=2)
            value = value.repeat_interleave(n_groups, dim=2)
    else:
        # Handle Group Query Attention (GQA) using view/expand approach from reference
        key = key.view(batch_size, key_length, num_key_value_heads, 1, head_dim)
        value = value.view(batch_size, key_length, num_key_value_heads, 1, head_dim)
        key = key.expand(
            batch_size, key_length, num_key_value_heads, n_groups, head_dim
        )
        value = value.expand(
            batch_size, key_length, num_key_value_heads, n_groups, head_dim
        )
        if module.training:
            key = key.reshape(batch_size, key_length, num_attention_heads, head_dim)
            value = value.reshape(batch_size, key_length, num_attention_heads, head_dim)
            if has_sliding_window:
                query = query.view(
                    1, batch_size * query_length, num_attention_heads, head_dim
                )
                key = key.view(
                    1, batch_size * key_length, num_attention_heads, head_dim
                )
                value = value.view(
                    1, batch_size * key_length, num_attention_heads, head_dim
                )
        else:
            query = query.view(
                batch_size, query_length, num_key_value_heads, n_groups, head_dim
            )
            # If we need a sliding window attention
            if has_sliding_window:
                query = query.view(
                    1,
                    batch_size * query_length,
                    num_key_value_heads,
                    n_groups,
                    head_dim,
                )
                key = key.view(
                    1, batch_size * key_length, num_key_value_heads, n_groups, head_dim
                )
                value = value.view(
                    1, batch_size * key_length, num_key_value_heads, n_groups, head_dim
                )
    # Run the xformers attention
    attn_output = xformers_attention(
        query,
        key,
        value,
        attn_bias=attn_bias,
    )
    attn_output = attn_output.view(
        batch_size, -1, attn_output.size(-2), attn_output.size(-1)
    )
    return attn_output, None
--- a/src/axolotl/monkeypatch/loss/init.py
+++ b/src/axolotl/monkeypatch/loss/init.py
--- a/src/axolotl/monkeypatch/loss/chunked.py
+++ b/src/axolotl/monkeypatch/loss/chunked.py
@@ -0,0 +1,134 @@
 """
 chunked ce loss
 """
 from typing import List, Optional
 import torch
 import torch.nn.functional as F
 # copied and modified from torchtune.modules.loss.CEWithChunkedOutputLoss
 class CEWithChunkedOutputLoss(torch.nn.Module):
    """
    Cross-entropy with chunked outputs that saves memory by only upcasting one chunk at a time.
    For more details, please refer to: https://github.com/pytorch/torchtune/pull/1390
    """
    def __init__(self, num_output_chunks: int = 8, ignore_index: int = -100):
        super().__init__()
        self.num_output_chunks = num_output_chunks
        self.ignore_index = ignore_index
    def compute_cross_entropy(
        self,
        logits: torch.Tensor,
        labels: torch.Tensor,
        normalize: bool = True,  # pylint: disable=unused-argument
    ) -> torch.Tensor:
        """
        Upcast logits to fp32 and compute cross entropy loss.
        """
        return F.cross_entropy(
            logits.float(), labels, ignore_index=self.ignore_index, reduction="sum"
        )
    def forward(
        self, logits: List[torch.Tensor], labels: torch.Tensor, reduction="sum"
    ) -> torch.Tensor:
        """
        Args:
            logits (List[torch.Tensor]): List of chunked logits of length
                ``self.num_output_chunks``, where each chunk has shape
                ``(batch_size, num_tokens / num_output_chunks, vocab_size)``.
            labels (torch.Tensor): Ground truth labels of shape ``(batch_size, num_tokens)``.
            reduction (str): The reduction to apply to the output.
        Returns:
            torch.Tensor: Cross entropy loss of shape (1,).
        """
        total_elements = (labels != self.ignore_index).sum()
        # chunk and reshape labels (bsz, num_tokens, vocab) -> [(bsz*num_tokens/num_chunks, vocab)]
        labels = [
            target_chunk.reshape(-1)
            for target_chunk in labels.chunk(self.num_output_chunks, dim=1)
        ]
        # reshape logits [(bsz, num_tokens/num_chunks, vocab)] -> [(bsz*num_tokens/num_chunks, vocab)]
        logits = [
            logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits
        ]
        # compute one chunk at a time
        total_loss = 0.0
        for logits_chunk, labels_chunk in zip(logits, labels):
            total_loss += self.compute_cross_entropy(logits_chunk, labels_chunk)
        if reduction == "sum":
            return total_loss
        return total_loss / total_elements
 def _build_chunked_ce_loss_fn(num_output_chunks: int = 8, ignore_index: int = -100):
    loss_fn_ce = CEWithChunkedOutputLoss(num_output_chunks, ignore_index)
    loss_fn_ce.compute_cross_entropy = torch.compile(
        loss_fn_ce.compute_cross_entropy, backend="inductor"
    )
    return loss_fn_ce
 def get_causal_lm_loss(num_output_chunks: int = 8, ignore_index: int = -100):
    loss_fn_ce = _build_chunked_ce_loss_fn(num_output_chunks, ignore_index)
    def chunked_fix_cross_entropy(
        source,
        target,
        num_items_in_batch: int = None,
        ignore_index: int = -100,
        **kwargs,
    ):  # pylint: disable=unused-argument
        reduction = "sum" if num_items_in_batch is not None else "mean"
        logit_chunks = [  # pylint: disable=unnecessary-comprehension
            chunk for chunk in source.chunk(loss_fn_ce.num_output_chunks, dim=1)
        ]
        loss = loss_fn_ce(logit_chunks, target, reduction=reduction)
        if reduction == "sum":
            loss = loss / num_items_in_batch
        return loss
    def for_causal_lm_chunked_loss(
        logits,
        labels,
        vocab_size: int = None,  # pylint: disable=unused-argument
        num_items_in_batch: Optional[int] = None,
        ignore_index: int = -100,
        shift_labels: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        # skip the upcast to float since we handle that in the chunking loss
        if shift_labels is None:
            # Shift so that tokens < n predict n
            labels = F.pad(labels, (0, 1), value=ignore_index)
            shift_labels = labels[..., 1:].contiguous()
        # Skip Flattening the tokens
        # Enable model parallelism
        shift_labels = shift_labels.to(logits.device)
        loss = chunked_fix_cross_entropy(
            logits, shift_labels, num_items_in_batch, ignore_index, **kwargs
        )
        return loss
    return for_causal_lm_chunked_loss
 def patch_chunked_ce_loss_fn(num_output_chunks: int = 8, ignore_index: int = -100):
    import transformers.loss.loss_utils
    for_causal_lm_chunked_loss = get_causal_lm_loss(num_output_chunks, ignore_index)
    transformers.loss.loss_utils.ForCausalLMLoss = for_causal_lm_chunked_loss
    transformers.loss.loss_utils.LOSS_MAPPING["ForCausalLM"] = (
        for_causal_lm_chunked_loss
    )
--- a/src/axolotl/monkeypatch/peft/init.py
+++ b/src/axolotl/monkeypatch/peft/init.py
--- a/src/axolotl/monkeypatch/peft/utils.py
+++ b/src/axolotl/monkeypatch/peft/utils.py
@@ -0,0 +1,78 @@
 """
 Patch prepare_model_for_kbit_training to not upcast everything
 """
 import inspect
 import logging
 import peft
 import axolotl
 from axolotl.monkeypatch.utils import detab_code
 LOG = logging.getLogger(__name__)
 ORIGINAL_PREPARE_CODE = """
        for param in model.parameters():
            if (
                (param.dtype == torch.float16) or (param.dtype == torch.bfloat16)
            ) and param.__class__.__name__ != "Params4bit":
                param.data = param.data.to(torch.float32)
 """
 PATCHED_PREPARE_CODE = """
        for name, param in model.named_parameters():
            if (
                (param.dtype == torch.float16) or (param.dtype == torch.bfloat16)
            ) and param.__class__.__name__ != "Params4bit" and "norm" in name:
                param.data = param.data.to(torch.float32)
 """
 def get_peft_prep_code() -> str:
    prepare = inspect.getsource(peft.utils.other.prepare_model_for_kbit_training)
    return prepare
 def check_peft_prep_code_is_patchable() -> bool:
    prep_code = get_peft_prep_code()
    prep_code, _ = detab_code(prep_code)
    return ORIGINAL_PREPARE_CODE in prep_code
 def patch_peft_prep_code():
    """
    monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs
    """
    try:
        prep_code = get_peft_prep_code()
    except OSError:
        return
    peft.utils.other._original_create_accelerator_and_postprocess = (  # pylint: disable=protected-access
        prep_code
    )
    prep_code, _ = detab_code(prep_code)
    if ORIGINAL_PREPARE_CODE not in prep_code:
        return
    prep_code = prep_code.replace(ORIGINAL_PREPARE_CODE, PATCHED_PREPARE_CODE)
    prep_code = prep_code.replace(
        "def prepare_model_for_kbit_training(",
        "def fixed_prepare_model_for_kbit_training(",
        1,
    )
    items_to_import = []
    for item in dir(peft.utils.other):
        if item in prep_code:
            items_to_import.append(item)
    exec(  # pylint: disable=exec-used  # nosec B102
        "from peft.utils.other import (" + ", ".join(x for x in items_to_import) + ")",
        globals(),
    )
    exec(prep_code, globals())  # pylint: disable=exec-used  # nosec B102
    LOG.info("patching prepare_model_for_kbit_training to allow for overrides")
    peft.utils.other.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
    axolotl.utils.models.prepare_model_for_kbit_training = fixed_prepare_model_for_kbit_training  # pylint: disable=protected-access  # pylint: disable=undefined-variable  # noqa: F821
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -21,6 +21,7 @@ from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.trainer import Trainer
 from axolotl.cli.art import print_axolotl_text_art
 from axolotl.common.datasets import TrainDatasetMeta
 from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
@@ -516,6 +517,8 @@ def train(
    Returns:
        Tuple of (model, tokenizer) after training
    """
    print_axolotl_text_art()
    # Setup model, tokenizer, (causal or RLHF) trainer, etc.
    (
        trainer,
--- a/src/axolotl/utils/config/init.py
+++ b/src/axolotl/utils/config/init.py
@@ -70,6 +70,9 @@ def resolve_dtype(cfg):
            if cfg.fp16 is None and not cfg.float16:
                cfg.fp16 = True
    if cfg.fp16 and cfg.bf16 == "auto":
        cfg.bf16 = False
    if cfg.device == "mps":
        cfg.load_in_8bit = False
        cfg.tf32 = False
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -556,11 +556,30 @@ class ModelLoader:
        self.auto_model_loader = AutoModelForCausalLM  # pylint: disable=invalid-name
    def apply_patches(self) -> None:
        if self.cfg.xformers_attention and self.cfg.sample_packing:
            from axolotl.monkeypatch.attention import patch_xformers_attn_over_fa2
            patch_xformers_attn_over_fa2()
            self.cfg.flash_attention = True
        if self.cfg.chunked_cross_entropy:
            from axolotl.monkeypatch.loss.chunked import patch_chunked_ce_loss_fn
            if self.cfg.chunked_cross_entropy_num_chunks:
                patch_chunked_ce_loss_fn(self.cfg.chunked_cross_entropy_num_chunks)
            else:
                patch_chunked_ce_loss_fn()
        if self.cfg.fsdp_config and str(self.cfg.fsdp_config.fsdp_version) == "2":
            from axolotl.monkeypatch.accelerate.fsdp2 import patch_accelerate_fsdp_utils
            patch_accelerate_fsdp_utils()
        if self.cfg.adapter:
            from axolotl.monkeypatch.peft.utils import patch_peft_prep_code
            patch_peft_prep_code()
        if self.cfg.flex_attention:
            from axolotl.monkeypatch.attention.flex_attn import (
                patch_flex_make_mask,
@@ -893,7 +912,7 @@ class ModelLoader:
                "bnb_4bit_compute_dtype": self.cfg.torch_dtype,
                "bnb_4bit_use_double_quant": True,
                "bnb_4bit_quant_type": "nf4",
-                "bnb_4bit_quant_storage": torch.bfloat16,
+                "bnb_4bit_quant_storage": torch.uint8,
            }
            if self.cfg.model_config_type in ["jamba", "qwen2_moe"] and not (
                self.cfg.deepspeed or self.cfg.fsdp
@@ -1180,7 +1199,7 @@ class ModelLoader:
                ],
            )
-    def prepare_model(self, qlora_fsdp) -> None:
+    def prepare_model(self, qlora_fsdp: bool) -> None:
        skip_prepare_model_for_kbit_training = False
        if self.cfg.model_config_type == "qwen" and self.cfg.adapter == "lora":
            # Qwen doesn't play nicely with LoRA if this is enabled
@@ -1309,7 +1328,7 @@ class ModelLoader:
        # make sure these are fp32 per Ramesh et al. (2021)
        embedding_modules = get_linear_embedding_layers(self.cfg.model_config_type)
-        if not self.cfg.fsdp:
+        if self.cfg.fsdp:
            # FSDP doesn't like mixed Float and BFloat16
            self.convert_embedding_modules_dtype(
                embedding_modules,
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -1,10 +1,13 @@
 # pylint: skip-file
 """
-Multipack Batch Sampler
+Multipack Batch Sampler - An efficient batch sampler for packing variable-length sequences
 into fixed-capacity batches to optimize memory usage and training throughput.
 """
 import logging
 import math
-from typing import Any, Iterable, List, Union
+from concurrent.futures import ProcessPoolExecutor
 from multiprocessing import cpu_count
 from typing import Iterable, List, Union
 import numba
 import numpy as np
@@ -13,26 +16,39 @@ from torch.utils.data import BatchSampler, Sampler, SequentialSampler
 from axolotl.utils.distributed import reduce_and_broadcast
 LOG = logging.getLogger(__name__)
 LOG.setLevel(logging.INFO)
@numba.njit
-def ffd_check(a: np.ndarray, c: int, n: int):
+def ffd_check(sequence_lengths: np.ndarray, bin_capacity: int, num_bins: int):
-    # First-fit-decreasing bin packing
+    """
-    # Check if a[] could fit in n bins with capacity c
+    First-fit-decreasing bin packing algorithm check
    # https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing
-    a = np.sort(a)[::-1]
+    Checks if sequences with the given lengths could fit in the specified number of bins
-    bins = np.full((n,), c, dtype=a.dtype)
+
-    for size in a:
+    Args:
        sequence_lengths: Array of sequence lengths
        bin_capacity: Maximum capacity of each bin
        num_bins: Number of bins available
    Returns:
        True if all sequences can be packed, False otherwise
    """
    # Sort sequence lengths in descending order for optimal packing
    sequence_lengths = np.sort(sequence_lengths)[::-1]
    # Initialize all bins with full capacity
    bins = np.full((num_bins,), bin_capacity, dtype=sequence_lengths.dtype)
    # Try to place each sequence in the first bin it fits
    for size in sequence_lengths:
        not_found = True
-        for idx in range(n):
+        for idx in range(num_bins):
            if bins[idx] >= size:
                bins[idx] -= size
                not_found = False
                break
        # If no bin could fit this sequence, packing failed
        if not_found:
            return False
@@ -40,240 +56,380 @@ def ffd_check(a: np.ndarray, c: int, n: int):
@numba.njit
-def ffd_with_result(a: np.ndarray, c: int, start_index: int):
+def pack_group(
-    # First-fit-decreasing bin packing (with result return)
+    sequence_lengths: np.ndarray,
    group_offset: int,
    bin_capacity: int,
    max_bins: int,
    bin_size: int,
    safe_mode: bool = True,
 ):
    """
    Pack a group of sequences into bins using First-Fit Decreasing algorithm
-    indices = np.argsort(a)[::-1]
+    Args:
-    a = a[indices]
+        sequence_lengths: Array of sequence lengths
        group_offset: Offset to apply to indices when returning results
        bin_capacity: Maximum capacity of each bin
        max_bins: Maximum number of bins to use
        bin_size: Maximum number of sequences per bin
        safe_mode: If True, use a more conservative packing approach
-    bins: List[Any] = []
+    Returns:
-    bins_result: List[Any] = []
+        List of bins, where each bin contains indices of sequences assigned to it
-    for a_id, size in enumerate(a):
+    """
-        add_new = True
+    # Get sorting indices and sort lengths in descending order
-        for idx in range(len(bins)):
+    indices = np.argsort(sequence_lengths)[::-1]
-            if bins[idx] >= size:
+    sorted_lengths = sequence_lengths[indices]
-                bins[idx] -= size
+
-                bins_result[idx].append(indices[a_id] + start_index)
+    bins_remaining_space: list = []  # Tracks remaining capacity in each bin
-                add_new = False
+    bins_assigned_sequences: list = []  # Tracks sequence indices assigned to each bin
    for seq_id, size in enumerate(sorted_lengths):
        global_idx = indices[seq_id] + group_offset
        # Try to place sequence in existing bins
        add_new_bin = True
        for bin_idx, _ in enumerate(bins_remaining_space):
            if (
                bins_remaining_space[bin_idx] >= size
                and len(bins_assigned_sequences[bin_idx]) < bin_size
            ):
                bins_remaining_space[bin_idx] -= size
                bins_assigned_sequences[bin_idx].append(global_idx)
                add_new_bin = False
                break
-        if add_new:
+        # Create a new bin if needed and if we haven't reached the limit
-            bins.append(c - size)
+        if add_new_bin:
-            bins_result.append([indices[a_id] + start_index])
+            if len(bins_remaining_space) >= max_bins and safe_mode:
                # In safe mode, skip items that would exceed max_bins
                continue
            bins_remaining_space.append(bin_capacity - size)
            bins_assigned_sequences.append([global_idx])
-    return bins_result
+            # Safety check to avoid infinite bins
            if len(bins_remaining_space) > len(sequence_lengths):
                break
    return bins_assigned_sequences
-@numba.njit
+# Define a standalone function for multiprocessing
-def allocate(
+def _process_group(args):
-    lengths: np.ndarray, lengths_cumsum: np.ndarray, rank: int, c: int, n: int
+    group_lengths, start_idx, bin_capacity, max_bins, bin_size, safe_mode = args
    return pack_group(
        group_lengths, start_idx, bin_capacity, max_bins, bin_size, safe_mode
    )
 def pack_parallel(
    sequence_lengths: np.ndarray,
    bin_capacity: int,
    group_size: int,
    bin_size: int,
    num_processes: int | None = None,
    safe_mode: bool = True,
 ):
-    # Dynamic batch allocator, similar to Multifit
+    """
-    # https://en.wikipedia.org/wiki/Multifit_algorithm
+    Pack sequences into bins using parallel processing
    # ~99.5% efficiency on OpenChat training set (12 * 2048 ctx len)
-    s = 0
+    Args:
-    start_index = 0
+        sequence_lengths: Array of sequence lengths
-    result = []
+        bin_capacity: Maximum capacity of each bin as total number of tokens
        group_size: Number of sequences to process in each group
        bin_size: Maximum number of bins to use
        num_processes: Number of parallel processes to use
        safe_mode: If True, use a more conservative packing approach
-    while True:
+    Returns:
-        # binary search [l, r)
+        List of bins, where each bin contains indices of sequences assigned to it
-        left = 1
+    """
-        right = 1 + np.searchsorted(lengths_cumsum[start_index:], s + c * n, "right")
+    num_items = len(sequence_lengths)
    if num_processes is None:
        num_processes = max(1, min(num_items // group_size, cpu_count()))
-        while right - left > 1:
+    # Create tasks for parallel processing
-            mid = (left + right) // 2
+    tasks = []
-            if ffd_check(lengths[start_index : start_index + mid], c, n):
+    for i in range(0, num_items, group_size):
-                left = mid
+        group_lengths = sequence_lengths[i : i + group_size]
-            else:
+        max_bins = len(group_lengths)  # Allow as many bins as items in the group
-                right = mid
+        tasks.append((group_lengths, i, bin_capacity, max_bins, bin_size, safe_mode))
-        # use length l
+    # Process groups in parallel
-        batch = ffd_with_result(
+    all_bins = []
-            lengths[start_index : start_index + left], c, start_index
+    with ProcessPoolExecutor(max_workers=num_processes) as executor:
-        )
+        for group_bins in executor.map(_process_group, tasks):
-        assert len(batch) <= n
+            all_bins.extend(group_bins)
        if len(batch) < n:
            break
-        start_index += left
+    return all_bins
        s = lengths_cumsum[start_index - 1]
        # add local rank
        result.append(batch[rank])
    return result, s, len(result) * c * n
@numba.njit
-def allocate_sequentially(lengths: np.ndarray, rank: int, c: int, n: int):
+def allocate_sequentially(
    sequence_lengths: np.ndarray, rank: int, bin_capacity: int, num_ranks: int
 ):
    """
    Sequential allocator that preserves example order
    Parameters:
-    - lengths: The lengths of all examples
+        sequence_lengths: The lengths of all examples
-    - rank: The current rank (for distributed training)
+        rank: The current rank (for distributed training)
-    - c: The capacity of each bin (maximum sequence length)
+        bin_capacity: The capacity of each bin (maximum sequence length)
-    - n: Number of ranks
+        num_ranks: Number of ranks (processes/GPUs)
    Returns:
-    - result: List of batches for the current rank
+        rank_batches: List of batches for the current rank
-    - total_used: Number of actual example tokens
+        total_tokens_used: Number of actual example tokens
-    - total_slots: Maximum theoretical number of example tokens (number of bins * bin capacity)
+        total_token_slots: Maximum theoretical number of example tokens (number of bins * bin capacity)
    """
-    result = []
+    rank_batches = []
-    total_used = 0
+    total_tokens_used = 0
    # First, do sequential packing into bins
    all_bins = []
-    current_bin = [0 for i in range(0)]  # numba hint
+    current_bin = []
-    remaining_capacity = c
+    remaining_capacity = bin_capacity
-    for idx, size in enumerate(lengths):
+    # Process each sequence in order
    for idx, size in enumerate(sequence_lengths):
        if size <= remaining_capacity:
            # Example fits in current bin
            current_bin.append(idx)
            remaining_capacity -= size
-            total_used += size
+            total_tokens_used += size
        else:
            # Example doesn't fit, start a new bin
            if current_bin:  # Add non-empty bin to all_bins
                all_bins.append(current_bin)
            current_bin = [idx]
-            remaining_capacity = c - size
+            remaining_capacity = bin_capacity - size
-            total_used += size
+            total_tokens_used += size
    # Add the last bin if not empty
    if current_bin:
        all_bins.append(current_bin)
-    # Assign bins to ranks - each rank gets every n-th bin
+    # Assign bins to ranks - each rank gets every num_ranks-th bin
-    for bin_idx in range(rank, len(all_bins), n):
+    for bin_idx in range(rank, len(all_bins), num_ranks):
-        result.append(all_bins[bin_idx])
+        rank_batches.append(all_bins[bin_idx])
-    return result, total_used, len(all_bins) * c
+    return rank_batches, total_tokens_used, len(all_bins) * bin_capacity
 class MultipackBatchSampler(BatchSampler):
-    """Batch sampler class for multipack"""
+    """
    Batch sampler class for efficient packing of variable-length sequences
    This sampler packs sequences into fixed-capacity bins (batches) to maximize
    GPU memory utilization and training throughput by reducing padding.
    It supports both parallel packing (using FFD algorithm) and
    sequential packing (preserving original sequence order).
    """
    def __init__(
        self,
        sampler: Union[Sampler[int], Iterable[int]],
-        batch_size: int,
+        batch_size: int,  # Number of bins per batch
-        batch_max_len: int,
+        batch_max_len: int,  # Maximum sequence length (bin capacity)
-        lengths: np.ndarray,
+        lengths: np.ndarray,  # Sequence lengths
-        packing_efficiency_estimate: float = 1.0,
+        packing_efficiency_estimate: float = 1.0,  # Initial efficiency estimate
-        drop_last: bool = False,
+        drop_last: bool = False,  # Whether to drop incomplete batches
-        num_count_samples: int = 16,
+        num_count_samples: int = 16,  # Number of samples to estimate batch count
-        sequential: bool = False,
+        sequential: bool = False,  # Whether to use sequential packing
-        **kwargs,
+        group_size: int = 100_000,  # Size of groups for parallel packing
        bin_size: int = 200,  # The max number of samples that can be packed in a single bin
        num_processes: int | None = None,  # Number of processes for parallel packing
        safe_mode: bool = True,  # Conservative packing to prevent training instability
        **kwargs,  # pylint: disable=unused-argument
    ):
        super().__init__(sampler, batch_size, drop_last)
        self.batch_size = batch_size
        self.batch_max_len = batch_max_len
-        self.lengths: np.ndarray = lengths
+        self.lengths = np.array(lengths, dtype=np.int32)
        self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
        self.sequential = sequential
        self.group_size = group_size
        self.bin_size = bin_size
        self.num_processes = num_processes
        self.safe_mode = safe_mode
        assert isinstance(self.lengths, np.ndarray)
        self.epoch = 0
-        # statistics
+        # Efficiency statistics tracking
-        self.eff_total_used = 0
+        self.total_tokens_used = 0
-        self.eff_total_slots = 0
+        self.total_token_slots = 0
-        # The number of times to calculate the batches to determine the minimum packed dataset length for the local rank
+        # The number of times to calculate batches to determine minimum packed dataset length
        self.num_count_samples = num_count_samples
-        # the minimum packed dataset length across all ranks determined by a gather/broadcast
+        # Minimum packed dataset length across all ranks (determined by gather/broadcast)
        self.len_across_ranks = None
        # Cache for batches
        self._batches = None
        if self.sequential and not isinstance(sampler, SequentialSampler):
-            LOG.warn(
+            LOG.warning(
                "using sequential sample packing with non-sequential sampler, did you want to also enable curriculum_sampling?"
            )
    def set_epoch(self, epoch: int):
        """Set the epoch number, used for reproducible shuffling across epochs"""
        self.epoch = epoch
        self._batches = None  # Invalidate batch cache
    def generate_batches(self, set_stats=False):
-        indices = [idx for idx in self.sampler]
+        """
        Generate packed batches for training
-        lengths = self.lengths[indices]
+        Args:
-        lengths_cumsum = np.cumsum(lengths)
+            set_stats: Whether to update efficiency statistics
-        if self.sequential:
+        Returns:
-            batches, total_used, total_slots = allocate_sequentially(
+            List of batches, where each batch contains multiple bins,
-                lengths=lengths,
+            and each bin contains multiple sequence indices
-                rank=0,
+        """
-                c=self.batch_max_len,
+        if self._batches is not None:
-                n=1,
+            return self._batches
            )
        else:
            batches, total_used, total_slots = allocate(
                lengths=lengths,
                lengths_cumsum=lengths_cumsum,
                rank=0,
                c=self.batch_max_len,
                n=1,
            )
-        batches = [
+        # Get indices from the sampler
-            [
+        indices = [  # pylint: disable=unnecessary-comprehension
-                [indices[b_idx] for b_idx in batch]
+            idx for idx in self.sampler
                for batch in batches[i : i + self.batch_size]
            ]
            for i in range(0, len(batches), self.batch_size)
        ]
-        # statistics
+        # Get lengths of the selected sequences
-        if set_stats:
+        lengths = self.lengths[indices]
            self.eff_total_used += total_used
            self.eff_total_slots += total_slots
        # Pack sequences into bins using either sequential or parallel packing
        if self.sequential:
            bins, total_used, total_slots = allocate_sequentially(
                lengths,
                rank=0,
                bin_capacity=self.batch_max_len,
                num_ranks=1,
            )
        else:
            # Use parallel packing
            all_bins = pack_parallel(
                lengths,
                bin_capacity=self.batch_max_len,
                group_size=self.group_size,
                bin_size=self.bin_size,
                num_processes=self.num_processes,
                safe_mode=self.safe_mode,
            )
            # Map bin indices back to original indices
            bins = [
                [indices[b_idx] for b_idx in bin_indices] for bin_indices in all_bins
            ]
            # Calculate efficiency statistics
            total_used = lengths.sum()
            total_slots = len(all_bins) * self.batch_max_len
        # Group bins into batches (each batch contains batch_size bins)
        batches = [
            bins[i : i + self.batch_size] for i in range(0, len(bins), self.batch_size)
        ]
        # Drop last batch if requested and it's incomplete
        if self.drop_last and len(batches[-1]) < self.batch_size:
            batches = batches[:-1]
            # Adjust total_slots if we dropped a batch
            if not self.sequential:
                total_slots -= (self.batch_size - len(batches[-1])) * self.batch_max_len
        # Update statistics if requested
        if set_stats:
            self.total_tokens_used += total_used
            self.total_token_slots += total_slots
        self._batches = batches
        return batches
    def __iter__(self):
        """
        Return an iterator over batches
        The batches are truncated to match the minimum number of batches across all ranks
        to ensure distributed training balance
        """
        batches = self.generate_batches(set_stats=True)
        if self.len_across_ranks:
-            # make sure the batches we iterate over is truncated to the same min length across all ranks
+            # Truncate batches to ensure all ranks have the same number of batches
            batches = batches[: self.len_across_ranks]
        return iter(batches)
    def num_batches(self):
        batches = self.generate_batches(set_stats=True)
        return len(batches)
    def efficiency(self):
-        return self.eff_total_used / self.eff_total_slots
+        """
        Calculate the packing efficiency (ratio of tokens used to total token slots)
        Higher is better - 1.0 would mean perfect packing with no wasted space
        """
        if self.total_token_slots == 0:
            self.generate_batches(set_stats=True)
        if self.total_token_slots == 0:
            return 0.0
        # Return a Python float instead of potentially a numpy float
        return float(self.total_tokens_used / self.total_token_slots)
    def gather_efficiency(self):
        """
        Gather and synchronize packing efficiency estimates across all distributed ranks
        Returns a conservative efficiency estimate based on the measurements
        """
        def calc_sample_packing_eff_est(estimates: List[float]):
            LOG.debug(f"sample_packing_eff_est across ranks: {repr(estimates)}")
-            return math.floor(0.997 * max(estimates))
+            # Use 99.7% of max observed efficiency as a safe estimate
            max_eff = max(float(eff) for eff in estimates)
            return math.floor(0.997 * max_eff)
        # Gather efficiency from all ranks and apply the calculation function
        sample_packing_actual_eff_all = reduce_and_broadcast(
-            lambda: self.efficiency(),  # pylint: disable=unnecessary-lambda
+            lambda: float(self.efficiency()),  # pylint: disable=unnecessary-lambda
            calc_sample_packing_eff_est,
        )
        # Quantize to 0.5% intervals for stability
        sample_packing_eff_est = (
            math.ceil(sample_packing_actual_eff_all * 200.0) / 200.0
        )
        return sample_packing_eff_est
    def gather_len_batches(self, num):
        """
        Gather and synchronize batch counts across all distributed ranks
        Returns the minimum number of batches available on any rank
        """
        def calc_min_len(estimates: list[(int, float)]):
            LOG.info(f"gather_len_batches: {repr(estimates)}")
            return math.floor(min(estimates))
        # Find minimum batch count across ranks to ensure balance
        min_len_batches = reduce_and_broadcast(lambda: num, calc_min_len)
        return min_len_batches
    def __len__(self):
-        if not self.len_across_ranks:
+        """
-            len_batches = min(
+        Return the total number of batches that will be yielded by this sampler
-                [self.num_batches() for _ in range(self.num_count_samples)]
+
        This is calculated as the minimum number of batches available on any rank
        to ensure balanced distributed training
        """
        if self._batches is None:
            self._batches = self.generate_batches(set_stats=True)
        if self.len_across_ranks is None:
            # Sample multiple times to get stable estimate
            len_batches = min(  # pylint: disable=consider-using-generator
                [len(self._batches) for _ in range(self.num_count_samples)]
            )
            # Gather minimum across all ranks
            self.len_across_ranks = self.gather_len_batches(len_batches)
        return self.len_across_ranks
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -242,6 +242,9 @@ class AxolotlInputConfig(
    unsloth_rms_norm: bool | None = None
    unsloth_rope: bool | None = None
    chunked_cross_entropy: bool | None = None
    chunked_cross_entropy_num_chunks: int | None = None
    lora_mlp_kernel: bool | None = None
    lora_qkv_kernel: bool | None = None
    lora_o_kernel: bool | None = None
@@ -435,16 +438,6 @@ class AxolotlInputConfig(
            )
        return data
    @model_validator(mode="before")
    @classmethod
    def check_sample_packing_w_xformers(cls, data):
        if data.get("sample_packing") and data.get("xformers_attention"):
            raise ValueError(
                "sample_packing not compatible with xformers_attention. Use flash_attention"
            )
        return data
    @model_validator(mode="before")
    @classmethod
    # pylint: disable=duplicate-code
--- a/tests/test_chunked_xentropy.py
+++ b/tests/test_chunked_xentropy.py
@@ -0,0 +1,40 @@
 """
 test suite for chunked cross entropy
 """
 import pytest
 import torch
 from torch import nn
 from axolotl.monkeypatch.loss.chunked import get_causal_lm_loss
@pytest.fixture
 def chunked_fixtures():
    model_dim = 512
    vocab_size = 1024 * 256
    seq_len = 2048
    batch_size = 1
    lm_head = nn.Linear(model_dim, vocab_size)
    hidden_state = torch.randn(batch_size, seq_len, model_dim)
    labels = torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len))
    return lm_head, hidden_state, labels, vocab_size
 def test_chunked_forward(chunked_fixtures):  # pylint: disable=redefined-outer-name
    lm_head, hidden_state, labels, vocab_size = chunked_fixtures
    lm_loss = get_causal_lm_loss()
    logits = lm_head(hidden_state)
    chunked_lm_loss = lm_loss(logits, labels)
    logits_flattened = logits.view(-1, vocab_size)
    labels_flattened = labels.view(-1)
    loss = nn.functional.cross_entropy(
        logits_flattened.float(), labels_flattened, reduction="mean"
    )
    assert torch.allclose(chunked_lm_loss, loss, atol=1e-2, rtol=1e-2)
Author	SHA1	Message	Date
Wing Lian	985ee95f2d	use uint8 dtype for qlora	2025-05-05 08:27:34 -04:00
Wing Lian	72ece3dadf	support for configurable group and bin size for sample packing	2025-05-05 06:57:21 -04:00
Wing Lian	2e74e1d289	fix xformers inference	2025-05-05 06:20:31 -04:00
Wing Lian	4f478083e7	fix batch size setter	2025-05-05 03:49:01 -04:00
Wing Lian	82453bab7e	handle xformers patch for inference too	2025-05-05 03:22:02 -04:00
Wing Lian	5b2bd75aba	parallel bin packing fix error with lambda and pickling make sure things are in float instead of np.float	2025-05-04 23:24:46 -04:00
Wing Lian	03508c6816	improve readability of multipack sampler	2025-05-04 23:24:40 -04:00
Wing Lian	48b3e14a24	Print axolotl art if train is called outside of cli:	2025-05-04 23:24:35 -04:00
Wing Lian	544b1212d8	use relative import	2025-05-04 07:36:26 -04:00
Wing Lian	695fc2f802	missing __init__	2025-05-04 07:31:01 -04:00
Wing Lian	c7f38ba96b	fix seq lens calc to drop hanging sequences	2025-05-03 21:56:45 -04:00
Wing Lian	372fd08548	fix fp16 / bf16 reset when using fp16 with bf16 auto	2025-05-03 21:56:39 -04:00
Wing Lian	52cab2aa5b	refactor so we can add test	2025-05-03 21:55:34 -04:00
Wing Lian	bed8f354a5	reorder the packing check	2025-05-03 15:38:29 -04:00
Wing Lian	f301a165c3	fix xformers + packing validation	2025-05-03 15:00:33 -04:00
Wing Lian	2b3a09aeae	wire up the patch	2025-05-03 15:00:29 -04:00
Wing Lian	648780de51	xformers attention with packing	2025-05-03 14:59:49 -04:00
Wing Lian	ecc2388274	chunked cross entropy loss	2025-05-03 14:59:43 -04:00
Wing Lian	ebf724a9d9	fix import	2025-05-03 12:03:15 -04:00
Wing Lian	99095573c3	add tabs back to code check	2025-05-03 12:03:15 -04:00
Wing Lian	140083a828	patch peft to not upcast everything	2025-05-03 12:03:15 -04:00
Wing Lian	37c27aedc1	fsdp embeddings should be float32 per comment	2025-05-03 12:03:15 -04:00