Merge branch 'main' into destroy-pg

ray bugfix
destroy process group on Ctrl+C / training or eval run (#2457 )
2025-03-31 14:36:43 -04:00 · 2025-03-31 18:35:41 +00:00 · 2025-03-31 12:36:47 -04:00 · 2025-03-31 14:46:15 +00:00 · 2025-03-31 14:32:50 +00:00 · 2025-03-31 09:13:42 -04:00
17 changed files with 83 additions and 613 deletions
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -243,6 +243,7 @@ website:
            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
+            - docs/sequence_parallelism.qmd

        - section: "Troubleshooting"
          contents:
--- a/docs/config.qmd
+++ b/docs/config.qmd
@@ -658,6 +658,9 @@ ddp_broadcast_buffers:
 # subsequences, or set to 4 to split into four equal-sized subsequences.
 # See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
 sequence_parallel_degree:
+# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
+# Must evenly divide the number of KV heads in your model.
+heads_k_stride: 1

 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:
--- a/docs/multi-gpu.qmd
+++ b/docs/multi-gpu.qmd
@@ -18,6 +18,7 @@ Axolotl supports several methods for multi-GPU training:

 - DeepSpeed (recommended)
 - FSDP (Fully Sharded Data Parallel)
+- Sequence parallelism
 - FSDP + QLoRA

 ## DeepSpeed {#sec-deepspeed}
@@ -66,6 +67,28 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

+## Sequence parallelism {#sec-sequence-parallelism}
+
+We support sequence parallelism (SP) via the
+[ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
+allows one to split up sequences across GPUs, which is useful in the event that a
+single sequence causes OOM errors during model training.
+
+First, install `ring-flash-attn`, recommended via `pip install axolotl[ring-flash-attn]`,
+or from source with `pip install .[ring-flash-attn]`.
+
+Your Axolotl YAML config should contain the following lines:
+
+```{.yaml}
+sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
+flash_attention: true  # Required with sequence parallelism
+
+# Optional; strides across the key dimension. Larger values use more memory but will make training faster.
+heads_k_stride: 1
+```
+
+See our [dedicated guide](sequence_parallelism.qmd) for more details.
+
 ### FSDP + QLoRA {#sec-fsdp-qlora}

 For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).
--- a/docs/sequence_parallelism.qmd
+++ b/docs/sequence_parallelism.qmd
@@ -25,6 +25,8 @@ To enable sequence parallelism, add the following to your configuration file:
 ```yaml
 # Set to a divisor (> 1) of the number of GPUs available
 sequence_parallel_degree: 4  # Split sequences across 4 GPUs
+# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
+heads_k_stride: 1
 ```

 The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
@@ -58,11 +60,16 @@ To use sequence parallelism, you need:
 ## Example

 ```yaml
-# Example config with sequence parallelism
 base_model: meta-llama/Llama-3-8B-Instruct
 sequence_len: 8192
-sequence_parallel_degree: 2  # Split each sequence into 4 parts
+
+...
+
+sequence_parallel_degree: 4  # Split each sequence into 4 parts, one per GPU
 flash_attention: true  # Required with sequence parallelism
+# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
+heads_k_stride: 1
+
 ...
 ```

--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -69,7 +69,6 @@ from axolotl.utils.callbacks import (
    LossWatchDogCallback,
    SaveAxolotlConfigtoWandBCallback,
    SaveBetterTransformerModelCallback,
-    SaveModelCallback,
    bench_eval_callback_factory,
    causal_lm_bench_eval_callback_factory,
    log_prediction_callback_factory,
@@ -249,7 +248,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

        if self.cfg.gc_steps:
            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
-        callbacks.append(SaveModelCallback())

        return callbacks

@@ -663,11 +661,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):

                optimizer_cls = MuonOptimizerFactory
                optimizer_kwargs.update(adam_kwargs)
-            elif self.cfg.optimizer == "soap":
-                from axolotl.utils.optimizers.soap import SOAP
-
-                optimizer_cls = SOAP
-                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "optimi_adamw":
                from optimi import AdamW

@@ -942,7 +935,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):

    def get_callbacks(self):
        callbacks = super().get_callbacks()
-        callbacks.append(SaveModelCallback())

        return callbacks

--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -15,6 +15,7 @@ from axolotl.logging_config import configure_logging
 from axolotl.train import TrainDatasetMeta
 from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.trainer import setup_trainer

@@ -159,4 +160,6 @@ def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, f
    del model
    del tokenizer

+    cleanup_distributed()
+
    return all_metrics
--- a/src/axolotl/monkeypatch/attention/ring_attn.py
+++ b/src/axolotl/monkeypatch/attention/ring_attn.py
@@ -38,13 +38,19 @@ def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
    RING_ATTN_GROUP = ring_attn_group


-def register_ring_attn(sequence_parallel_degree: int):
+def register_ring_attn(sequence_parallel_degree: int, heads_k_stride: int | None):
    """
    Create ring attention group and substitute flash attn with ring flash attn.

    Args:
        sequence_parallel_degree: Sequence parallelism factor.
+        heads_k_stride: Sequence parallelism K head stride size. Passed
+            through to `ring_flash_attn.substitute_hf_flash_attn`.
    """
+    if get_ring_attn_group() is not None:
+        LOG.info("Ring attention already registered, exiting early...")
+        return
+
    LOG.info(
        "Enabling ring attention sequence parallelism: "
        f"each sequence will be processed across {sequence_parallel_degree} GPUs"
@@ -84,6 +90,11 @@ def register_ring_attn(sequence_parallel_degree: int):
    if rank == 0:
        LOG.info(f"Sequence parallel group assignments: {group_assignments}")

+    if heads_k_stride is None:
+        heads_k_stride = 1
+
    from ring_flash_attn import substitute_hf_flash_attn

-    substitute_hf_flash_attn(get_ring_attn_group(), sequence_parallel_degree)
+    substitute_hf_flash_attn(
+        process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride
+    )
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -27,6 +27,7 @@ from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
 from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
 from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
+from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.freeze import freeze_layers_except
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.trainer import setup_trainer
@@ -157,6 +158,8 @@ def setup_signal_handler(
                _model.save_pretrained(
                    cfg.output_dir, safe_serialization=safe_serialization
                )
+
+            cleanup_distributed()
            sys.exit(0)

        _model_weakref = weakref.ref(model)
@@ -478,7 +481,7 @@ def train(
    Returns:
        Tuple of (model, tokenizer) after training
    """
-    # Setup model, tokenizer, (causal or RLHF) trainer etc.
+    # Setup model, tokenizer, (causal or RLHF) trainer, etc.
    (
        trainer,
        model,
@@ -487,34 +490,26 @@ def train(
        processor,
    ) = setup_model_and_trainer(cfg, dataset_meta)

-    # Determine if we need to resume from a checkpoint
-    resume_from_checkpoint = determine_resume_checkpoint(cfg)
-
-    # Configuration for saving
-    safe_serialization = cfg.save_safetensors is True
-
    # Handle untrained tokens if configured
+    safe_serialization = cfg.save_safetensors is True
    train_dataset = dataset_meta.train_dataset
    handle_untrained_tokens_fix(
        cfg, model, tokenizer, train_dataset, safe_serialization
    )

-    # Save initial configs
+    # Additional setup
    save_initial_configs(cfg, tokenizer, model, peft_config, processor)
-
-    # Set up signal handler for graceful termination
    setup_signal_handler(cfg, model, safe_serialization)
-
-    # Set up badges and config info for model card
    setup_model_card(cfg)

    # Execute the training
+    resume_from_checkpoint = determine_resume_checkpoint(cfg)
    execute_training(cfg, trainer, resume_from_checkpoint)

-    # Save the trained model
+    # Save the trained model and cleanup
    save_trained_model(cfg, trainer, model, safe_serialization)
-
-    # Create model card
    create_model_card(cfg, trainer)
+    if not cfg.use_ray:
+        cleanup_distributed()

    return model, tokenizer, trainer
--- a/src/axolotl/utils/callbacks/init.py
+++ b/src/axolotl/utils/callbacks/init.py
@@ -816,27 +816,6 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
        return control


-class SaveModelCallback(TrainerCallback):
-    """Callback to save model on train end"""
-
-    def on_step_end(  # pylint: disable=unused-argument
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ):
-        # Save
-        if state.global_step >= state.max_steps:
-            control.should_save = True
-
-    def on_train_end(  # pylint: disable=unused-argument
-        self, args, state, control, **kwargs
-    ):
-        control.should_save = True
-        return control
-
-
 class GCCallback(TrainerCallback):
    """Callback to garbage collect torch cache"""

--- a/src/axolotl/utils/distributed.py
+++ b/src/axolotl/utils/distributed.py
@@ -71,8 +71,8 @@ def barrier():

 def is_main_process():
    """
-    Check if the current process is the main process.
-    If not in distributed mode, always return True.
+    Check if the current process is the main process. If not in distributed mode,
+    always return `True`.
    """
    if not is_distributed():
        return True
@@ -87,6 +87,18 @@ def get_world_size():
    return int(os.getenv("WORLD_SIZE", "1"))


+def cleanup_distributed():
+    """
+    Destroy process group if torch distributed is initialized. Called in training early
+    termination or when training successfully completes.
+    """
+    # Ensure that all operations are completed before destroying the process group
+    torch.cuda.synchronize()
+    # Destroy the process group
+    if torch.distributed.is_initialized():
+        torch.distributed.destroy_process_group()
+
+
@contextmanager
 def zero_only():
    """
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -609,7 +609,10 @@ class ModelLoader:
            # Initialize ring attn for sequence parallelism. This must be done after
            # model init but before the first forward pass, since it modifies flash
            # attn to use ring comm for SP training across multiple GPUs.
-            register_ring_attn(self.cfg.sequence_parallel_degree)
+            register_ring_attn(
+                sequence_parallel_degree=self.cfg.sequence_parallel_degree,
+                heads_k_stride=self.cfg.heads_k_stride,
+            )

    def patch_attention(self) -> None:
        if hasattr(self.model_config, "model_type"):
--- a/src/axolotl/utils/optimizers/soap/LICENSE
+++ b/src/axolotl/utils/optimizers/soap/LICENSE
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2024 Nikhil Vyas
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
--- a/src/axolotl/utils/optimizers/soap/init.py
+++ b/src/axolotl/utils/optimizers/soap/init.py
@@ -1,495 +0,0 @@
-# pylint: skip-file
-# Copied from https://github.com/nikhilvyas/SOAP
-from itertools import chain
-
-import torch
-import torch.optim as optim
-
-# Parts of the code are modifications of Pytorch's AdamW optimizer
-# Parts of the code are modifications of code from https://github.com/jiaweizzhao/GaLore/blob/master/galore_torch/galore_projector.py
-
-
-class SOAP(optim.Optimizer):
-    """
-    Implements SOAP algorithm (https://arxiv.org/abs/2409.11321).
-
-    Parameters:
-        params (`Iterable[nn.parameter.Parameter]`):
-            Iterable of parameters to optimize or dictionaries defining parameter groups.
-        lr (`float`, *optional*, defaults to 0.003):
-            The learning rate to use.
-        betas (`Tuple[float,float]`, *optional*, defaults to `(0.95, 0.95)`):
-            Adam's betas parameters (b1, b2).
-        shampoo_beta (`float`, *optional*, defaults to -1):
-            If >= 0, use this beta for the preconditioner (L and R in paper, state["GG"] below) moving average instead of betas[1].
-        eps (`float`, *optional*, defaults to 1e-08):
-            Adam's epsilon for numerical stability.
-        weight_decay (`float`, *optional*, defaults to 0.01): weight decay coefficient.
-        precondition_frequency (`int`, *optional*, defaults to 10):
-            How often to update the preconditioner.
-        max_precond_dim (`int`, *optional*, defaults to 10000):
-            Maximum dimension of the preconditioner.
-            Set to 10000, so that we exclude most common vocab sizes while including layers.
-        merge_dims (`bool`, *optional*, defaults to `False`):
-            Whether or not to merge dimensions of the preconditioner.
-        precondition_1d (`bool`, *optional*, defaults to `False`):
-            Whether or not to precondition 1D gradients.
-        normalize_grads (`bool`, *optional*, defaults to `False`):
-            Whether or not to normalize gradients per layer.
-            Helps at large precondition_frequency (~100 in our experiments),
-            but hurts performance at small precondition_frequency (~10 in our experiments).
-        data_format (`str`, *optional*, defaults to `channels_first`):
-            Data format of the input for convolutional layers.
-            Should be "channels_last" for data_format of NHWC and "channels_first" for NCHW.
-        correct_bias (`bool`, *optional*, defaults to `True`):
-            Whether or not to use bias correction in Adam.
-    """
-
-    def __init__(
-        self,
-        params,
-        lr: float = 3e-3,
-        betas=(0.95, 0.95),
-        shampoo_beta: float = -1,
-        eps: float = 1e-8,
-        weight_decay: float = 0.01,
-        precondition_frequency: int = 10,
-        max_precond_dim: int = 10000,  #
-        merge_dims: bool = False,  # Merge dimensions till the product of the dimensions is less than or equal to max_precond_dim.
-        precondition_1d: bool = False,
-        normalize_grads: bool = False,
-        data_format: str = "channels_first",
-        correct_bias: bool = True,
-    ):
-        defaults = {
-            "lr": lr,
-            "betas": betas,
-            "shampoo_beta": shampoo_beta,
-            "eps": eps,
-            "weight_decay": weight_decay,
-            "precondition_frequency": precondition_frequency,
-            "max_precond_dim": max_precond_dim,
-            "merge_dims": merge_dims,
-            "precondition_1d": precondition_1d,
-            "normalize_grads": normalize_grads,
-            "correct_bias": correct_bias,
-        }
-        super().__init__(params, defaults)
-        self._data_format = data_format
-
-    def merge_dims(self, grad, max_precond_dim):
-        """
-        Merges dimensions of the gradient tensor till the product of the dimensions is less than or equal to max_precond_dim.
-        """
-        assert self._data_format in ["channels_first", "channels_last"]
-        if self._data_format == "channels_last" and grad.dim() == 4:
-            grad = grad.permute(0, 3, 1, 2)
-        shape = grad.shape
-        new_shape = []
-
-        curr_shape = 1
-        for sh in shape:
-            temp_shape = curr_shape * sh
-            if temp_shape > max_precond_dim:
-                if curr_shape > 1:
-                    new_shape.append(curr_shape)
-                    curr_shape = sh
-                else:
-                    new_shape.append(sh)
-                    curr_shape = 1
-            else:
-                curr_shape = temp_shape
-
-        if curr_shape > 1 or len(new_shape) == 0:
-            new_shape.append(curr_shape)
-
-        new_grad = grad.reshape(new_shape)
-        return new_grad
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """
-        Performs a single optimization step.
-
-        Arguments:
-            closure (`Callable`, *optional*): A closure that reevaluates the model and returns the loss.
-        """
-        if closure is None:
-            loss = None
-        else:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group["params"]:
-                if p.grad is None:
-                    continue
-                grad = p.grad
-
-                state = self.state[p]
-
-                if "step" not in state:
-                    state["step"] = 0
-
-                    # State initialization
-                if "exp_avg" not in state:
-                    # Exponential moving average of gradient values
-                    state["exp_avg"] = torch.zeros_like(grad)
-                    # Exponential moving average of squared gradient values
-                    state["exp_avg_sq"] = torch.zeros_like(grad)
-
-                if "Q" not in state:
-                    self.init_preconditioner(
-                        grad,
-                        state,
-                        precondition_frequency=group["precondition_frequency"],
-                        precondition_1d=group["precondition_1d"],
-                        shampoo_beta=(
-                            group["shampoo_beta"]
-                            if group["shampoo_beta"] >= 0
-                            else group["betas"][1]
-                        ),
-                        max_precond_dim=group["max_precond_dim"],
-                        merge_dims=group["merge_dims"],
-                    )
-                    self.update_preconditioner(
-                        grad,
-                        state,
-                        max_precond_dim=group["max_precond_dim"],
-                        merge_dims=group["merge_dims"],
-                        precondition_1d=group["precondition_1d"],
-                    )
-                    continue  # first step is skipped so that we never use the current gradients in the projection.
-
-                # Projecting gradients to the eigenbases of Shampoo's preconditioner
-                # i.e. projecting to the eigenbases of matrices in state["GG"]
-                grad_projected = self.project(
-                    grad,
-                    state,
-                    merge_dims=group["merge_dims"],
-                    max_precond_dim=group["max_precond_dim"],
-                )
-
-                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
-                beta1, beta2 = group["betas"]
-
-                state["step"] += 1
-
-                # Decay the first and second moment running average coefficient
-                # In-place operations to update the averages at the same time
-                exp_avg.mul_(beta1).add_(grad_projected, alpha=(1.0 - beta1))
-                exp_avg_sq.mul_(beta2).add_(
-                    grad_projected.square(), alpha=(1.0 - beta2)
-                )
-
-                denom = exp_avg_sq.sqrt().add_(group["eps"])
-
-                # Projecting the exponential moving average of gradients to the eigenbases of Shampoo's preconditioner
-                # i.e. projecting to the eigenbases of matrices in state["GG"]
-                # exp_avg_projected = self.project(
-                #     exp_avg,
-                #     state,
-                #     merge_dims=group["merge_dims"],
-                #     max_precond_dim=group["max_precond_dim"],
-                # )
-                exp_avg_projected = exp_avg
-
-                step_size = group["lr"]
-                if group["correct_bias"]:
-                    bias_correction1 = 1.0 - beta1 ** (state["step"])
-                    bias_correction2 = 1.0 - beta2 ** (state["step"])
-                    step_size = step_size * (bias_correction2**0.5) / bias_correction1
-
-                # Projecting back the preconditioned (by Adam) exponential moving average of gradients
-                # to the original space
-                norm_grad = self.project_back(
-                    exp_avg_projected / denom,
-                    state,
-                    merge_dims=group["merge_dims"],
-                    max_precond_dim=group["max_precond_dim"],
-                )
-
-                if group["normalize_grads"]:
-                    norm_grad = norm_grad / (1e-30 + torch.mean(norm_grad**2) ** 0.5)
-
-                p.add_(norm_grad, alpha=-step_size)
-
-                # From AdamW code: Just adding the square of the weights to the loss function is *not*
-                # the correct way of using L2 regularization/weight decay with Adam,
-                # since that will interact with the m and v parameters in strange ways.
-                #
-                # Instead we want to decay the weights in a manner that doesn't interact
-                # with the m/v parameters. This is equivalent to adding the square
-                # of the weights to the loss with plain (non-momentum) SGD.
-                # Add weight decay at the end (fixed version)
-                if group["weight_decay"] > 0.0:
-                    p.add_(p, alpha=(-group["lr"] * group["weight_decay"]))
-
-                # Update is done after the gradient step to avoid using current gradients in the projection.
-                self.update_preconditioner(
-                    grad,
-                    state,
-                    max_precond_dim=group["max_precond_dim"],
-                    merge_dims=group["merge_dims"],
-                    precondition_1d=group["precondition_1d"],
-                )
-
-        return loss
-
-    def init_preconditioner(
-        self,
-        grad,
-        state,
-        precondition_frequency=10,
-        shampoo_beta=0.95,
-        max_precond_dim=10000,
-        precondition_1d=False,
-        merge_dims=False,
-    ):
-        """
-        Initializes the preconditioner matrices (L and R in the paper).
-        """
-        state["GG"] = (
-            []
-        )  # Will hold all the preconditioner matrices (L and R in the paper).
-        if grad.dim() == 1:
-            if not precondition_1d or grad.shape[0] > max_precond_dim:
-                state["GG"].append([])
-            else:
-                state["GG"].append(
-                    torch.zeros(grad.shape[0], grad.shape[0], device=grad.device)
-                )
-        else:
-            if merge_dims:
-                grad = self.merge_dims(grad, max_precond_dim)
-
-            for sh in grad.shape:
-                if sh > max_precond_dim:
-                    state["GG"].append([])
-                else:
-                    state["GG"].append(torch.zeros(sh, sh, device=grad.device))
-
-        state["Q"] = None  # Will hold all the eigenbases of the preconditioner.
-        state["precondition_frequency"] = precondition_frequency
-        state["shampoo_beta"] = shampoo_beta
-
-    def project(self, grad, state, merge_dims=False, max_precond_dim=10000):
-        """
-        Projects the gradient to the eigenbases of the preconditioner.
-        """
-        original_shape = grad.shape
-        if merge_dims:
-            if grad.dim() == 4 and self._data_format == "channels_last":
-                permuted_shape = grad.permute(0, 3, 1, 2).shape
-            grad = self.merge_dims(grad, max_precond_dim)
-
-        for mat in state["Q"]:
-            if len(mat) > 0:
-                grad = torch.tensordot(
-                    grad,
-                    mat,
-                    dims=[[0], [0]],
-                )
-            else:
-                permute_order = list(range(1, len(grad.shape))) + [0]
-                grad = grad.permute(permute_order)
-
-        if merge_dims:
-            if self._data_format == "channels_last" and len(original_shape) == 4:
-                grad = grad.reshape(permuted_shape).permute(0, 2, 3, 1)
-            else:
-                grad = grad.reshape(original_shape)
-        return grad
-
-    def update_preconditioner(
-        self,
-        grad,
-        state,
-        max_precond_dim=10000,
-        merge_dims=False,
-        precondition_1d=False,
-    ):
-        """
-        Updates the preconditioner matrices and the eigenbases (L, R, Q_L, Q_R in the paper).
-        """
-        if state["Q"] is not None:
-            state["exp_avg"] = self.project_back(
-                state["exp_avg"],
-                state,
-                merge_dims=merge_dims,
-                max_precond_dim=max_precond_dim,
-            )
-        if grad.dim() == 1:
-            if precondition_1d and grad.shape[0] <= max_precond_dim:
-                state["GG"][0].lerp_(
-                    grad.unsqueeze(1) @ grad.unsqueeze(0), 1 - state["shampoo_beta"]
-                )
-        else:
-            if merge_dims:
-                new_grad = self.merge_dims(grad, max_precond_dim)
-                for idx, sh in enumerate(new_grad.shape):
-                    if sh <= max_precond_dim:
-                        outer_product = torch.tensordot(
-                            new_grad,
-                            new_grad,
-                            dims=[
-                                [
-                                    *chain(
-                                        range(idx), range(idx + 1, len(new_grad.shape))
-                                    )
-                                ]
-                            ]
-                            * 2,
-                        )
-                        state["GG"][idx].lerp_(outer_product, 1 - state["shampoo_beta"])
-            else:
-                for idx, sh in enumerate(grad.shape):
-                    if sh <= max_precond_dim:
-                        outer_product = torch.tensordot(
-                            grad,
-                            grad,
-                            # Contracts across all dimensions except for k.
-                            dims=[[*chain(range(idx), range(idx + 1, len(grad.shape)))]]
-                            * 2,
-                        )
-                        state["GG"][idx].lerp_(outer_product, 1 - state["shampoo_beta"])
-
-        if state["Q"] is None:
-            state["Q"] = self.get_orthogonal_matrix(state["GG"])
-        if state["step"] > 0 and state["step"] % state["precondition_frequency"] == 0:
-            state["Q"] = self.get_orthogonal_matrix_QR(
-                state, max_precond_dim, merge_dims
-            )
-            # state["Q"] = self.get_fast_QR(state, max_precond_dim, merge_dims)
-
-        if state["step"] > 0:
-            state["exp_avg"] = self.project(
-                state["exp_avg"],
-                state,
-                merge_dims=merge_dims,
-                max_precond_dim=max_precond_dim,
-            )
-
-    def project_back(self, grad, state, merge_dims=False, max_precond_dim=10000):
-        """
-        Projects the gradient back to the original space.
-        """
-        original_shape = grad.shape
-        if merge_dims:
-            if self._data_format == "channels_last" and grad.dim() == 4:
-                permuted_shape = grad.permute(0, 3, 1, 2).shape
-            grad = self.merge_dims(grad, max_precond_dim)
-        for mat in state["Q"]:
-            if len(mat) > 0:
-                grad = torch.tensordot(
-                    grad,
-                    mat,
-                    dims=[[0], [1]],
-                )
-            else:
-                permute_order = list(range(1, len(grad.shape))) + [0]
-                grad = grad.permute(permute_order)
-
-        if merge_dims:
-            if self._data_format == "channels_last" and len(original_shape) == 4:
-                grad = grad.reshape(permuted_shape).permute(0, 2, 3, 1)
-            else:
-                grad = grad.reshape(original_shape)
-        return grad
-
-    def get_orthogonal_matrix(self, mat):
-        """
-        Computes the eigenbases of the preconditioner using torch.linalg.eigh decomposition.
-        """
-        matrix = []
-        for m in mat:
-            if len(m) == 0:
-                matrix.append([])
-                continue
-            if m.data.dtype != torch.float:
-                float_data = False
-                original_type = m.data.dtype
-                original_device = m.data.device
-                matrix.append(m.data.float())
-            else:
-                float_data = True
-                matrix.append(m.data)
-
-        final = []
-        for m in matrix:
-            if len(m) == 0:
-                final.append([])
-                continue
-            try:
-                _, Q = torch.linalg.eigh(
-                    m + 1e-30 * torch.eye(m.shape[0], device=m.device)
-                )
-            except:  # pylint: disable=bare-except # noqa: E722
-                _, Q = torch.linalg.eigh(
-                    m.to(torch.float64) + 1e-30 * torch.eye(m.shape[0], device=m.device)
-                )
-                Q = Q.to(m.dtype)
-            Q = torch.flip(Q, [1])
-
-            if not float_data:
-                Q = Q.to(original_device).type(original_type)
-            final.append(Q)
-        return final
-
-    def get_orthogonal_matrix_QR(self, state, max_precond_dim=10000, merge_dims=False):
-        """
-        Computes the eigenbases of the preconditioner using one round of power iteration
-        followed by torch.linalg.qr decomposition.
-        """
-        precond_list = state["GG"]
-        orth_list = state["Q"]
-
-        matrix = []
-        orth_matrix = []
-        for m, o in zip(precond_list, orth_list):
-            if len(m) == 0:
-                matrix.append([])
-                orth_matrix.append([])
-                continue
-            if m.data.dtype != torch.float:
-                float_data = False
-                original_type = m.data.dtype
-                original_device = m.data.device
-                matrix.append(m.data.float())
-                orth_matrix.append(o.data.float())
-            else:
-                float_data = True
-                matrix.append(m.data.float())
-                orth_matrix.append(o.data.float())
-
-        orig_shape = state["exp_avg_sq"].shape
-        if self._data_format == "channels_last" and len(orig_shape) == 4:
-            permuted_shape = state["exp_avg_sq"].permute(0, 3, 1, 2).shape
-        if merge_dims:
-            exp_avg_sq = self.merge_dims(state["exp_avg_sq"], max_precond_dim)
-        else:
-            exp_avg_sq = state["exp_avg_sq"]
-
-        final = []
-        for ind, (m, o) in enumerate(zip(matrix, orth_matrix)):
-            if len(m) == 0:
-                final.append([])
-                continue
-            est_eig = torch.diag(o.T @ m @ o)
-            sort_idx = torch.argsort(est_eig, descending=True)
-            exp_avg_sq = exp_avg_sq.index_select(ind, sort_idx)
-            o = o[:, sort_idx]
-            power_iter = m @ o
-            Q, _ = torch.linalg.qr(power_iter)
-
-            if not float_data:
-                Q = Q.to(original_device).type(original_type)
-            final.append(Q)
-
-        if merge_dims:
-            if self._data_format == "channels_last" and len(orig_shape) == 4:
-                exp_avg_sq = exp_avg_sq.reshape(permuted_shape).permute(0, 2, 3, 1)
-            else:
-                exp_avg_sq = exp_avg_sq.reshape(orig_shape)
-
-        state["exp_avg_sq"] = exp_avg_sq
-        return final
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -248,6 +248,7 @@ class AxolotlInputConfig(
    val_set_size: float | None = Field(default=0.0)

    sequence_parallel_degree: int | None = None
+    heads_k_stride: int | None = None

    special_tokens: SpecialTokensConfig | None = None
    tokens: list[str] | None = None
@@ -1108,7 +1109,7 @@ class AxolotlInputConfig(

    @field_validator("sequence_parallel_degree", mode="before")
    @classmethod
-    def check_sequence_parallel_config(cls, value, info):
+    def check_sequence_parallel_degree(cls, value, info):
        if not value:
            value = 1

--- a/src/axolotl/utils/schemas/enums.py
+++ b/src/axolotl/utils/schemas/enums.py
@@ -52,4 +52,3 @@ class CustomSupportedOptimizers(str, Enum):
    ao_adamw_fp8 = "ao_adamw_fp8"  # pylint: disable=invalid-name
    adopt_adamw = "adopt_adamw"  # pylint: disable=invalid-name
    muon = "muon"  # pylint: disable=invalid-name
-    soap = "soap"  # pylint: disable=invalid-name
--- a/tests/e2e/patched/test_sp.py
+++ b/tests/e2e/patched/test_sp.py
@@ -110,7 +110,7 @@ class TestRingAttention:
        mock_new_group.return_value = mock_group

        # Call register_ring_attn with size 4
-        register_ring_attn(sequence_parallel_degree=4)
+        register_ring_attn(sequence_parallel_degree=4, heads_k_stride=1)

        # Verify the number of calls without examining the arguments
        assert mock_new_group.call_count == 2
--- a/tests/e2e/test_optimizers.py
+++ b/tests/e2e/test_optimizers.py
@@ -201,46 +201,3 @@ class TestCustomOptimizers(unittest.TestCase):

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
-
-    @with_temp_dir
-    def test_soap(self, temp_dir):
-        # pylint: disable=duplicate-code
-        cfg = DictDefault(
-            {
-                "base_model": "HuggingFaceTB/SmolLM-135M",
-                "sequence_len": 1024,
-                "load_in_8bit": True,
-                "adapter": "lora",
-                "lora_r": 8,
-                "lora_alpha": 16,
-                "lora_dropout": 0.05,
-                "lora_target_linear": True,
-                "val_set_size": 0.1,
-                "special_tokens": {
-                    "pad_token": "<|endoftext|>",
-                },
-                "datasets": [
-                    {
-                        "path": "vicgalle/alpaca-gpt4",
-                        "type": "alpaca",
-                    },
-                ],
-                "num_epochs": 1,
-                "micro_batch_size": 8,
-                "gradient_accumulation_steps": 1,
-                "output_dir": temp_dir,
-                "learning_rate": 0.00001,
-                "optimizer": "soap",
-                "adam_beta1": 0.9,
-                "adam_beta2": 0.95,
-                "lr_scheduler": "cosine",
-            }
-        )
-
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        check_model_output_exists(temp_dir, cfg)
Author	SHA1	Message	Date
Dan Saunders	1defb8a955	Merge branch 'main' into destroy-pg	2025-03-31 14:36:43 -04:00
Dan Saunders	70b466aa67	ray bugfix	2025-03-31 18:35:41 +00:00
Dan Saunders	ef6eb77cc8	destroy process group on Ctrl+C / training or eval run (#2457 ) * fix nccl pg destroy warning * update	2025-03-31 12:36:47 -04:00
Dan Saunders	32ce167404	update	2025-03-31 14:46:15 +00:00
Dan Saunders	1c4cc639f5	fix nccl pg destroy warning	2025-03-31 14:32:50 +00:00
Dan Saunders	5410195e0b	Sequence parallelism quick follow-ups; remove ModelCallback (#2450 ) * guard return if ring attn alrady registered * add docs link, bits in multi-gpu docs, remove save model callback (subsumed by HF trainers) * configurable heads_k_stride from ring-flash-attn hf adapter	2025-03-31 09:13:42 -04:00