updates

2025-04-25 02:28:38 +00:00
parent 6810f0ee19
commit 926dc4af90
6 changed files with 594 additions and 532 deletions
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -1009,6 +1009,8 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
            training_args_kwargs["dataloader_prefetch_factor"] = (
                self.cfg.dataloader_prefetch_factor
            )
        if self.cfg.seed:
            training_args_kwargs["seed"] = self.cfg.seed
        if self.cfg.gradient_checkpointing:
            training_args_kwargs["gradient_checkpointing"] = (
                self.cfg.gradient_checkpointing
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
--- a/src/axolotl/core/trainers/mixins/sequence_parallel.py
+++ b/src/axolotl/core/trainers/mixins/sequence_parallel.py
@@ -4,7 +4,6 @@ Module for Axolotl trainer sequence parallelism mixin and training context manag
 import functools
 import logging
 from contextlib import contextmanager
 import torch
 import torch.distributed as dist
@@ -14,14 +13,66 @@ from torch.utils.data import DistributedSampler, Sampler
 from torch.utils.hooks import RemovableHandle
 from axolotl.monkeypatch.attention.ring_attn import (
    RingAttnFunc,
    get_ring_attn_group,
    update_ring_attn_params,
 )
 from axolotl.utils.schemas.enums import RingAttnFunc
 LOG = logging.getLogger(__name__)
 def _handle_logits_to_keep(
    logits_to_keep,
    local_rank: int,
    local_world_size: int,
    ring_attn_func: RingAttnFunc,
    total_seq_len: int,
 ):
    """
    Handle logits_to_keep parameter for sequence parallelism.
    Args:
        logits_to_keep: Integer or tensor indicating which positions to compute logits
            for.
        local_rank: Rank in the sequence parallel group.
        local_world_size: World size of the sequence parallel group.
        ring_attn_func: Ring attention function being used.
        total_seq_len: Full sequence length.
    Returns:
        Adjusted logits_to_keep appropriate for this rank's sharded sequence
    """
    print("start of _handle_logits_to_keep")
    print(dist.get_rank(), logits_to_keep)
    # No transformation needed if logits_to_keep is None
    if logits_to_keep is None:
        return None
    assert isinstance(
        logits_to_keep, int
    ), "sequence parallelism currently only supports integer logits_to_keep"
    assert ring_attn_func in [
        RingAttnFunc.VARLEN_LLAMA3,
        RingAttnFunc.BATCH_RING,
    ], "if specifying logits_to_keep, sequence parallelism currently only supports 'batch_ring' and 'varlen_llama3' `ring_attn_func`s"
    # For standard sharding, each rank gets a contiguous chunk
    chunk_size = total_seq_len // local_world_size
    start_idx = local_rank * chunk_size
    end_idx = start_idx + chunk_size
    # Check if logits_to_keep is in this rank's range
    if start_idx <= logits_to_keep < end_idx:
        print("end of _handle_logits_to_keep")
        print(dist.get_rank(), logits_to_keep - start_idx)
        return logits_to_keep - start_idx
    else:
        print("end of _handle_logits_to_keep")
        print(dist.get_rank(), -1)
        return -1
 def apply_sequence_parallelism(
    batch: dict[str, torch.Tensor],
    local_rank: int,
@@ -32,10 +83,10 @@ def apply_sequence_parallelism(
    Apply sequence parallelism slicing to a batch.
    Args:
-        batch: Batch dictionary (e.g., input_ids, attention_mask, etc.)
+        batch: Batch dictionary (e.g., input_ids, attention_mask, etc.).
-        local_rank: Local rank in the sequence parallel group
+        local_rank: Local rank in the sequence parallel group.
-        local_world_size: World size of the sequence parallel group
+        local_world_size: World size of the sequence parallel group.
-        ring_attn_func: The ring attention function to use
+        ring_attn_func: The ring attention function to use.
    Returns:
        Sliced batch dictionary.
@@ -48,12 +99,10 @@ def apply_sequence_parallelism(
    total_seq_len = batch["input_ids"].size(1)
    for key in batch:
        if (
-            key in batch
+            isinstance(batch[key], torch.Tensor)
            and isinstance(batch[key], torch.Tensor)
            and batch[key].dim() > 1
            and batch[key].size(1) == total_seq_len
        ):
            if ring_attn_func in [
                RingAttnFunc.VARLEN_LLAMA3,
                RingAttnFunc.BATCH_RING,
@@ -78,6 +127,14 @@ def apply_sequence_parallelism(
                    dim=1,
                ).transpose(1, 2)
                batch[key] = tensor[:, local_rank].contiguous()
        if key == "logits_to_keep":
            batch[key] = _handle_logits_to_keep(
                logits_to_keep=batch[key],
                local_rank=local_rank,
                local_world_size=local_world_size,
                ring_attn_func=ring_attn_func,
                total_seq_len=total_seq_len,
            )
    return batch
@@ -205,8 +262,11 @@ class SequenceParallelContextManager:
        # Forward post-hook to gather outputs
        def sequence_parallel_post_hook(_, __, output):
            print("start of sequence_parallel_post_hook")
            # Gather the sharded outputs
-            return self.gather_outputs(output)
+            output = self.gather_outputs(output)
            print("end of sequence_parallel_post_hook")
            return output
        # Register both hooks
        self.hook_handles.append(
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -18,7 +18,6 @@ from pydantic import (
 )
 from transformers.utils.import_utils import is_torch_npu_available
 from axolotl.monkeypatch.attention.ring_attn import RingAttnFunc
 from axolotl.utils.distributed import is_main_process
 from axolotl.utils.schemas.datasets import (
    DatasetConfig,
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -1,6 +1,4 @@
-"""
+"""E2E tests for mixtral"""
 E2E tests for mixtral
 """
 import logging
 import os
@@ -99,6 +97,7 @@ class TestMixtral(unittest.TestCase):
                "bf16": "auto",
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
--- a/tests/e2e/patched/test_sp.py
+++ b/tests/e2e/patched/test_sp.py
@@ -12,12 +12,12 @@ from accelerate.state import PartialState
 from axolotl.core.trainers.mixins.sequence_parallel import apply_sequence_parallelism
 from axolotl.monkeypatch.attention.ring_attn import (
    RingAttnFunc,
    get_ring_attn_group,
    register_ring_attn,
    set_ring_attn_group,
 )
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.schemas.enums import RingAttnFunc
@pytest.fixture