grpo sp support

2025-04-09 00:46:05 +00:00
parent e55dce9995
commit 11b6803ff4
2 changed files with 63 additions and 6 deletions
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -1048,6 +1048,10 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
        if self.cfg.rpo_alpha is not None:
            training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha

+        training_args_kwargs["sequence_parallel_degree"] = (
+            self.cfg.sequence_parallel_degree
+        )
+
        training_args_cls = None
        blocklist_args_kwargs = []
        if self.cfg.rl == "simpo":
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -1,23 +1,25 @@
-"""
-Axolotl GRPO trainer
-"""
+"""Axolotl GRPO trainer"""

 from contextlib import nullcontext

+import torch
+import torch.distributed as dist
 from accelerate.utils import is_deepspeed_available, is_peft_model
 from trl import GRPOTrainer
 from trl.extras.profiling import profiling_decorator

 from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
+from axolotl.monkeypatch.attention.ring_attn import (
+    get_ring_attn_group,
+    update_ring_attn_params,
+)

 if is_deepspeed_available():
    import deepspeed


 class AxolotlGRPOTrainer(RngLoaderMixin, SchedulerMixin, GRPOTrainer):
-    """
-    Extend the base GRPOTrainer for axolotl helpers
-    """
+    """Extend the base GRPOTrainer for axolotl helpers"""

    _tag_names = ["trl", "grpo", "axolotl"]

@@ -67,3 +69,54 @@ class AxolotlGRPOTrainer(RngLoaderMixin, SchedulerMixin, GRPOTrainer):
        # Reset cache on main process
        if self.accelerator.is_main_process:
            self.vllm_client.reset_prefix_cache()
+
+    # Get the per-token log probabilities for the completions for the model and the reference model
+    def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep):
+        if self.args.sequence_parallel_degree > 1:
+            sp_group = get_ring_attn_group()
+            self.local_rank = dist.get_rank(group=sp_group)
+            self.local_world_size = dist.get_world_size(group=sp_group)
+
+            # Pad sequence if needed
+            total_seq_len = input_ids.shape[1]
+            remainder = total_seq_len % self.local_world_size
+            if remainder != 0:
+                padding = self.local_world_size - remainder
+
+                if dist.get_rank() == 0:
+                    import ipdb
+
+                    ipdb.set_trace()
+                dist.barrier()
+
+                pad_token_id = self.processing_class.pad_token_id or 0
+                padding = torch.full(
+                    (input_ids.shape[0], padding),
+                    pad_token_id,
+                    dtype=input_ids.dtype,
+                    device=input_ids.device,
+                )
+                input_ids = torch.cat([input_ids, padding], dim=1)
+
+                # Also pad attention mask if it exists
+                if attention_mask is not None:
+                    attn_padding = torch.zeros(
+                        (attention_mask.shape[0], padding),
+                        dtype=attention_mask.dtype,
+                        device=attention_mask.device,
+                    )
+                    attention_mask = torch.cat([attention_mask, attn_padding], dim=1)
+
+                # Update total_seq_len after padding
+                total_seq_len += padding
+
+            # Get local (start, end) for sequence parallelism slicing
+            slice_size = total_seq_len // self.local_world_size
+            start = self.local_rank * slice_size
+            end = start + slice_size
+
+            # Slice data for sequence parallel processing
+            input_ids = input_ids[:, start:end]
+            attention_mask = attention_mask[:, start:end]
+
+        super()._get_per_token_logps(model, input_ids, attention_mask, logits_to_keep)