remove ref_model when peft model is passed into grpo trainer

feat: update transformers version to 4.49.0 (#2340 )
calculate sample length fixes and SFT splitting fixes (#2351 )
2025-02-20 21:53:20 -05:00 · 2025-02-20 21:12:06 -05:00 · 2025-02-20 14:29:58 -05:00 · 2025-02-19 09:23:31 -05:00 · 2025-02-19 12:02:35 +07:00 · 2025-02-18 16:08:40 -05:00
13 changed files with 234 additions and 132 deletions
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -12,6 +12,7 @@ to leverage operator fusion and tensor re-use in order to improve speed and redu
 memory usage during the forward and backward passes of these calculations.

 We currently support several common model architectures, including (but not limited to):
+
 - `llama`
 - `mistral`
 - `qwen2`
--- a/requirements.txt
+++ b/requirements.txt
@@ -13,7 +13,7 @@ liger-kernel==0.5.2
 packaging==23.2

 peft==0.14.0
-transformers==4.48.3
+transformers==4.49.0
 tokenizers>=0.21.0
 accelerate==1.3.0
 datasets==3.2.0
--- a/src/axolotl/core/trainers/grpo/trainer.py
+++ b/src/axolotl/core/trainers/grpo/trainer.py
@@ -39,6 +39,15 @@ class AxolotlGRPOTrainer(SchedulerMixin, GRPOTrainer):
            self.model = self._enable_gradient_checkpointing(self.model, kwargs["args"])
        # pylint: enable=access-member-before-definition

+        # cleanup the ref_model if we have a peft model passed in
+        # TODO remove this after next major trl release
+        if (
+            self.ref_model  # pylint: disable=access-member-before-definition
+            and is_peft_model(self.model)
+        ):
+            del self.ref_model
+            self.ref_model = None
+
    def _enable_gradient_checkpointing(
        self, model: PreTrainedModel, args: GRPOConfig
    ) -> PreTrainedModel:
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -4,12 +4,13 @@ import importlib
 import inspect
 import logging
 import types
+from typing import Type

 import torch
 from accelerate.logging import get_logger
 from peft import PeftModelForCausalLM
 from torch import nn
-from transformers.modeling_utils import PreTrainedModel
+from transformers import AutoConfig

 from axolotl.kernels.lora import (
    apply_lora_mlp_geglu,
@@ -95,90 +96,108 @@ def original_apply_o(self: nn.Module, hidden_states: torch.Tensor) -> torch.Tens
    return attn_output


-# pylint: disable=protected-access
-def patch_self_attn_lora(model: PreTrainedModel):
+def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
    """
-    Patches the attention classes in a transformer model with optimized LoRA implementations.
+    Get the appropriate attention class by inspecting the model config.
+    Uses dynamic import to support any model architecture that follows
+    the standard transformers naming convention.
+
+    Args:
+        cfg: Dictionary mapping `axolotl` config keys to values.
+
+    Returns:
+        The appropriate attention class for the model.
+
+    Raises:
+        ValueError: If `base_model` not specified or attention class cannot be imported
+        ImportError: If the model module or attention class doesn't exist
+    """
+    if "base_model" not in cfg:
+        raise ValueError("base_model must be specified in config")
+
+    # Get model config without loading the model
+    model_config = AutoConfig.from_pretrained(cfg["base_model"])
+    model_type = model_config.model_type
+
+    # Special case for model_type = "qwen2"
+    if model_type == "qwen2":
+        from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention
+
+        return Qwen2Attention
+
+    try:
+        # Dynamically import the module and attention class
+        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
+        module = __import__(
+            module_path, fromlist=[f"{model_type.capitalize()}Attention"]
+        )
+        attention_cls = getattr(module, f"{model_type.capitalize()}Attention")
+
+        return attention_cls
+    except (ImportError, AttributeError) as e:
+        raise ValueError(
+            f"Could not import attention class for model_type: {model_type}. "
+            f"Error: {str(e)}"
+        ) from e
+
+
+# pylint: disable=protected-access
+def patch_self_attn_lora(cfg: DictDefault):
+    """
+    Given an `axolotl` config, this method patches the inferred attention class forward
+    pass with optimized LoRA implementations.

    It modifies the attention class to use optimized QKV and output projections. The
    original implementation is preserved and can be restored if needed.

    Args:
-        model: A HuggingFace transformers model.
+        cfg: Dictionary mapping `axolotl` config keys to values.

    Raises:
        AssertionError: If the required code blocks are not found in the attention
            implementation.
    """
-    # Find all attention modules in the model
-    attention_modules = [
-        module
-        for module in model.modules()
-        if "attention" in module.__class__.__name__.lower()
-        and hasattr(module, "forward")
-    ]
+    attention_cls = get_attention_cls_from_config(cfg)

-    if not attention_modules:
-        LOG.warning("No attention modules found in model")
+    # Check if already patched
+    if hasattr(attention_cls, "_original_forward"):
+        LOG.info(f"{attention_cls.__name__} already patched")
        return

-    attention_classes = {type(module) for module in attention_modules}
-    LOG.info(f"Found attention classes: {[cls.__name__ for cls in attention_classes]}")
+    self_attn_forward = inspect.getsource(attention_cls.forward)
+    attention_cls._original_forward = self_attn_forward
+    self_attn_forward, _ = detab_code(self_attn_forward)

-    for attention_cls in attention_classes:
-        # Skip if already patched
-        if hasattr(attention_cls, "_original_forward"):
-            LOG.info(f"{attention_cls.__name__} already patched")
-            continue
+    assert ORIGINAL_QKV_CODE in self_attn_forward, "Original QKV code not found"
+    assert ORIGINAL_O_CODE in self_attn_forward, "Original O code not found"

-        # Get and store original forward implementation
-        self_attn_forward = inspect.getsource(attention_cls.forward)
-        attention_cls._original_forward = self_attn_forward
+    self_attn_forward = self_attn_forward.replace(ORIGINAL_QKV_CODE, PATCHED_QKV_CODE)
+    self_attn_forward = self_attn_forward.replace(ORIGINAL_O_CODE, PATCHED_O_CODE)
+    self_attn_forward = self_attn_forward.replace(
+        "def forward(",
+        "def axolotl_attn_forward(",
+        1,
+    )

-        # Remove indentation
-        self_attn_forward, _ = detab_code(self_attn_forward)
+    # Load necessary imports
+    module_name = attention_cls.__module__
+    module = importlib.import_module(module_name)

-        # Verify required code blocks exist
-        assert (
-            ORIGINAL_QKV_CODE in self_attn_forward
-        ), f"Original QKV code not found in {attention_cls.__name__}"
-        assert (
-            ORIGINAL_O_CODE in self_attn_forward
-        ), f"Original O code not found in {attention_cls.__name__}"
+    items_to_import = []
+    for item in dir(module):
+        if item in self_attn_forward:
+            items_to_import.append(item)

-        # Replace code blocks
-        self_attn_forward = self_attn_forward.replace(
-            ORIGINAL_QKV_CODE, PATCHED_QKV_CODE
-        )
-        self_attn_forward = self_attn_forward.replace(ORIGINAL_O_CODE, PATCHED_O_CODE)
-        self_attn_forward = self_attn_forward.replace(
-            "def forward(",
-            "def axolotl_attn_forward(",
-            1,
-        )
+    exec(  # pylint: disable=exec-used  # nosec B102
+        f"from {module_name} import ({', '.join(items_to_import)})",
+        globals(),
+    )
+    exec(self_attn_forward, globals())  # pylint: disable=exec-used  # nosec B102

-        # Import necessary symbols from the attention module
-        module_name = attention_cls.__module__
-        module = importlib.import_module(module_name)
-
-        items_to_import = []
-        for item in dir(module):
-            if item in self_attn_forward:
-                items_to_import.append(item)
-
-        if items_to_import:
-            exec(  # pylint: disable=exec-used  # nosec B102
-                f"from {module_name} import ({', '.join(items_to_import)})",
-                globals(),
-            )
-
-        # Execute the new implementation
-        exec(self_attn_forward, globals())  # pylint: disable=exec-used  # nosec B102
-
-        LOG.info(f"Patched attention class with LoRA optims: {attention_cls.__name__}")
-        attention_cls.forward = (
-            axolotl_attn_forward  # pylint: disable=undefined-variable  # noqa: F821
-        )
+    LOG.info(f"Patched attention class with LoRA optims: {attention_cls.__name__}")
+    attention_cls.forward = (
+        axolotl_attn_forward  # pylint: disable=undefined-variable  # noqa: F821
+    )


 def apply_lora_kernel_patches(
--- a/src/axolotl/monkeypatch/relora.py
+++ b/src/axolotl/monkeypatch/relora.py
@@ -127,6 +127,8 @@ class ReLoRACallback(TrainerCallback):
        optimizer: torch.optim.Optimizer,
        **_kwargs,
    ):
+        if not optimizer:
+            optimizer = state.optimizer
        if state.global_step > 0 and state.global_step % self.relora_steps == 0:
            checkpoint_folder = os.path.join(
                args.output_dir,
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -272,8 +272,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
                dict(zip(feature_names, row))
            )
            for key, val in tokenized_prompt.items():
-                for i in range(0, len(val), self.sequence_len):
-                    res[key].append(val[i : i + self.sequence_len])
+                res[key].append(val)

        # If there are no examples left, return an empty dictionary
        if not res:
--- a/src/axolotl/utils/data/utils.py
+++ b/src/axolotl/utils/data/utils.py
@@ -172,10 +172,11 @@ def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
    )

    try:
-        min_input_len = np.min(get_dataset_lengths(dataset))
-        LOG.debug(f"min_input_len: {min_input_len}")
-        max_input_len = np.max(get_dataset_lengths(dataset))
-        LOG.debug(f"max_input_len: {max_input_len}")
+        ds_lengths = get_dataset_lengths(dataset, from_arrow=True)
+        min_input_len = np.min(ds_lengths)
+        LOG.info(f"min_input_len: {min_input_len}")
+        max_input_len = np.max(ds_lengths)
+        LOG.info(f"max_input_len: {max_input_len}")
    except AttributeError:
        pass

--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -439,6 +439,11 @@ class ModelLoader:

            patch_mistral_cross_entropy()

+        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
+            from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
+
+            patch_self_attn_lora(self.cfg)
+
    def patch_attention(self) -> None:
        if hasattr(self.model_config, "model_type"):
            if self.model_config.model_type == "mllama" and self.cfg.flash_attention:
@@ -1023,12 +1028,6 @@ class ModelLoader:
            integrate_rope_embeddings()

    def apply_lora_patch(self) -> None:
-        """Applies patching relevant to LoRA Triton kernels if enabled."""
-        if self.cfg.lora_qkv_kernel or self.cfg.lora_o_kernel:
-            from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
-
-            patch_self_attn_lora(self.model)
-
        if (
            self.cfg.lora_mlp_kernel
            or self.cfg.lora_qkv_kernel
@@ -1182,7 +1181,6 @@ class ModelLoader:
        if self.cfg.adapter is not None:
            log_gpu_memory_usage(LOG, "after adapters", self.model.device)

-        # TODO: Deprecate this.
        self.apply_unsloth_lora_patch()
        self.apply_lora_patch()

@@ -1203,7 +1201,9 @@ def load_model(
    reference_model: bool = False,
    **kwargs,  # pylint: disable=unused-argument
 ) -> Tuple[PreTrainedModel, Optional[PeftConfig]]:
-    """Load a model for a given configuration and tokenizer."""
+    """
+    Load a model for a given configuration and tokenizer.
+    """
    loader = ModelLoader(
        cfg,
        tokenizer,
--- a/src/axolotl/utils/samplers/utils.py
+++ b/src/axolotl/utils/samplers/utils.py
@@ -4,13 +4,17 @@ helper util to calculate dataset lengths
 import numpy as np


-def get_dataset_lengths(dataset):
-    if "length" in dataset.data.column_names:
-        lengths = np.array(dataset.data.column("length"))
-    elif "position_ids" in dataset.data.column_names:
-        position_ids = dataset.data.column("position_ids")
+def get_dataset_lengths(dataset, from_arrow=False):
+    if "length" in dataset.column_names:
+        lengths = np.array(dataset["length"])
+    elif "position_ids" in dataset.column_names:
+        position_ids = dataset["position_ids"]
        lengths = np.array([x[-1] + 1 for x in position_ids])
    else:
-        input_ids = dataset.data.column("input_ids")
-        lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
+        if from_arrow:
+            input_ids = dataset.data.column("input_ids")
+            lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
+        else:
+            input_ids = dataset["input_ids"]
+            lengths = np.array([len(seq) for seq in input_ids])
    return lengths
--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -9,14 +9,16 @@ from transformers import AutoModelForCausalLM, LlamaForCausalLM
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import LlamaAttention

-from axolotl.cli.utils import load_model_and_tokenizer
 from axolotl.kernels.lora import (
    apply_lora_mlp_geglu,
    apply_lora_mlp_swiglu,
    apply_lora_o,
    apply_lora_qkv,
 )
-from axolotl.monkeypatch.lora_kernels import apply_lora_kernel_patches
+from axolotl.monkeypatch.lora_kernels import (
+    apply_lora_kernel_patches,
+    patch_self_attn_lora,
+)
 from axolotl.utils.dict import DictDefault

 MODEL_CONFIGS = [
@@ -63,45 +65,15 @@ def small_llama_model():
    return LlamaForCausalLM(LlamaConfig(**config))


-# pylint: disable=duplicate-code
-@pytest.fixture
-def minimal_cfg():
-    "Config of real HuggingFace Hub model"
-    cfg = DictDefault(
-        {
-            "base_model": "HuggingFaceTB/SmolLM2-135M",
-            "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
-            "learning_rate": 0.000001,
-            "datasets": [
-                {
-                    "path": "mhenrichsen/alpaca_2k_test",
-                    "type": "alpaca",
-                }
-            ],
-            "micro_batch_size": 1,
-            "gradient_accumulation_steps": 1,
-            "adapter": "lora",
-            "lora_r": 8,
-            "lora_alpha": 16,
-            "lora_dropout": 0.0,
-            "lora_target_linear": True,
-            "sequence_len": 1024,
-            "lora_mlp_kernel": True,
-            "lora_qkv_kernel": True,
-            "lora_o_kernel": True,
-        }
-    )
-
-    return cfg
-
-
-def test_attention_patching_integration(minimal_cfg):
+def test_attention_patching_integration():
    """Test attention patching in integration context."""
+    cfg = {"base_model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0"}
+
    # Store the original implementation
    original_forward = getattr(LlamaAttention, "forward")

-    # Load model
-    _, _ = load_model_and_tokenizer(cfg=minimal_cfg)
+    # Apply patch
+    patch_self_attn_lora(cfg)

    # Get the new forward method
    patched_forward = LlamaAttention.forward
@@ -404,10 +376,38 @@ def test_model_architecture(model_config):


 # pylint: disable=duplicate-code
-def test_kernel_training_integration(minimal_cfg):
+def test_kernel_training_integration():
    """Test model loading with kernel patches enabled."""
+    from axolotl.cli.utils import load_model_and_tokenizer
+
+    # Create minimal config
+    cfg = DictDefault(
+        {
+            "base_model": "HuggingFaceTB/SmolLM2-135M",
+            "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
+            "learning_rate": 0.000001,
+            "datasets": [
+                {
+                    "path": "mhenrichsen/alpaca_2k_test",
+                    "type": "alpaca",
+                }
+            ],
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 1,
+            "adapter": "lora",
+            "lora_r": 8,
+            "lora_alpha": 16,
+            "lora_dropout": 0.0,
+            "lora_target_linear": True,
+            "sequence_len": 1024,
+            "lora_mlp_kernel": True,
+            "lora_qkv_kernel": True,
+            "lora_o_kernel": True,
+        }
+    )
+
    # Load model
-    model, _ = load_model_and_tokenizer(cfg=minimal_cfg)
+    model, _ = load_model_and_tokenizer(cfg=cfg)

    # Verify correct activation function
    layer = model.model.model.layers[0]
--- a/tests/prompt_strategies/conftest.py
+++ b/tests/prompt_strategies/conftest.py
@@ -125,6 +125,12 @@ def fixture_llama3_tokenizer():
    return tokenizer


+@pytest.fixture(name="smollm2_tokenizer", scope="session", autouse=True)
+def fixture_smollm2_tokenizer():
+    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")
+    return tokenizer
+
+
@pytest.fixture(name="mistralv03_tokenizer", scope="session", autouse=True)
 def fixture_mistralv03_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(
--- a/tests/prompt_strategies/test_dpo_chatml.py
+++ b/tests/prompt_strategies/test_dpo_chatml.py
@@ -0,0 +1,61 @@
+"""
+Tests for loading DPO preference datasets with chatml formatting
+"""
+import unittest
+
+import pytest
+
+from axolotl.prompt_strategies.dpo import load as load_dpo
+from axolotl.utils.data.rl import load_prepare_preference_datasets
+from axolotl.utils.dict import DictDefault
+
+
+@pytest.fixture(name="minimal_dpo_cfg")
+def fixture_cfg():
+    return DictDefault(
+        {
+            "base_model": "HuggingFaceTB/SmolLM2-135M",
+            "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
+            "rl": "dpo",
+            "learning_rate": 0.000001,
+            "micro_batch_size": 1,
+            "gradient_accumulation_steps": 1,
+            "special_tokens": {
+                "pad_token": "<|endoftext|>",
+            },
+            "sequence_len": 2048,
+        }
+    )
+
+
+class TestDPOChatml:
+    """
+    Test loading DPO preference datasets with chatml formatting
+    """
+
+    def test_default(self, minimal_dpo_cfg):
+        cfg = DictDefault(
+            {
+                "datasets": [
+                    {
+                        "path": "argilla/distilabel-intel-orca-dpo-pairs",
+                        "type": "chatml",
+                        "split": "train[:1%]",
+                    }
+                ]
+            }
+            | minimal_dpo_cfg
+        )
+
+        # test that dpo.load works
+        load_dpo("chatml", cfg)
+        # now actually load the datasets with the strategy
+        train_ds, _ = load_prepare_preference_datasets(cfg)
+        assert train_ds[0]["prompt"].startswith("<|im_start|>")
+        assert train_ds[0]["prompt"].endswith("<|im_start|>assistant\n")
+        assert "chosen" in train_ds[0]
+        assert "rejected" in train_ds[0]
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/tests/test_packed_batch_sampler.py
+++ b/tests/test_packed_batch_sampler.py
@@ -7,6 +7,7 @@ from transformers import AutoTokenizer
 from axolotl.datasets import TokenizedPromptDataset
 from axolotl.prompt_strategies.completion import load
 from axolotl.utils.collators import V2BatchSamplerDataCollatorForSeq2Seq
+from axolotl.utils.data.utils import drop_long_seq_in_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths

@@ -18,11 +19,6 @@ def fixture_tokenizer():
    return tokenizer


-@pytest.fixture(name="max_seq_length")
-def fixture_max_seq_length():
-    return 4096
-
-
 class TestBatchedSamplerPacking:
    """
    Test class for packing streaming dataset sequences
@@ -37,6 +33,7 @@ class TestBatchedSamplerPacking:
            (2, 2),
        ],
    )
+    @pytest.mark.parametrize("max_seq_length", [4096, 512])
    def test_packing(self, batch_size, num_workers, tokenizer, max_seq_length):
        import axolotl.monkeypatch.data.batch_dataset_fetcher  # pylint: disable=unused-import  # noqa: F401

@@ -62,6 +59,9 @@ class TestBatchedSamplerPacking:
            dataset,
        )
        train_dataset = concatenate_datasets([dataset_wrapper])
+
+        train_dataset = drop_long_seq_in_dataset(train_dataset, cfg)
+
        lengths = get_dataset_lengths(train_dataset)
        batch_sampler = MultipackBatchSampler(
            sampler=RandomSampler(train_dataset),
@@ -90,7 +90,7 @@ class TestBatchedSamplerPacking:
                batch_idxs.extend(pack)

        for batch in loader:
-            assert len(batch["input_ids"]) <= batch_size * max_seq_length
+            assert batch["input_ids"].numel() <= batch_size * max_seq_length
            assert batch["input_ids"].shape[1] == max_seq_length

        original_idxs = set(range(len(train_dataset)))
Author	SHA1	Message	Date
Wing Lian	a9ebff087c	remove ref_model when peft model is passed into grpo trainer	2025-02-20 21:53:20 -05:00
NanoCode012	b53a41372f	feat: update transformers version to 4.49.0 (#2340 )	2025-02-20 21:12:06 -05:00
Wing Lian	02f45e94be	calculate sample length fixes and SFT splitting fixes (#2351 ) * fix chat template splitting long samples across multiple rows * make the preprocessing faster	2025-02-20 14:29:58 -05:00
Dan Saunders	954e192f38	quick formatting fix for LoRA optims doc (#2349 )	2025-02-19 09:23:31 -05:00
Tobias	8dfadc2b3c	Fix sample packing producing longer sequences than specified by `sequence_len` (#2332 ) * Extend MultiPackBatchSampler test to include shorter sequence length and drop long sequences filter * Fix get_dataset_lengths for datasets that were previously filtered (e.g., with drop_long_seq_in_dataset) * Update src/axolotl/utils/samplers/utils.py Fix get_dataset_lengths for datasets that do not have position_ids or length attributes Co-authored-by: NanoCode012 <kevinvong@rocketmail.com> --------- Co-authored-by: NanoCode012 <kevinvong@rocketmail.com>	2025-02-19 12:02:35 +07:00
Wing Lian	23a9fcb0a7	make sure chatml dpo dataset loading works (#2333 )	2025-02-18 16:08:40 -05:00