make sure to set forward first

fix detab usage
fix enable_act_offloading
2024-12-12 17:29:34 -05:00 · 2024-12-12 17:24:18 -05:00 · 2024-12-12 17:22:34 -05:00 · 2024-12-12 17:19:43 -05:00 · 2024-12-12 17:09:02 -05:00
12 changed files with 1449 additions and 1583 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -12,7 +12,7 @@ liger-kernel==0.4.2

 packaging==23.2
 peft==0.14.0
-transformers==4.47.0
+transformers>=4.46.3
 tokenizers>=0.20.1
 accelerate==1.2.0
 datasets==3.1.0
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
--- a/src/axolotl/core/trainers/base.py
+++ b/src/axolotl/core/trainers/base.py
--- a/src/axolotl/core/training_args.py
+++ b/src/axolotl/core/training_args.py
@@ -1,220 +0,0 @@
-"""
-extra axolotl specific training args
-"""
-from dataclasses import dataclass, field
-from typing import Optional
-
-from transformers import TrainingArguments
-from trl import CPOConfig, DPOConfig, KTOConfig, ORPOConfig, RewardConfig
-
-
-@dataclass
-class AxolotlTrainingMixins:
-    """
-    Mixin class for the Axolotl training args.
-    """
-
-    # pylint: disable=duplicate-code
-    model_type: Optional[str] = field(
-        default=None, metadata={"help": "HF model configuration model_type."}
-    )
-    lr_quadratic_warmup: bool = field(
-        default=False,
-        metadata={"help": "Use quadratic warmup for cosine scheduling."},
-    )
-    pretraining: bool = field(
-        default=False,
-        metadata={
-            "help": "Indicates to trainer whether we are doing continued pretraining."
-        },
-    )
-    sample_packing: bool = field(
-        default=False,
-        metadata={"help": "Use sample packing for efficient training."},
-    )
-    multipack_real_batches: bool = field(
-        default=False,
-        metadata={"help": "Use real batches for efficient training."},
-    )
-    eval_sample_packing: Optional[bool] = field(
-        default=None,
-        metadata={"help": "Use sample packing for efficient evals."},
-    )
-    sample_packing_efficiency: float = field(
-        default=1.0,
-        metadata={"help": "Sample packing efficiency for calculating batch length."},
-    )
-    sample_packing_bin_size: int = field(
-        default=200,
-        metadata={
-            "help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
-        },
-    )
-    sample_packing_group_size: int = field(
-        default=100000,
-        metadata={
-            "help": "The number of samples to group together for packing. Increase for better packing."
-        },
-    )
-    max_seq_length: int = field(
-        default=2048,
-        metadata={"help": "The maximum sequence length the model can handle"},
-    )
-    relora_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how often to reset for ReLoRA"},
-    )
-    relora_warmup_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
-    )
-    relora_anneal_steps: Optional[int] = field(
-        default=None,
-        metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
-    )
-    relora_prune_ratio: Optional[float] = field(
-        default=0.9,
-        metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
-    )
-    bench_split: Optional[str] = field(
-        default="eval", metadata={"help": "The benchmark split to run on"}
-    )
-    bench_dataset: Optional[str] = field(
-        default="pharaouk/dharma-1/dharma_1_mini.json",
-        metadata={
-            "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
-        },
-    )
-    do_bench_eval: Optional[bool] = field(
-        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
-    )
-    do_causal_lm_eval: Optional[bool] = field(
-        default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
-    )
-    max_bench_samples: Optional[int] = field(
-        default=None,
-        metadata={
-            "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
-        },
-    )
-    bench_source_max_len: int = field(
-        default=2048, metadata={"help": "Maximum source sequence length for bench."}
-    )
-    dataloader_prefetch_factor: Optional[int] = field(
-        default=None,
-        metadata={"help": "prefetch_factor argument to the dataloader"},
-    )
-    cosine_min_lr_ratio: Optional[float] = field(
-        default=None,
-        metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
-    )
-    cosine_constant_lr_ratio: Optional[float] = field(
-        default=None,
-        metadata={
-            "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
-        },
-    )
-    loraplus_lr_ratio: Optional[float] = field(
-        default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
-    )
-    loraplus_lr_embedding: Optional[float] = field(
-        default=1e-6,
-        metadata={"help": "loraplus learning rate for lora embedding layers."},
-    )
-    embedding_lr_scale: Optional[float] = field(
-        default=None,
-        metadata={"help": "Scale the learning rate for the embedding layers."},
-    )
-    embedding_lr: Optional[float] = field(
-        default=None,
-        metadata={"help": "absolute learning rate for the embedding layers."},
-    )
-    qlora: bool = field(
-        default=False,
-        metadata={"help": "whether this is a qlora training"},
-    )
-    orpo_alpha: Optional[float] = field(
-        default=None,
-    )
-    lisa_n_layers: Optional[int] = field(
-        default=None,
-        metadata={"help": "the number of activate layers in LISA"},
-    )
-    lisa_step_interval: Optional[int] = field(
-        default=None,
-        metadata={"help": "how often to switch layers in LISA"},
-    )
-    lisa_layers_attribute: Optional[str] = field(
-        default=None,
-        metadata={"help": "path under the model to access the layers"},
-    )
-    curriculum_sampling: Optional[bool] = field(
-        default=None,
-        metadata={"help": "whether to use sequential sampling for curriculum learning"},
-    )
-    alternate_optimizer: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "workaround to pass an alternate optimizer to the HF trainer"
-        },
-    )
-    alternate_lr_scheduler_type: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
-        },
-    )
-    chat_template: Optional[str] = field(
-        default=None,
-        metadata={"help": "Chat template converting chat messages to text"},
-    )
-
-
-@dataclass
-class AxolotlTrainingArguments(AxolotlTrainingMixins, TrainingArguments):
-    """
-    Training arguments for Causal trainer
-
-    This code is duplicated due to HF TrainingArguments not setting output_dir with a defaujlt value
-    so it can't be used as a mixin.
-    """
-
-
-@dataclass
-class AxolotlDPOConfig(AxolotlTrainingMixins, DPOConfig):
-    """
-    DPO config for DPO training
-    """
-
-
-@dataclass
-class AxolotlORPOConfig(AxolotlTrainingMixins, ORPOConfig):
-    """
-    ORPO config for ORPO training
-    """
-
-
-@dataclass
-class AxolotlKTOConfig(AxolotlTrainingMixins, KTOConfig):
-    """
-    KTO config for KTO training
-    """
-
-
-@dataclass
-class AxolotlCPOConfig(AxolotlTrainingMixins, CPOConfig):
-    """
-    CPO config for CPO training
-    """
-
-    simpo_gamma: Optional[float] = field(
-        default=None,
-        metadata={"help": "simpo gamma parameter"},
-    )
-
-
-@dataclass
-class AxolotlRewardConfig(AxolotlTrainingMixins, RewardConfig):
-    """
-    Reward config for Reward training
-    """
--- a/src/axolotl/integrations/liger/args.py
+++ b/src/axolotl/integrations/liger/args.py
@@ -36,8 +36,6 @@ class LigerArgs(BaseModel):
    liger_cross_entropy: Optional[bool] = None
    liger_fused_linear_cross_entropy: Optional[bool] = None

-    liger_pref_rl: Optional[bool] = None
-
    @model_validator(mode="before")
    @classmethod
    def check_deprecated_swiglu(cls, data):
--- a/src/axolotl/integrations/liger/trainer/dpo_trainer.py
+++ b/src/axolotl/integrations/liger/trainer/dpo_trainer.py
@@ -1,253 +0,0 @@
-"""
-integration of liger dpo kernels with dpotrainer
-"""
-from typing import Dict, List, Literal, Union
-
-import torch
-from liger_kernel.chunked_loss import LigerFusedLinearDPOLoss
-from liger_kernel.transformers.trainer.orpo_trainer import _FSDPForwardRedirection
-from torch import nn
-from torch.distributed.fsdp import FullyShardedDataParallel
-
-from axolotl.core.trainers.base import AxolotlDPOTrainer
-
-
-class AxolotlLigerDPOTrainer(AxolotlDPOTrainer):
-    """
-    Extend the DPO Trainer to use LIGER kernels for DPO
-    """
-
-    def concatenated_forward(
-        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
-    ):
-        """
-        Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together,
-        and compute the DPO loss using Liger's fused kernel.
-
-        This method replaces the original `concatenated_forward` implementation to use Liger.
-        """
-
-        # Prepare concatenated inputs
-        concatenated_batch = self.concatenated_inputs(batch, self.padding_value)
-
-        # Extract concatenated inputs
-        prompt_input_ids = concatenated_batch["prompt_input_ids"]
-        prompt_attention_mask = concatenated_batch["prompt_attention_mask"]
-        completion_input_ids = concatenated_batch["completion_input_ids"]
-        completion_attention_mask = concatenated_batch["completion_attention_mask"]
-
-        # For encoder-decoder models, you'd need to construct decoder_input_ids, etc.
-        # This example assumes a causal decoder-only model.
-        input_ids = torch.cat((prompt_input_ids, completion_input_ids), dim=1)
-        attention_mask = torch.cat(
-            (prompt_attention_mask, completion_attention_mask), dim=1
-        )
-
-        # Align inputs by removing leading padding
-        for i in range(attention_mask.size(0)):
-            first_one_idx = torch.nonzero(attention_mask[i])[0].item()
-            input_ids[i] = torch.roll(input_ids[i], shifts=-first_one_idx)
-            attention_mask[i] = torch.roll(attention_mask[i], shifts=-first_one_idx)
-
-        # Remove trailing empty columns
-        empty_cols = torch.sum(attention_mask, dim=0) == 0
-        if empty_cols.any():
-            first_empty_col = torch.nonzero(empty_cols)[0].item()
-            input_ids = input_ids[:, :first_empty_col]
-            attention_mask = attention_mask[:, :first_empty_col]
-
-        if self.args.max_length is not None:
-            input_ids = input_ids[:, : self.args.max_length]
-            attention_mask = attention_mask[:, : self.args.max_length]
-
-        # Labels are completion_input_ids shifted by one token right
-        # For causal LM, labels are the completion part only
-        labels = torch.cat(
-            (torch.zeros_like(prompt_input_ids), completion_input_ids), dim=1
-        )
-        labels = labels[:, 1:]  # shift left by one
-        attention_mask = attention_mask[:, 1:]
-        labels = labels[:, : attention_mask.size(1)]
-
-        # Mask out the prompt portion from loss
-        labels[~attention_mask.bool()] = self.label_pad_token_id
-
-        # Prepare reference model hidden states if ref_model exists
-        use_ref_model = self.ref_model is not None and not self.reference_free
-
-        # Run main model forward to get hidden states
-        # If using FSDP, redirect forward calls
-        if isinstance(model, FullyShardedDataParallel):
-            outputs = _FSDPForwardRedirection()(
-                model,
-                model._fsdp_wrapped_module.model,  # pylint: disable=protected-access
-                input_ids,
-                attention_mask=attention_mask,
-                use_cache=False,
-            )
-        else:
-            # If model is a DataParallel, unwrap
-            if isinstance(model, torch.nn.DataParallel):
-                model = model.module
-            outputs = model.model(
-                input_ids, attention_mask=attention_mask, use_cache=False
-            )
-
-        last_hidden_state = outputs.last_hidden_state
-
-        ref_last_hidden_state = None
-        if use_ref_model:
-            ref_model = self.ref_model
-            if isinstance(ref_model, FullyShardedDataParallel):
-                with torch.no_grad():
-                    ref_outputs = _FSDPForwardRedirection()(
-                        ref_model,
-                        ref_model._fsdp_wrapped_module.model,  # pylint: disable=protected-accessåå
-                        input_ids,
-                        attention_mask=attention_mask,
-                        use_cache=False,
-                    )
-            else:
-                if isinstance(ref_model, torch.nn.DataParallel):
-                    ref_model = ref_model.module
-                with torch.no_grad():
-                    ref_outputs = ref_model.model(
-                        input_ids, attention_mask=attention_mask, use_cache=False
-                    )
-            ref_last_hidden_state = ref_outputs.last_hidden_state
-
-        # Retrieve lm_head parameters
-        lm_head = model.lm_head
-        ref_lm_head = (
-            self.ref_model.lm_head
-            if (use_ref_model and self.ref_model is not None)
-            else None
-        )
-
-        # Use Liger fused DPO loss
-        dpo_loss_fn = LigerFusedLinearDPOLoss(
-            ignore_index=self.label_pad_token_id,
-            beta=self.beta,
-            compute_nll_loss=False,
-            compiled=True,
-            use_ref_model=use_ref_model,
-        )
-
-        # call fused Liger DPO
-        if use_ref_model:
-            loss_acc, aux_outputs = dpo_loss_fn(
-                lm_head.weight,  # lin_weight
-                last_hidden_state,  # _input
-                labels,  # target
-                bias=lm_head.bias,
-                ref_input=ref_last_hidden_state,
-                ref_weight=ref_lm_head.weight,
-                ref_bias=ref_lm_head.bias,
-            )
-
-            (
-                policy_chosen_logps,
-                policy_rejected_logps,
-                policy_chosen_logits_mean,
-                policy_rejected_logits_mean,
-                policy_nll_loss,
-            ) = aux_outputs[:5]
-
-        else:
-            # No reference model scenario: Liger kernel treats ref_logps as 0
-            loss_acc, aux_outputs = dpo_loss_fn(
-                lm_head.weight,
-                last_hidden_state,
-                labels,
-                bias=lm_head.bias,
-            )
-            (
-                policy_chosen_logps,
-                policy_rejected_logps,
-                policy_chosen_logits_mean,
-                policy_rejected_logits_mean,
-                policy_nll_loss,
-            ) = aux_outputs[:5]
-
-        # Add aux loss if enabled
-        if self.aux_loss_enabled and hasattr(outputs, "aux_loss"):
-            loss_acc = loss_acc + self.aux_loss_coef * outputs.aux_loss
-
-        # Add RPO loss if requested (RPO is a variant that adds NLL loss)
-        if self.args.rpo_alpha is not None:
-            # policy_nll_loss: average negative log-likelihood of chosen completions
-            loss_acc = loss_acc + self.args.rpo_alpha * policy_nll_loss.mean()
-
-        return (
-            loss_acc,
-            policy_chosen_logps,
-            policy_rejected_logps,
-            policy_chosen_logits_mean,
-            policy_rejected_logits_mean,
-            policy_nll_loss,
-        )
-
-    def get_batch_loss_metrics(
-        self,
-        model,
-        batch: Dict[str, Union[List, torch.LongTensor]],
-        train_eval: Literal["train", "eval"] = "train",
-    ):
-        """
-        Compute the DPO loss and other metrics for a given batch using the Liger fused kernel.
-        """
-        metrics = {}
-
-        (
-            loss,
-            policy_chosen_logps,
-            policy_rejected_logps,
-            policy_chosen_logits_mean,
-            policy_rejected_logits_mean,
-            policy_nll_loss,
-        ) = self.concatenated_forward(model, batch)
-
-        # For metrics, we approximate chosen/rejected rewards as beta * (log π(y) - log π_ref(y)) if ref model used.
-        # If no ref model is used, we can't compute reward_accuracies meaningfully. For simplicity, we assume ref_model presence.
-        if self.ref_model is not None and not self.reference_free:
-            # If you want full parity with original DPOTrainer metrics (like chosen_rewards, rejected_rewards),
-            # you'd need to run reference forward or store reference log ps. The Liger kernel currently doesn't
-            # return ref_chosen_logps/ref_rejected_logps explicitly. By design, Liger directly computes DPO.
-            #
-            # Here we approximate chosen_rewards and rejected_rewards from the difference in chosen/rejected logps.
-            # Since Liger DPO does not output ref logps separately, you may need to modify the Liger kernel to
-            # also output them if you need all the metrics. For now, we'll skip them or provide a placeholder.
-
-            # Placeholder: chosen/rejected "rewards" can't be retrieved directly from Liger as-is.
-            # If needed, integrate ref_chosen_logps/ref_rejected_logps into Liger kernel returns.
-            chosen_rewards = policy_chosen_logps * self.beta  # approximation
-            rejected_rewards = policy_rejected_logps * self.beta  # approximation
-            reward_accuracies = (chosen_rewards > rejected_rewards).float()
-            metrics[f"{train_eval}_rewards/chosen"] = chosen_rewards.mean().cpu().item()
-            metrics[f"{train_eval}_rewards/rejected"] = (
-                rejected_rewards.mean().cpu().item()
-            )
-            metrics[f"{train_eval}_rewards/accuracies"] = (
-                reward_accuracies.mean().cpu().item()
-            )
-            metrics[f"{train_eval}_rewards/margins"] = (
-                (chosen_rewards - rejected_rewards).mean().cpu().item()
-            )
-
-        metrics[f"{train_eval}_logps/chosen"] = policy_chosen_logps.mean().cpu().item()
-        metrics[f"{train_eval}_logps/rejected"] = (
-            policy_rejected_logps.mean().cpu().item()
-        )
-        metrics[f"{train_eval}_logits/chosen"] = (
-            policy_chosen_logits_mean.detach().cpu().item()
-        )
-        metrics[f"{train_eval}_logits/rejected"] = (
-            policy_rejected_logits_mean.detach().cpu().item()
-        )
-
-        if self.args.rpo_alpha is not None:
-            metrics[f"{train_eval}_nll_loss"] = (
-                policy_nll_loss.mean().detach().cpu().item()
-            )
-
-        return loss.mean(), metrics
--- a/src/axolotl/integrations/liger/trainer/init.py
+++ b/src/axolotl/integrations/liger/trainer/init.py
--- a/src/axolotl/monkeypatch/models/llama/modeling_llama.py
+++ b/src/axolotl/monkeypatch/models/llama/modeling_llama.py
@@ -0,0 +1,170 @@
+import contextlib
+import inspect
+import types
+
+from torchtune.training import OffloadActivations
+from transformers import LlamaConfig, LlamaForCausalLM
+
+from axolotl.monkeypatch.unsloth_ import detab_code
+
+HF_MODEL_OUTPUTS = """
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+""".lstrip()
+
+PATCHED_HF_MODEL_OUTPUTS = """
+        with self.act_offloading_ctx_manager:
+            outputs = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                inputs_embeds=inputs_embeds,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                cache_position=cache_position,
+                **kwargs,
+            )
+""".lstrip()
+
+LCE_MODEL_OUTPUTS = """
+    outputs = self.model(
+        input_ids=input_ids,
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+    )
+""".lstrip()
+
+PATCHED_LCE_OUTPUTS = """
+    with self.act_offloading_ctx_manager:
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+""".lstrip()
+
+HF_GA_FORWARD_1 = """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+""".lstrip()
+
+PATCHED_HF_GA_FORWARD_1 = """
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    # remove num_items_in_batch otherwise self.model attempts to pass it to flash_attention
+    num_items_in_batch = kwargs.pop("num_items_in_batch", None)
+
+    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+""".lstrip()
+
+HF_GA_FORWARD_2 = """
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+""".lstrip()
+
+PATCHED_HF_GA_FORWARD_2 = """
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, num_items_in_batch=num_items_in_batch, **kwargs)
+""".lstrip()
+
+
+class AxolotlLlamaForCausalLM(LlamaForCausalLM):
+    act_offloading_ctx_manager = contextlib.nullcontext()
+
+    def __init__(self, config: LlamaConfig):
+        super().__init__(config)
+
+    @classmethod
+    def set_forward(cls):
+        forward_source = inspect.getsource(LlamaForCausalLM.forward)
+        forward_source, _ = detab_code(forward_source)
+        cls.forward = types.MethodType(
+            compile(forward_source, "<forward>", "exec"), cls
+        )
+
+    @classmethod
+    def enable_act_offloading(cls):
+        forward_source = inspect.getsource(cls.forward)
+        forward_source = forward_source.replace(
+            HF_MODEL_OUTPUTS, PATCHED_HF_MODEL_OUTPUTS
+        )
+        forward_source, _ = detab_code(forward_source)
+        # replace forward method with patched version
+        cls.forward = types.MethodType(
+            compile(forward_source, "<llama_forward_w_act_offloading>", "exec"), cls
+        )
+        cls.act_offloading_ctx_manager = OffloadActivations()
+
+    @classmethod
+    def enable_liger_fce(cls, enable_act_offloading=True):
+        from liger_kernel.transformers.model.llama import (
+            lce_forward as llama_lce_forward,
+        )
+
+        if enable_act_offloading:
+            lce_source = inspect.getsource(llama_lce_forward)
+            lce_source = lce_source.replace(LCE_MODEL_OUTPUTS, PATCHED_LCE_OUTPUTS)
+            # replace forward method with patched version
+            cls.forward = types.MethodType(
+                compile(lce_source, "<llama_lce_forward_w_act_offloading>", "exec"),
+                cls,
+            )
+        else:
+            cls.forward = types.methodType(llama_lce_forward, cls)
+
+    @classmethod
+    def patch_hf_ga(cls):
+        # bugfix patch for gradient accumulation
+        forward_source = inspect.getsource(cls.forward)
+        forward_source = forward_source.replace(
+            HF_GA_FORWARD_1, PATCHED_HF_GA_FORWARD_1
+        )
+        forward_source = forward_source.replace(
+            HF_GA_FORWARD_2, PATCHED_HF_GA_FORWARD_2
+        )
+        forward_source, _ = detab_code(forward_source)
+        # replace forward method with patched version
+        cls.forward = types.MethodType(
+            compile(forward_source, "<llama_forward_ga_fix>", "exec"), cls
+        )
+
+
+def replace_auto_model():
+    from transformers import LlamaConfig
+    from transformers.models.auto import MODEL_FOR_CAUSAL_LM_MAPPING
+
+    MODEL_FOR_CAUSAL_LM_MAPPING[LlamaConfig] = AxolotlLlamaForCausalLM
+    AxolotlLlamaForCausalLM.set_forward()
+
+    return AxolotlLlamaForCausalLM
--- a/src/axolotl/prompt_strategies/dpo/chatml.py
+++ b/src/axolotl/prompt_strategies/dpo/chatml.py
@@ -8,36 +8,17 @@ def argilla(
    **kwargs,
 ):  # pylint: disable=possibly-unused-variable,unused-argument
    def transform_fn(sample):
-        if "prompt" in sample.keys():
-            prompt_key = "prompt"
-        elif "input" in sample.keys():
-            prompt_key = "input"
-        elif "question" in sample.keys():
-            prompt_key = "question"
-        else:
-            prompt_key = "instruction"
-
-        if "chosen" in sample.keys():
-            chosen_key = "chosen"
-        else:
-            chosen_key = "chosen_response"
-
-        if "rejected" in sample.keys():
-            rejected_key = "rejected"
-        else:
-            rejected_key = "rejected_response"
-
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
-                f"<|im_start|>user\n{sample[prompt_key]}<|im_end|>\n<|im_start|>assistant\n"
+                f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample[
                "prompt"
-            ] = f"<|im_start|>user\n{sample[prompt_key]}<|im_end|>\n<|im_start|>assistant\n"
-        sample["chosen"] = f"{sample[chosen_key]}<|im_end|>"
-        sample["rejected"] = f"{sample[rejected_key]}<|im_end|>"
+            ] = f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
+        sample["chosen"] = f"{sample['chosen_response']}<|im_end|>"
+        sample["rejected"] = f"{sample['rejected_response']}<|im_end|>"
        return sample

    return transform_fn
--- a/src/axolotl/prompt_strategies/dpo/llama3.py
+++ b/src/axolotl/prompt_strategies/dpo/llama3.py
@@ -8,37 +8,17 @@ def argilla(
    **kwargs,
 ):  # pylint: disable=possibly-unused-variable,unused-argument
    def transform_fn(sample):
-        # pylint: disable=duplicate-code
-        if "prompt" in sample.keys():
-            prompt_key = "prompt"
-        elif "input" in sample.keys():
-            prompt_key = "input"
-        elif "question" in sample.keys():
-            prompt_key = "question"
-        else:
-            prompt_key = "instruction"
-
-        if "chosen" in sample.keys():
-            chosen_key = "chosen"
-        else:
-            chosen_key = "chosen_response"
-
-        if "rejected" in sample.keys():
-            rejected_key = "rejected"
-        else:
-            rejected_key = "rejected_response"
-
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
-                f"<|start_header_id|>user<|end_header_id|>\n\n{sample[prompt_key]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample[
                "prompt"
-            ] = f"<|start_header_id|>user<|end_header_id|>\n\n{sample[prompt_key]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-        sample["chosen"] = f"{sample[chosen_key]}<|eot_id|>"
-        sample["rejected"] = f"{sample[rejected_key]}<|eot_id|>"
+            ] = f"<|start_header_id|>user<|end_header_id|>\n\n{sample['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
+        sample["chosen"] = f"{sample['chosen_response']}<|eot_id|>"
+        sample["rejected"] = f"{sample['rejected_response']}<|eot_id|>"
        return sample

    return transform_fn
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -679,6 +679,7 @@ class AxolotlInputConfig(
        default=False
    )
    gradient_checkpointing_kwargs: Optional[Dict[str, Any]] = None
+    activation_offloading: Optional[bool] = None

    unfrozen_parameters: Optional[List[str]] = None

--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -380,6 +380,15 @@ class ModelLoader:
        plugin_manager = PluginManager.get_instance()
        plugin_manager.pre_model_load(self.cfg)

+        if self.cfg.model_config_type == "llama":
+            from axolotl.monkeypatch.models.llama.modeling_llama import replace_auto_model
+
+            AxolotlLlamaForCausalLM = replace_auto_model()
+
+            AxolotlLlamaForCausalLM.patch_hf_ga()
+            if self.cfg.activation_offloading:
+                AxolotlLlamaForCausalLM.enable_act_offloading()
+
        if self.cfg.fsdp:
            from axolotl.monkeypatch.trainer_fsdp_optim import (
                patch_training_loop_for_fsdp,
@@ -1183,6 +1192,8 @@ class ModelLoader:

        self.apply_lora_patch()

+        # self.apply_patches_to_model()
+
        for _ in range(3):
            gc.collect()
            torch.cuda.empty_cache()
Author	SHA1	Message	Date
Wing Lian	7ac9cbebb9	make sure to set forward first	2024-12-12 17:29:34 -05:00
Wing Lian	15f2fa4c8e	fix detab usage	2024-12-12 17:24:18 -05:00
Wing Lian	43a2f9a155	fix enable_act_offloading	2024-12-12 17:22:34 -05:00
Wing Lian	8b79f1cbf6	use as class methods	2024-12-12 17:19:43 -05:00
Wing Lian	3872d5eaed	WIP experimental management of patches on custom model	2024-12-12 17:09:02 -05:00