remove monkeypatch

try run regular CE loss on eval
WIP DFT
2026-01-21 10:06:58 +00:00 · 2026-01-19 22:00:48 +00:00 · 2026-01-15 18:43:31 +00:00
4 changed files with 160 additions and 0 deletions
--- a/examples/gemma3/gemma-3-1b-fft-dft.yml
+++ b/examples/gemma3/gemma-3-1b-fft-dft.yml
@@ -0,0 +1,53 @@
 base_model: google/gemma-3-1b-it
 model_type: Gemma3ForCausalLM
 cls_model_config: Gemma3TextConfig
 # gemma3 doesn't seem to play nice with ddp
 ddp_find_unused_parameters: true
 chat_template: gemma3
 eot_tokens:
  - <end_of_turn>
 datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
 val_set_size: 0.05
 output_dir: ./outputs/gemma-3-1b-fft-dft
 sequence_len: 2048
 use_dynamic_finetuning: true
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 1
 optimizer: adamw_torch_fused
 lr_scheduler: cosine
 learning_rate: 5e-5
 bf16: auto
 tf32: true
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
  use_reentrant: false
 resume_from_checkpoint:
 logging_steps: 1
 flash_attention: true
 warmup_ratio: 0.1
 evals_per_epoch: 2
 saves_per_epoch: 1
 weight_decay: 0.0
--- a/src/axolotl/core/builders/causal.py
+++ b/src/axolotl/core/builders/causal.py
@@ -373,6 +373,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
            # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
            data_collator_kwargs["pad_to_multiple_of"] = multiple
        if self.cfg.use_dynamic_finetuning:
            from axolotl.monkeypatch.loss.dft import dft_loss
            trainer_kwargs["compute_loss_func"] = dft_loss
        trainer_cls = self._get_trainer_cls()
        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
--- a/src/axolotl/monkeypatch/loss/dft.py
+++ b/src/axolotl/monkeypatch/loss/dft.py
@@ -0,0 +1,98 @@
 """Dynamic Fine-Tuning (DFT) loss implementation"""
 from typing import Optional
 import torch
 import torch.nn.functional as F
 def selective_log_softmax(logits, index):
    """Memory-efficient log_softmax -> gather"""
    if logits.dtype in [torch.float32, torch.float64]:
        selected_logits = torch.gather(
            logits, dim=-1, index=index.unsqueeze(-1)
        ).squeeze(-1)
        logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits])
        per_token_logps = selected_logits - logsumexp_values
    else:
        per_token_logps = []
        for row_logits, row_labels in zip(logits, index, strict=True):
            row_logps = F.log_softmax(row_logits, dim=-1)
            row_per_token_logps = row_logps.gather(
                dim=-1, index=row_labels.unsqueeze(-1)
            ).squeeze(-1)
            per_token_logps.append(row_per_token_logps)
        per_token_logps = torch.stack(per_token_logps)
    return per_token_logps
 def get_dft_loss(ignore_index: int = -100):
    """Creates DFT loss function"""
    def for_causal_lm_dft_loss(
        logits,
        labels,
        vocab_size: int = None,
        num_items_in_batch: Optional[int] = None,
        ignore_index: int = -100,
        shift_labels: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        """DFT loss: -exp(logprobs).detach() * logprobs"""
        if shift_labels is None:
            # Shift so that tokens < n predict n
            labels = F.pad(labels, (0, 1), value=ignore_index)
            shift_labels = labels[..., 1:].contiguous()
        shift_labels = shift_labels.to(logits.device)
        # Create loss mask
        loss_mask = shift_labels != ignore_index
        shift_labels_masked = shift_labels.clone()
        shift_labels_masked[~loss_mask] = 0
        # Compute log probabilities
        logprobs = selective_log_softmax(logits, shift_labels_masked)
        # DFT loss: -exp(logprobs).detach() * logprobs
        per_token_loss = -logprobs.exp().detach() * logprobs
        # Sum over valid tokens and normalize
        if num_items_in_batch is None:
            num_items_in_batch = loss_mask.sum()
        loss = (per_token_loss * loss_mask).sum() / num_items_in_batch
        return loss
    return for_causal_lm_dft_loss
 def dft_loss(outputs, labels, num_items_in_batch=None):
    """DFT loss compatible with Trainer.compute_loss_func signature.
    This function is designed to be passed to Trainer's compute_loss_func parameter.
    """
    ignore_index = -100
    # Shift labels for causal LM
    labels = F.pad(labels, (0, 1), value=ignore_index)
    shift_labels = labels[..., 1:].contiguous()
    shift_labels = shift_labels.to(outputs.logits.device)
    # Create loss mask
    loss_mask = shift_labels != ignore_index
    shift_labels_masked = shift_labels.clone()
    shift_labels_masked[~loss_mask] = 0
    # Compute log probabilities
    logprobs = selective_log_softmax(outputs.logits, shift_labels_masked)
    # DFT loss: -exp(logprobs).detach() * logprobs
    per_token_loss = -logprobs.exp().detach() * logprobs
    # Sum over valid tokens and normalize
    if num_items_in_batch is None:
        num_items_in_batch = loss_mask.sum()
    loss = (per_token_loss * loss_mask).sum() / num_items_in_batch
    return loss
--- a/src/axolotl/utils/schemas/config.py
+++ b/src/axolotl/utils/schemas/config.py
@@ -676,6 +676,10 @@ class AxolotlInputConfig(
            "description": "Number of chunks to use for chunked cross entropy loss"
        },
    )
    use_dynamic_finetuning: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Enable Dynamic Fine-Tuning loss (DFT)"},
    )
    tiled_mlp: bool | None = Field(
        default=None,
Author	SHA1	Message	Date
Salman Mohammadi	0a0115493d	remove monkeypatch	2026-01-21 10:06:58 +00:00
Salman Mohammadi	7a4f33802d	try run regular CE loss on eval	2026-01-19 22:00:48 +00:00
Salman Mohammadi	170dca9bb9	WIP DFT	2026-01-15 18:43:31 +00:00