ungate lora with bias

update lora optims doc
2025-09-25 12:40:13 -04:00 · 2025-09-25 12:24:25 -04:00
8 changed files with 62 additions and 62 deletions
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -5,10 +5,11 @@ description: "Custom autograd functions and Triton kernels in Axolotl for optimi

 Inspired by [Unsloth](https://github.com/unslothai/unsloth), we've implemented two
 optimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU
-(in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function
-Triton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was
-to leverage operator fusion and tensor re-use in order to improve speed and reduce
-memory usage during the forward and backward passes of these calculations.
+(including DDP, DeepSpeed, and FSDP2) training. These include (1) SwiGLU and GEGLU
+activation function Triton kernels, and (2) LoRA MLP and attention custom autograd
+functions. Our goal was to leverage operator fusion and tensor re-use in order to
+improve speed and reduce memory usage during the forward and backward passes of these
+calculations.

 We currently support several common model architectures, including (but not limited to):

@@ -92,13 +93,12 @@ Currently, LoRA kernels are not supported for RLHF training, only SFT.

 - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
    - Note: Set `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` to enable [memory-efficient attention on AMD GPUs](https://github.com/ROCm/aotriton/issues/16#issuecomment-2346675491)
- Targeted LoRA adapters cannot use Dropout
-    - This may limit model expressivity / cause overfitting
- Targeted LoRA adapters cannot have bias terms
+- Targeted LoRA adapters must disable dropout (`lora_dropout: 0`)
    - This may limit model expressivity
+- Adapters that already include bias terms are supported.

-Models with pre-existing LoRA adapters that use Dropout or have bias terms may need to
-be re-finetuned without these features in order to be useful.
+Models with pre-existing LoRA adapters that use Dropout may need to be re-finetuned
+without it in order to be as performant.

 ## Implementation details

@@ -131,6 +131,5 @@ computation path.
 ## Future Work

 - Support for additional model architectures
- Support for the FSDP setting
- Support for dropout and bias
+- Support for dropout
 - Additional operator fusions
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef\""
   ]
  },
  {
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""

 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"'
 )
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh

 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"
 ```

 ## Usage
@@ -31,7 +31,6 @@ plugins:

 ## Supported Models

- apertus
 - arcee
 - cohere
 - cohere2
@@ -45,13 +44,9 @@ plugins:
 - glm
 - glm4
 - glm4_moe
- glm4v
- glm4v_moe
 - gpt_oss
 - granite
 - granitemoe
- granitemoeshared
- granitemoehybrid
 - hunyuan_v1_dense
 - hunyuan_v1_moe
 - llama
@@ -70,8 +65,6 @@ plugins:
 - qwen2_5_vl
 - qwen3
 - qwen3_moe
- qwen3_vl
- qwen3_vl_moe
 - qwen3_next
 - smollm3
 - seed_oss
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)

 _CCE_INSTALL_MESSAGE = (
    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"`'
 )


--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -368,7 +368,6 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
        # removing the call above leads to extra memory usage as explained in the comment above
        if hasattr(model, "tie_weights"):
            model.tie_weights()
-    model = model.to(torch.float32)
    return model


--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -323,8 +323,8 @@ def apply_lora_kernel_patches(
        AssertionError: If multiple adapters are active (currently unsupported).

    Note:
-        The optimizations require LoRA adapters with no dropout and no bias terms. The
-            function will skip patching if these conditions aren't met.
+        The optimizations require LoRA adapters with no dropout. The function will skip
+        patching if that condition isn't met.
    """
    if not isinstance(model, PeftModelForCausalLM):
        raise TypeError("Model must be a PeftModelForCausalLM")
@@ -340,10 +340,10 @@ def apply_lora_kernel_patches(
    lora_config = model.model.peft_config[active_adapter]

    # Only patch if conditions are met
-    can_patch = lora_config.lora_dropout == 0 and lora_config.bias == "none"
+    can_patch = lora_config.lora_dropout == 0

    if not can_patch:
-        LOG.warning("Cannot patch layers - requires no dropout and no bias")
+        LOG.warning("Cannot patch layers - requires `lora_dropout: 0`")
        LOG.warning("Please specify `lora_dropout: 0` in your axolotl config file")
        return model

--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -221,44 +221,53 @@ def test_model_specific_activation(model_name, expected_activation):
    assert layer.mlp.forward.__func__ is expected_activation


-def test_kernel_patch_conditions():
-    """Test various conditions that should prevent kernel patching."""
-    test_configs = [
-        # Dropout prevents patching
-        {
-            "peft_type": "LORA",
-            "task_type": "CAUSAL_LM",
-            "r": 8,
-            "lora_alpha": 16,
-            "target_modules": ["gate_proj", "up_proj", "down_proj"],
-            "lora_dropout": 0.1,
-            "bias": "none",
-        },
-        # Bias prevents patching
-        {
-            "peft_type": "LORA",
-            "task_type": "CAUSAL_LM",
-            "r": 8,
-            "lora_alpha": 16,
-            "target_modules": ["gate_proj", "up_proj", "down_proj"],
-            "lora_dropout": 0,
-            "bias": "lora_only",
-        },
-    ]
+def test_kernel_patch_requires_zero_dropout():
+    """Kernel patching should be skipped when dropout is enabled."""
+    config = {
+        "peft_type": "LORA",
+        "task_type": "CAUSAL_LM",
+        "r": 8,
+        "lora_alpha": 16,
+        "target_modules": ["gate_proj", "up_proj", "down_proj"],
+        "lora_dropout": 0.1,
+        "bias": "none",
+    }

-    for config in test_configs:
-        model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
-        peft_config = get_peft_config(config)
-        model = PeftModelForCausalLM(model, peft_config)
-        cfg = DictDefault({"lora_mlp_kernel": True})
+    model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
+    peft_config = get_peft_config(config)
+    model = PeftModelForCausalLM(model, peft_config)
+    cfg = DictDefault({"lora_mlp_kernel": True})

-        # Should not patch
-        patched_model = apply_lora_kernel_patches(model, cfg)
-        layer = patched_model.model.model.layers[0].mlp
+    patched_model = apply_lora_kernel_patches(model, cfg)
+    layer = patched_model.model.model.layers[0].mlp

-        # Verify no patches applied
-        assert layer.forward.__func__ is not apply_lora_mlp_swiglu
-        assert layer.forward.__func__ is not apply_lora_mlp_geglu
+    # Verify no patches applied when dropout is non-zero
+    assert layer.forward.__func__ is not apply_lora_mlp_swiglu
+    assert layer.forward.__func__ is not apply_lora_mlp_geglu
+
+
+def test_kernel_patch_with_bias_enabled():
+    """Kernel patching should succeed when LoRA bias is enabled."""
+    config = {
+        "peft_type": "LORA",
+        "task_type": "CAUSAL_LM",
+        "r": 8,
+        "lora_alpha": 16,
+        "target_modules": ["gate_proj", "up_proj", "down_proj"],
+        "lora_dropout": 0,
+        "bias": "lora_only",
+    }
+
+    model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
+    peft_config = get_peft_config(config)
+    model = PeftModelForCausalLM(model, peft_config)
+    cfg = DictDefault({"lora_mlp_kernel": True})
+
+    patched_model = apply_lora_kernel_patches(model, cfg)
+    layer = patched_model.model.model.layers[0].mlp
+
+    # Verify patches applied when bias support is enabled
+    assert layer.forward.__func__ is apply_lora_mlp_swiglu


 def test_kernel_config_options():
Author	SHA1	Message	Date
Dan Saunders	3299f182ba	ungate lora with bias	2025-09-25 12:40:13 -04:00
Dan Saunders	2fc430d365	update lora optims doc	2025-09-25 12:24:25 -04:00