adding fp32 support

Feat(cce): add qwen3_vl, qwen3_vl_moe, granitemoeshared, granitemoehybrid, and upgraded all cce patches (#3178 )
* feat: upgrade cce with patches for transformers 4.56 * feat: add missing models to cce readme
2025-09-26 16:32:09 +00:00 · 2025-09-26 12:11:29 +07:00
8 changed files with 62 additions and 62 deletions
--- a/docs/lora_optims.qmd
+++ b/docs/lora_optims.qmd
@@ -5,11 +5,10 @@ description: "Custom autograd functions and Triton kernels in Axolotl for optimi
 Inspired by [Unsloth](https://github.com/unslothai/unsloth), we've implemented two
 optimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU
-(including DDP, DeepSpeed, and FSDP2) training. These include (1) SwiGLU and GEGLU
+(in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function
-activation function Triton kernels, and (2) LoRA MLP and attention custom autograd
+Triton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was
-functions. Our goal was to leverage operator fusion and tensor re-use in order to
+to leverage operator fusion and tensor re-use in order to improve speed and reduce
-improve speed and reduce memory usage during the forward and backward passes of these
+memory usage during the forward and backward passes of these calculations.
 calculations.
 We currently support several common model architectures, including (but not limited to):
@@ -93,12 +92,13 @@ Currently, LoRA kernels are not supported for RLHF training, only SFT.
 - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
    - Note: Set `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` to enable [memory-efficient attention on AMD GPUs](https://github.com/ROCm/aotriton/issues/16#issuecomment-2346675491)
- Targeted LoRA adapters must disable dropout (`lora_dropout: 0`)
+- Targeted LoRA adapters cannot use Dropout
    - This may limit model expressivity / cause overfitting
 - Targeted LoRA adapters cannot have bias terms
    - This may limit model expressivity
 - Adapters that already include bias terms are supported.
-Models with pre-existing LoRA adapters that use Dropout may need to be re-finetuned
+Models with pre-existing LoRA adapters that use Dropout or have bias terms may need to
-without it in order to be as performant.
+be re-finetuned without these features in order to be useful.
 ## Implementation details
@@ -131,5 +131,6 @@ computation path.
 ## Future Work
 - Support for additional model architectures
- Support for dropout
+- Support for the FSDP setting
 - Support for dropout and bias
 - Additional operator fusions
--- a/examples/colab-notebooks/colab-axolotl-example.ipynb
+++ b/examples/colab-notebooks/colab-axolotl-example.ipynb
@@ -40,7 +40,7 @@
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
-    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef\""
+    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28\""
   ]
  },
  {
--- a/scripts/cutcrossentropy_install.py
+++ b/scripts/cutcrossentropy_install.py
@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
 print(
    UNINSTALL_PREFIX
-    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"'
+    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"'
 )
--- a/src/axolotl/integrations/cut_cross_entropy/README.md
+++ b/src/axolotl/integrations/cut_cross_entropy/README.md
@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
 - If you are installing from pip
 ```bash
-pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"
+pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"
 ```
 ## Usage
@@ -31,6 +31,7 @@ plugins:
 ## Supported Models
 - apertus
 - arcee
 - cohere
 - cohere2
@@ -44,9 +45,13 @@ plugins:
 - glm
 - glm4
 - glm4_moe
 - glm4v
 - glm4v_moe
 - gpt_oss
 - granite
 - granitemoe
 - granitemoeshared
 - granitemoehybrid
 - hunyuan_v1_dense
 - hunyuan_v1_moe
 - llama
@@ -65,6 +70,8 @@ plugins:
 - qwen2_5_vl
 - qwen3
 - qwen3_moe
 - qwen3_vl
 - qwen3_vl_moe
 - qwen3_next
 - smollm3
 - seed_oss
--- a/src/axolotl/integrations/cut_cross_entropy/init.py
+++ b/src/axolotl/integrations/cut_cross_entropy/init.py
@@ -35,7 +35,7 @@ LOG = get_logger(__name__)
 _CCE_INSTALL_MESSAGE = (
    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
-    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"`'
+    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"`'
 )
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -368,6 +368,7 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
        # removing the call above leads to extra memory usage as explained in the comment above
        if hasattr(model, "tie_weights"):
            model.tie_weights()
    model = model.to(torch.float32)
    return model
--- a/src/axolotl/monkeypatch/lora_kernels.py
+++ b/src/axolotl/monkeypatch/lora_kernels.py
@@ -323,8 +323,8 @@ def apply_lora_kernel_patches(
        AssertionError: If multiple adapters are active (currently unsupported).
    Note:
-        The optimizations require LoRA adapters with no dropout. The function will skip
+        The optimizations require LoRA adapters with no dropout and no bias terms. The
-        patching if that condition isn't met.
+            function will skip patching if these conditions aren't met.
    """
    if not isinstance(model, PeftModelForCausalLM):
        raise TypeError("Model must be a PeftModelForCausalLM")
@@ -340,10 +340,10 @@ def apply_lora_kernel_patches(
    lora_config = model.model.peft_config[active_adapter]
    # Only patch if conditions are met
-    can_patch = lora_config.lora_dropout == 0
+    can_patch = lora_config.lora_dropout == 0 and lora_config.bias == "none"
    if not can_patch:
-        LOG.warning("Cannot patch layers - requires `lora_dropout: 0`")
+        LOG.warning("Cannot patch layers - requires no dropout and no bias")
        LOG.warning("Please specify `lora_dropout: 0` in your axolotl config file")
        return model
--- a/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
+++ b/tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
@@ -221,53 +221,44 @@ def test_model_specific_activation(model_name, expected_activation):
    assert layer.mlp.forward.__func__ is expected_activation
-def test_kernel_patch_requires_zero_dropout():
+def test_kernel_patch_conditions():
-    """Kernel patching should be skipped when dropout is enabled."""
+    """Test various conditions that should prevent kernel patching."""
-    config = {
+    test_configs = [
-        "peft_type": "LORA",
+        # Dropout prevents patching
-        "task_type": "CAUSAL_LM",
+        {
-        "r": 8,
+            "peft_type": "LORA",
-        "lora_alpha": 16,
+            "task_type": "CAUSAL_LM",
-        "target_modules": ["gate_proj", "up_proj", "down_proj"],
+            "r": 8,
-        "lora_dropout": 0.1,
+            "lora_alpha": 16,
-        "bias": "none",
+            "target_modules": ["gate_proj", "up_proj", "down_proj"],
-    }
+            "lora_dropout": 0.1,
            "bias": "none",
        },
        # Bias prevents patching
        {
            "peft_type": "LORA",
            "task_type": "CAUSAL_LM",
            "r": 8,
            "lora_alpha": 16,
            "target_modules": ["gate_proj", "up_proj", "down_proj"],
            "lora_dropout": 0,
            "bias": "lora_only",
        },
    ]
-    model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
+    for config in test_configs:
-    peft_config = get_peft_config(config)
+        model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
-    model = PeftModelForCausalLM(model, peft_config)
+        peft_config = get_peft_config(config)
-    cfg = DictDefault({"lora_mlp_kernel": True})
+        model = PeftModelForCausalLM(model, peft_config)
        cfg = DictDefault({"lora_mlp_kernel": True})
-    patched_model = apply_lora_kernel_patches(model, cfg)
+        # Should not patch
-    layer = patched_model.model.model.layers[0].mlp
+        patched_model = apply_lora_kernel_patches(model, cfg)
        layer = patched_model.model.model.layers[0].mlp
-    # Verify no patches applied when dropout is non-zero
+        # Verify no patches applied
-    assert layer.forward.__func__ is not apply_lora_mlp_swiglu
+        assert layer.forward.__func__ is not apply_lora_mlp_swiglu
-    assert layer.forward.__func__ is not apply_lora_mlp_geglu
+        assert layer.forward.__func__ is not apply_lora_mlp_geglu
 def test_kernel_patch_with_bias_enabled():
    """Kernel patching should succeed when LoRA bias is enabled."""
    config = {
        "peft_type": "LORA",
        "task_type": "CAUSAL_LM",
        "r": 8,
        "lora_alpha": 16,
        "target_modules": ["gate_proj", "up_proj", "down_proj"],
        "lora_dropout": 0,
        "bias": "lora_only",
    }
    model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
    peft_config = get_peft_config(config)
    model = PeftModelForCausalLM(model, peft_config)
    cfg = DictDefault({"lora_mlp_kernel": True})
    patched_model = apply_lora_kernel_patches(model, cfg)
    layer = patched_model.model.model.layers[0].mlp
    # Verify patches applied when bias support is enabled
    assert layer.forward.__func__ is apply_lora_mlp_swiglu
 def test_kernel_config_options():
Author	SHA1	Message	Date
Salman Mohammadi	1d0562dedd	adding fp32 support	2025-09-26 16:32:09 +00:00
NanoCode012	7fa8ac40cd	Feat(cce): add qwen3_vl, qwen3_vl_moe, granitemoeshared, granitemoehybrid, and upgraded all cce patches (#3178 ) * feat: upgrade cce with patches for transformers 4.56 * feat: add missing models to cce readme	2025-09-26 12:11:29 +07:00