Compare commits

..

2 Commits

Author SHA1 Message Date
Salman Mohammadi
1d0562dedd adding fp32 support 2025-09-26 16:32:09 +00:00
NanoCode012
7fa8ac40cd Feat(cce): add qwen3_vl, qwen3_vl_moe, granitemoeshared, granitemoehybrid, and upgraded all cce patches (#3178)
* feat: upgrade cce with patches for transformers 4.56

* feat: add missing models to cce readme
2025-09-26 12:11:29 +07:00
8 changed files with 62 additions and 62 deletions

View File

@@ -5,11 +5,10 @@ description: "Custom autograd functions and Triton kernels in Axolotl for optimi
Inspired by [Unsloth](https://github.com/unslothai/unsloth), we've implemented two Inspired by [Unsloth](https://github.com/unslothai/unsloth), we've implemented two
optimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU optimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU
(including DDP, DeepSpeed, and FSDP2) training. These include (1) SwiGLU and GEGLU (in the DDP and DeepSpeed settings) training. These include (1) SwiGLU and GEGLU activation function
activation function Triton kernels, and (2) LoRA MLP and attention custom autograd Triton kernels, and (2) LoRA MLP and attention custom autograd functions. Our goal was
functions. Our goal was to leverage operator fusion and tensor re-use in order to to leverage operator fusion and tensor re-use in order to improve speed and reduce
improve speed and reduce memory usage during the forward and backward passes of these memory usage during the forward and backward passes of these calculations.
calculations.
We currently support several common model architectures, including (but not limited to): We currently support several common model architectures, including (but not limited to):
@@ -93,12 +92,13 @@ Currently, LoRA kernels are not supported for RLHF training, only SFT.
- One or more NVIDIA or AMD GPUs (in order to use the Triton kernels) - One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
- Note: Set `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` to enable [memory-efficient attention on AMD GPUs](https://github.com/ROCm/aotriton/issues/16#issuecomment-2346675491) - Note: Set `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` to enable [memory-efficient attention on AMD GPUs](https://github.com/ROCm/aotriton/issues/16#issuecomment-2346675491)
- Targeted LoRA adapters must disable dropout (`lora_dropout: 0`) - Targeted LoRA adapters cannot use Dropout
- This may limit model expressivity / cause overfitting
- Targeted LoRA adapters cannot have bias terms
- This may limit model expressivity - This may limit model expressivity
- Adapters that already include bias terms are supported.
Models with pre-existing LoRA adapters that use Dropout may need to be re-finetuned Models with pre-existing LoRA adapters that use Dropout or have bias terms may need to
without it in order to be as performant. be re-finetuned without these features in order to be useful.
## Implementation details ## Implementation details
@@ -131,5 +131,6 @@ computation path.
## Future Work ## Future Work
- Support for additional model architectures - Support for additional model architectures
- Support for dropout - Support for the FSDP setting
- Support for dropout and bias
- Additional operator fusions - Additional operator fusions

View File

@@ -40,7 +40,7 @@
"%%capture\n", "%%capture\n",
"# This step can take ~5-10 minutes to install dependencies\n", "# This step can take ~5-10 minutes to install dependencies\n",
"!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n", "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
"!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef\"" "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28\""
] ]
}, },
{ {

View File

@@ -29,5 +29,5 @@ UV_PREFIX = "uv " if USE_UV else ""
print( print(
UNINSTALL_PREFIX UNINSTALL_PREFIX
+ f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"' + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"'
) )

View File

@@ -19,7 +19,7 @@ python scripts/cutcrossentropy_install.py | sh
- If you are installing from pip - If you are installing from pip
```bash ```bash
pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef" pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"
``` ```
## Usage ## Usage
@@ -31,6 +31,7 @@ plugins:
## Supported Models ## Supported Models
- apertus
- arcee - arcee
- cohere - cohere
- cohere2 - cohere2
@@ -44,9 +45,13 @@ plugins:
- glm - glm
- glm4 - glm4
- glm4_moe - glm4_moe
- glm4v
- glm4v_moe
- gpt_oss - gpt_oss
- granite - granite
- granitemoe - granitemoe
- granitemoeshared
- granitemoehybrid
- hunyuan_v1_dense - hunyuan_v1_dense
- hunyuan_v1_moe - hunyuan_v1_moe
- llama - llama
@@ -65,6 +70,8 @@ plugins:
- qwen2_5_vl - qwen2_5_vl
- qwen3 - qwen3
- qwen3_moe - qwen3_moe
- qwen3_vl
- qwen3_vl_moe
- qwen3_next - qwen3_next
- smollm3 - smollm3
- seed_oss - seed_oss

View File

@@ -35,7 +35,7 @@ LOG = get_logger(__name__)
_CCE_INSTALL_MESSAGE = ( _CCE_INSTALL_MESSAGE = (
"Please install Axolotl's fork of cut_cross_entropy with transformers support using " "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
'`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@c5aa3ef"`' '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@147ea28"`'
) )

View File

@@ -368,6 +368,7 @@ def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
# removing the call above leads to extra memory usage as explained in the comment above # removing the call above leads to extra memory usage as explained in the comment above
if hasattr(model, "tie_weights"): if hasattr(model, "tie_weights"):
model.tie_weights() model.tie_weights()
model = model.to(torch.float32)
return model return model

View File

@@ -323,8 +323,8 @@ def apply_lora_kernel_patches(
AssertionError: If multiple adapters are active (currently unsupported). AssertionError: If multiple adapters are active (currently unsupported).
Note: Note:
The optimizations require LoRA adapters with no dropout. The function will skip The optimizations require LoRA adapters with no dropout and no bias terms. The
patching if that condition isn't met. function will skip patching if these conditions aren't met.
""" """
if not isinstance(model, PeftModelForCausalLM): if not isinstance(model, PeftModelForCausalLM):
raise TypeError("Model must be a PeftModelForCausalLM") raise TypeError("Model must be a PeftModelForCausalLM")
@@ -340,10 +340,10 @@ def apply_lora_kernel_patches(
lora_config = model.model.peft_config[active_adapter] lora_config = model.model.peft_config[active_adapter]
# Only patch if conditions are met # Only patch if conditions are met
can_patch = lora_config.lora_dropout == 0 can_patch = lora_config.lora_dropout == 0 and lora_config.bias == "none"
if not can_patch: if not can_patch:
LOG.warning("Cannot patch layers - requires `lora_dropout: 0`") LOG.warning("Cannot patch layers - requires no dropout and no bias")
LOG.warning("Please specify `lora_dropout: 0` in your axolotl config file") LOG.warning("Please specify `lora_dropout: 0` in your axolotl config file")
return model return model

View File

@@ -221,53 +221,44 @@ def test_model_specific_activation(model_name, expected_activation):
assert layer.mlp.forward.__func__ is expected_activation assert layer.mlp.forward.__func__ is expected_activation
def test_kernel_patch_requires_zero_dropout(): def test_kernel_patch_conditions():
"""Kernel patching should be skipped when dropout is enabled.""" """Test various conditions that should prevent kernel patching."""
config = { test_configs = [
"peft_type": "LORA", # Dropout prevents patching
"task_type": "CAUSAL_LM", {
"r": 8, "peft_type": "LORA",
"lora_alpha": 16, "task_type": "CAUSAL_LM",
"target_modules": ["gate_proj", "up_proj", "down_proj"], "r": 8,
"lora_dropout": 0.1, "lora_alpha": 16,
"bias": "none", "target_modules": ["gate_proj", "up_proj", "down_proj"],
} "lora_dropout": 0.1,
"bias": "none",
},
# Bias prevents patching
{
"peft_type": "LORA",
"task_type": "CAUSAL_LM",
"r": 8,
"lora_alpha": 16,
"target_modules": ["gate_proj", "up_proj", "down_proj"],
"lora_dropout": 0,
"bias": "lora_only",
},
]
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M") for config in test_configs:
peft_config = get_peft_config(config) model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
model = PeftModelForCausalLM(model, peft_config) peft_config = get_peft_config(config)
cfg = DictDefault({"lora_mlp_kernel": True}) model = PeftModelForCausalLM(model, peft_config)
cfg = DictDefault({"lora_mlp_kernel": True})
patched_model = apply_lora_kernel_patches(model, cfg) # Should not patch
layer = patched_model.model.model.layers[0].mlp patched_model = apply_lora_kernel_patches(model, cfg)
layer = patched_model.model.model.layers[0].mlp
# Verify no patches applied when dropout is non-zero # Verify no patches applied
assert layer.forward.__func__ is not apply_lora_mlp_swiglu assert layer.forward.__func__ is not apply_lora_mlp_swiglu
assert layer.forward.__func__ is not apply_lora_mlp_geglu assert layer.forward.__func__ is not apply_lora_mlp_geglu
def test_kernel_patch_with_bias_enabled():
"""Kernel patching should succeed when LoRA bias is enabled."""
config = {
"peft_type": "LORA",
"task_type": "CAUSAL_LM",
"r": 8,
"lora_alpha": 16,
"target_modules": ["gate_proj", "up_proj", "down_proj"],
"lora_dropout": 0,
"bias": "lora_only",
}
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
peft_config = get_peft_config(config)
model = PeftModelForCausalLM(model, peft_config)
cfg = DictDefault({"lora_mlp_kernel": True})
patched_model = apply_lora_kernel_patches(model, cfg)
layer = patched_model.model.model.layers[0].mlp
# Verify patches applied when bias support is enabled
assert layer.forward.__func__ is apply_lora_mlp_swiglu
def test_kernel_config_options(): def test_kernel_config_options():