fix: qwen3-next to use fla causal-conv1d to support packing (#3437
* fix: qwen3-next to use fla causal-conv1d to support packing * fix: causal import and update doc for v5 * fix: hard fail for packing without fla
This commit is contained in:
@@ -6,30 +6,13 @@ This guide shows how to fine-tune it with Axolotl with multi-turn conversations
|
||||
|
||||
## Getting started
|
||||
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Qwen3-Next is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
|
||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
Here is an example of how to install from main for pip:
|
||||
|
||||
```bash
|
||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
|
||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
||||
cd axolotl
|
||||
|
||||
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
|
||||
pip3 install --no-build-isolation -e '.[flash-attn]'
|
||||
|
||||
# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
```
|
||||
|
||||
2. Install Qwen3-Next transformers commit
|
||||
```bash
|
||||
pip3 uninstall -y transformers && pip3 install "git+https://github.com/huggingface/transformers.git@b9282355bea846b54ed850a066901496b19da654"
|
||||
```
|
||||
2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.
|
||||
|
||||
3. Install FLA for improved performance
|
||||
```bash
|
||||
pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.3.2
|
||||
pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
|
||||
```
|
||||
|
||||
4. Run the finetuning example:
|
||||
@@ -38,7 +21,7 @@ pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.3.2
|
||||
axolotl train examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
|
||||
```
|
||||
|
||||
This config uses about 45.62 GiB VRAM.
|
||||
This config uses about ~47 GiB (no target experts) and ~71GiB (target experts) VRAM.
|
||||
|
||||
Let us know how it goes. Happy finetuning! 🚀
|
||||
|
||||
|
||||
@@ -9,6 +9,8 @@ plugins:
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
quantize_moe_experts: true
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
@@ -25,7 +27,7 @@ sample_packing: true
|
||||
|
||||
lora_r: 16
|
||||
lora_alpha: 8
|
||||
lora_dropout: 0.05
|
||||
lora_dropout: 0
|
||||
lora_target_modules:
|
||||
- linear_attn.in_proj_ba
|
||||
- linear_attn.in_proj_qkvz
|
||||
@@ -34,12 +36,19 @@ lora_target_modules:
|
||||
- shared_expert.down_proj
|
||||
- shared_expert.gate_proj
|
||||
- shared_expert_gate
|
||||
- mlp.gate
|
||||
- q_proj
|
||||
- v_proj
|
||||
- k_proj
|
||||
- o_proj
|
||||
|
||||
# lora_target_parameters:
|
||||
# - mlp.experts.gate_up_proj
|
||||
# - mlp.experts.down_proj
|
||||
|
||||
lora_mlp_kernel: false
|
||||
lora_qkv_kernel: false
|
||||
lora_o_kernel: false
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
|
||||
@@ -9,6 +9,11 @@ from axolotl.utils.logging import get_logger
|
||||
|
||||
LOG = get_logger(__name__)
|
||||
|
||||
try:
|
||||
from fla.modules.convolution import causal_conv1d as fla_causal_conv1d
|
||||
except ImportError:
|
||||
fla_causal_conv1d = None
|
||||
|
||||
|
||||
def get_cu_seqlens(position_ids):
|
||||
"""
|
||||
@@ -137,6 +142,11 @@ def patch_qwen3_next_gateddelta_layer():
|
||||
and cache_position is not None
|
||||
)
|
||||
|
||||
# Compute cu_seqlens early for use by both causal_conv1d and chunk_gated_delta_rule
|
||||
cu_seqlens = None
|
||||
if not use_precomputed_states and position_ids is not None:
|
||||
cu_seqlens = get_cu_seqlens(position_ids=position_ids)
|
||||
|
||||
# getting projected states from cache if it exists
|
||||
if cache_params is not None:
|
||||
conv_state = cache_params.conv_states[self.layer_idx]
|
||||
@@ -151,12 +161,11 @@ def patch_qwen3_next_gateddelta_layer():
|
||||
x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value)
|
||||
)
|
||||
|
||||
mixed_qkv = torch.cat((query, key, value), dim=-1)
|
||||
mixed_qkv = mixed_qkv.transpose(1, 2)
|
||||
mixed_qkv = torch.cat((query, key, value), dim=-1) # [B, T, D]
|
||||
|
||||
if use_precomputed_states:
|
||||
# 2. Convolution sequence transformation
|
||||
# NOTE: the conv state is updated in `causal_conv1d_update`
|
||||
# Inference single-token path: causal_conv1d_update expects [B, D, T]
|
||||
mixed_qkv = mixed_qkv.transpose(1, 2)
|
||||
mixed_qkv = self.causal_conv1d_update(
|
||||
mixed_qkv,
|
||||
conv_state,
|
||||
@@ -164,24 +173,41 @@ def patch_qwen3_next_gateddelta_layer():
|
||||
self.conv1d.bias,
|
||||
self.activation,
|
||||
)
|
||||
mixed_qkv = mixed_qkv.transpose(1, 2)
|
||||
else:
|
||||
if cache_params is not None:
|
||||
# Cache state expects [B, D, T] for the inference update path
|
||||
mixed_qkv_t = mixed_qkv.transpose(1, 2)
|
||||
conv_state = F.pad(
|
||||
mixed_qkv, (self.conv_kernel_size - mixed_qkv.shape[-1], 0)
|
||||
mixed_qkv_t,
|
||||
(self.conv_kernel_size - mixed_qkv_t.shape[-1], 0),
|
||||
)
|
||||
cache_params.conv_states[self.layer_idx] = conv_state
|
||||
if self.causal_conv1d_fn is not None:
|
||||
mixed_qkv = self.causal_conv1d_fn(
|
||||
|
||||
if fla_causal_conv1d is not None:
|
||||
# FLA Triton causal_conv1d: [B, T, D] in/out, with cu_seqlens support
|
||||
mixed_qkv, _ = fla_causal_conv1d(
|
||||
x=mixed_qkv,
|
||||
weight=self.conv1d.weight.squeeze(1),
|
||||
bias=self.conv1d.bias,
|
||||
activation=self.activation,
|
||||
seq_idx=None,
|
||||
cu_seqlens=cu_seqlens,
|
||||
)
|
||||
else:
|
||||
# PyTorch fallback (no cu_seqlens support)
|
||||
if cu_seqlens is not None and cu_seqlens.shape[0] > batch_size + 1:
|
||||
raise RuntimeError(
|
||||
"Packed sequences require fla.modules.convolution.causal_conv1d "
|
||||
"(cu_seqlens support). Install flash-linear-attention or disable packing."
|
||||
)
|
||||
LOG.warning_once(
|
||||
"FLA causal_conv1d not available. Falling back to PyTorch conv1d."
|
||||
)
|
||||
mixed_qkv = mixed_qkv.transpose(1, 2)
|
||||
mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])
|
||||
mixed_qkv = mixed_qkv.transpose(1, 2)
|
||||
|
||||
mixed_qkv = mixed_qkv.transpose(1, 2)
|
||||
# mixed_qkv is [B, T, D] in all paths
|
||||
query, key, value = torch.split(
|
||||
mixed_qkv,
|
||||
[
|
||||
@@ -203,7 +229,6 @@ def patch_qwen3_next_gateddelta_layer():
|
||||
key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
|
||||
|
||||
if not use_precomputed_states:
|
||||
cu_seqlens = get_cu_seqlens(position_ids=position_ids)
|
||||
core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule(
|
||||
query,
|
||||
key,
|
||||
|
||||
Reference in New Issue
Block a user