bump transformers to 5.5.4 and trl to latest 1.1.0 (#3603)
* bump transformers to 5.5.4 and trl to latest 1.1.0 * more upgrades * update peft too * adapt lora_merge to peft 0.19 layer config API PEFT 0.19 requires a LoraConfig object on Linear/ParamWrapper/Conv layer constructors and moved use_rslora, use_dora, fan_in_fan_out, lora_dropout, and lora_bias into that config. Build the config per branch in _build_peft_layer_and_get_delta so the merge utility works with the upgraded peft. * allow lora_dropout on mixed attention+MoE configs under peft 0.19 PEFT 0.19's convert_peft_config_for_transformers auto-remaps old MoE target_modules (w1/w2/w3 on Mixtral, etc.) into target_parameters for transformers v5's fused 3D expert Parameters. Those targets get wrapped with ParamWrapper, which rejects lora_dropout != 0 because the 3D einsum can't factor dropout out of lora_B(lora_A(dropout(x))). Monkeypatch ParamWrapper.__init__ to internally use a copy of the LoraConfig with lora_dropout=0, so its dropout slot becomes nn.Identity while the shared config still delivers real dropout to sibling Linear LoRA layers (attention q/k/v/o). A probe runs the same conversion on a deep copy to detect the situation and emit a warning before patching.
This commit is contained in:
@@ -10,15 +10,15 @@ liger-kernel==0.7.0
|
|||||||
|
|
||||||
packaging==26.0
|
packaging==26.0
|
||||||
huggingface_hub>=1.1.7
|
huggingface_hub>=1.1.7
|
||||||
peft>=0.18.1
|
peft>=0.19.0,<0.20.0
|
||||||
tokenizers>=0.22.1
|
tokenizers>=0.22.1
|
||||||
transformers==5.5.3
|
transformers==5.5.4
|
||||||
accelerate==1.13.0
|
accelerate==1.13.0
|
||||||
datasets==4.5.0
|
datasets>=4.8.4,<4.9.0
|
||||||
deepspeed>=0.18.6,<0.19.0
|
deepspeed>=0.18.6,<0.19.0
|
||||||
trl==0.29.0
|
trl==1.1.0
|
||||||
hf_xet==1.3.2
|
hf_xet==1.4.3
|
||||||
kernels==0.12.2
|
kernels==0.13.0
|
||||||
|
|
||||||
fla-core==0.4.1
|
fla-core==0.4.1
|
||||||
flash-linear-attention==0.4.1
|
flash-linear-attention==0.4.1
|
||||||
|
|||||||
@@ -315,15 +315,27 @@ def _build_peft_layer_and_get_delta(
|
|||||||
"weight", nn.Parameter(base_tensor.clone(), requires_grad=False)
|
"weight", nn.Parameter(base_tensor.clone(), requires_grad=False)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# ParamWrapper rejects dropout/fan_in_fan_out/lora_bias/use_dora, so
|
||||||
|
# build a minimal config with only the fields it accepts.
|
||||||
|
pw_config = LoraConfig(
|
||||||
|
r=r,
|
||||||
|
lora_alpha=lora_alpha,
|
||||||
|
lora_dropout=0.0,
|
||||||
|
fan_in_fan_out=False,
|
||||||
|
use_rslora=use_rslora,
|
||||||
|
use_dora=False,
|
||||||
|
lora_bias=False,
|
||||||
|
)
|
||||||
|
|
||||||
with warnings.catch_warnings():
|
with warnings.catch_warnings():
|
||||||
warnings.simplefilter("ignore", UserWarning)
|
warnings.simplefilter("ignore", UserWarning)
|
||||||
layer = ParamWrapper(
|
layer = ParamWrapper(
|
||||||
fake,
|
fake,
|
||||||
adapter_name=adapter_name,
|
adapter_name=adapter_name,
|
||||||
parameter_name="weight",
|
parameter_name="weight",
|
||||||
|
config=pw_config,
|
||||||
r=r,
|
r=r,
|
||||||
lora_alpha=lora_alpha,
|
lora_alpha=lora_alpha,
|
||||||
use_rslora=use_rslora,
|
|
||||||
)
|
)
|
||||||
layer.lora_A[adapter_name].weight.data = lora_a
|
layer.lora_A[adapter_name].weight.data = lora_a
|
||||||
layer.lora_B[adapter_name].weight.data = lora_b
|
layer.lora_B[adapter_name].weight.data = lora_b
|
||||||
@@ -375,14 +387,19 @@ def _build_peft_layer_and_get_delta(
|
|||||||
)
|
)
|
||||||
base_layer.weight.data = base_tensor.clone()
|
base_layer.weight.data = base_tensor.clone()
|
||||||
|
|
||||||
layer = PeftConvCls(
|
conv_config = LoraConfig(
|
||||||
base_layer,
|
|
||||||
adapter_name=adapter_name,
|
|
||||||
r=r_total,
|
r=r_total,
|
||||||
lora_alpha=lora_alpha,
|
lora_alpha=lora_alpha,
|
||||||
use_rslora=use_rslora,
|
use_rslora=use_rslora,
|
||||||
use_dora=use_dora,
|
use_dora=use_dora,
|
||||||
)
|
)
|
||||||
|
layer = PeftConvCls(
|
||||||
|
base_layer,
|
||||||
|
adapter_name=adapter_name,
|
||||||
|
config=conv_config,
|
||||||
|
r=r_total,
|
||||||
|
lora_alpha=lora_alpha,
|
||||||
|
)
|
||||||
layer.lora_A[adapter_name].weight.data = lora_a
|
layer.lora_A[adapter_name].weight.data = lora_a
|
||||||
layer.lora_B[adapter_name].weight.data = lora_b
|
layer.lora_B[adapter_name].weight.data = lora_b
|
||||||
|
|
||||||
@@ -410,15 +427,20 @@ def _build_peft_layer_and_get_delta(
|
|||||||
or lora_config_dict.get("lora_fan_in_fan_out", False)
|
or lora_config_dict.get("lora_fan_in_fan_out", False)
|
||||||
)
|
)
|
||||||
|
|
||||||
layer = LoraLinear(
|
linear_config = LoraConfig(
|
||||||
base_layer,
|
|
||||||
adapter_name=adapter_name,
|
|
||||||
r=r_total,
|
r=r_total,
|
||||||
lora_alpha=lora_alpha,
|
lora_alpha=lora_alpha,
|
||||||
fan_in_fan_out=fan_in_fan_out,
|
fan_in_fan_out=fan_in_fan_out,
|
||||||
use_rslora=use_rslora,
|
use_rslora=use_rslora,
|
||||||
use_dora=use_dora,
|
use_dora=use_dora,
|
||||||
)
|
)
|
||||||
|
layer = LoraLinear(
|
||||||
|
base_layer,
|
||||||
|
adapter_name=adapter_name,
|
||||||
|
config=linear_config,
|
||||||
|
r=r_total,
|
||||||
|
lora_alpha=lora_alpha,
|
||||||
|
)
|
||||||
layer.lora_A[adapter_name].weight.data = lora_a
|
layer.lora_A[adapter_name].weight.data = lora_a
|
||||||
layer.lora_B[adapter_name].weight.data = lora_b
|
layer.lora_B[adapter_name].weight.data = lora_b
|
||||||
|
|
||||||
|
|||||||
@@ -124,6 +124,101 @@ def _patch_peft_clippable_linear():
|
|||||||
LoraModel._axolotl_clippable_patched = True
|
LoraModel._axolotl_clippable_patched = True
|
||||||
|
|
||||||
|
|
||||||
|
def _peft_will_auto_convert_target_params(model, lora_config) -> bool:
|
||||||
|
"""Check whether PEFT will auto-populate target_parameters for this model.
|
||||||
|
|
||||||
|
PEFT 0.19's ``convert_peft_config_for_transformers`` rewrites old MoE
|
||||||
|
``target_modules`` (e.g. ``w1``/``w2``/``w3`` on Mixtral) into
|
||||||
|
``target_parameters`` (``gate_up_proj``/``down_proj``) because
|
||||||
|
transformers v5 fused those expert linears into 3D ``nn.Parameter``
|
||||||
|
tensors. PEFT wraps the resulting 3D params with ``ParamWrapper``,
|
||||||
|
which rejects ``lora_dropout != 0``. This probe runs the conversion on
|
||||||
|
a copy of the config so we can detect the situation before
|
||||||
|
``get_peft_model`` blows up.
|
||||||
|
"""
|
||||||
|
if getattr(lora_config, "target_parameters", None):
|
||||||
|
return False
|
||||||
|
|
||||||
|
try:
|
||||||
|
from peft.utils.transformers_weight_conversion import (
|
||||||
|
convert_peft_config_for_transformers,
|
||||||
|
get_model_conversion_mapping,
|
||||||
|
)
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
import copy
|
||||||
|
|
||||||
|
probe_cfg = copy.deepcopy(lora_config)
|
||||||
|
try:
|
||||||
|
convert_peft_config_for_transformers(
|
||||||
|
probe_cfg,
|
||||||
|
model=model,
|
||||||
|
conversions=get_model_conversion_mapping(model),
|
||||||
|
)
|
||||||
|
except Exception: # pylint: disable=broad-except
|
||||||
|
return False
|
||||||
|
|
||||||
|
return bool(getattr(probe_cfg, "target_parameters", None))
|
||||||
|
|
||||||
|
|
||||||
|
def _patch_peft_param_wrapper_dropout():
|
||||||
|
"""Let PEFT's ``ParamWrapper`` silently accept ``lora_dropout != 0``.
|
||||||
|
|
||||||
|
``ParamWrapper`` wraps 3D expert ``nn.Parameter`` tensors and rejects
|
||||||
|
non-zero dropout because dropout can't be factored out of
|
||||||
|
``lora_B(lora_A(dropout(x)))`` when the inner op is an expert-indexed
|
||||||
|
matmul. For mixed configs (attention + MoE experts) this is too
|
||||||
|
aggressive — the non-expert ``Linear`` LoRA layers *can* apply dropout
|
||||||
|
and that's usually what the user intended. We pass a copy of the
|
||||||
|
``LoraConfig`` with ``lora_dropout=0`` only to ``ParamWrapper.__init__``
|
||||||
|
so it builds with ``nn.Identity`` for its internal dropout slot while
|
||||||
|
every other layer type still receives the real dropout value.
|
||||||
|
"""
|
||||||
|
from peft.tuners.lora.layer import ParamWrapper
|
||||||
|
|
||||||
|
if getattr(ParamWrapper, "_axolotl_dropout_patched", False):
|
||||||
|
return
|
||||||
|
|
||||||
|
_orig_init = ParamWrapper.__init__
|
||||||
|
|
||||||
|
def _patched_init(
|
||||||
|
self,
|
||||||
|
base_layer,
|
||||||
|
adapter_name,
|
||||||
|
parameter_name,
|
||||||
|
config,
|
||||||
|
*args,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
if getattr(config, "lora_dropout", 0):
|
||||||
|
import copy as _copy
|
||||||
|
|
||||||
|
patched_config = _copy.copy(config)
|
||||||
|
patched_config.lora_dropout = 0.0
|
||||||
|
return _orig_init(
|
||||||
|
self,
|
||||||
|
base_layer,
|
||||||
|
adapter_name,
|
||||||
|
parameter_name,
|
||||||
|
patched_config,
|
||||||
|
*args,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return _orig_init(
|
||||||
|
self,
|
||||||
|
base_layer,
|
||||||
|
adapter_name,
|
||||||
|
parameter_name,
|
||||||
|
config,
|
||||||
|
*args,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
ParamWrapper.__init__ = _patched_init
|
||||||
|
ParamWrapper._axolotl_dropout_patched = True
|
||||||
|
|
||||||
|
|
||||||
def load_lora(
|
def load_lora(
|
||||||
model: PreTrainedModel,
|
model: PreTrainedModel,
|
||||||
cfg: DictDefault,
|
cfg: DictDefault,
|
||||||
@@ -191,6 +286,20 @@ def load_lora(
|
|||||||
if config_only:
|
if config_only:
|
||||||
return None, lora_config
|
return None, lora_config
|
||||||
|
|
||||||
|
if getattr(
|
||||||
|
lora_config, "lora_dropout", 0
|
||||||
|
) and _peft_will_auto_convert_target_params(model, lora_config):
|
||||||
|
LOG.warning(
|
||||||
|
"lora_dropout=%s requested but PEFT will wrap this model's fused "
|
||||||
|
"MoE expert parameters with ParamWrapper, which cannot apply "
|
||||||
|
"dropout (the 3D einsum can't factor dropout out of "
|
||||||
|
"lora_B(lora_A(dropout(x)))). Dropout will still be applied to "
|
||||||
|
"non-expert LoRA layers (e.g. attention), and expert LoRA layers "
|
||||||
|
"will use nn.Identity for the dropout slot.",
|
||||||
|
lora_config.lora_dropout,
|
||||||
|
)
|
||||||
|
_patch_peft_param_wrapper_dropout()
|
||||||
|
|
||||||
rank = int(os.environ.get("LOCAL_RANK", 0))
|
rank = int(os.environ.get("LOCAL_RANK", 0))
|
||||||
|
|
||||||
if (
|
if (
|
||||||
|
|||||||
Reference in New Issue
Block a user