From 318f8bb03cf0b080d86a36c72b4514762a8572bf Mon Sep 17 00:00:00 2001 From: NanoCode012 Date: Tue, 11 Nov 2025 17:25:08 +0700 Subject: [PATCH] feat: add configuration file and fix import [skip ci] --- src/axolotl/loaders/patch_manager.py | 36 +++++ .../models/kimi_linear/configuration_kimi.py | 146 ++++++++++++++++++ .../models/kimi_linear/modeling_kimi.py | 9 +- 3 files changed, 190 insertions(+), 1 deletion(-) create mode 100644 src/axolotl/monkeypatch/models/kimi_linear/configuration_kimi.py diff --git a/src/axolotl/loaders/patch_manager.py b/src/axolotl/loaders/patch_manager.py index 81e4dd786..d2e341b6a 100644 --- a/src/axolotl/loaders/patch_manager.py +++ b/src/axolotl/loaders/patch_manager.py @@ -190,6 +190,42 @@ class PatchManager: apply_mistral_tokenizer_image_patch() + if self.cfg.model_config_type == "kimi_linear": + tokenizer_for_class_loading = AutoTokenizer.from_pretrained( + self.cfg.tokenizer_config, trust_remote_code=True + ) + tokenizer_class = tokenizer_for_class_loading.__class__ + del tokenizer_for_class_loading + + def patched_apply_chat_template( + self, + conversation, + tools: Optional[list[dict]] = None, + tokenize: bool = True, # <-- FIXED DEFAULT + add_generation_prompt: bool = False, # <-- FIXED DEFAULT + **kwargs, + ): + """ + A patched version of apply_chat_template with corrected defaults and no + external dependencies like deep_sort_dict. + """ + # The line `tools = deep_sort_dict(tools)` has been removed. + # Now we just call the superclass method, passing all arguments along. + return super(tokenizer_class, self).apply_chat_template( + conversation=conversation, + tools=tools, + tokenize=tokenize, + add_generation_prompt=add_generation_prompt, + **kwargs, + ) + + tokenizer_class.apply_chat_template = patched_apply_chat_template + + print( + f"Successfully patched 'apply_chat_template' on class '{tokenizer_class.__name__}' " + "with new defaults (tokenize=True, add_generation_prompt=False)." + ) + def _apply_fp8_patches(self): """Apply patches for FP8 support.""" if self.cfg.fp8: diff --git a/src/axolotl/monkeypatch/models/kimi_linear/configuration_kimi.py b/src/axolotl/monkeypatch/models/kimi_linear/configuration_kimi.py new file mode 100644 index 000000000..3b1977d94 --- /dev/null +++ b/src/axolotl/monkeypatch/models/kimi_linear/configuration_kimi.py @@ -0,0 +1,146 @@ +""" +Kimi-Linear configuration. + +Source: https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct/blob/main/configuration_kimi.py +Revision: 6e163f3 +""" + +from typing import Optional + +from transformers.configuration_utils import PretrainedConfig + + +class KimiLinearConfig(PretrainedConfig): + model_type = "kimi_linear" + keys_to_ignore_at_inference = ["past_key_values"] + + def __init__( + self, + model_type="kimi_linear", + vocab_size=163840, + hidden_size=4096, + head_dim=None, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=None, + hidden_act="silu", + initializer_range=0.02, + rms_norm_eps=1e-6, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + rope_theta=10000.0, + rope_scaling=None, + tie_word_embeddings=False, + moe_intermediate_size: Optional[int] = None, + moe_renormalize: bool = True, + moe_router_activation_func: str = "sigmoid", + num_experts: Optional[int] = None, + num_experts_per_token: Optional[int] = None, + num_shared_experts: int = 0, + routed_scaling_factor: float = 1.0, + first_k_dense_replace: int = 0, + moe_layer_freq: int = 1, + use_grouped_topk: bool = True, + num_expert_group: int = 1, + topk_group: int = 1, + q_lora_rank: Optional[int] = None, + kv_lora_rank: Optional[int] = None, + qk_nope_head_dim: Optional[int] = None, + qk_rope_head_dim: Optional[int] = None, + v_head_dim: Optional[int] = None, + mla_use_nope: Optional[bool] = False, + num_nextn_predict_layers: int = 0, + linear_attn_config: Optional[dict] = None, + **kwargs, + ): + self.model_type = model_type + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.head_dim = ( + head_dim if head_dim is not None else hidden_size // num_attention_heads + ) + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + + # for backward compatibility + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.rope_scaling = rope_scaling + + self.q_lora_rank = q_lora_rank + self.kv_lora_rank = kv_lora_rank + self.qk_nope_head_dim = qk_nope_head_dim + self.qk_rope_head_dim = qk_rope_head_dim + self.v_head_dim = v_head_dim + self.mla_use_nope = mla_use_nope + # moe config + self.num_experts = num_experts + self.num_experts_per_token = num_experts_per_token + self.moe_renormalize = moe_renormalize + self.num_shared_experts = num_shared_experts + self.routed_scaling_factor = routed_scaling_factor + self.moe_router_activation_func = moe_router_activation_func + assert self.moe_router_activation_func in ("softmax", "sigmoid") + self.moe_intermediate_size = moe_intermediate_size + self.first_k_dense_replace = first_k_dense_replace + self.moe_layer_freq = moe_layer_freq + self.use_grouped_topk = use_grouped_topk + self.num_expert_group = num_expert_group + self.topk_group = topk_group + self.num_nextn_predict_layers = num_nextn_predict_layers + + if linear_attn_config is not None: + assert linear_attn_config["kda_layers"] is not None + assert linear_attn_config["full_attn_layers"] is not None + self.linear_attn_config = linear_attn_config + + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) + + @property + def is_mla(self): + return ( + self.q_lora_rank is not None + or self.kv_lora_rank is not None + or self.qk_nope_head_dim is not None + or self.qk_rope_head_dim is not None + or self.v_head_dim is not None + or self.mla_use_nope is True + ) + + @property + def is_moe(self): + return self.num_experts is not None + + @property + def is_linear_attn(self) -> bool: + return not ( + self.linear_attn_config is None + or ( + isinstance(self.linear_attn_config, dict) + and self.linear_attn_config["kda_layers"] is not None + and len(self.linear_attn_config["kda_layers"]) == 0 + ) + ) + + def is_kda_layer(self, layer_idx: int): + return ( + self.linear_attn_config is not None + and (layer_idx + 1) in self.linear_attn_config["kda_layers"] + ) diff --git a/src/axolotl/monkeypatch/models/kimi_linear/modeling_kimi.py b/src/axolotl/monkeypatch/models/kimi_linear/modeling_kimi.py index 12701c603..5c01474d4 100644 --- a/src/axolotl/monkeypatch/models/kimi_linear/modeling_kimi.py +++ b/src/axolotl/monkeypatch/models/kimi_linear/modeling_kimi.py @@ -1,3 +1,10 @@ +""" +Adapted Kimi-Linear modeling to enable MoE differentiable. + +Source: https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct/blob/main/modeling_kimi.py +Revision: 6e163f3 +""" + import math from collections.abc import Callable from typing import Any, List, Optional, Tuple, Union @@ -38,7 +45,7 @@ except ImportError as err: "Plese run `pip uninstall fla-core flash-linear-attention -y && pip install git+https://github.com/fla-org/flash-linear-attention@v0.4.0`" ) from err -from .configuration_kimi import KimiLinearConfig +from axolotl.monkeypatch.models.kimi_linear.configuration_kimi import KimiLinearConfig assert version.parse(transformers.__version__) >= version.parse("4.56.0"), ( "Please upgrade transformers to >= 4.56.0"