fix: remove doc string imports in monkeypatches (#2671) [skip ci]
This commit is contained in:
@@ -20,25 +20,15 @@ from cut_cross_entropy.transformers.utils import (
|
|||||||
from transformers.cache_utils import Cache
|
from transformers.cache_utils import Cache
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
from transformers.models.cohere.modeling_cohere import (
|
from transformers.models.cohere.modeling_cohere import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
COHERE_INPUTS_DOCSTRING,
|
|
||||||
KwargsForCausalLM,
|
KwargsForCausalLM,
|
||||||
)
|
)
|
||||||
from transformers.processing_utils import Unpack
|
from transformers.processing_utils import Unpack
|
||||||
from transformers.utils import (
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
|
||||||
from transformers.utils.deprecation import deprecate_kwarg
|
from transformers.utils.deprecation import deprecate_kwarg
|
||||||
|
|
||||||
_PATCH_OPTS: PatchOptions | None = None
|
_PATCH_OPTS: PatchOptions | None = None
|
||||||
|
|
||||||
|
|
||||||
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
||||||
@add_start_docstrings_to_model_forward(COHERE_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def cce_forward(
|
def cce_forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor | None = None,
|
input_ids: torch.LongTensor | None = None,
|
||||||
|
|||||||
@@ -17,25 +17,15 @@ from cut_cross_entropy.transformers.utils import (
|
|||||||
from transformers.cache_utils import Cache
|
from transformers.cache_utils import Cache
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
from transformers.models.gemma.modeling_gemma import (
|
from transformers.models.gemma.modeling_gemma import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
GEMMA_INPUTS_DOCSTRING,
|
|
||||||
KwargsForCausalLM,
|
KwargsForCausalLM,
|
||||||
)
|
)
|
||||||
from transformers.processing_utils import Unpack
|
from transformers.processing_utils import Unpack
|
||||||
from transformers.utils import (
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
|
||||||
from transformers.utils.deprecation import deprecate_kwarg
|
from transformers.utils.deprecation import deprecate_kwarg
|
||||||
|
|
||||||
_PATCH_OPTS: PatchOptions | None = None
|
_PATCH_OPTS: PatchOptions | None = None
|
||||||
|
|
||||||
|
|
||||||
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
||||||
@add_start_docstrings_to_model_forward(GEMMA_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def cce_forward(
|
def cce_forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor | None = None,
|
input_ids: torch.LongTensor | None = None,
|
||||||
|
|||||||
@@ -20,15 +20,11 @@ from torch import nn
|
|||||||
from transformers.cache_utils import Cache, HybridCache
|
from transformers.cache_utils import Cache, HybridCache
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
from transformers.models.gemma3.modeling_gemma3 import (
|
from transformers.models.gemma3.modeling_gemma3 import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
GEMMA3_INPUTS_DOCSTRING,
|
|
||||||
Gemma3CausalLMOutputWithPast,
|
Gemma3CausalLMOutputWithPast,
|
||||||
logger,
|
logger,
|
||||||
)
|
)
|
||||||
from transformers.utils import (
|
from transformers.utils import (
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
is_torchdynamo_compiling,
|
is_torchdynamo_compiling,
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
)
|
||||||
from transformers.utils.deprecation import deprecate_kwarg
|
from transformers.utils.deprecation import deprecate_kwarg
|
||||||
|
|
||||||
@@ -38,10 +34,6 @@ _PATCH_OPTS: PatchOptions | None = None
|
|||||||
|
|
||||||
|
|
||||||
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
||||||
@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def cce_forward(
|
def cce_forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor | None = None,
|
input_ids: torch.LongTensor | None = None,
|
||||||
@@ -170,10 +162,6 @@ def cce_forward(
|
|||||||
|
|
||||||
|
|
||||||
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
||||||
@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=Gemma3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def cce_forward_multimodal(
|
def cce_forward_multimodal(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor | None = None,
|
input_ids: torch.LongTensor | None = None,
|
||||||
|
|||||||
@@ -19,15 +19,9 @@ from transformers.modeling_outputs import (
|
|||||||
CausalLMOutputWithPast,
|
CausalLMOutputWithPast,
|
||||||
)
|
)
|
||||||
from transformers.models.llama.modeling_llama import (
|
from transformers.models.llama.modeling_llama import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
LLAMA_INPUTS_DOCSTRING,
|
|
||||||
KwargsForCausalLM,
|
KwargsForCausalLM,
|
||||||
)
|
)
|
||||||
from transformers.processing_utils import Unpack
|
from transformers.processing_utils import Unpack
|
||||||
from transformers.utils import (
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
|
||||||
from transformers.utils.deprecation import deprecate_kwarg
|
from transformers.utils.deprecation import deprecate_kwarg
|
||||||
from transformers.utils.generic import can_return_tuple
|
from transformers.utils.generic import can_return_tuple
|
||||||
|
|
||||||
@@ -36,10 +30,6 @@ _PATCH_OPTS: PatchOptions | None = None
|
|||||||
|
|
||||||
@can_return_tuple
|
@can_return_tuple
|
||||||
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
||||||
@add_start_docstrings_to_model_forward(LLAMA_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def cce_forward(
|
def cce_forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.LongTensor] = None,
|
input_ids: Optional[torch.LongTensor] = None,
|
||||||
|
|||||||
@@ -16,22 +16,12 @@ from torch import nn
|
|||||||
from transformers.cache_utils import Cache
|
from transformers.cache_utils import Cache
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
from transformers.models.llama4.modeling_llama4 import (
|
from transformers.models.llama4.modeling_llama4 import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
LLAMA4_INPUTS_DOCSTRING,
|
|
||||||
Llama4CausalLMOutputWithPast,
|
Llama4CausalLMOutputWithPast,
|
||||||
)
|
)
|
||||||
from transformers.utils import (
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
|
||||||
|
|
||||||
_PATCH_OPTS: PatchOptions | None = None
|
_PATCH_OPTS: PatchOptions | None = None
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings_to_model_forward(LLAMA4_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def cce_forward(
|
def cce_forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor | None = None,
|
input_ids: torch.LongTensor | None = None,
|
||||||
@@ -160,9 +150,6 @@ def cce_forward(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=Llama4CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def cce_forward_multimodal(
|
def cce_forward_multimodal(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor | None = None, # type: ignore
|
input_ids: torch.LongTensor | None = None, # type: ignore
|
||||||
|
|||||||
@@ -19,15 +19,11 @@ from transformers.models.mistral3.modeling_mistral3 import (
|
|||||||
Mistral3CausalLMOutputWithPast,
|
Mistral3CausalLMOutputWithPast,
|
||||||
)
|
)
|
||||||
from transformers.models.mistral.modeling_mistral import (
|
from transformers.models.mistral.modeling_mistral import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
MISTRAL_INPUTS_DOCSTRING,
|
|
||||||
KwargsForCausalLM,
|
KwargsForCausalLM,
|
||||||
)
|
)
|
||||||
from transformers.processing_utils import Unpack
|
from transformers.processing_utils import Unpack
|
||||||
from transformers.utils import (
|
from transformers.utils import (
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
is_torchdynamo_compiling,
|
is_torchdynamo_compiling,
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
)
|
||||||
from transformers.utils.deprecation import deprecate_kwarg
|
from transformers.utils.deprecation import deprecate_kwarg
|
||||||
|
|
||||||
@@ -35,10 +31,6 @@ _PATCH_OPTS: PatchOptions | None = None
|
|||||||
|
|
||||||
|
|
||||||
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
||||||
@add_start_docstrings_to_model_forward(MISTRAL_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def cce_forward(
|
def cce_forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor | None = None,
|
input_ids: torch.LongTensor | None = None,
|
||||||
|
|||||||
@@ -13,16 +13,10 @@ from cut_cross_entropy.transformers.utils import (
|
|||||||
apply_lce,
|
apply_lce,
|
||||||
)
|
)
|
||||||
from transformers.models.qwen2_moe.modeling_qwen2_moe import (
|
from transformers.models.qwen2_moe.modeling_qwen2_moe import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
QWEN2MOE_INPUTS_DOCSTRING,
|
|
||||||
MoeCausalLMOutputWithPast,
|
MoeCausalLMOutputWithPast,
|
||||||
MoeModelOutputWithPast,
|
MoeModelOutputWithPast,
|
||||||
load_balancing_loss_func,
|
load_balancing_loss_func,
|
||||||
)
|
)
|
||||||
from transformers.utils import (
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
|
||||||
from transformers.utils.deprecation import deprecate_kwarg
|
from transformers.utils.deprecation import deprecate_kwarg
|
||||||
from transformers.utils.generic import can_return_tuple
|
from transformers.utils.generic import can_return_tuple
|
||||||
|
|
||||||
@@ -31,10 +25,6 @@ _PATCH_OPTS: PatchOptions | None = None
|
|||||||
|
|
||||||
@can_return_tuple
|
@can_return_tuple
|
||||||
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
||||||
@add_start_docstrings_to_model_forward(QWEN2MOE_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.LongTensor] = None,
|
input_ids: Optional[torch.LongTensor] = None,
|
||||||
|
|||||||
@@ -14,22 +14,12 @@ from cut_cross_entropy.transformers.utils import (
|
|||||||
)
|
)
|
||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from transformers.models.qwen2_vl.modeling_qwen2_vl import (
|
from transformers.models.qwen2_vl.modeling_qwen2_vl import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
QWEN2_VL_INPUTS_DOCSTRING,
|
|
||||||
Qwen2VLCausalLMOutputWithPast,
|
Qwen2VLCausalLMOutputWithPast,
|
||||||
)
|
)
|
||||||
from transformers.utils import (
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
|
||||||
|
|
||||||
_PATCH_OPTS: PatchOptions | None = None
|
_PATCH_OPTS: PatchOptions | None = None
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings_to_model_forward(QWEN2_VL_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=Qwen2VLCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def cce_forward_multimodal(
|
def cce_forward_multimodal(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.LongTensor] = None,
|
input_ids: Optional[torch.LongTensor] = None,
|
||||||
|
|||||||
@@ -12,20 +12,13 @@ from cut_cross_entropy.transformers.utils import (
|
|||||||
TransformersModelT,
|
TransformersModelT,
|
||||||
apply_lce,
|
apply_lce,
|
||||||
)
|
)
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
|
||||||
from transformers.models.qwen3_moe.modeling_qwen3_moe import (
|
from transformers.models.qwen3_moe.modeling_qwen3_moe import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
QWEN3_MOE_INPUTS_DOCSTRING,
|
|
||||||
KwargsForCausalLM,
|
KwargsForCausalLM,
|
||||||
MoeCausalLMOutputWithPast,
|
MoeCausalLMOutputWithPast,
|
||||||
MoeModelOutputWithPast,
|
MoeModelOutputWithPast,
|
||||||
load_balancing_loss_func,
|
load_balancing_loss_func,
|
||||||
)
|
)
|
||||||
from transformers.processing_utils import Unpack
|
from transformers.processing_utils import Unpack
|
||||||
from transformers.utils import (
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
|
||||||
from transformers.utils.deprecation import deprecate_kwarg
|
from transformers.utils.deprecation import deprecate_kwarg
|
||||||
from transformers.utils.generic import can_return_tuple
|
from transformers.utils.generic import can_return_tuple
|
||||||
|
|
||||||
@@ -34,10 +27,6 @@ _PATCH_OPTS: PatchOptions | None = None
|
|||||||
|
|
||||||
@can_return_tuple
|
@can_return_tuple
|
||||||
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
||||||
@add_start_docstrings_to_model_forward(QWEN3_MOE_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def forward(
|
def forward(
|
||||||
self,
|
self,
|
||||||
input_ids: Optional[torch.LongTensor] = None,
|
input_ids: Optional[torch.LongTensor] = None,
|
||||||
|
|||||||
@@ -14,10 +14,6 @@ from torch.nn import CrossEntropyLoss
|
|||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||||
|
|
||||||
|
|
||||||
# @add_start_docstrings_to_model_forward(DeepseekV2_INPUTS_DOCSTRING)
|
|
||||||
# @replace_return_docstrings(
|
|
||||||
# output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
# )
|
|
||||||
def lce_forward(
|
def lce_forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor = None,
|
input_ids: torch.LongTensor = None,
|
||||||
|
|||||||
@@ -13,21 +13,11 @@ from liger_kernel.transformers.fused_linear_cross_entropy import (
|
|||||||
from torch.nn import CrossEntropyLoss
|
from torch.nn import CrossEntropyLoss
|
||||||
from transformers.modeling_outputs import MoeCausalLMOutputWithPast
|
from transformers.modeling_outputs import MoeCausalLMOutputWithPast
|
||||||
from transformers.models.jamba.modeling_jamba import (
|
from transformers.models.jamba.modeling_jamba import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
JAMBA_INPUTS_DOCSTRING,
|
|
||||||
HybridMambaAttentionDynamicCache,
|
HybridMambaAttentionDynamicCache,
|
||||||
load_balancing_loss_func,
|
load_balancing_loss_func,
|
||||||
)
|
)
|
||||||
from transformers.utils import (
|
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@add_start_docstrings_to_model_forward(JAMBA_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=MoeCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def lce_forward(
|
def lce_forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor = None,
|
input_ids: torch.LongTensor = None,
|
||||||
|
|||||||
@@ -7,24 +7,16 @@ from typing import Optional, Tuple, Union
|
|||||||
import torch
|
import torch
|
||||||
from transformers.cache_utils import Cache
|
from transformers.cache_utils import Cache
|
||||||
from transformers.models.gemma3.modeling_gemma3 import (
|
from transformers.models.gemma3.modeling_gemma3 import (
|
||||||
_CONFIG_FOR_DOC,
|
|
||||||
GEMMA3_INPUTS_DOCSTRING,
|
|
||||||
Gemma3CausalLMOutputWithPast,
|
Gemma3CausalLMOutputWithPast,
|
||||||
logger,
|
logger,
|
||||||
)
|
)
|
||||||
from transformers.utils import (
|
from transformers.utils import (
|
||||||
add_start_docstrings_to_model_forward,
|
|
||||||
is_torchdynamo_compiling,
|
is_torchdynamo_compiling,
|
||||||
replace_return_docstrings,
|
|
||||||
)
|
)
|
||||||
from transformers.utils.deprecation import deprecate_kwarg
|
from transformers.utils.deprecation import deprecate_kwarg
|
||||||
|
|
||||||
|
|
||||||
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
@deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
|
||||||
@add_start_docstrings_to_model_forward(GEMMA3_INPUTS_DOCSTRING)
|
|
||||||
@replace_return_docstrings(
|
|
||||||
output_type=Gemma3CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC
|
|
||||||
)
|
|
||||||
def new_forward(
|
def new_forward(
|
||||||
self,
|
self,
|
||||||
input_ids: torch.LongTensor = None,
|
input_ids: torch.LongTensor = None,
|
||||||
|
|||||||
Reference in New Issue
Block a user