refactor and fixing test isolation issues
This commit is contained in:
@@ -1,4 +1,5 @@
|
||||
"""CLI to convert a transformers model's attns to diff attns."""
|
||||
"""CLI to convert a transformers model's attention layers to differential attention layers."""
|
||||
|
||||
import logging
|
||||
import warnings
|
||||
from pathlib import Path
|
||||
@@ -127,6 +128,7 @@ def convert_diff_transformer(cfg, cli_args, config_path):
|
||||
else:
|
||||
modified_cfg["plugins"] = [plugin_class]
|
||||
|
||||
# Write out the updated axolotl config while preserving original ordering / formatting
|
||||
dump_yaml_preserved_order(
|
||||
data=modified_cfg,
|
||||
reference_yaml_path=config_path,
|
||||
|
||||
@@ -12,14 +12,12 @@ from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.models import load_model, load_tokenizer
|
||||
|
||||
configure_logging()
|
||||
LOG = logging.getLogger("axolotl.common.cli")
|
||||
LOG = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreprocessCliArgs:
|
||||
"""
|
||||
dataclass with arguments for preprocessing only
|
||||
"""
|
||||
"""dataclass with arguments for preprocessing only"""
|
||||
|
||||
debug: bool = field(default=False)
|
||||
debug_text_only: bool = field(default=False)
|
||||
@@ -30,9 +28,7 @@ class PreprocessCliArgs:
|
||||
|
||||
@dataclass
|
||||
class TrainerCliArgs:
|
||||
"""
|
||||
dataclass with various non-training arguments
|
||||
"""
|
||||
"""dataclass with various non-training arguments"""
|
||||
|
||||
debug: bool = field(default=False)
|
||||
debug_text_only: bool = field(default=False)
|
||||
@@ -45,9 +41,7 @@ class TrainerCliArgs:
|
||||
|
||||
@dataclass
|
||||
class EvaluateCliArgs:
|
||||
"""
|
||||
dataclass with various evaluation arguments
|
||||
"""
|
||||
"""dataclass with various evaluation arguments"""
|
||||
|
||||
debug: bool = field(default=False)
|
||||
debug_text_only: bool = field(default=False)
|
||||
@@ -56,9 +50,7 @@ class EvaluateCliArgs:
|
||||
|
||||
@dataclass
|
||||
class ConvertDiffTransformerCliArgs:
|
||||
"""
|
||||
dataclass with arguments for convert-diff-transformer CLI
|
||||
"""
|
||||
"""dataclass with arguments for convert-diff-transformer CLI"""
|
||||
|
||||
debug: bool = field(default=False)
|
||||
zero_init: bool = field(default=False)
|
||||
|
||||
@@ -98,9 +98,13 @@ def convert_to_diff_attn(
|
||||
|
||||
# Iterate through module children, convert any attn layers to diff attn
|
||||
for name, child in module.named_children():
|
||||
if isinstance(child, tuple(ATTENTION_MAPPING.keys())):
|
||||
# Choose appropriate differential attention class
|
||||
attention_class = ATTENTION_MAPPING[type(child)]
|
||||
child_class_name = type(child).__name__
|
||||
if child_class_name in [k.__name__ for k in ATTENTION_MAPPING]:
|
||||
# Find matching attention class by name
|
||||
for orig_class, diff_class in ATTENTION_MAPPING.items():
|
||||
if orig_class.__name__ == child_class_name:
|
||||
attention_class = diff_class
|
||||
break
|
||||
|
||||
layer_type = type(child).__name__
|
||||
logger.info(
|
||||
|
||||
@@ -21,7 +21,6 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
|
||||
"""torch.repeat_interleave(x, dim=1, repeats=n_rep)"""
|
||||
batch_size, n_kv_heads, slen, head_dim = x.shape
|
||||
if n_rep == 1:
|
||||
return x
|
||||
@@ -249,6 +248,7 @@ class LlamaDifferentialAttention(DifferentialAttentionBase):
|
||||
class LlamaDifferentialSdpaAttention(DifferentialAttentionBase):
|
||||
"""SDPA-based implementation of differential attention."""
|
||||
|
||||
# pylint: disable=duplicate-code
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
@@ -312,6 +312,7 @@ class LlamaDifferentialSdpaAttention(DifferentialAttentionBase):
|
||||
class LlamaDifferentialFlashAttention2(DifferentialAttentionBase):
|
||||
"""Flash Attention 2-based implementation of differential attention."""
|
||||
|
||||
# pylint: disable=duplicate-code
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
|
||||
@@ -84,6 +84,11 @@ class OrderedDumper(yaml.SafeDumper):
|
||||
"""Custom YAML dumper that maintains dictionary order."""
|
||||
|
||||
|
||||
def represent_none(self, _):
|
||||
"""Represent None values as empty fields."""
|
||||
return self.represent_scalar("tag:yaml.org,2002:null", "")
|
||||
|
||||
|
||||
def ordered_dict_representer(dumper: OrderedDumper, data: Dict) -> Any:
|
||||
"""Custom representer for dictionaries that maintains order."""
|
||||
return dumper.represent_mapping("tag:yaml.org,2002:map", data.items())
|
||||
@@ -121,7 +126,8 @@ def dump_yaml_preserved_order(
|
||||
# Reorder the data
|
||||
ordered_data = reorder_dict(data, tracker.structure)
|
||||
|
||||
# Register the custom representer
|
||||
# Register the custom representers
|
||||
OrderedDumper.add_representer(type(None), represent_none)
|
||||
OrderedDumper.add_representer(dict, ordered_dict_representer)
|
||||
OrderedDumper.add_representer(OrderedDict, ordered_dict_representer)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user