Compare commits
16 Commits
feat/liger
...
flex_patch
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
deb01959d2 | ||
|
|
76ae4ae238 | ||
|
|
f85861a0b2 | ||
|
|
630e40dd13 | ||
|
|
bf9efe2a09 | ||
|
|
2f147cc6ff | ||
|
|
6f47b1e896 | ||
|
|
e1a8dfbe8c | ||
|
|
cdb16069af | ||
|
|
75c565d476 | ||
|
|
bdaaba2784 | ||
|
|
04624c5a8d | ||
|
|
b98dbafc31 | ||
|
|
4d320e2e4d | ||
|
|
421e0ee499 | ||
|
|
4e8677027a |
@@ -68,7 +68,7 @@ def run_cmd(cmd: str, run_folder: str):
|
||||
@app.function(
|
||||
image=cicd_image,
|
||||
gpu=GPU_CONFIG,
|
||||
timeout=60 * 60,
|
||||
timeout=90 * 60,
|
||||
cpu=8.0,
|
||||
memory=131072 * N_GPUS,
|
||||
volumes=VOLUME_CONFIG,
|
||||
|
||||
10
examples/llama-4/README.md
Normal file
10
examples/llama-4/README.md
Normal file
@@ -0,0 +1,10 @@
|
||||
# Llama 4 by Meta AI
|
||||
|
||||
## Available Examples
|
||||
|
||||
### Llama 4 Scout 17Bx16Experts (109B)
|
||||
- [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml)
|
||||
- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml)
|
||||
- [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml)
|
||||
|
||||
Our Single GPU implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second.
|
||||
86
examples/llama-4/scout-qlora-single-h100.yaml
Normal file
86
examples/llama-4/scout-qlora-single-h100.yaml
Normal file
@@ -0,0 +1,86 @@
|
||||
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||
model_type: Llama4ForConditionalGeneration
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
strict: false
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_glu_activation: true
|
||||
liger_rms_norm: true
|
||||
liger_layer_norm: true
|
||||
|
||||
llama4_linearized_experts: true
|
||||
load_in_4bit: true
|
||||
adapter: qlora
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_target_modules:
|
||||
- self_attn.q_proj
|
||||
- self_attn.k_proj
|
||||
- self_attn.v_proj
|
||||
- self_attn.o_proj
|
||||
- shared_expert.gate_proj
|
||||
- shared_expert.up_proj
|
||||
- shared_expert.down_proj
|
||||
# - experts.gate_projs.[0-9]+$
|
||||
# - experts.up_projs.[0-9]+$
|
||||
# - experts.down_projs.[0-9]+$
|
||||
lora_modules_to_save:
|
||||
# - lm_head
|
||||
# - embed_tokens
|
||||
|
||||
lora_mlp_kernel: true
|
||||
lora_qkv_kernel: true
|
||||
lora_o_kernel: true
|
||||
|
||||
chat_template: llama4
|
||||
datasets:
|
||||
- path: mlabonne/FineTome-100k
|
||||
type: chat_template
|
||||
split: train[:20%]
|
||||
field_messages: conversations
|
||||
message_property_mappings:
|
||||
role: from
|
||||
content: value
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 4096 # up to 8k will work on a single H100
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_4bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 1e-4
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
gradient_checkpointing: offload
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
|
||||
warmup_steps: 20
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
pad_token: <|finetune_right_pad_id|>
|
||||
eos_token: <|eot|>
|
||||
@@ -1,13 +1,28 @@
|
||||
base_model: meta-llama/Llama-4-Scout-17B-16E
|
||||
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||
model_type: Llama4ForConditionalGeneration
|
||||
processor_type: Llama4Processor
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
strict: false
|
||||
|
||||
# torch_compile: true
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
skip_prepare_dataset: true
|
||||
remove_unused_columns: false
|
||||
sample_packing: false
|
||||
|
||||
adapter: lora
|
||||
sequence_len: 4096
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_glu_activation: true
|
||||
liger_rms_norm: true
|
||||
liger_layer_norm: true
|
||||
|
||||
llama4_linearized_experts: true # use Axolotl's customized model
|
||||
load_in_4bit: true
|
||||
adapter: qlora
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_target_modules:
|
||||
@@ -15,60 +30,59 @@ lora_target_modules:
|
||||
- self_attn.k_proj
|
||||
- self_attn.v_proj
|
||||
- self_attn.o_proj
|
||||
- shared_expert.gate_proj
|
||||
- shared_expert.up_proj
|
||||
- shared_expert.down_proj
|
||||
- vision_adapter.mlp.fc1
|
||||
- vision_adapter.mlp.fc2
|
||||
# - experts.gate_projs.[0-9]+$
|
||||
# - experts.up_projs.[0-9]+$
|
||||
# - experts.down_projs.[0-9]+$
|
||||
lora_modules_to_save:
|
||||
- lm_head
|
||||
- embed_tokens
|
||||
|
||||
chat_template: llama4
|
||||
datasets:
|
||||
- path: mlabonne/FineTome-100k
|
||||
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||
type: chat_template
|
||||
split: train[:20%]
|
||||
field_messages: conversations
|
||||
message_property_mappings:
|
||||
role: from
|
||||
content: value
|
||||
split: train[:1%]
|
||||
field_messages: messages
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_8bit
|
||||
optimizer: adamw_torch_4bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
# gradient_checkpointing: true
|
||||
# gradient_checkpointing_kwargs:
|
||||
# use_reentrant: false
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
evals_per_epoch: 2
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- auto_wrap
|
||||
- full_shard
|
||||
fsdp_config:
|
||||
fsdp_version: 2
|
||||
fsdp_offload_params: false
|
||||
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
fsdp_reshard_after_forward: true
|
||||
fsdp_activation_checkpointing: true
|
||||
special_tokens:
|
||||
pad_token: <|finetune_right_pad_id|>
|
||||
@@ -12,7 +12,7 @@ liger-kernel==0.5.6
|
||||
packaging==23.2
|
||||
|
||||
peft==0.15.1
|
||||
transformers==4.51.0
|
||||
transformers==4.51.1
|
||||
tokenizers>=0.21.1
|
||||
accelerate==1.6.0
|
||||
datasets==3.5.0
|
||||
|
||||
@@ -185,21 +185,7 @@ class LigerPlugin(BasePlugin):
|
||||
rms_norm=cfg.liger_rms_norm,
|
||||
layer_norm=cfg.liger_layer_norm,
|
||||
)
|
||||
# Not fully tested. No suitable small MoE model to test
|
||||
# with train-ready modeling source
|
||||
elif cfg.model_config_type == "deepseek_v3":
|
||||
from axolotl.integrations.liger.models.deepseekv3 import (
|
||||
apply_liger_kernel_to_deepseekv3,
|
||||
else:
|
||||
logging.warning(
|
||||
f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
|
||||
)
|
||||
|
||||
apply_liger_kernel_to_deepseekv3(
|
||||
base_model=cfg.base_model,
|
||||
trust_remote_code=cfg.trust_remote_code,
|
||||
cross_entropy=cfg.liger_cross_entropy,
|
||||
fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
|
||||
rms_norm=cfg.liger_rms_norm,
|
||||
glu_activation=cfg.liger_glu_activation,
|
||||
layer_norm=cfg.liger_layer_norm,
|
||||
)
|
||||
elif cfg.model_config_type in ["deepseek_v3"]:
|
||||
raise ValueError(f"Unsupported model config type: {cfg.model_config_type}")
|
||||
|
||||
@@ -1,464 +0,0 @@
|
||||
"""
|
||||
DeepseekV3 model with LigerFusedLinearCrossEntropyLoss
|
||||
"""
|
||||
|
||||
# pylint: disable=duplicate-code
|
||||
|
||||
import sys
|
||||
from copy import deepcopy
|
||||
from functools import partial
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
|
||||
from torch.nn import functional as F
|
||||
from transformers.cache_utils import Cache, DynamicCache
|
||||
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
|
||||
from transformers.modeling_outputs import (
|
||||
BaseModelOutputWithPast,
|
||||
CausalLMOutputWithPast,
|
||||
)
|
||||
from transformers.models.deepseek_v3.modeling_deepseek_v3 import (
|
||||
KwargsForCausalLM,
|
||||
logger,
|
||||
)
|
||||
from transformers.processing_utils import Unpack
|
||||
|
||||
|
||||
def lce_forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
logits_to_keep: Union[int, torch.Tensor] = 0,
|
||||
**kwargs: Unpack[KwargsForCausalLM],
|
||||
) -> CausalLMOutputWithPast:
|
||||
r"""
|
||||
labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
||||
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
||||
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
||||
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
||||
|
||||
logits_to_keep (`int` or `torch.Tensor`, *optional*):
|
||||
If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
|
||||
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
|
||||
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
|
||||
If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
|
||||
This is useful when using packed tensor format (single dimension for batch and sequence length).
|
||||
|
||||
Returns:
|
||||
|
||||
Example:
|
||||
|
||||
```python
|
||||
>>> from transformers import AutoTokenizer, DeepseekV3ForCausalLM
|
||||
|
||||
>>> model = DeepseekV3ForCausalLM.from_pretrained("meta-deepseek_v3/DeepseekV3-2-7b-hf")
|
||||
>>> tokenizer = AutoTokenizer.from_pretrained("meta-deepseek_v3/DeepseekV3-2-7b-hf")
|
||||
|
||||
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
||||
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
||||
|
||||
>>> # Generate
|
||||
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
||||
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
||||
"Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
|
||||
```"""
|
||||
output_attentions = (
|
||||
output_attentions
|
||||
if output_attentions is not None
|
||||
else self.config.output_attentions
|
||||
)
|
||||
output_hidden_states = (
|
||||
output_hidden_states
|
||||
if output_hidden_states is not None
|
||||
else self.config.output_hidden_states
|
||||
)
|
||||
|
||||
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
||||
outputs: BaseModelOutputWithPast = self.model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = outputs.last_hidden_state
|
||||
|
||||
logits = None
|
||||
loss = None
|
||||
|
||||
if self.training and (labels is not None):
|
||||
loss = LigerForCausalLMLoss(
|
||||
hidden_states=hidden_states,
|
||||
lm_head_weight=self.lm_head.weight,
|
||||
labels=labels,
|
||||
hidden_size=self.config.hidden_size,
|
||||
**kwargs,
|
||||
)
|
||||
else:
|
||||
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||
slice_indices = (
|
||||
slice(-logits_to_keep, None)
|
||||
if isinstance(logits_to_keep, int)
|
||||
else logits_to_keep
|
||||
)
|
||||
logits = self.lm_head(hidden_states[:, slice_indices, :])
|
||||
|
||||
if labels is not None:
|
||||
loss = self.loss_function(
|
||||
logits=logits,
|
||||
labels=labels,
|
||||
vocab_size=self.config.vocab_size,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
return CausalLMOutputWithPast(
|
||||
loss=loss,
|
||||
logits=logits,
|
||||
past_key_values=outputs.past_key_values,
|
||||
hidden_states=outputs.hidden_states,
|
||||
attentions=outputs.attentions,
|
||||
)
|
||||
|
||||
|
||||
# adapted from https://github.com/ScienceOne-AI/DeepSeek-671B-SFT-Guide/blob/ccf17c581b9c42eca007aae793e164b66a0fbaab/model/DeepSeek-V3-BF16/modeling_deepseek.py#L424
|
||||
def moe_forward(self, hidden_states):
|
||||
bsz, seq_len, h = hidden_states.shape
|
||||
# compute gating score
|
||||
hidden_states = hidden_states.view(-1, h)
|
||||
logits = F.linear(
|
||||
hidden_states.type(torch.float32), self.weight.type(torch.float32), None
|
||||
)
|
||||
if self.scoring_func == "sigmoid":
|
||||
scores = logits.sigmoid()
|
||||
elif self.scoring_func == "softmax":
|
||||
scores = logits.softmax(dim=-1, dtype=torch.float32)
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"insupportable scoring function for MoE gating: {self.scoring_func}"
|
||||
)
|
||||
|
||||
# select top-k experts
|
||||
if self.topk_method == "noaux_tc":
|
||||
# assert not self.training
|
||||
scores_for_choice = scores.view(
|
||||
bsz * seq_len, -1
|
||||
) + self.e_score_correction_bias.unsqueeze(0)
|
||||
group_scores = (
|
||||
scores_for_choice.view(bsz * seq_len, self.n_group, -1)
|
||||
.topk(2, dim=-1)[0]
|
||||
.sum(dim=-1)
|
||||
) # [n, n_group]
|
||||
group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[
|
||||
1
|
||||
] # [n, top_k_group]
|
||||
group_mask = torch.zeros_like(group_scores) # [n, n_group]
|
||||
group_mask.scatter_(1, group_idx, 1) # [n, n_group]
|
||||
score_mask = (
|
||||
group_mask.unsqueeze(-1)
|
||||
.expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group)
|
||||
.reshape(bsz * seq_len, -1)
|
||||
) # [n, e]
|
||||
tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0) # [n, e]
|
||||
_, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
|
||||
topk_weight = scores.gather(1, topk_idx)
|
||||
elif self.topk_method == "greedy":
|
||||
topk_weight, topk_idx = torch.topk(scores, k=self.top_k, dim=-1, sorted=False)
|
||||
elif self.topk_method == "group_limited_greedy":
|
||||
group_scores = (
|
||||
scores.view(bsz * seq_len, self.n_group, -1).max(dim=-1).values
|
||||
) # [n, n_group]
|
||||
group_idx = torch.topk(group_scores, k=self.topk_group, dim=-1, sorted=False)[
|
||||
1
|
||||
] # [n, top_k_group]
|
||||
group_mask = torch.zeros_like(group_scores) # [n, n_group]
|
||||
group_mask.scatter_(1, group_idx, 1) # [n, n_group]
|
||||
score_mask = (
|
||||
group_mask.unsqueeze(-1)
|
||||
.expand(bsz * seq_len, self.n_group, self.n_routed_experts // self.n_group)
|
||||
.reshape(bsz * seq_len, -1)
|
||||
) # [n, e]
|
||||
tmp_scores = scores.masked_fill(~score_mask.bool(), 0.0) # [n, e]
|
||||
topk_weight, topk_idx = torch.topk(
|
||||
tmp_scores, k=self.top_k, dim=-1, sorted=False
|
||||
)
|
||||
|
||||
# norm gate to sum 1
|
||||
if self.top_k > 1 and self.norm_topk_prob:
|
||||
denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
|
||||
topk_weight = topk_weight / denominator
|
||||
else:
|
||||
topk_weight = topk_weight * self.routed_scaling_factor
|
||||
# expert-level computation auxiliary loss
|
||||
if self.training and self.alpha > 0.0:
|
||||
scores_for_aux = scores
|
||||
aux_topk = self.top_k
|
||||
# always compute aux loss based on the naive greedy topk method
|
||||
topk_idx_for_aux_loss = topk_idx.view(bsz, -1)
|
||||
if self.seq_aux:
|
||||
scores_for_seq_aux = scores_for_aux.view(bsz, seq_len, -1)
|
||||
ce = torch.zeros(bsz, self.n_routed_experts, device=hidden_states.device)
|
||||
ce.scatter_add_(
|
||||
1,
|
||||
topk_idx_for_aux_loss,
|
||||
torch.ones(bsz, seq_len * aux_topk, device=hidden_states.device),
|
||||
).div_(seq_len * aux_topk / self.n_routed_experts)
|
||||
aux_loss = (ce * scores_for_seq_aux.mean(dim=1)).sum(
|
||||
dim=1
|
||||
).mean() * self.alpha
|
||||
else:
|
||||
mask_ce = F.one_hot(
|
||||
topk_idx_for_aux_loss.view(-1), num_classes=self.n_routed_experts
|
||||
)
|
||||
ce = mask_ce.float().mean(0)
|
||||
pi = scores_for_aux.mean(0)
|
||||
fi = ce * self.n_routed_experts
|
||||
aux_loss = (pi * fi).sum() * self.alpha
|
||||
else:
|
||||
aux_loss = None
|
||||
return topk_idx, topk_weight, aux_loss
|
||||
|
||||
|
||||
# from transformers main but using this requires patching private function _causal_mask etc
|
||||
def model_forward(
|
||||
self,
|
||||
input_ids: Optional[torch.LongTensor] = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[Cache] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**flash_attn_kwargs: Unpack[FlashAttentionKwargs],
|
||||
) -> BaseModelOutputWithPast:
|
||||
output_attentions = (
|
||||
output_attentions
|
||||
if output_attentions is not None
|
||||
else self.config.output_attentions
|
||||
)
|
||||
output_hidden_states = (
|
||||
output_hidden_states
|
||||
if output_hidden_states is not None
|
||||
else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
|
||||
if (input_ids is None) ^ (inputs_embeds is not None):
|
||||
raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
|
||||
|
||||
if self.gradient_checkpointing and self.training and use_cache:
|
||||
logger.warning_once(
|
||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
|
||||
)
|
||||
use_cache = False
|
||||
|
||||
# TODO (joao): remove this exception in v4.56 -- it exists for users that try to pass a legacy cache
|
||||
if not isinstance(past_key_values, (type(None), Cache)):
|
||||
raise ValueError(
|
||||
"The `past_key_values` should be either a `Cache` object or `None`."
|
||||
)
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.embed_tokens(input_ids)
|
||||
|
||||
if use_cache and past_key_values is None:
|
||||
past_key_values = DynamicCache()
|
||||
|
||||
if cache_position is None:
|
||||
past_seen_tokens = (
|
||||
past_key_values.get_seq_length() if past_key_values is not None else 0
|
||||
)
|
||||
cache_position = torch.arange(
|
||||
past_seen_tokens,
|
||||
past_seen_tokens + inputs_embeds.shape[1],
|
||||
device=inputs_embeds.device,
|
||||
)
|
||||
|
||||
if position_ids is None:
|
||||
position_ids = cache_position.unsqueeze(0)
|
||||
|
||||
causal_mask = self._update_causal_mask( # pylint: disable=protected-access
|
||||
attention_mask,
|
||||
inputs_embeds,
|
||||
cache_position,
|
||||
past_key_values,
|
||||
output_attentions,
|
||||
)
|
||||
|
||||
hidden_states = inputs_embeds
|
||||
|
||||
# create position embeddings to be shared across the decoder layers
|
||||
position_embeddings = self.rotary_emb(hidden_states, position_ids)
|
||||
|
||||
# decoder layers
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attns = () if output_attentions else None
|
||||
|
||||
for decoder_layer in self.layers[: self.config.num_hidden_layers]:
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = (
|
||||
self._gradient_checkpointing_func( # pylint: disable=protected-access
|
||||
partial(decoder_layer.__call__, **flash_attn_kwargs),
|
||||
hidden_states,
|
||||
causal_mask,
|
||||
position_ids,
|
||||
past_key_values,
|
||||
output_attentions,
|
||||
use_cache,
|
||||
cache_position,
|
||||
position_embeddings,
|
||||
)
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=causal_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cache_position=cache_position,
|
||||
position_embeddings=position_embeddings,
|
||||
**flash_attn_kwargs,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if output_attentions:
|
||||
all_self_attns += (layer_outputs[1],)
|
||||
|
||||
hidden_states = self.norm(hidden_states)
|
||||
|
||||
# add hidden states from the last decoder layer
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
return BaseModelOutputWithPast(
|
||||
last_hidden_state=hidden_states,
|
||||
past_key_values=past_key_values if use_cache else None,
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attns,
|
||||
)
|
||||
|
||||
|
||||
def apply_liger_kernel_to_deepseekv3(
|
||||
base_model: str,
|
||||
trust_remote_code: bool = False,
|
||||
cross_entropy: bool = False,
|
||||
fused_linear_cross_entropy: bool = False,
|
||||
rms_norm: bool = False,
|
||||
glu_activation: bool = False,
|
||||
layer_norm: bool = False,
|
||||
**kwargs, # pylint: disable=unused-argument
|
||||
) -> None:
|
||||
"""
|
||||
Apply Liger kernels to replace original implementation in HuggingFace DeepseekV3 models
|
||||
|
||||
Args:
|
||||
rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
|
||||
cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
|
||||
fused_linear_cross_entropy (bool):
|
||||
Whether to apply Liger's fused linear cross entropy loss. Default is True.
|
||||
`cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
|
||||
If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
|
||||
rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
|
||||
swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
|
||||
model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
|
||||
loaded. Default is None.
|
||||
"""
|
||||
|
||||
assert not (
|
||||
cross_entropy and fused_linear_cross_entropy
|
||||
), "cross_entropy and fused_linear_cross_entropy cannot both be True."
|
||||
|
||||
# from transformers.models.deepseek_v3 import modeling_deepseek_v3
|
||||
from accelerate import init_empty_weights
|
||||
from liger_kernel.transformers.functional import liger_cross_entropy
|
||||
from liger_kernel.transformers.layer_norm import LigerLayerNorm
|
||||
from liger_kernel.transformers.rms_norm import LigerRMSNorm
|
||||
from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
with init_empty_weights():
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model, trust_remote_code=trust_remote_code or False
|
||||
)
|
||||
modeling_mod = sys.modules[model.__class__.__module__]
|
||||
|
||||
# patch moe
|
||||
modeling_mod.MoEGate.forward = moe_forward
|
||||
|
||||
original_model_forward = modeling_mod.DeepseekV3Model.forward
|
||||
|
||||
def wrapped_model_forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[list[torch.FloatTensor]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
num_items_in_batch: Optional[int] = None,
|
||||
) -> Union[Tuple, BaseModelOutputWithPast]:
|
||||
return original_model_forward(
|
||||
input_ids,
|
||||
attention_mask,
|
||||
position_ids,
|
||||
past_key_values,
|
||||
inputs_embeds,
|
||||
use_cache,
|
||||
output_attentions,
|
||||
output_hidden_states,
|
||||
return_dict,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
# patch model forward
|
||||
modeling_mod.DeepseekV3Model.forward = wrapped_model_forward
|
||||
|
||||
if rms_norm:
|
||||
modeling_mod.DeepseekV3RMSNorm = LigerRMSNorm
|
||||
if glu_activation:
|
||||
|
||||
def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
|
||||
"Accepts intermediate_size to pass to LigerSwiGLUMLP"
|
||||
# clone config to avoid modifying the original
|
||||
config = deepcopy(config)
|
||||
if intermediate_size:
|
||||
setattr(config, "intermediate_size", intermediate_size)
|
||||
return LigerSwiGLUMLP(config, **kwargs)
|
||||
|
||||
modeling_mod.DeepseekV3MLP = _liger_swiglu_mlp_wrapper
|
||||
if layer_norm:
|
||||
modeling_mod.nn.LayerNorm = LigerLayerNorm
|
||||
|
||||
if cross_entropy:
|
||||
from transformers.loss.loss_utils import nn
|
||||
|
||||
nn.functional.cross_entropy = liger_cross_entropy
|
||||
|
||||
if fused_linear_cross_entropy:
|
||||
modeling_mod.DeepseekV3ForCausalLM.forward = lce_forward
|
||||
@@ -3,6 +3,7 @@ Liger FLCE for llama4
|
||||
"""
|
||||
|
||||
import sys
|
||||
from copy import deepcopy
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
@@ -158,7 +159,16 @@ def apply_liger_kernel_to_llama4(
|
||||
if rms_norm:
|
||||
modeling_llama4.Llama4TextRMSNorm = LigerRMSNorm
|
||||
if glu_activation:
|
||||
modeling_llama4.Llama4TextMLP = LigerSwiGLUMLP
|
||||
|
||||
def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
|
||||
"Accepts intermediate_size to pass to LigerSwiGLUMLP"
|
||||
# clone config to avoid modifying the original
|
||||
config = deepcopy(config)
|
||||
if intermediate_size:
|
||||
setattr(config, "intermediate_size", intermediate_size)
|
||||
return LigerSwiGLUMLP(config, **kwargs)
|
||||
|
||||
modeling_llama4.Llama4TextMLP = _liger_swiglu_mlp_wrapper
|
||||
if layer_norm:
|
||||
modeling_llama4.nn.LayerNorm = LigerLayerNorm
|
||||
|
||||
|
||||
@@ -1,171 +0,0 @@
|
||||
"""Flex attention monkey patch"""
|
||||
|
||||
import sys
|
||||
from typing import Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
|
||||
def patch_flex_wrapper():
|
||||
# TODO remove this patch when transformers#37285 is merged and in a release
|
||||
is_torch_2_6 = torch.__version__.startswith("2.6")
|
||||
is_transformers_below_4_51 = transformers.__version__ < "4.51.0"
|
||||
|
||||
if not (is_torch_2_6 and is_transformers_below_4_51):
|
||||
return
|
||||
|
||||
from torch.nn.attention.flex_attention import flex_attention
|
||||
|
||||
class WrappedFlexAttention:
|
||||
"""
|
||||
We are doing a singleton class so that flex attention is compiled once when it's first called.
|
||||
"""
|
||||
|
||||
_instance = None
|
||||
_is_flex_compiled = False
|
||||
_compiled_flex_attention = None
|
||||
|
||||
def __new__(cls, *args, **kwargs):
|
||||
if cls._instance is None:
|
||||
# Create a new instance if one doesn't already exist
|
||||
cls._instance = super().__new__(cls)
|
||||
return cls._instance
|
||||
|
||||
@torch.compiler.disable(recursive=False)
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize or update the singleton instance.
|
||||
"""
|
||||
if not self._is_flex_compiled:
|
||||
self._compiled_flex_attention = torch.compile(
|
||||
flex_attention,
|
||||
dynamic=False,
|
||||
mode="max-autotune-no-cudagraphs",
|
||||
fullgraph=True,
|
||||
)
|
||||
self._is_flex_compiled = True
|
||||
|
||||
def __call__(self):
|
||||
return self._compiled_flex_attention
|
||||
|
||||
transformers.integrations.flex_attention.WrappedFlexAttention = WrappedFlexAttention
|
||||
|
||||
|
||||
def patch_flex_make_mask():
|
||||
is_torch_2_6 = torch.__version__.startswith("2.6")
|
||||
is_transformers_eq_4_51 = transformers.__version__ == "4.51.0"
|
||||
|
||||
if not (is_torch_2_6 and is_transformers_eq_4_51):
|
||||
return
|
||||
|
||||
from torch.nn.attention.flex_attention import (
|
||||
BlockMask,
|
||||
)
|
||||
from torch.nn.attention.flex_attention import (
|
||||
create_block_mask as create_block_causal_mask_flex,
|
||||
)
|
||||
|
||||
Offset = Union[torch.Tensor, int]
|
||||
|
||||
def patched_make_flex_block_causal_mask(
|
||||
attention_mask_2d: torch.Tensor,
|
||||
attention_chunk_size: Optional[int] = None,
|
||||
query_length=None,
|
||||
key_length=None,
|
||||
offsets: Optional[Tuple[Offset, Offset]] = None,
|
||||
) -> "BlockMask":
|
||||
"""
|
||||
Create a block causal document mask for a batch of sequences, both packed and unpacked.
|
||||
Create Block causal logic and passing it into :func:`torch.nn.attention.flex_attention.create_block_mask`.
|
||||
The resultant BlockMask is a compressed representation of the full block causal
|
||||
mask. BlockMask is essential for performant computation of flex attention.
|
||||
See: https://pytorch.org/blog/flexattention/
|
||||
|
||||
Args:
|
||||
attention_mask_2d (torch.Tensor): Attention mask for packed and padded sequences
|
||||
of shape (batch_size, total_seq_len). e.g.
|
||||
|
||||
For unpacked sequence:
|
||||
[[1, 1, 1, 1, 0, 0, 0],
|
||||
[1, 1, 1, 1, 1, 0, 0]]
|
||||
|
||||
For packed sequence:
|
||||
[[1, 1, 1, 2, 2, 2, 0],
|
||||
[1, 1, 2, 2, 2, 3, 3]]
|
||||
|
||||
Returns:
|
||||
BlockMask
|
||||
"""
|
||||
|
||||
batch_size, total_seq_len = attention_mask_2d.shape
|
||||
if not key_length:
|
||||
key_length = total_seq_len
|
||||
if not query_length:
|
||||
query_length = total_seq_len
|
||||
attention_mask_2d = torch.nn.functional.pad(
|
||||
attention_mask_2d, value=0, pad=(0, key_length)
|
||||
)
|
||||
device = attention_mask_2d.device
|
||||
document_ids = attention_mask_2d.clone()
|
||||
|
||||
if attention_chunk_size is not None:
|
||||
# we create an arange, then we just // by chunk size to get [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3]
|
||||
document_ids = (document_ids.fill_(1).cumsum(-1) - 1) // (
|
||||
attention_chunk_size
|
||||
)
|
||||
|
||||
# Instead of passing a tensor mask, flex attention requires a mask_mod function
|
||||
# that determines which elements of QK^T should be included in the attention
|
||||
# computation prior to the softmax. For sample packing, we need both the
|
||||
# logic for both causal mask and document mask. See PyTorch's official
|
||||
# blog post for more details: https://pytorch.org/blog/flexattention/#mask-mods
|
||||
def causal_mask_mod(
|
||||
batch_idx, head_idx, q_idx, kv_idx
|
||||
): # pylint: disable=unused-argument
|
||||
"""
|
||||
Defines the logic of a block causal mask by combining both a standard causal mask
|
||||
and a block diagonal document mask.
|
||||
|
||||
See :func:`~torchtune.modules.attention_utils.create_block_causal_mask`
|
||||
for an illustration.
|
||||
"""
|
||||
causal_mask = q_idx >= kv_idx # not valid when decoding
|
||||
document_mask = (
|
||||
document_ids[batch_idx, q_idx] == document_ids[batch_idx, kv_idx]
|
||||
)
|
||||
padding_mask = attention_mask_2d[batch_idx, q_idx] > 0
|
||||
final_mask = causal_mask & padding_mask & document_mask
|
||||
return final_mask
|
||||
|
||||
if offsets is not None:
|
||||
q_offset = offsets[0]
|
||||
kv_offset = offsets[1]
|
||||
|
||||
def mask_mod(batch_idx, head_idx, q_idx, kv_idx):
|
||||
offset_q = q_idx + q_offset
|
||||
offset_kv = kv_idx + kv_offset
|
||||
return causal_mask_mod(batch_idx, head_idx, offset_q, offset_kv)
|
||||
|
||||
else:
|
||||
mask_mod = causal_mask_mod
|
||||
return create_block_causal_mask_flex(
|
||||
mask_mod=mask_mod,
|
||||
B=batch_size,
|
||||
H=None, # attention head
|
||||
Q_LEN=query_length,
|
||||
KV_LEN=key_length,
|
||||
device=device,
|
||||
_compile=True,
|
||||
)
|
||||
|
||||
for n in tuple(sys.modules):
|
||||
if ".modeling_" in n and "llama4" not in n:
|
||||
if hasattr(sys.modules[n], "make_flex_block_causal_mask"):
|
||||
sys.modules[n].make_flex_block_causal_mask = (
|
||||
patched_make_flex_block_causal_mask
|
||||
)
|
||||
|
||||
transformers.integrations.flex_attention.make_flex_block_causal_mask = (
|
||||
patched_make_flex_block_causal_mask
|
||||
)
|
||||
@@ -906,20 +906,7 @@ class ModelLoader:
|
||||
"""
|
||||
sample packing uses custom FA2 patch
|
||||
"""
|
||||
if self.cfg.flex_attention:
|
||||
self.model_kwargs["attn_implementation"] = "flex_attention"
|
||||
self.model_config._attn_implementation = ( # pylint: disable=protected-access
|
||||
"flex_attention"
|
||||
)
|
||||
from axolotl.monkeypatch.attention.flex_attn import (
|
||||
patch_flex_make_mask,
|
||||
patch_flex_wrapper,
|
||||
)
|
||||
|
||||
patch_flex_wrapper()
|
||||
patch_flex_make_mask()
|
||||
|
||||
elif self.cfg.flash_attention:
|
||||
if self.cfg.flash_attention:
|
||||
if not self.cfg.sample_packing and self.cfg.s2_attention:
|
||||
pass
|
||||
self.model_kwargs["attn_implementation"] = "flash_attention_2"
|
||||
|
||||
@@ -1316,8 +1316,29 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
|
||||
|
||||
if version.parse(torch_version) < version.parse("2.6.0"):
|
||||
raise ValueError(
|
||||
"Flex attention is not supported on torch version < 2.6.0"
|
||||
"Flex attention is not supported on torch version < 2.6.0."
|
||||
)
|
||||
if version.parse(torch_version) < version.parse("2.7.0"):
|
||||
LOG.warning(
|
||||
f"You are currently using torch version {torch_version}. "
|
||||
"We recommend using the latest version of torch for flex attention. "
|
||||
"You may encounter unexpected issues with flex attention on older versions of torch. "
|
||||
"Please upgrade to the latest stable, or nightly version of torch. "
|
||||
)
|
||||
|
||||
transformers_version = env_capabilities.get("transformers_version")
|
||||
if transformers_version is None:
|
||||
import transformers
|
||||
|
||||
transformers_version = str(transformers.__version__).split(
|
||||
"+", maxsplit=1
|
||||
)[0]
|
||||
|
||||
if version.parse(transformers_version) < version.parse("4.45.1"):
|
||||
raise ValueError(
|
||||
"Transformers version < 4.45.1 is not supported with flex attention. "
|
||||
)
|
||||
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
|
||||
@@ -16,7 +16,7 @@ from transformers.testing_utils import get_torch_dist_unique_port
|
||||
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
from tests.e2e.utils import check_tensorboard, require_torch_2_6_0
|
||||
from tests.e2e.utils import check_tensorboard, require_torch_2_6_0, require_torch_2_7_0
|
||||
|
||||
LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
@@ -458,17 +458,11 @@ class TestMultiGPULlama:
|
||||
)
|
||||
|
||||
@require_torch_2_6_0
|
||||
@pytest.mark.parametrize(
|
||||
"attention_backend",
|
||||
["flash", "flex"],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"fsdp_reshard_after_forward",
|
||||
[True, False],
|
||||
)
|
||||
def test_fsdp2_packed(
|
||||
self, temp_dir, attention_backend, fsdp_reshard_after_forward
|
||||
):
|
||||
def test_fsdp2_packed_flash(self, temp_dir, fsdp_reshard_after_forward):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
@@ -509,13 +503,79 @@ class TestMultiGPULlama:
|
||||
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
|
||||
},
|
||||
"use_tensorboard": True,
|
||||
"flash_attention": True,
|
||||
}
|
||||
)
|
||||
if attention_backend == "flash":
|
||||
cfg.flash_attention = True
|
||||
elif attention_backend == "flex":
|
||||
cfg.flex_attention = True
|
||||
# write cfg to yaml file
|
||||
Path(temp_dir).mkdir(parents=True, exist_ok=True)
|
||||
with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
|
||||
fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
|
||||
|
||||
execute_subprocess_async(
|
||||
[
|
||||
"axolotl",
|
||||
"train",
|
||||
str(Path(temp_dir) / "config.yaml"),
|
||||
"--num-processes",
|
||||
"2",
|
||||
"--main-process-port",
|
||||
f"{get_torch_dist_unique_port()}",
|
||||
]
|
||||
)
|
||||
|
||||
check_tensorboard(
|
||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss is too high"
|
||||
)
|
||||
|
||||
@require_torch_2_7_0
|
||||
@pytest.mark.parametrize(
|
||||
"fsdp_reshard_after_forward",
|
||||
[True, False],
|
||||
)
|
||||
def test_fsdp2_packed_flex(self, temp_dir, fsdp_reshard_after_forward):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 2048,
|
||||
"val_set_size": 0.05,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "tatsu-lab/alpaca",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"max_steps": 2,
|
||||
"micro_batch_size": 4,
|
||||
"gradient_accumulation_steps": 2,
|
||||
"gradient_checkpointing": True,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"fsdp": [
|
||||
"auto_wrap",
|
||||
],
|
||||
"fsdp_config": {
|
||||
"fsdp_version": 2,
|
||||
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
|
||||
"fsdp_offload_params": False,
|
||||
"fsdp_cpu_ram_efficient_loading": False,
|
||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
|
||||
},
|
||||
"use_tensorboard": True,
|
||||
"flex_attention": True,
|
||||
}
|
||||
)
|
||||
# write cfg to yaml file
|
||||
Path(temp_dir).mkdir(parents=True, exist_ok=True)
|
||||
with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
|
||||
@@ -617,12 +677,6 @@ class TestMultiGPULlama:
|
||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high"
|
||||
)
|
||||
|
||||
# TODO: remove skip once deepspeed regression is fixed
|
||||
# see https://github.com/huggingface/transformers/pull/37324
|
||||
@pytest.mark.skipif(
|
||||
transformers_version_eq("4.51.0"),
|
||||
reason="zero3 is not supported with transformers==4.51.0",
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"gradient_accumulation_steps",
|
||||
[1, 2],
|
||||
|
||||
@@ -14,7 +14,7 @@ from axolotl.train import train
|
||||
from axolotl.utils.config import normalize_config, validate_config
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
from ..utils import check_tensorboard, require_torch_2_6_0, with_temp_dir
|
||||
from ..utils import check_tensorboard, require_torch_2_7_0, with_temp_dir
|
||||
|
||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
@@ -25,7 +25,7 @@ class TestPackedFlex(unittest.TestCase):
|
||||
Test case for Packed training of llama models
|
||||
"""
|
||||
|
||||
@require_torch_2_6_0
|
||||
@require_torch_2_7_0
|
||||
@with_temp_dir
|
||||
def test_loss_llama(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
|
||||
@@ -33,6 +33,18 @@ def with_temp_dir(test_func):
|
||||
return wrapper
|
||||
|
||||
|
||||
def require_torch_2_7_0(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires torch >= 2.7.0
|
||||
"""
|
||||
|
||||
def is_min_2_7_0():
|
||||
torch_version = version.parse(torch.__version__)
|
||||
return torch_version >= version.parse("2.7.0")
|
||||
|
||||
return unittest.skipUnless(is_min_2_7_0(), "test requires torch>=2.7.0")(test_case)
|
||||
|
||||
|
||||
def most_recent_subdir(path):
|
||||
base_path = Path(path)
|
||||
subdirectories = [d for d in base_path.iterdir() if d.is_dir()]
|
||||
|
||||
Reference in New Issue
Block a user