Compare commits
10 Commits
version-de
...
transforme
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
31723ac523 | ||
|
|
2e9e423dfd | ||
|
|
cbe61186dc | ||
|
|
2a83580bdc | ||
|
|
825f66b9fd | ||
|
|
3b44989205 | ||
|
|
811224d7b7 | ||
|
|
84a14fc604 | ||
|
|
86cf62ca46 | ||
|
|
fc54e10455 |
@@ -2,7 +2,7 @@ ARG BASE_TAG=main
|
|||||||
FROM axolotlai/axolotl:$BASE_TAG
|
FROM axolotlai/axolotl:$BASE_TAG
|
||||||
|
|
||||||
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
||||||
ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||||
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
||||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ ARG BASE_TAG=main
|
|||||||
FROM axolotlai/axolotl:$BASE_TAG
|
FROM axolotlai/axolotl:$BASE_TAG
|
||||||
|
|
||||||
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
||||||
ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||||
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
||||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||||
|
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
peft==0.14.0
|
peft==0.14.0
|
||||||
transformers==4.46.3
|
transformers==4.47.0
|
||||||
tokenizers>=0.20.1
|
tokenizers>=0.20.1
|
||||||
bitsandbytes==0.45.0
|
bitsandbytes==0.45.0
|
||||||
accelerate==1.1.0
|
accelerate==1.2.0
|
||||||
datasets==3.1.0
|
datasets==3.1.0
|
||||||
deepspeed==0.15.4
|
deepspeed==0.15.4
|
||||||
pydantic==2.6.3
|
pydantic==2.6.3
|
||||||
@@ -42,7 +42,7 @@ s3fs>=2024.5.0
|
|||||||
gcsfs>=2024.5.0
|
gcsfs>=2024.5.0
|
||||||
# adlfs
|
# adlfs
|
||||||
|
|
||||||
trl==0.12.0
|
trl==0.12.1
|
||||||
zstandard==0.22.0
|
zstandard==0.22.0
|
||||||
fastcore
|
fastcore
|
||||||
|
|
||||||
|
|||||||
@@ -442,7 +442,7 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):
|
|||||||
"compute_capability": gpu_version,
|
"compute_capability": gpu_version,
|
||||||
},
|
},
|
||||||
env_capabilities={
|
env_capabilities={
|
||||||
"torch_version": str(torch.__version__).split("+", maxsplit=1)[0]
|
"torch_version": str(torch.__version__).split("+", maxsplit=1)[0],
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -957,13 +957,15 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
|
|||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
def log(self, logs: Dict[str, float]) -> None:
|
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||||
"""
|
"""
|
||||||
Log `logs` on the various objects watching training, including stored metrics.
|
Log `logs` on the various objects watching training, including stored metrics.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
logs (`Dict[str, float]`):
|
logs (`Dict[str, float]`):
|
||||||
The values to log.
|
The values to log.
|
||||||
|
start_time (`Optional[float]`):
|
||||||
|
The start of training.
|
||||||
"""
|
"""
|
||||||
# logs either has 'loss' or 'eval_loss'
|
# logs either has 'loss' or 'eval_loss'
|
||||||
train_eval = "train" if "loss" in logs else "eval"
|
train_eval = "train" if "loss" in logs else "eval"
|
||||||
@@ -971,7 +973,7 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
|
|||||||
for key, metrics in self._stored_metrics[train_eval].items():
|
for key, metrics in self._stored_metrics[train_eval].items():
|
||||||
logs[key] = torch.tensor(metrics).mean().item()
|
logs[key] = torch.tensor(metrics).mean().item()
|
||||||
del self._stored_metrics[train_eval]
|
del self._stored_metrics[train_eval]
|
||||||
return super().log(logs)
|
return super().log(logs, start_time)
|
||||||
|
|
||||||
def store_metrics(
|
def store_metrics(
|
||||||
self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train"
|
self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train"
|
||||||
@@ -1155,6 +1157,18 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):
|
|||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
|
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||||
|
# TODO remove once trl supports the updated to the Trainer.log method
|
||||||
|
# logs either has 'loss' or 'eval_loss'
|
||||||
|
train_eval = "train" if "loss" in logs else "eval"
|
||||||
|
# Add averaged stored metrics to logs
|
||||||
|
for key, metrics in self._stored_metrics[train_eval].items():
|
||||||
|
logs[key] = torch.tensor(metrics).mean().item()
|
||||||
|
del self._stored_metrics[train_eval]
|
||||||
|
return super(DPOTrainer, self).log( # pylint: disable=bad-super-call
|
||||||
|
logs, start_time
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):
|
class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):
|
||||||
"""
|
"""
|
||||||
@@ -1163,6 +1177,18 @@ class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):
|
|||||||
|
|
||||||
tag_names = ["axolotl", "orpo"]
|
tag_names = ["axolotl", "orpo"]
|
||||||
|
|
||||||
|
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||||
|
# TODO remove once trl supports the updated to the Trainer.log method
|
||||||
|
# logs either has 'loss' or 'eval_loss'
|
||||||
|
train_eval = "train" if "loss" in logs else "eval"
|
||||||
|
# Add averaged stored metrics to logs
|
||||||
|
for key, metrics in self._stored_metrics[train_eval].items():
|
||||||
|
logs[key] = torch.tensor(metrics).mean().item()
|
||||||
|
del self._stored_metrics[train_eval]
|
||||||
|
return super(ORPOTrainer, self).log( # pylint: disable=bad-super-call
|
||||||
|
logs, start_time
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AxolotlKTOTrainer(SchedulerMixin, KTOTrainer):
|
class AxolotlKTOTrainer(SchedulerMixin, KTOTrainer):
|
||||||
"""
|
"""
|
||||||
@@ -1171,6 +1197,45 @@ class AxolotlKTOTrainer(SchedulerMixin, KTOTrainer):
|
|||||||
|
|
||||||
tag_names = ["axolotl", "kto"]
|
tag_names = ["axolotl", "kto"]
|
||||||
|
|
||||||
|
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||||
|
# TODO remove once trl supports the updated to the Trainer.log method
|
||||||
|
# logs either has 'loss' or 'eval_loss'
|
||||||
|
train_eval = "train" if "loss" in logs else "eval"
|
||||||
|
# train metrics should have no prefix, eval should have 'eval_'
|
||||||
|
prefix = "eval_" if train_eval == "eval" else ""
|
||||||
|
# accumulate average metrics from sums and lengths
|
||||||
|
for split in ["chosen", "rejected"]:
|
||||||
|
if f"count/{split}" in self._stored_metrics[train_eval]:
|
||||||
|
count_sum = (
|
||||||
|
torch.Tensor(self._stored_metrics[train_eval][f"count/{split}"])
|
||||||
|
.sum()
|
||||||
|
.item()
|
||||||
|
)
|
||||||
|
for metric in ["rewards", "logps", "logits"]:
|
||||||
|
logs[f"{prefix}{metric}/{split}"] = (
|
||||||
|
torch.Tensor(
|
||||||
|
self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
|
||||||
|
)
|
||||||
|
.sum()
|
||||||
|
.item()
|
||||||
|
/ count_sum
|
||||||
|
)
|
||||||
|
# delete obsolete metric
|
||||||
|
del self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
|
||||||
|
del self._stored_metrics[train_eval][f"count/{split}"]
|
||||||
|
# calculate reward margin
|
||||||
|
if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs:
|
||||||
|
logs[f"{prefix}rewards/margins"] = (
|
||||||
|
logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"]
|
||||||
|
)
|
||||||
|
# Add averaged stored metrics to logs
|
||||||
|
for key, metrics in self._stored_metrics[train_eval].items():
|
||||||
|
logs[f"{prefix}{key}"] = torch.Tensor(metrics).mean().item()
|
||||||
|
del self._stored_metrics[train_eval]
|
||||||
|
return super(KTOTrainer, self).log( # pylint: disable=bad-super-call
|
||||||
|
logs, start_time
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AxolotlCPOTrainer(SchedulerMixin, CPOTrainer):
|
class AxolotlCPOTrainer(SchedulerMixin, CPOTrainer):
|
||||||
"""
|
"""
|
||||||
@@ -1179,6 +1244,18 @@ class AxolotlCPOTrainer(SchedulerMixin, CPOTrainer):
|
|||||||
|
|
||||||
tag_names = ["axolotl", "cpo"]
|
tag_names = ["axolotl", "cpo"]
|
||||||
|
|
||||||
|
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||||
|
# TODO remove once trl supports the updated to the Trainer.log method
|
||||||
|
# logs either has 'loss' or 'eval_loss'
|
||||||
|
train_eval = "train" if "loss" in logs else "eval"
|
||||||
|
# Add averaged stored metrics to logs
|
||||||
|
for key, metrics in self._stored_metrics[train_eval].items():
|
||||||
|
logs[key] = torch.tensor(metrics).mean().item()
|
||||||
|
del self._stored_metrics[train_eval]
|
||||||
|
return super(CPOTrainer, self).log( # pylint: disable=bad-super-call
|
||||||
|
logs, start_time
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class AxolotlRewardTrainer(SchedulerMixin, RewardTrainer):
|
class AxolotlRewardTrainer(SchedulerMixin, RewardTrainer):
|
||||||
"""
|
"""
|
||||||
@@ -1187,6 +1264,12 @@ class AxolotlRewardTrainer(SchedulerMixin, RewardTrainer):
|
|||||||
|
|
||||||
tag_names = ["axolotl", "reward"]
|
tag_names = ["axolotl", "reward"]
|
||||||
|
|
||||||
|
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||||
|
# TODO remove once trl supports the updated to the Trainer.log method
|
||||||
|
return super(RewardTrainer, self).log( # pylint: disable=bad-super-call
|
||||||
|
logs, start_time
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TrainerBuilderBase(abc.ABC):
|
class TrainerBuilderBase(abc.ABC):
|
||||||
"""
|
"""
|
||||||
|
|||||||
197
src/axolotl/monkeypatch/trainer_grad_accum.py
Normal file
197
src/axolotl/monkeypatch/trainer_grad_accum.py
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
"""
|
||||||
|
fix for FSDP gradient accumulation
|
||||||
|
see https://github.com/huggingface/transformers/pull/35128
|
||||||
|
"""
|
||||||
|
import inspect
|
||||||
|
|
||||||
|
from accelerate.logging import get_logger
|
||||||
|
from transformers import LlamaForCausalLM
|
||||||
|
from transformers.trainer import Trainer
|
||||||
|
|
||||||
|
from axolotl.monkeypatch.unsloth_ import detab_code
|
||||||
|
|
||||||
|
LOG = get_logger("axolotl.monkeypatch.trainer_grad_accum")
|
||||||
|
|
||||||
|
ORIGINAL_CONTEXT_CODE = """
|
||||||
|
with self.compute_loss_context_manager():
|
||||||
|
if self.model_accepts_loss_kwargs:
|
||||||
|
loss = self.compute_loss(model, inputs)
|
||||||
|
else:
|
||||||
|
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
|
||||||
|
"""
|
||||||
|
|
||||||
|
PATCHED_CONTEXT_CODE = """
|
||||||
|
with self.compute_loss_context_manager():
|
||||||
|
if self.model_accepts_loss_kwargs:
|
||||||
|
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
|
||||||
|
else:
|
||||||
|
loss = self.compute_loss(model, inputs)
|
||||||
|
"""
|
||||||
|
|
||||||
|
ORIGINAL_LLAMA_FCLM_CODE = """
|
||||||
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
|
output_hidden_states = (
|
||||||
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
|
)
|
||||||
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|
||||||
|
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
||||||
|
outputs = self.model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
past_key_values=past_key_values,
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
|
use_cache=use_cache,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
cache_position=cache_position,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
|
||||||
|
hidden_states = outputs[0]
|
||||||
|
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||||
|
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
|
||||||
|
|
||||||
|
loss = None
|
||||||
|
if labels is not None:
|
||||||
|
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
|
||||||
|
"""
|
||||||
|
|
||||||
|
PATCHED_LLAMA_FCLM_CODE = """
|
||||||
|
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||||
|
output_hidden_states = (
|
||||||
|
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||||
|
)
|
||||||
|
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||||
|
|
||||||
|
# remove num_items_in_batch otherwise self.model attempts to pass it to flash_attention
|
||||||
|
num_items_in_batch = kwargs.pop("num_items_in_batch")
|
||||||
|
|
||||||
|
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
||||||
|
outputs = self.model(
|
||||||
|
input_ids=input_ids,
|
||||||
|
attention_mask=attention_mask,
|
||||||
|
position_ids=position_ids,
|
||||||
|
past_key_values=past_key_values,
|
||||||
|
inputs_embeds=inputs_embeds,
|
||||||
|
use_cache=use_cache,
|
||||||
|
output_attentions=output_attentions,
|
||||||
|
output_hidden_states=output_hidden_states,
|
||||||
|
return_dict=return_dict,
|
||||||
|
cache_position=cache_position,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
hidden_states = outputs[0]
|
||||||
|
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||||
|
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
|
||||||
|
|
||||||
|
loss = None
|
||||||
|
if labels is not None:
|
||||||
|
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, num_items_in_batch=num_items_in_batch, **kwargs)
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def get_training_step_code() -> str:
|
||||||
|
training_step = inspect.getsource(
|
||||||
|
Trainer.training_step # pylint: disable=protected-access
|
||||||
|
)
|
||||||
|
return training_step
|
||||||
|
|
||||||
|
|
||||||
|
def check_training_step_is_patchable() -> bool:
|
||||||
|
training_step = get_training_step_code()
|
||||||
|
training_step, _ = detab_code(training_step)
|
||||||
|
return ORIGINAL_CONTEXT_CODE in training_step
|
||||||
|
|
||||||
|
|
||||||
|
def patch_training_step_for_ga():
|
||||||
|
"""
|
||||||
|
monkeypatch for fixing the training loop for gradient accumulation
|
||||||
|
"""
|
||||||
|
|
||||||
|
training_step = get_training_step_code()
|
||||||
|
Trainer._original_training_step = training_step # pylint: disable=protected-access
|
||||||
|
training_step, _ = detab_code(training_step)
|
||||||
|
assert (
|
||||||
|
ORIGINAL_CONTEXT_CODE in training_step
|
||||||
|
), "Original training_step code not found"
|
||||||
|
|
||||||
|
training_step = training_step.replace(ORIGINAL_CONTEXT_CODE, PATCHED_CONTEXT_CODE)
|
||||||
|
training_step = training_step.replace(
|
||||||
|
"def training_step(",
|
||||||
|
"def _fixed_training_step(",
|
||||||
|
1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# load imports necessary
|
||||||
|
import transformers.trainer
|
||||||
|
|
||||||
|
items_to_import = []
|
||||||
|
for item in dir(transformers.trainer):
|
||||||
|
if item in training_step:
|
||||||
|
items_to_import.append(item)
|
||||||
|
|
||||||
|
exec( # pylint: disable=exec-used # nosec B102
|
||||||
|
"from transformers.trainer import ("
|
||||||
|
+ ", ".join(x for x in items_to_import)
|
||||||
|
+ ")",
|
||||||
|
globals(),
|
||||||
|
)
|
||||||
|
exec(training_step, globals()) # pylint: disable=exec-used # nosec B102
|
||||||
|
LOG.info("patching training_step", main_process_only=True)
|
||||||
|
Trainer.training_step = ( # pylint: disable=protected-access
|
||||||
|
_fixed_training_step # pylint: disable=undefined-variable # noqa: F821
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_model_forward_code() -> str:
|
||||||
|
forward = inspect.getsource(
|
||||||
|
LlamaForCausalLM.forward # pylint: disable=protected-access
|
||||||
|
)
|
||||||
|
return forward
|
||||||
|
|
||||||
|
|
||||||
|
def check_forward_is_patchable() -> bool:
|
||||||
|
forward = get_model_forward_code()
|
||||||
|
forward, _ = detab_code(forward)
|
||||||
|
return ORIGINAL_LLAMA_FCLM_CODE in forward
|
||||||
|
|
||||||
|
|
||||||
|
def patch_forward_for_ga():
|
||||||
|
"""
|
||||||
|
monkeypatch for fixing the training loop for gradient accumulation
|
||||||
|
"""
|
||||||
|
|
||||||
|
forward = get_model_forward_code()
|
||||||
|
LlamaForCausalLM._original_forward = forward # pylint: disable=protected-access
|
||||||
|
forward, _ = detab_code(forward)
|
||||||
|
assert ORIGINAL_LLAMA_FCLM_CODE in forward, "Original forward code not found"
|
||||||
|
|
||||||
|
forward = forward.replace(ORIGINAL_LLAMA_FCLM_CODE, PATCHED_LLAMA_FCLM_CODE)
|
||||||
|
forward = forward.replace(
|
||||||
|
"def forward(",
|
||||||
|
"def _fixed_forward(",
|
||||||
|
1,
|
||||||
|
)
|
||||||
|
|
||||||
|
# load imports necessary
|
||||||
|
import transformers.models.llama.modeling_llama
|
||||||
|
|
||||||
|
items_to_import = []
|
||||||
|
for item in dir(transformers.models.llama.modeling_llama):
|
||||||
|
if item in forward:
|
||||||
|
items_to_import.append(item)
|
||||||
|
|
||||||
|
exec( # pylint: disable=exec-used # nosec B102
|
||||||
|
"from transformers.models.llama.modeling_llama import ("
|
||||||
|
+ ", ".join(x for x in items_to_import)
|
||||||
|
+ ")",
|
||||||
|
globals(),
|
||||||
|
)
|
||||||
|
exec(forward, globals()) # pylint: disable=exec-used # nosec B102
|
||||||
|
LOG.info("patching forward", main_process_only=True)
|
||||||
|
LlamaForCausalLM.forward = ( # pylint: disable=protected-access
|
||||||
|
_fixed_forward # pylint: disable=undefined-variable # noqa: F821
|
||||||
|
)
|
||||||
@@ -1432,6 +1432,20 @@ class AxolotlInputConfig(
|
|||||||
)
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
@model_validator(mode="before")
|
||||||
|
@classmethod
|
||||||
|
def notify_qlora_unsloth(cls, data):
|
||||||
|
if (
|
||||||
|
data.get("unsloth_lora_mlp")
|
||||||
|
or data.get("unsloth_lora_qkv")
|
||||||
|
or data.get("unsloth_lora_o")
|
||||||
|
):
|
||||||
|
LOG.info(
|
||||||
|
"Unsloth may not be well supported with the latest version of Transformers, "
|
||||||
|
"resulting in loss that is incorrect."
|
||||||
|
)
|
||||||
|
return data
|
||||||
|
|
||||||
@model_validator(mode="before")
|
@model_validator(mode="before")
|
||||||
@classmethod
|
@classmethod
|
||||||
def check_torch_compile_deepspeed(cls, data):
|
def check_torch_compile_deepspeed(cls, data):
|
||||||
|
|||||||
@@ -386,6 +386,15 @@ class ModelLoader:
|
|||||||
if self.cfg.flash_attention:
|
if self.cfg.flash_attention:
|
||||||
self.patch_attention()
|
self.patch_attention()
|
||||||
|
|
||||||
|
if self.cfg.model_config_type == "llama":
|
||||||
|
from axolotl.monkeypatch.trainer_grad_accum import (
|
||||||
|
patch_forward_for_ga,
|
||||||
|
patch_training_step_for_ga,
|
||||||
|
)
|
||||||
|
|
||||||
|
patch_forward_for_ga()
|
||||||
|
patch_training_step_for_ga()
|
||||||
|
|
||||||
if self.cfg.sample_packing and self.cfg.s2_attention:
|
if self.cfg.sample_packing and self.cfg.s2_attention:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Received `sample_packing=true` and `s2_attention=true`; however, \
|
"Received `sample_packing=true` and `s2_attention=true`; however, \
|
||||||
|
|||||||
@@ -20,6 +20,7 @@ os.environ["WANDB_DISABLED"] = "true"
|
|||||||
|
|
||||||
|
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
|
@pytest.mark.skip(reason="latest unsloth doesn't work with latest transformers")
|
||||||
class TestUnslothQLoRA:
|
class TestUnslothQLoRA:
|
||||||
"""
|
"""
|
||||||
Test class for Unsloth QLoRA Llama models
|
Test class for Unsloth QLoRA Llama models
|
||||||
|
|||||||
25
tests/patched/test_llama_trainer_ga.py
Normal file
25
tests/patched/test_llama_trainer_ga.py
Normal file
@@ -0,0 +1,25 @@
|
|||||||
|
""""Test module for checking whether the Hugging Face Transformers is working as expected."""
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from axolotl.monkeypatch.trainer_grad_accum import (
|
||||||
|
check_forward_is_patchable,
|
||||||
|
check_training_step_is_patchable,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestTrainerGAIntegration(unittest.TestCase):
|
||||||
|
"""llama monkeypatch integration tests."""
|
||||||
|
|
||||||
|
def test_train_step_patchable(self):
|
||||||
|
# ensures the current version of transformers has loss code that matches our patching code
|
||||||
|
self.assertTrue(
|
||||||
|
check_training_step_is_patchable(),
|
||||||
|
"HF transformers Trainer.training_step has changed and isn't patchable",
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_model_forward_patchable(self):
|
||||||
|
# ensures the current version of transformers has loss code that matches our patching code
|
||||||
|
self.assertTrue(
|
||||||
|
check_forward_is_patchable(),
|
||||||
|
"HF transformers LlamaForCausalLM.forward has changed and isn't patchable",
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user