Compare commits
10 Commits
coderabbit
...
transforme
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
31723ac523 | ||
|
|
2e9e423dfd | ||
|
|
cbe61186dc | ||
|
|
2a83580bdc | ||
|
|
825f66b9fd | ||
|
|
3b44989205 | ||
|
|
811224d7b7 | ||
|
|
84a14fc604 | ||
|
|
86cf62ca46 | ||
|
|
fc54e10455 |
@@ -2,7 +2,7 @@ ARG BASE_TAG=main
|
||||
FROM axolotlai/axolotl:$BASE_TAG
|
||||
|
||||
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
||||
ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@ ARG BASE_TAG=main
|
||||
FROM axolotlai/axolotl:$BASE_TAG
|
||||
|
||||
ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
|
||||
ENV HUGGINGFACE_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||
packaging==23.2
|
||||
peft==0.14.0
|
||||
transformers==4.46.3
|
||||
transformers==4.47.0
|
||||
tokenizers>=0.20.1
|
||||
bitsandbytes==0.45.0
|
||||
accelerate==1.1.0
|
||||
accelerate==1.2.0
|
||||
datasets==3.1.0
|
||||
deepspeed==0.15.4
|
||||
pydantic==2.6.3
|
||||
@@ -42,7 +42,7 @@ s3fs>=2024.5.0
|
||||
gcsfs>=2024.5.0
|
||||
# adlfs
|
||||
|
||||
trl==0.12.0
|
||||
trl==0.12.1
|
||||
zstandard==0.22.0
|
||||
fastcore
|
||||
|
||||
|
||||
@@ -442,7 +442,7 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):
|
||||
"compute_capability": gpu_version,
|
||||
},
|
||||
env_capabilities={
|
||||
"torch_version": str(torch.__version__).split("+", maxsplit=1)[0]
|
||||
"torch_version": str(torch.__version__).split("+", maxsplit=1)[0],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@@ -957,13 +957,15 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
|
||||
|
||||
return res
|
||||
|
||||
def log(self, logs: Dict[str, float]) -> None:
|
||||
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||
"""
|
||||
Log `logs` on the various objects watching training, including stored metrics.
|
||||
|
||||
Args:
|
||||
logs (`Dict[str, float]`):
|
||||
The values to log.
|
||||
start_time (`Optional[float]`):
|
||||
The start of training.
|
||||
"""
|
||||
# logs either has 'loss' or 'eval_loss'
|
||||
train_eval = "train" if "loss" in logs else "eval"
|
||||
@@ -971,7 +973,7 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
|
||||
for key, metrics in self._stored_metrics[train_eval].items():
|
||||
logs[key] = torch.tensor(metrics).mean().item()
|
||||
del self._stored_metrics[train_eval]
|
||||
return super().log(logs)
|
||||
return super().log(logs, start_time)
|
||||
|
||||
def store_metrics(
|
||||
self, metrics: Dict[str, float], train_eval: Literal["train", "eval"] = "train"
|
||||
@@ -1155,6 +1157,18 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):
|
||||
torch.cuda.empty_cache()
|
||||
return loss
|
||||
|
||||
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||
# TODO remove once trl supports the updated to the Trainer.log method
|
||||
# logs either has 'loss' or 'eval_loss'
|
||||
train_eval = "train" if "loss" in logs else "eval"
|
||||
# Add averaged stored metrics to logs
|
||||
for key, metrics in self._stored_metrics[train_eval].items():
|
||||
logs[key] = torch.tensor(metrics).mean().item()
|
||||
del self._stored_metrics[train_eval]
|
||||
return super(DPOTrainer, self).log( # pylint: disable=bad-super-call
|
||||
logs, start_time
|
||||
)
|
||||
|
||||
|
||||
class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):
|
||||
"""
|
||||
@@ -1163,6 +1177,18 @@ class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):
|
||||
|
||||
tag_names = ["axolotl", "orpo"]
|
||||
|
||||
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||
# TODO remove once trl supports the updated to the Trainer.log method
|
||||
# logs either has 'loss' or 'eval_loss'
|
||||
train_eval = "train" if "loss" in logs else "eval"
|
||||
# Add averaged stored metrics to logs
|
||||
for key, metrics in self._stored_metrics[train_eval].items():
|
||||
logs[key] = torch.tensor(metrics).mean().item()
|
||||
del self._stored_metrics[train_eval]
|
||||
return super(ORPOTrainer, self).log( # pylint: disable=bad-super-call
|
||||
logs, start_time
|
||||
)
|
||||
|
||||
|
||||
class AxolotlKTOTrainer(SchedulerMixin, KTOTrainer):
|
||||
"""
|
||||
@@ -1171,6 +1197,45 @@ class AxolotlKTOTrainer(SchedulerMixin, KTOTrainer):
|
||||
|
||||
tag_names = ["axolotl", "kto"]
|
||||
|
||||
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||
# TODO remove once trl supports the updated to the Trainer.log method
|
||||
# logs either has 'loss' or 'eval_loss'
|
||||
train_eval = "train" if "loss" in logs else "eval"
|
||||
# train metrics should have no prefix, eval should have 'eval_'
|
||||
prefix = "eval_" if train_eval == "eval" else ""
|
||||
# accumulate average metrics from sums and lengths
|
||||
for split in ["chosen", "rejected"]:
|
||||
if f"count/{split}" in self._stored_metrics[train_eval]:
|
||||
count_sum = (
|
||||
torch.Tensor(self._stored_metrics[train_eval][f"count/{split}"])
|
||||
.sum()
|
||||
.item()
|
||||
)
|
||||
for metric in ["rewards", "logps", "logits"]:
|
||||
logs[f"{prefix}{metric}/{split}"] = (
|
||||
torch.Tensor(
|
||||
self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
|
||||
)
|
||||
.sum()
|
||||
.item()
|
||||
/ count_sum
|
||||
)
|
||||
# delete obsolete metric
|
||||
del self._stored_metrics[train_eval][f"{metric}/{split}_sum"]
|
||||
del self._stored_metrics[train_eval][f"count/{split}"]
|
||||
# calculate reward margin
|
||||
if f"{prefix}rewards/chosen" in logs and f"{prefix}rewards/rejected" in logs:
|
||||
logs[f"{prefix}rewards/margins"] = (
|
||||
logs[f"{prefix}rewards/chosen"] - logs[f"{prefix}rewards/rejected"]
|
||||
)
|
||||
# Add averaged stored metrics to logs
|
||||
for key, metrics in self._stored_metrics[train_eval].items():
|
||||
logs[f"{prefix}{key}"] = torch.Tensor(metrics).mean().item()
|
||||
del self._stored_metrics[train_eval]
|
||||
return super(KTOTrainer, self).log( # pylint: disable=bad-super-call
|
||||
logs, start_time
|
||||
)
|
||||
|
||||
|
||||
class AxolotlCPOTrainer(SchedulerMixin, CPOTrainer):
|
||||
"""
|
||||
@@ -1179,6 +1244,18 @@ class AxolotlCPOTrainer(SchedulerMixin, CPOTrainer):
|
||||
|
||||
tag_names = ["axolotl", "cpo"]
|
||||
|
||||
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||
# TODO remove once trl supports the updated to the Trainer.log method
|
||||
# logs either has 'loss' or 'eval_loss'
|
||||
train_eval = "train" if "loss" in logs else "eval"
|
||||
# Add averaged stored metrics to logs
|
||||
for key, metrics in self._stored_metrics[train_eval].items():
|
||||
logs[key] = torch.tensor(metrics).mean().item()
|
||||
del self._stored_metrics[train_eval]
|
||||
return super(CPOTrainer, self).log( # pylint: disable=bad-super-call
|
||||
logs, start_time
|
||||
)
|
||||
|
||||
|
||||
class AxolotlRewardTrainer(SchedulerMixin, RewardTrainer):
|
||||
"""
|
||||
@@ -1187,6 +1264,12 @@ class AxolotlRewardTrainer(SchedulerMixin, RewardTrainer):
|
||||
|
||||
tag_names = ["axolotl", "reward"]
|
||||
|
||||
def log(self, logs: Dict[str, float], start_time: Optional[float] = None) -> None:
|
||||
# TODO remove once trl supports the updated to the Trainer.log method
|
||||
return super(RewardTrainer, self).log( # pylint: disable=bad-super-call
|
||||
logs, start_time
|
||||
)
|
||||
|
||||
|
||||
class TrainerBuilderBase(abc.ABC):
|
||||
"""
|
||||
|
||||
197
src/axolotl/monkeypatch/trainer_grad_accum.py
Normal file
197
src/axolotl/monkeypatch/trainer_grad_accum.py
Normal file
@@ -0,0 +1,197 @@
|
||||
"""
|
||||
fix for FSDP gradient accumulation
|
||||
see https://github.com/huggingface/transformers/pull/35128
|
||||
"""
|
||||
import inspect
|
||||
|
||||
from accelerate.logging import get_logger
|
||||
from transformers import LlamaForCausalLM
|
||||
from transformers.trainer import Trainer
|
||||
|
||||
from axolotl.monkeypatch.unsloth_ import detab_code
|
||||
|
||||
LOG = get_logger("axolotl.monkeypatch.trainer_grad_accum")
|
||||
|
||||
ORIGINAL_CONTEXT_CODE = """
|
||||
with self.compute_loss_context_manager():
|
||||
if self.model_accepts_loss_kwargs:
|
||||
loss = self.compute_loss(model, inputs)
|
||||
else:
|
||||
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
|
||||
"""
|
||||
|
||||
PATCHED_CONTEXT_CODE = """
|
||||
with self.compute_loss_context_manager():
|
||||
if self.model_accepts_loss_kwargs:
|
||||
loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
|
||||
else:
|
||||
loss = self.compute_loss(model, inputs)
|
||||
"""
|
||||
|
||||
ORIGINAL_LLAMA_FCLM_CODE = """
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
||||
outputs = self.model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hidden_states = outputs[0]
|
||||
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
|
||||
"""
|
||||
|
||||
PATCHED_LLAMA_FCLM_CODE = """
|
||||
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
|
||||
output_hidden_states = (
|
||||
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
|
||||
)
|
||||
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
|
||||
|
||||
# remove num_items_in_batch otherwise self.model attempts to pass it to flash_attention
|
||||
num_items_in_batch = kwargs.pop("num_items_in_batch")
|
||||
|
||||
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
||||
outputs = self.model(
|
||||
input_ids=input_ids,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_values=past_key_values,
|
||||
inputs_embeds=inputs_embeds,
|
||||
use_cache=use_cache,
|
||||
output_attentions=output_attentions,
|
||||
output_hidden_states=output_hidden_states,
|
||||
return_dict=return_dict,
|
||||
cache_position=cache_position,
|
||||
**kwargs,
|
||||
)
|
||||
hidden_states = outputs[0]
|
||||
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
||||
logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, num_items_in_batch=num_items_in_batch, **kwargs)
|
||||
"""
|
||||
|
||||
|
||||
def get_training_step_code() -> str:
|
||||
training_step = inspect.getsource(
|
||||
Trainer.training_step # pylint: disable=protected-access
|
||||
)
|
||||
return training_step
|
||||
|
||||
|
||||
def check_training_step_is_patchable() -> bool:
|
||||
training_step = get_training_step_code()
|
||||
training_step, _ = detab_code(training_step)
|
||||
return ORIGINAL_CONTEXT_CODE in training_step
|
||||
|
||||
|
||||
def patch_training_step_for_ga():
|
||||
"""
|
||||
monkeypatch for fixing the training loop for gradient accumulation
|
||||
"""
|
||||
|
||||
training_step = get_training_step_code()
|
||||
Trainer._original_training_step = training_step # pylint: disable=protected-access
|
||||
training_step, _ = detab_code(training_step)
|
||||
assert (
|
||||
ORIGINAL_CONTEXT_CODE in training_step
|
||||
), "Original training_step code not found"
|
||||
|
||||
training_step = training_step.replace(ORIGINAL_CONTEXT_CODE, PATCHED_CONTEXT_CODE)
|
||||
training_step = training_step.replace(
|
||||
"def training_step(",
|
||||
"def _fixed_training_step(",
|
||||
1,
|
||||
)
|
||||
|
||||
# load imports necessary
|
||||
import transformers.trainer
|
||||
|
||||
items_to_import = []
|
||||
for item in dir(transformers.trainer):
|
||||
if item in training_step:
|
||||
items_to_import.append(item)
|
||||
|
||||
exec( # pylint: disable=exec-used # nosec B102
|
||||
"from transformers.trainer import ("
|
||||
+ ", ".join(x for x in items_to_import)
|
||||
+ ")",
|
||||
globals(),
|
||||
)
|
||||
exec(training_step, globals()) # pylint: disable=exec-used # nosec B102
|
||||
LOG.info("patching training_step", main_process_only=True)
|
||||
Trainer.training_step = ( # pylint: disable=protected-access
|
||||
_fixed_training_step # pylint: disable=undefined-variable # noqa: F821
|
||||
)
|
||||
|
||||
|
||||
def get_model_forward_code() -> str:
|
||||
forward = inspect.getsource(
|
||||
LlamaForCausalLM.forward # pylint: disable=protected-access
|
||||
)
|
||||
return forward
|
||||
|
||||
|
||||
def check_forward_is_patchable() -> bool:
|
||||
forward = get_model_forward_code()
|
||||
forward, _ = detab_code(forward)
|
||||
return ORIGINAL_LLAMA_FCLM_CODE in forward
|
||||
|
||||
|
||||
def patch_forward_for_ga():
|
||||
"""
|
||||
monkeypatch for fixing the training loop for gradient accumulation
|
||||
"""
|
||||
|
||||
forward = get_model_forward_code()
|
||||
LlamaForCausalLM._original_forward = forward # pylint: disable=protected-access
|
||||
forward, _ = detab_code(forward)
|
||||
assert ORIGINAL_LLAMA_FCLM_CODE in forward, "Original forward code not found"
|
||||
|
||||
forward = forward.replace(ORIGINAL_LLAMA_FCLM_CODE, PATCHED_LLAMA_FCLM_CODE)
|
||||
forward = forward.replace(
|
||||
"def forward(",
|
||||
"def _fixed_forward(",
|
||||
1,
|
||||
)
|
||||
|
||||
# load imports necessary
|
||||
import transformers.models.llama.modeling_llama
|
||||
|
||||
items_to_import = []
|
||||
for item in dir(transformers.models.llama.modeling_llama):
|
||||
if item in forward:
|
||||
items_to_import.append(item)
|
||||
|
||||
exec( # pylint: disable=exec-used # nosec B102
|
||||
"from transformers.models.llama.modeling_llama import ("
|
||||
+ ", ".join(x for x in items_to_import)
|
||||
+ ")",
|
||||
globals(),
|
||||
)
|
||||
exec(forward, globals()) # pylint: disable=exec-used # nosec B102
|
||||
LOG.info("patching forward", main_process_only=True)
|
||||
LlamaForCausalLM.forward = ( # pylint: disable=protected-access
|
||||
_fixed_forward # pylint: disable=undefined-variable # noqa: F821
|
||||
)
|
||||
@@ -1432,6 +1432,20 @@ class AxolotlInputConfig(
|
||||
)
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def notify_qlora_unsloth(cls, data):
|
||||
if (
|
||||
data.get("unsloth_lora_mlp")
|
||||
or data.get("unsloth_lora_qkv")
|
||||
or data.get("unsloth_lora_o")
|
||||
):
|
||||
LOG.info(
|
||||
"Unsloth may not be well supported with the latest version of Transformers, "
|
||||
"resulting in loss that is incorrect."
|
||||
)
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_torch_compile_deepspeed(cls, data):
|
||||
|
||||
@@ -386,6 +386,15 @@ class ModelLoader:
|
||||
if self.cfg.flash_attention:
|
||||
self.patch_attention()
|
||||
|
||||
if self.cfg.model_config_type == "llama":
|
||||
from axolotl.monkeypatch.trainer_grad_accum import (
|
||||
patch_forward_for_ga,
|
||||
patch_training_step_for_ga,
|
||||
)
|
||||
|
||||
patch_forward_for_ga()
|
||||
patch_training_step_for_ga()
|
||||
|
||||
if self.cfg.sample_packing and self.cfg.s2_attention:
|
||||
raise ValueError(
|
||||
"Received `sample_packing=true` and `s2_attention=true`; however, \
|
||||
|
||||
@@ -20,6 +20,7 @@ os.environ["WANDB_DISABLED"] = "true"
|
||||
|
||||
|
||||
# pylint: disable=duplicate-code
|
||||
@pytest.mark.skip(reason="latest unsloth doesn't work with latest transformers")
|
||||
class TestUnslothQLoRA:
|
||||
"""
|
||||
Test class for Unsloth QLoRA Llama models
|
||||
|
||||
25
tests/patched/test_llama_trainer_ga.py
Normal file
25
tests/patched/test_llama_trainer_ga.py
Normal file
@@ -0,0 +1,25 @@
|
||||
""""Test module for checking whether the Hugging Face Transformers is working as expected."""
|
||||
import unittest
|
||||
|
||||
from axolotl.monkeypatch.trainer_grad_accum import (
|
||||
check_forward_is_patchable,
|
||||
check_training_step_is_patchable,
|
||||
)
|
||||
|
||||
|
||||
class TestTrainerGAIntegration(unittest.TestCase):
|
||||
"""llama monkeypatch integration tests."""
|
||||
|
||||
def test_train_step_patchable(self):
|
||||
# ensures the current version of transformers has loss code that matches our patching code
|
||||
self.assertTrue(
|
||||
check_training_step_is_patchable(),
|
||||
"HF transformers Trainer.training_step has changed and isn't patchable",
|
||||
)
|
||||
|
||||
def test_model_forward_patchable(self):
|
||||
# ensures the current version of transformers has loss code that matches our patching code
|
||||
self.assertTrue(
|
||||
check_forward_is_patchable(),
|
||||
"HF transformers LlamaForCausalLM.forward has changed and isn't patchable",
|
||||
)
|
||||
Reference in New Issue
Block a user