Compare commits
2 Commits
08fc7de87e
...
accelerato
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
d260eeb57d | ||
|
|
5a7f007d20 |
@@ -258,6 +258,11 @@ class TrainerBuilderBase(abc.ABC):
|
||||
bf16 = bf16 if bf16 is not None else False
|
||||
training_args_kwargs["bf16"] = bf16
|
||||
|
||||
if self.cfg.fp8:
|
||||
training_args_kwargs["fp8"] = True
|
||||
if self.cfg.fp8_enable_fsdp_float8_all_gather:
|
||||
training_args_kwargs["enable_fsdp_float8_all_gather:"] = True
|
||||
|
||||
def _configure_scheduler(self, training_args_kwargs: dict):
|
||||
if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
|
||||
training_args_kwargs["lr_scheduler_type"] = "cosine"
|
||||
|
||||
@@ -584,11 +584,9 @@ class AxolotlTrainer(
|
||||
|
||||
super().create_accelerator_and_postprocess()
|
||||
|
||||
def additional_accelerator_args(
|
||||
self, fp8: bool = False, enable_fsdp_float8_all_gather: bool = False, **kwargs
|
||||
) -> dict[str, Any]:
|
||||
ret_kwargs = {}
|
||||
if fp8:
|
||||
def build_fp8_accelerator_args(self) -> dict[str, Any]:
|
||||
args = {}
|
||||
if self.args.fp8:
|
||||
from accelerate.utils import AORecipeKwargs
|
||||
from torchao.float8 import Float8LinearConfig
|
||||
|
||||
@@ -596,15 +594,22 @@ class AxolotlTrainer(
|
||||
# scaling strategy. See more details here:
|
||||
# https://github.com/pytorch/ao/tree/main/torchao/float8.
|
||||
config = Float8LinearConfig(
|
||||
enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
|
||||
force_recompute_fp8_weight_in_bwd=enable_fsdp_float8_all_gather is True,
|
||||
enable_fsdp_float8_all_gather=self.args.enable_fsdp_float8_all_gather,
|
||||
force_recompute_fp8_weight_in_bwd=self.args.enable_fsdp_float8_all_gather
|
||||
is True,
|
||||
)
|
||||
|
||||
ret_kwargs["mixed_precision"] = "fp8"
|
||||
ret_kwargs["kwargs_handlers"] = [AORecipeKwargs(config=config)] # type: ignore
|
||||
args["mixed_precision"] = "fp8"
|
||||
args["kwargs_handlers"] = [AORecipeKwargs(config=config)] # type: ignore
|
||||
os.environ["ACCELERATE_MIXED_PRECISION"] = "fp8"
|
||||
|
||||
return ret_kwargs
|
||||
return args
|
||||
|
||||
def _build_accelerator_args(self, **kwargs) -> dict[str, Any]:
|
||||
args = super().build_accelerator_args(**kwargs)
|
||||
fp8_args = self.build_fp8_accelerator_args()
|
||||
args.update(fp8_args)
|
||||
return args
|
||||
|
||||
def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
|
||||
"""
|
||||
|
||||
@@ -263,3 +263,13 @@ class AxolotlTrainingMixins:
|
||||
dion_rank_multiple_of: int | None = field(
|
||||
default=None,
|
||||
)
|
||||
|
||||
fp8: bool | None = field(
|
||||
default=None,
|
||||
metadata={"help": "Whether to use FP8 precision for training"},
|
||||
)
|
||||
|
||||
enable_fsdp_float8_all_gather: bool | None = field(
|
||||
default=None,
|
||||
metadata={"help": "Whether to use FSDP with FP8 precision for all_gather"},
|
||||
)
|
||||
|
||||
@@ -100,7 +100,6 @@ class PatchManager:
|
||||
self._apply_fsdp_patches()
|
||||
self._apply_adapter_patches()
|
||||
self._apply_model_specific_patches()
|
||||
self._apply_fp8_patches()
|
||||
self._apply_flash_attention_peft_patches()
|
||||
self._apply_gradient_checkpointing_patches()
|
||||
self._patch_attention()
|
||||
@@ -235,17 +234,6 @@ class PatchManager:
|
||||
|
||||
patch_kimi_model()
|
||||
|
||||
def _apply_fp8_patches(self):
|
||||
"""Apply patches for FP8 support."""
|
||||
if self.cfg.fp8:
|
||||
from axolotl.monkeypatch.trainer_accelerator_args import (
|
||||
patch_create_accelerate_code_for_fp8,
|
||||
)
|
||||
|
||||
patch_create_accelerate_code_for_fp8(
|
||||
self.cfg.fp8_enable_fsdp_float8_all_gather
|
||||
)
|
||||
|
||||
def _apply_flash_attention_peft_patches(self):
|
||||
"""Apply patches for Flash Attention with PEFT."""
|
||||
if self.cfg.adapter:
|
||||
|
||||
@@ -1,83 +0,0 @@
|
||||
"""
|
||||
allow adding additional kwargs to Accelerator init
|
||||
"""
|
||||
|
||||
import inspect
|
||||
|
||||
from transformers import Trainer
|
||||
|
||||
from axolotl.monkeypatch.utils import detab_code
|
||||
from axolotl.utils.logging import get_logger
|
||||
|
||||
LOG = get_logger(__name__)
|
||||
|
||||
ORIGINAL_TRAINER_CODE = """
|
||||
# create accelerator object
|
||||
self.accelerator = Accelerator(**args)
|
||||
"""
|
||||
|
||||
PATCHED_TRAINER_CODE = """
|
||||
if hasattr(self, "additional_accelerator_args"):
|
||||
additional_args = self.additional_accelerator_args(fp8=True, enable_fsdp_float8_all_gather={enable_fsdp_float8_all_gather}, **args)
|
||||
if additional_args:
|
||||
args.update(additional_args)
|
||||
|
||||
# create accelerator object
|
||||
self.accelerator = Accelerator(**args)
|
||||
"""
|
||||
|
||||
|
||||
def get_create_accelerate_code() -> str:
|
||||
training_loop = inspect.getsource(Trainer.create_accelerator_and_postprocess)
|
||||
return training_loop
|
||||
|
||||
|
||||
def check_create_accelerate_code_is_patchable() -> bool:
|
||||
create_code = get_create_accelerate_code()
|
||||
create_code, _ = detab_code(create_code)
|
||||
return ORIGINAL_TRAINER_CODE in create_code
|
||||
|
||||
|
||||
def patch_create_accelerate_code_for_fp8(enable_fsdp_float8_all_gather: bool):
|
||||
"""
|
||||
Monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs.
|
||||
"""
|
||||
|
||||
try:
|
||||
create_code = get_create_accelerate_code()
|
||||
except OSError:
|
||||
return
|
||||
Trainer._original_create_accelerator_and_postprocess = create_code
|
||||
create_code, _ = detab_code(create_code)
|
||||
if ORIGINAL_TRAINER_CODE not in create_code:
|
||||
return
|
||||
|
||||
patched_trainer_code = PATCHED_TRAINER_CODE.format(
|
||||
enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather
|
||||
)
|
||||
create_code = create_code.replace(ORIGINAL_TRAINER_CODE, patched_trainer_code)
|
||||
create_code = create_code.replace(
|
||||
"def create_accelerator_and_postprocess(",
|
||||
"def fixed_create_accelerator_and_postprocess(",
|
||||
1,
|
||||
)
|
||||
|
||||
# load imports necessary
|
||||
import transformers.trainer
|
||||
|
||||
items_to_import = []
|
||||
for item in dir(transformers.trainer):
|
||||
if item in create_code:
|
||||
items_to_import.append(item)
|
||||
|
||||
exec(
|
||||
"from transformers.trainer import ("
|
||||
+ ", ".join(x for x in items_to_import)
|
||||
+ ")",
|
||||
globals(),
|
||||
)
|
||||
exec(create_code, globals())
|
||||
LOG.info("patching create_accelerator_and_postprocess to allow for overrides")
|
||||
Trainer.create_accelerator_and_postprocess = (
|
||||
fixed_create_accelerator_and_postprocess
|
||||
)
|
||||
Reference in New Issue
Block a user