GC every n steps (#2209)
This commit is contained in:
@@ -56,6 +56,7 @@ from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
|
|||||||
from axolotl.utils import is_comet_available, is_mlflow_available
|
from axolotl.utils import is_comet_available, is_mlflow_available
|
||||||
from axolotl.utils.callbacks import (
|
from axolotl.utils.callbacks import (
|
||||||
EvalFirstStepCallback,
|
EvalFirstStepCallback,
|
||||||
|
GCCallback,
|
||||||
GPUStatsCallback,
|
GPUStatsCallback,
|
||||||
LossWatchDogCallback,
|
LossWatchDogCallback,
|
||||||
SaveAxolotlConfigtoWandBCallback,
|
SaveAxolotlConfigtoWandBCallback,
|
||||||
@@ -1452,6 +1453,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
if self.cfg.loss_watchdog_threshold is not None:
|
if self.cfg.loss_watchdog_threshold is not None:
|
||||||
callbacks.append(LossWatchDogCallback(self.cfg))
|
callbacks.append(LossWatchDogCallback(self.cfg))
|
||||||
|
|
||||||
|
if self.cfg.gc_steps:
|
||||||
|
callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
|
||||||
callbacks.append(SaveModelCallback())
|
callbacks.append(SaveModelCallback())
|
||||||
|
|
||||||
return callbacks
|
return callbacks
|
||||||
|
|||||||
@@ -2,6 +2,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import gc
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
@@ -842,3 +843,17 @@ class SaveModelCallback(TrainerCallback):
|
|||||||
):
|
):
|
||||||
control.should_save = True
|
control.should_save = True
|
||||||
return control
|
return control
|
||||||
|
|
||||||
|
|
||||||
|
class GCCallback(TrainerCallback):
|
||||||
|
"""Callback to garbage collect torch cache"""
|
||||||
|
|
||||||
|
def __init__(self, gc_steps=None):
|
||||||
|
self.gc_steps = gc_steps
|
||||||
|
|
||||||
|
def on_step_end(
|
||||||
|
self, args, state, control, **kwargs # pylint: disable=unused-argument
|
||||||
|
):
|
||||||
|
if state.global_step % self.gc_steps == 0:
|
||||||
|
torch.cuda.empty_cache()
|
||||||
|
gc.collect()
|
||||||
|
|||||||
@@ -666,6 +666,8 @@ class AxolotlInputConfig(
|
|||||||
loss_watchdog_threshold: Optional[float] = None
|
loss_watchdog_threshold: Optional[float] = None
|
||||||
loss_watchdog_patience: Optional[int] = None
|
loss_watchdog_patience: Optional[int] = None
|
||||||
|
|
||||||
|
gc_steps: Optional[int] = None
|
||||||
|
|
||||||
bf16: Optional[Union[Literal["auto"], bool]] = "auto"
|
bf16: Optional[Union[Literal["auto"], bool]] = "auto"
|
||||||
fp16: Optional[bool] = None
|
fp16: Optional[bool] = None
|
||||||
bfloat16: Optional[bool] = None # for non-AMP cases
|
bfloat16: Optional[bool] = None # for non-AMP cases
|
||||||
|
|||||||
Reference in New Issue
Block a user