clear cuda cache to help with memory leak/creep (#1858)

* clear cuda cache to help with memory leak/creep * reverse order of gc
2024-08-26 15:50:26 -04:00
parent 2dac1edf72
commit 17af1d7081
1 changed files with 11 additions and 1 deletions
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -4,6 +4,7 @@ Builder for the training args and trainer
 """
 import abc
 import gc
 import importlib
 import importlib.util
 import logging
@@ -15,11 +16,12 @@ from collections import defaultdict
 from dataclasses import dataclass, field
 from functools import wraps
 from pathlib import Path
-from typing import Dict, List, Literal, Optional, Type, Union
+from typing import Any, Dict, List, Literal, Optional, Type, Union
 import torch
 import transformers
 from datasets import Dataset
 from torch import nn
 from torch.optim.lr_scheduler import OneCycleLR
 from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
 from transformers import (
@@ -997,6 +999,14 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):
                res[key] = res[key][1:]
        return res
    def training_step(
        self, model: nn.Module, inputs: Dict[str, Union[torch.Tensor, Any]]
    ) -> torch.Tensor:
        loss: torch.Tensor = super().training_step(model, inputs)
        gc.collect()
        torch.cuda.empty_cache()
        return loss
 class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):
    """