move the setting of PYTORCH_CUDA_ALLOC_CONF to the cli rather than train module (#2183) [skip ci]

* move the setting of PYTORCH_CUDA_ALLOC_CONF to the cli rather than train module * move set_pytorch_cuda_alloc_conf to a different module to have fewer loaded dependencies for the CLI
2024-12-17 13:56:48 -05:00
parent 1c14c4a15c
commit 8ddc18ec8d
5 changed files with 17 additions and 17 deletions
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -13,6 +13,7 @@ from axolotl.cli.utils import (
    fetch_from_github,
 )
 from axolotl.common.cli import EvaluateCliArgs, PreprocessCliArgs, TrainerCliArgs
 from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.config.models.input.v0_4_1 import AxolotlInputConfig
@@ -48,6 +49,9 @@ def train(config: str, accelerate: bool, **kwargs):
    """Train or fine-tune a model."""
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
    # Enable expandable segments for cuda allocation to improve VRAM usage
    set_pytorch_cuda_alloc_conf()
    if accelerate:
        base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"]
        if config:
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -12,9 +12,10 @@ from accelerate.logging import get_logger
 from axolotl.common.cli import TrainerCliArgs
 from axolotl.logging_config import configure_logging
 from axolotl.train import TrainDatasetMeta
 from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
-from axolotl.utils.trainer import set_pytorch_cuda_alloc_conf, setup_trainer
+from axolotl.utils.trainer import setup_trainer
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 src_dir = os.path.join(project_root, "src")
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -24,7 +24,7 @@ from axolotl.logging_config import configure_logging
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.freeze import freeze_layers_except
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
-from axolotl.utils.trainer import set_pytorch_cuda_alloc_conf, setup_trainer
+from axolotl.utils.trainer import setup_trainer
 try:
    from optimum.bettertransformer import BetterTransformer
@@ -53,9 +53,6 @@ class TrainDatasetMeta:
 def train(
    *, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta
 ) -> Tuple[Union[PeftModel, PreTrainedModel], PreTrainedTokenizer]:
    # Enable expandable segments for cuda allocation to improve VRAM usage
    set_pytorch_cuda_alloc_conf()
    # Load tokenizer
    LOG.debug(
        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
--- a/src/axolotl/utils/init.py
+++ b/src/axolotl/utils/init.py
@@ -3,6 +3,7 @@ Basic utils for Axolotl
 """
 import importlib.util
 import os
 import re
 import torch
@@ -33,4 +34,12 @@ def get_pytorch_version() -> tuple[int, int, int]:
    return major, minor, patch
-# pylint: enable=duplicate-code
+def set_pytorch_cuda_alloc_conf():
    """Set up CUDA allocation config if using PyTorch >= 2.2"""
    torch_version = torch.__version__.split(".")
    torch_major, torch_minor = int(torch_version[0]), int(torch_version[1])
    if torch_major == 2 and torch_minor >= 2:
        if os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None:
            os.environ[
                "PYTORCH_CUDA_ALLOC_CONF"
            ] = "expandable_segments:True,roundup_power2_divisions:16"
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -512,17 +512,6 @@ def prepare_opinionated_env(cfg):
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
 def set_pytorch_cuda_alloc_conf():
    """Set up CUDA allocation config if using PyTorch >= 2.2"""
    torch_version = torch.__version__.split(".")
    torch_major, torch_minor = int(torch_version[0]), int(torch_version[1])
    if torch_major == 2 and torch_minor >= 2:
        if os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None:
            os.environ[
                "PYTORCH_CUDA_ALLOC_CONF"
            ] = "expandable_segments:True,roundup_power2_divisions:16"
 def setup_trainer(
    cfg, train_dataset, eval_dataset, model, tokenizer, processor, total_num_steps
 ):