move the setting of PYTORCH_CUDA_ALLOC_CONF to the cli rather than train module (#2183) [skip ci]
* move the setting of PYTORCH_CUDA_ALLOC_CONF to the cli rather than train module * move set_pytorch_cuda_alloc_conf to a different module to have fewer loaded dependencies for the CLI
This commit is contained in:
@@ -13,6 +13,7 @@ from axolotl.cli.utils import (
|
|||||||
fetch_from_github,
|
fetch_from_github,
|
||||||
)
|
)
|
||||||
from axolotl.common.cli import EvaluateCliArgs, PreprocessCliArgs, TrainerCliArgs
|
from axolotl.common.cli import EvaluateCliArgs, PreprocessCliArgs, TrainerCliArgs
|
||||||
|
from axolotl.utils import set_pytorch_cuda_alloc_conf
|
||||||
from axolotl.utils.config.models.input.v0_4_1 import AxolotlInputConfig
|
from axolotl.utils.config.models.input.v0_4_1 import AxolotlInputConfig
|
||||||
|
|
||||||
|
|
||||||
@@ -48,6 +49,9 @@ def train(config: str, accelerate: bool, **kwargs):
|
|||||||
"""Train or fine-tune a model."""
|
"""Train or fine-tune a model."""
|
||||||
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
||||||
|
|
||||||
|
# Enable expandable segments for cuda allocation to improve VRAM usage
|
||||||
|
set_pytorch_cuda_alloc_conf()
|
||||||
|
|
||||||
if accelerate:
|
if accelerate:
|
||||||
base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"]
|
base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"]
|
||||||
if config:
|
if config:
|
||||||
|
|||||||
@@ -12,9 +12,10 @@ from accelerate.logging import get_logger
|
|||||||
from axolotl.common.cli import TrainerCliArgs
|
from axolotl.common.cli import TrainerCliArgs
|
||||||
from axolotl.logging_config import configure_logging
|
from axolotl.logging_config import configure_logging
|
||||||
from axolotl.train import TrainDatasetMeta
|
from axolotl.train import TrainDatasetMeta
|
||||||
|
from axolotl.utils import set_pytorch_cuda_alloc_conf
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.models import load_model, load_processor, load_tokenizer
|
from axolotl.utils.models import load_model, load_processor, load_tokenizer
|
||||||
from axolotl.utils.trainer import set_pytorch_cuda_alloc_conf, setup_trainer
|
from axolotl.utils.trainer import setup_trainer
|
||||||
|
|
||||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
src_dir = os.path.join(project_root, "src")
|
src_dir = os.path.join(project_root, "src")
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from axolotl.logging_config import configure_logging
|
|||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.freeze import freeze_layers_except
|
from axolotl.utils.freeze import freeze_layers_except
|
||||||
from axolotl.utils.models import load_model, load_processor, load_tokenizer
|
from axolotl.utils.models import load_model, load_processor, load_tokenizer
|
||||||
from axolotl.utils.trainer import set_pytorch_cuda_alloc_conf, setup_trainer
|
from axolotl.utils.trainer import setup_trainer
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from optimum.bettertransformer import BetterTransformer
|
from optimum.bettertransformer import BetterTransformer
|
||||||
@@ -53,9 +53,6 @@ class TrainDatasetMeta:
|
|||||||
def train(
|
def train(
|
||||||
*, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta
|
*, cfg: DictDefault, cli_args: TrainerCliArgs, dataset_meta: TrainDatasetMeta
|
||||||
) -> Tuple[Union[PeftModel, PreTrainedModel], PreTrainedTokenizer]:
|
) -> Tuple[Union[PeftModel, PreTrainedModel], PreTrainedTokenizer]:
|
||||||
# Enable expandable segments for cuda allocation to improve VRAM usage
|
|
||||||
set_pytorch_cuda_alloc_conf()
|
|
||||||
|
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
LOG.debug(
|
LOG.debug(
|
||||||
f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
|
f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ Basic utils for Axolotl
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
import importlib.util
|
import importlib.util
|
||||||
|
import os
|
||||||
import re
|
import re
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
@@ -33,4 +34,12 @@ def get_pytorch_version() -> tuple[int, int, int]:
|
|||||||
return major, minor, patch
|
return major, minor, patch
|
||||||
|
|
||||||
|
|
||||||
# pylint: enable=duplicate-code
|
def set_pytorch_cuda_alloc_conf():
|
||||||
|
"""Set up CUDA allocation config if using PyTorch >= 2.2"""
|
||||||
|
torch_version = torch.__version__.split(".")
|
||||||
|
torch_major, torch_minor = int(torch_version[0]), int(torch_version[1])
|
||||||
|
if torch_major == 2 and torch_minor >= 2:
|
||||||
|
if os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None:
|
||||||
|
os.environ[
|
||||||
|
"PYTORCH_CUDA_ALLOC_CONF"
|
||||||
|
] = "expandable_segments:True,roundup_power2_divisions:16"
|
||||||
|
|||||||
@@ -512,17 +512,6 @@ def prepare_opinionated_env(cfg):
|
|||||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||||
|
|
||||||
|
|
||||||
def set_pytorch_cuda_alloc_conf():
|
|
||||||
"""Set up CUDA allocation config if using PyTorch >= 2.2"""
|
|
||||||
torch_version = torch.__version__.split(".")
|
|
||||||
torch_major, torch_minor = int(torch_version[0]), int(torch_version[1])
|
|
||||||
if torch_major == 2 and torch_minor >= 2:
|
|
||||||
if os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None:
|
|
||||||
os.environ[
|
|
||||||
"PYTORCH_CUDA_ALLOC_CONF"
|
|
||||||
] = "expandable_segments:True,roundup_power2_divisions:16"
|
|
||||||
|
|
||||||
|
|
||||||
def setup_trainer(
|
def setup_trainer(
|
||||||
cfg, train_dataset, eval_dataset, model, tokenizer, processor, total_num_steps
|
cfg, train_dataset, eval_dataset, model, tokenizer, processor, total_num_steps
|
||||||
):
|
):
|
||||||
|
|||||||
Reference in New Issue
Block a user