Compare commits
2 Commits
activation
...
fix/vllm-v
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3474a9df88 | ||
|
|
f6151ce5cb |
@@ -51,7 +51,7 @@ Features:
|
||||
|
||||
- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
|
||||
- Python 3.11
|
||||
- PyTorch ≥2.4.1
|
||||
- PyTorch ≥2.5.1
|
||||
|
||||
### Installation
|
||||
|
||||
|
||||
4
setup.py
4
setup.py
@@ -67,13 +67,11 @@ def parse_requirements(extras_require_map):
|
||||
if (major, minor) >= (2, 7):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
# _install_requires.append("xformers==0.0.29.post3") # xformers seems to be hard pinned to 2.6.0
|
||||
extras_require_map["vllm"] = ["vllm==0.8.5"]
|
||||
elif (major, minor) >= (2, 6):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
_install_requires.append(
|
||||
"xformers==0.0.29.post2"
|
||||
) # vllm needs post2 w torch 2.6
|
||||
extras_require_map["vllm"] = ["vllm==0.8.5"]
|
||||
elif (major, minor) >= (2, 5):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
if patch == 0:
|
||||
@@ -147,7 +145,7 @@ extras_require = {
|
||||
"ray[train]",
|
||||
],
|
||||
"vllm": [
|
||||
"vllm==0.7.2",
|
||||
"vllm==0.8.5",
|
||||
],
|
||||
"llmcompressor": [
|
||||
"llmcompressor==0.5.1",
|
||||
|
||||
@@ -15,7 +15,7 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
|
||||
from axolotl.cli.config import load_cfg
|
||||
from axolotl.common.datasets import load_datasets, load_preference_datasets
|
||||
from axolotl.evaluate import evaluate
|
||||
from axolotl.utils import patch_optimized_env
|
||||
from axolotl.utils import set_pytorch_cuda_alloc_conf
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
LOG = logging.getLogger(__name__)
|
||||
@@ -32,7 +32,7 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
|
||||
cli_args: CLI arguments.
|
||||
"""
|
||||
# Enable expandable segments for cuda allocation to improve VRAM usage
|
||||
patch_optimized_env()
|
||||
set_pytorch_cuda_alloc_conf()
|
||||
|
||||
# pylint: disable=duplicate-code
|
||||
print_axolotl_text_art()
|
||||
|
||||
@@ -29,7 +29,7 @@ from axolotl.cli.utils import (
|
||||
filter_none_kwargs,
|
||||
)
|
||||
from axolotl.integrations.lm_eval.cli import lm_eval
|
||||
from axolotl.utils import patch_optimized_env
|
||||
from axolotl.utils import set_pytorch_cuda_alloc_conf
|
||||
from axolotl.utils.schemas.config import AxolotlInputConfig
|
||||
|
||||
|
||||
@@ -55,8 +55,6 @@ def preprocess(config: str, cloud: Optional[str] = None, **kwargs) -> None:
|
||||
kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
|
||||
config options.
|
||||
"""
|
||||
patch_optimized_env()
|
||||
|
||||
if cloud:
|
||||
from axolotl.cli.cloud import do_cli_preprocess
|
||||
|
||||
@@ -102,7 +100,7 @@ def train(
|
||||
config options.
|
||||
"""
|
||||
# Enable expandable segments for cuda allocation to improve VRAM usage
|
||||
patch_optimized_env()
|
||||
set_pytorch_cuda_alloc_conf()
|
||||
|
||||
if "use_ray" in kwargs and kwargs["use_ray"]:
|
||||
accelerate = False
|
||||
|
||||
@@ -18,7 +18,7 @@ from axolotl.cli.config import load_cfg
|
||||
from axolotl.common.datasets import load_datasets, load_preference_datasets
|
||||
from axolotl.integrations.base import PluginManager
|
||||
from axolotl.train import train
|
||||
from axolotl.utils import patch_optimized_env
|
||||
from axolotl.utils import set_pytorch_cuda_alloc_conf
|
||||
from axolotl.utils.config import normalize_config, resolve_dtype
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
@@ -36,7 +36,7 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
|
||||
cli_args: Training-specific CLI arguments.
|
||||
"""
|
||||
# Enable expandable segments for cuda allocation to improve VRAM usage
|
||||
patch_optimized_env()
|
||||
set_pytorch_cuda_alloc_conf()
|
||||
|
||||
print_axolotl_text_art()
|
||||
check_accelerate_default_config()
|
||||
|
||||
@@ -610,15 +610,3 @@ class AxolotlTrainer(
|
||||
output_dir = os.path.join(run_dir, checkpoint_folder)
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
return super()._save_checkpoint(model, trial, **kwargs)
|
||||
|
||||
def compute_loss_context_manager(self):
|
||||
from contextlib import ExitStack
|
||||
|
||||
from torchtune.training import OffloadActivations
|
||||
|
||||
stack = ExitStack()
|
||||
|
||||
stack.enter_context(super().compute_loss_context_manager())
|
||||
stack.enter_context(OffloadActivations())
|
||||
|
||||
return stack
|
||||
|
||||
@@ -18,8 +18,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
|
||||
"mixtral",
|
||||
"qwen2",
|
||||
"qwen2_moe",
|
||||
"qwen3",
|
||||
"qwen3_moe",
|
||||
"falcon",
|
||||
"phi",
|
||||
"phi3",
|
||||
|
||||
@@ -43,12 +43,3 @@ def set_pytorch_cuda_alloc_conf():
|
||||
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = (
|
||||
"expandable_segments:True,roundup_power2_divisions:16"
|
||||
)
|
||||
|
||||
|
||||
def patch_optimized_env():
|
||||
"""
|
||||
Patch environment variables to improve VRAM usage and increase download speed
|
||||
"""
|
||||
if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
set_pytorch_cuda_alloc_conf()
|
||||
|
||||
@@ -59,7 +59,7 @@ def choose_device(cfg):
|
||||
|
||||
def resolve_dtype(cfg):
|
||||
if (
|
||||
not cfg.fp16 and cfg.bf16 == "auto" and not cfg.use_ray
|
||||
cfg.bf16 == "auto" and not cfg.use_ray
|
||||
): # if we use ray we want to defer this check to the worker node
|
||||
if is_torch_bf16_gpu_available():
|
||||
LOG.debug("bf16 support detected, enabling for this configuration.")
|
||||
|
||||
@@ -2,13 +2,6 @@
|
||||
|
||||
from functools import partial
|
||||
|
||||
import torch
|
||||
from torch.utils.checkpoint import (
|
||||
CheckpointPolicy,
|
||||
checkpoint,
|
||||
create_selective_checkpoint_contexts,
|
||||
)
|
||||
|
||||
from axolotl.utils.gradient_checkpointing.unsloth import (
|
||||
Unsloth_Offloaded_Gradient_Checkpointer,
|
||||
)
|
||||
@@ -25,32 +18,3 @@ def hf_grad_checkpoint_offload_wrapper(
|
||||
),
|
||||
*args,
|
||||
)
|
||||
|
||||
|
||||
aten = torch.ops.aten
|
||||
compute_intensive_ops = [
|
||||
aten.mm.default,
|
||||
aten.bmm.default,
|
||||
aten.addmm.default,
|
||||
]
|
||||
|
||||
|
||||
def policy_fn(ctx, op, *args, **kwargs):
|
||||
if op in compute_intensive_ops:
|
||||
return CheckpointPolicy.MUST_SAVE
|
||||
else:
|
||||
return CheckpointPolicy.PREFER_RECOMPUTE
|
||||
|
||||
|
||||
context_fn = partial(create_selective_checkpoint_contexts, policy_fn)
|
||||
|
||||
|
||||
def checkpoint_w_policy(
|
||||
decoder_layer, *args, use_reentrant=None
|
||||
): # pylint: disable=unused-argument
|
||||
return checkpoint(
|
||||
decoder_layer,
|
||||
*args,
|
||||
use_reentrant=use_reentrant,
|
||||
context_fn=context_fn,
|
||||
)
|
||||
|
||||
@@ -190,7 +190,7 @@ class MultipackBatchSampler(BatchSampler):
|
||||
self.len_across_ranks = None
|
||||
|
||||
if self.sequential and not isinstance(sampler, SequentialSampler):
|
||||
LOG.warning(
|
||||
LOG.warn(
|
||||
"using sequential sample packing with non-sequential sampler, did you want to also enable curriculum_sampling?"
|
||||
)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user