Compare commits
16 Commits
runpod-sls
...
a8e5ba000e
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a8e5ba000e | ||
|
|
bc3dfa666d | ||
|
|
4371f3459e | ||
|
|
cc58d5e072 | ||
|
|
d197b054e3 | ||
|
|
7e1e153831 | ||
|
|
42de3096cf | ||
|
|
27758840a1 | ||
|
|
8dbf5c215a | ||
|
|
6411ca3fe1 | ||
|
|
813809c54d | ||
|
|
af7cfdc30b | ||
|
|
b76d2d1130 | ||
|
|
7946f89df4 | ||
|
|
8b33ae1c4f | ||
|
|
dc4da4a7e2 |
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@@ -24,7 +24,7 @@ jobs:
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.5.1
|
||||
axolotl_extras: vllm
|
||||
axolotl_extras:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
|
||||
2
.github/workflows/multi-gpu-e2e.yml
vendored
2
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -43,7 +43,7 @@ jobs:
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.5.1
|
||||
axolotl_extras: vllm
|
||||
axolotl_extras:
|
||||
num_gpus: 2
|
||||
nightly_build: "true"
|
||||
- cuda: 126
|
||||
|
||||
2
.github/workflows/tests.yml
vendored
2
.github/workflows/tests.yml
vendored
@@ -269,7 +269,7 @@ jobs:
|
||||
python_version: "3.11"
|
||||
pytorch: 2.5.1
|
||||
num_gpus: 1
|
||||
axolotl_extras: vllm
|
||||
axolotl_extras:
|
||||
- cuda: 126
|
||||
cuda_version: 12.6.3
|
||||
python_version: "3.11"
|
||||
|
||||
@@ -20,4 +20,4 @@ pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
|
||||
--cov-report=xml:multigpu-coverage.xml
|
||||
|
||||
# Upload coverage to Codecov
|
||||
codecov upload-process -t $CODECOV_TOKEN -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
|
||||
codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
|
||||
|
||||
76
examples/llama-3/sparse-finetuning.yaml
Normal file
76
examples/llama-3/sparse-finetuning.yaml
Normal file
@@ -0,0 +1,76 @@
|
||||
base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.llm_compressor.LLMCompressorPlugin
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
type: alpaca
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
eval_sample_packing: false
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 8
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: paged_adamw_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
evals_per_epoch: 2
|
||||
eval_table_size:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
pad_token: <|end_of_text|>
|
||||
|
||||
llmcompressor:
|
||||
recipe:
|
||||
finetuning_stage:
|
||||
finetuning_modifiers:
|
||||
ConstantPruningModifier:
|
||||
targets: [
|
||||
're:.*q_proj.weight',
|
||||
're:.*k_proj.weight',
|
||||
're:.*v_proj.weight',
|
||||
're:.*o_proj.weight',
|
||||
're:.*gate_proj.weight',
|
||||
're:.*up_proj.weight',
|
||||
're:.*down_proj.weight',
|
||||
]
|
||||
start: 0
|
||||
@@ -11,13 +11,13 @@ liger-kernel==0.5.8
|
||||
|
||||
packaging==23.2
|
||||
|
||||
peft==0.15.1
|
||||
peft==0.15.2
|
||||
transformers==4.51.3
|
||||
tokenizers>=0.21.1
|
||||
accelerate==1.6.0
|
||||
datasets==3.5.0
|
||||
deepspeed>=0.15.4
|
||||
trl==0.16.1
|
||||
trl==0.17.0
|
||||
hf_xet==1.0.0
|
||||
hqq==0.2.5
|
||||
|
||||
|
||||
7
setup.py
7
setup.py
@@ -67,13 +67,13 @@ def parse_requirements(extras_require_map):
|
||||
if (major, minor) >= (2, 7):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
# _install_requires.append("xformers==0.0.29.post3") # xformers seems to be hard pinned to 2.6.0
|
||||
extras_require_map["vllm"] = ["vllm==0.8.3"]
|
||||
extras_require_map["vllm"] = ["vllm==0.8.4"]
|
||||
elif (major, minor) >= (2, 6):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
_install_requires.append(
|
||||
"xformers==0.0.29.post2"
|
||||
) # vllm needs post2 w torch 2.6
|
||||
extras_require_map["vllm"] = ["vllm==0.8.3"]
|
||||
extras_require_map["vllm"] = ["vllm==0.8.4"]
|
||||
elif (major, minor) >= (2, 5):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
if patch == 0:
|
||||
@@ -149,6 +149,9 @@ extras_require = {
|
||||
"vllm": [
|
||||
"vllm==0.7.2",
|
||||
],
|
||||
"llmcompressor": [
|
||||
"llmcompressor~=0.5.0",
|
||||
],
|
||||
}
|
||||
|
||||
install_requires, dependency_links, extras_require_build = parse_requirements(
|
||||
|
||||
@@ -135,7 +135,9 @@ class GRPOStrategy:
|
||||
try:
|
||||
# use importlib to dynamically load the reward function from the module
|
||||
reward_func_module_name = reward_func_fqn.split(".")[-1]
|
||||
reward_func_module = importlib.import_module(reward_func_fqn.split(".")[-2])
|
||||
reward_func_module = importlib.import_module(
|
||||
".".join(reward_func_fqn.split(".")[:-1])
|
||||
)
|
||||
reward_func = getattr(reward_func_module, reward_func_module_name)
|
||||
if not len(inspect.signature(reward_func).parameters) >= 2:
|
||||
raise ValueError(
|
||||
|
||||
5
src/axolotl/integrations/llm_compressor/__init__.py
Normal file
5
src/axolotl/integrations/llm_compressor/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
"""Integration entry point for the LLMCompressor plugin."""
|
||||
|
||||
from .plugin import LLMCompressorPlugin
|
||||
|
||||
__all__ = ["LLMCompressorPlugin"]
|
||||
40
src/axolotl/integrations/llm_compressor/args.py
Normal file
40
src/axolotl/integrations/llm_compressor/args.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""
|
||||
LLMCompressor and Sparse Finetuning config models.
|
||||
"""
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
from typing_extensions import Annotated
|
||||
|
||||
|
||||
class CompressionArgs(BaseModel):
|
||||
"""Sparse Finetuning config for LLMCompressor."""
|
||||
|
||||
# Typing for recipe is set to Any due to:
|
||||
# https://github.com/vllm-project/llm-compressor/issues/1319
|
||||
recipe: Annotated[
|
||||
Any,
|
||||
Field(
|
||||
description="The recipe containing the compression algorithms and hyperparameters to apply."
|
||||
),
|
||||
]
|
||||
|
||||
model_config = ConfigDict(
|
||||
validate_assignment=True,
|
||||
)
|
||||
|
||||
|
||||
class LLMCompressorArgs(BaseModel):
|
||||
"""LLMCompressor configuration BaseModel."""
|
||||
|
||||
llmcompressor: Annotated[
|
||||
CompressionArgs,
|
||||
Field(
|
||||
description="Arguments enabling compression pathways through the LLM Compressor plugins"
|
||||
),
|
||||
]
|
||||
|
||||
model_config = ConfigDict(
|
||||
validate_assignment=True,
|
||||
)
|
||||
164
src/axolotl/integrations/llm_compressor/plugin.py
Normal file
164
src/axolotl/integrations/llm_compressor/plugin.py
Normal file
@@ -0,0 +1,164 @@
|
||||
"""
|
||||
Sparse Finetuning plugin for Axolotl — enables handling of sparse neural networks
|
||||
by maintaining masks for zero weights during training.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from functools import wraps
|
||||
from typing import Any, Callable, ParamSpec, TypeVar
|
||||
|
||||
from llmcompressor import active_session
|
||||
from llmcompressor.core import callbacks as session_callbacks
|
||||
from llmcompressor.recipe import Recipe
|
||||
from transformers.trainer import Trainer
|
||||
from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
|
||||
from transformers.training_args import TrainingArguments
|
||||
|
||||
from axolotl.integrations.base import BasePlugin
|
||||
|
||||
P = ParamSpec("P") # Params for generic function signatures
|
||||
R = TypeVar("R") # Return type for generic function signatures
|
||||
|
||||
LOG = logging.getLogger("axolotl.integrations.llm_compressor")
|
||||
|
||||
|
||||
class LLMCompressorCallbackHandler(TrainerCallback):
|
||||
"""
|
||||
Trainer callback for Sparse Finetuning.
|
||||
Maintains sparsity patterns during training by applying masks after optimization steps,
|
||||
ensuring zero-weight updates are canceled out.
|
||||
"""
|
||||
|
||||
def __init__(self, trainer: Trainer, recipe: Any):
|
||||
"""
|
||||
Initialize the Sparse Finetuning callback handler.
|
||||
|
||||
Args:
|
||||
trainer (Trainer): Huggingface Trainer instance.
|
||||
recipe (Recipe | dict): Sparse finetuning recipe to apply.
|
||||
"""
|
||||
super().__init__()
|
||||
self.trainer = trainer
|
||||
self.recipe = (
|
||||
Recipe.model_validate(recipe) if not isinstance(recipe, Recipe) else recipe
|
||||
)
|
||||
self.trainer.compute_loss = compute_loss_wrapper(self.trainer.compute_loss)
|
||||
|
||||
def on_train_begin(
|
||||
self,
|
||||
args: TrainingArguments,
|
||||
state: TrainerState,
|
||||
control: TrainerControl,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Called at the beginning of training. Initializes the compression session.
|
||||
|
||||
Args:
|
||||
args (TrainingArguments): Training arguments.
|
||||
state (TrainerState): Trainer state.
|
||||
control (TrainerControl): Trainer control.
|
||||
"""
|
||||
super().on_train_begin(args, state, control, **kwargs)
|
||||
session = active_session()
|
||||
session.initialize(
|
||||
model=self.trainer.model,
|
||||
optimizer=self.trainer.optimizer,
|
||||
start=state.epoch,
|
||||
recipe=self.recipe,
|
||||
)
|
||||
|
||||
def on_step_begin(
|
||||
self,
|
||||
args: TrainingArguments,
|
||||
state: TrainerState,
|
||||
control: TrainerControl,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Called at the beginning of a training step. Triggers batch_start callback.
|
||||
"""
|
||||
super().on_step_begin(args, state, control, **kwargs)
|
||||
session_callbacks.batch_start()
|
||||
|
||||
def on_step_end(
|
||||
self,
|
||||
args: TrainingArguments,
|
||||
state: TrainerState,
|
||||
control: TrainerControl,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Called at the end of a training step. Triggers optimizer and batch_end callbacks.
|
||||
"""
|
||||
super().on_step_end(args, state, control, **kwargs)
|
||||
session_callbacks.optim_pre_step()
|
||||
session_callbacks.optim_post_step()
|
||||
session_callbacks.batch_end()
|
||||
|
||||
def on_train_end(
|
||||
self,
|
||||
args: TrainingArguments,
|
||||
state: TrainerState,
|
||||
control: TrainerControl,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
"""
|
||||
Called at the end of training. Finalizes the compression session.
|
||||
"""
|
||||
super().on_train_end(args, state, control, **kwargs)
|
||||
session = active_session()
|
||||
session.finalize()
|
||||
|
||||
|
||||
class LLMCompressorPlugin(BasePlugin):
|
||||
"""
|
||||
Sparse Finetuning plugin for Axolotl integration.
|
||||
"""
|
||||
|
||||
def get_input_args(self) -> str:
|
||||
"""
|
||||
Returns the path to the plugin's argument definition.
|
||||
|
||||
Returns:
|
||||
str: Dotted path to the LLMCompressorArgs class.
|
||||
"""
|
||||
return "axolotl.integrations.llm_compressor.args.LLMCompressorArgs"
|
||||
|
||||
def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
|
||||
"""
|
||||
Adds Sparse Finetuning callback to the Trainer instance.
|
||||
|
||||
Args:
|
||||
cfg (Any): Configuration object containing the sparse recipe.
|
||||
trainer (Trainer): Huggingface Trainer instance.
|
||||
|
||||
Returns:
|
||||
list: List containing the configured callback instances.
|
||||
"""
|
||||
LOG.info("Adding Sparse Finetuning callback to the trainer")
|
||||
callback = LLMCompressorCallbackHandler(
|
||||
trainer=trainer,
|
||||
recipe=cfg.llmcompressor.recipe,
|
||||
)
|
||||
return [callback]
|
||||
|
||||
|
||||
def compute_loss_wrapper(compute_loss_func: Callable[P, R]) -> Callable[P, R]:
|
||||
"""
|
||||
Wraps the loss computation function to trigger the loss_calculated callback.
|
||||
|
||||
Args:
|
||||
compute_loss_func (Callable): Original loss computation function.
|
||||
|
||||
Returns:
|
||||
Callable: Wrapped function that also invokes the loss_calculated callback.
|
||||
"""
|
||||
|
||||
@wraps(compute_loss_func)
|
||||
def compute_and_notify(*args: P.args, **kwargs: P.kwargs) -> R:
|
||||
loss = compute_loss_func(*args, **kwargs)
|
||||
session_callbacks.loss_calculated(loss=loss)
|
||||
return loss
|
||||
|
||||
return compute_and_notify
|
||||
@@ -139,6 +139,22 @@ def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
|
||||
hasattr(model_config, "quantization_config")
|
||||
and model_config.quantization_config
|
||||
)
|
||||
|
||||
# Detect compressed-tensors config
|
||||
is_compressed_tensors_config = (
|
||||
quant_config_exists
|
||||
and model_config.quantization_config.get("quant_method") == "compressed-tensors"
|
||||
)
|
||||
|
||||
if is_compressed_tensors_config:
|
||||
if model_config.quantization_config.get("config_groups"):
|
||||
LOG.warning(
|
||||
"Found `config_groups` in a compressed-tensors config. "
|
||||
"QAT integration with llmcompressor is not tested."
|
||||
)
|
||||
# Skip further quant checks for compressed-tensors
|
||||
return
|
||||
|
||||
quant_config_method_is_gptq = (
|
||||
quant_config_exists
|
||||
and "quant_method" in model_config.quantization_config
|
||||
|
||||
@@ -4,11 +4,14 @@ GRPO test suite
|
||||
|
||||
import os
|
||||
import random
|
||||
import shutil
|
||||
import subprocess # nosec B404
|
||||
import sys
|
||||
import tempfile
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import psutil
|
||||
import pytest
|
||||
import requests
|
||||
import yaml
|
||||
@@ -21,8 +24,8 @@ from tests.e2e.utils import require_vllm
|
||||
|
||||
|
||||
def start_vllm(
|
||||
model: str, env: dict | None = None, wait: int | None = None, quiet=False, **kwargs
|
||||
) -> int:
|
||||
model: str, env: dict, wait: int | None = None, quiet=False, **kwargs
|
||||
) -> subprocess.Popen:
|
||||
"""
|
||||
helper function to start the VLLM server in the background, mostly for testing purposes
|
||||
"""
|
||||
@@ -46,10 +49,41 @@ def start_vllm(
|
||||
# print out the command to be executed
|
||||
print(" ".join(cmd))
|
||||
|
||||
vllm_logging_json = Path(tempfile.mkdtemp()) / "vllm_logging.json"
|
||||
with open(vllm_logging_json, "w", encoding="utf-8") as temp_file:
|
||||
temp_file.write(
|
||||
"""{
|
||||
"formatters": {
|
||||
"json": {
|
||||
"class": "pythonjsonlogger.jsonlogger.JsonFormatter"
|
||||
}
|
||||
},
|
||||
"handlers": {
|
||||
"file": {
|
||||
"class": "logging.FileHandler",
|
||||
"formatter": "json",
|
||||
"level": "DEBUG",
|
||||
"filename": "/tmp/vllm.log",
|
||||
"mode": "a"
|
||||
}
|
||||
},
|
||||
"loggers": {
|
||||
"vllm": {
|
||||
"handlers": ["file"],
|
||||
"level": "DEBUG",
|
||||
"propagate": false
|
||||
}
|
||||
},
|
||||
"version": 1
|
||||
}"""
|
||||
)
|
||||
|
||||
cmd_env = env.copy()
|
||||
cmd_env.update({"VLLM_LOGGING_CONFIG_PATH": vllm_logging_json})
|
||||
# start `trl vllm-serve` command in the background and capture the process id
|
||||
process = subprocess.Popen( # pylint: disable=consider-using-with
|
||||
cmd,
|
||||
env=env,
|
||||
env=cmd_env,
|
||||
stdout=subprocess.DEVNULL if quiet else subprocess.PIPE,
|
||||
stderr=subprocess.DEVNULL if quiet else subprocess.PIPE,
|
||||
) # nosec B603
|
||||
@@ -58,32 +92,51 @@ def start_vllm(
|
||||
print(f"VLLM server process started (PID: {process.pid})")
|
||||
|
||||
# wait until the http server is ready, even if it 404s, but timeout after 60 seconds
|
||||
period_seconds = 5
|
||||
started = False
|
||||
if wait and host and port:
|
||||
for _ in range(int(wait)):
|
||||
for i in range(0, int(wait), period_seconds):
|
||||
try:
|
||||
response = requests.get(f"http://{host}:{port}", timeout=1)
|
||||
print(f"{i}: VLLM server (status: {response.status_code})")
|
||||
if int(response.status_code) in [200, 404]:
|
||||
started = True
|
||||
break
|
||||
except requests.exceptions.RequestException:
|
||||
pass
|
||||
except requests.exceptions.RequestException as exc:
|
||||
print(f"{i}: VLLM server failed to start: {str(exc)}")
|
||||
|
||||
# also check if the process.pid is still running
|
||||
if not process.poll() is None:
|
||||
break
|
||||
|
||||
time.sleep(1)
|
||||
time.sleep(period_seconds)
|
||||
|
||||
if wait and not started:
|
||||
print(
|
||||
f"VLLM server process did not start within {wait} seconds. Please check your server logs."
|
||||
)
|
||||
process.kill()
|
||||
recursive_kill(process)
|
||||
with open("/tmp/vllm.log", "r", encoding="utf-8") as log_file:
|
||||
print(log_file.read())
|
||||
shutil.rmtree("/tmp/vllm.log")
|
||||
raise RuntimeError(f"VLLM server process did not start within {wait} seconds.")
|
||||
|
||||
# return the process id
|
||||
return process.pid
|
||||
# return the process
|
||||
return process
|
||||
|
||||
|
||||
def recursive_kill(process: subprocess.Popen):
|
||||
"""
|
||||
Recursively kill a process and its children
|
||||
"""
|
||||
process = psutil.Process(process.pid)
|
||||
for child in psutil.Process(process.pid).children(recursive=True):
|
||||
child.terminate()
|
||||
child.kill()
|
||||
os.kill(child.pid, 9)
|
||||
process.terminate()
|
||||
process.kill()
|
||||
os.kill(process.pid, 9)
|
||||
|
||||
|
||||
class TestGRPO:
|
||||
@@ -174,16 +227,17 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
|
||||
|
||||
current_env = os.environ.copy()
|
||||
env = {
|
||||
"NCCL_P2P_LEVEL": "LOC",
|
||||
"NCCL_P2P_LEVEL": "NVL",
|
||||
**current_env,
|
||||
"CUDA_VISIBLE_DEVICES": "1",
|
||||
"VLLM_USE_V1": "0",
|
||||
"VLLM_DISABLE_COMPILE_CACHE": "1",
|
||||
# "VLLM_USE_V1": "0",
|
||||
}
|
||||
vllm_process_id = start_vllm(
|
||||
vllm_process = start_vllm(
|
||||
cfg.base_model,
|
||||
env=env,
|
||||
quiet=True,
|
||||
wait=120,
|
||||
wait=300,
|
||||
gpu_memory_utilization=0.15,
|
||||
max_model_len=cfg.vllm.max_model_len,
|
||||
enable_prefix_caching=cfg.vllm.enable_prefix_caching,
|
||||
@@ -202,10 +256,14 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
|
||||
"--main-process-port",
|
||||
f"{get_torch_dist_unique_port()}",
|
||||
],
|
||||
env={"NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env},
|
||||
env={
|
||||
"NCCL_P2P_LEVEL": "NVL",
|
||||
"NCCL_DEBUG": "INFO",
|
||||
**current_env,
|
||||
},
|
||||
)
|
||||
finally:
|
||||
os.kill(vllm_process_id, 9)
|
||||
recursive_kill(vllm_process)
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"num_gpus",
|
||||
@@ -262,16 +320,17 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
|
||||
|
||||
current_env = os.environ.copy()
|
||||
env = {
|
||||
"NCCL_P2P_LEVEL": "LOC", # nccl can be brittle, assume P2P isn't reliable
|
||||
"NCCL_P2P_LEVEL": "NVL", # nccl can be brittle, assume P2P isn't reliable
|
||||
**current_env,
|
||||
"CUDA_VISIBLE_DEVICES": "1",
|
||||
"VLLM_USE_V1": "0",
|
||||
"VLLM_DISABLE_COMPILE_CACHE": "1",
|
||||
# "VLLM_USE_V1": "0",
|
||||
}
|
||||
vllm_process_id = start_vllm(
|
||||
vllm_process = start_vllm(
|
||||
cfg.base_model,
|
||||
env=env,
|
||||
quiet=True,
|
||||
wait=120,
|
||||
wait=300,
|
||||
gpu_memory_utilization=0.15,
|
||||
max_model_len=cfg.vllm.max_model_len,
|
||||
enable_prefix_caching=cfg.vllm.enable_prefix_caching,
|
||||
@@ -290,7 +349,11 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
|
||||
"--main-process-port",
|
||||
f"{get_torch_dist_unique_port()}",
|
||||
],
|
||||
env={"NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env},
|
||||
env={
|
||||
"NCCL_P2P_LEVEL": "NVL",
|
||||
"NCCL_DEBUG": "INFO",
|
||||
**current_env,
|
||||
},
|
||||
)
|
||||
finally:
|
||||
os.kill(vllm_process_id, 9)
|
||||
recursive_kill(vllm_process)
|
||||
|
||||
Reference in New Issue
Block a user