Compare commits
10 Commits
a8e5ba000e
...
llmcompres
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c9880977be | ||
|
|
f196941315 | ||
|
|
5be047ac46 | ||
|
|
758115b8c6 | ||
|
|
0dc1da5876 | ||
|
|
f3e876dbfc | ||
|
|
99c13ef60c | ||
|
|
2c24434ee0 | ||
|
|
586268a0d7 | ||
|
|
b600e119b6 |
6
.github/workflows/tests.yml
vendored
6
.github/workflows/tests.yml
vendored
@@ -258,6 +258,12 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
include:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.6.0
|
||||
num_gpus: 1
|
||||
axolotl_extras: llmcompressor
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
|
||||
@@ -49,7 +49,8 @@ sections = [
|
||||
("Knowledge Distillation (KD)", "kd"),
|
||||
("Liger Kernels", "liger"),
|
||||
("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
|
||||
("Spectrum", "spectrum")
|
||||
("Spectrum", "spectrum"),
|
||||
("LLMCompressor", "llm_compressor")
|
||||
]
|
||||
|
||||
for section_name, folder_name in sections:
|
||||
|
||||
@@ -74,3 +74,4 @@ llmcompressor:
|
||||
're:.*down_proj.weight',
|
||||
]
|
||||
start: 0
|
||||
save_compressed: true
|
||||
|
||||
2
setup.py
2
setup.py
@@ -150,7 +150,7 @@ extras_require = {
|
||||
"vllm==0.7.2",
|
||||
],
|
||||
"llmcompressor": [
|
||||
"llmcompressor~=0.5.0",
|
||||
"llmcompressor==0.5.1",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
108
src/axolotl/integrations/llm_compressor/README.md
Normal file
108
src/axolotl/integrations/llm_compressor/README.md
Normal file
@@ -0,0 +1,108 @@
|
||||
# LLMCompressor Integration
|
||||
|
||||
Fine-tune sparsified models in Axolotl using Neural Magic's [LLMCompressor](https://github.com/vllm-project/llm-compressor).
|
||||
|
||||
This integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor's model compression capabilities with Axolotl's distributed training pipelines, users can efficiently fine-tune sparse models at scale.
|
||||
|
||||
It uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.
|
||||
|
||||
---
|
||||
|
||||
## Requirements
|
||||
|
||||
- Axolotl with `llmcompressor` extras:
|
||||
|
||||
```bash
|
||||
pip install "axolotl[llmcompressor]"
|
||||
```
|
||||
|
||||
- Requires `llmcompressor >= 0.5.1`
|
||||
|
||||
This will install all necessary dependencies to fine-tune sparsified models using the integration.
|
||||
|
||||
---
|
||||
|
||||
## Usage
|
||||
|
||||
To enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:
|
||||
|
||||
```yaml
|
||||
plugins:
|
||||
- axolotl.integrations.llm_compressor.LLMCompressorPlugin
|
||||
|
||||
llmcompressor:
|
||||
recipe:
|
||||
finetuning_stage:
|
||||
finetuning_modifiers:
|
||||
ConstantPruningModifier:
|
||||
targets: [
|
||||
're:.*q_proj.weight',
|
||||
're:.*k_proj.weight',
|
||||
're:.*v_proj.weight',
|
||||
're:.*o_proj.weight',
|
||||
're:.*gate_proj.weight',
|
||||
're:.*up_proj.weight',
|
||||
're:.*down_proj.weight',
|
||||
]
|
||||
start: 0
|
||||
save_compressed: true
|
||||
# ... (other training arguments)
|
||||
```
|
||||
|
||||
This plugin **does not apply pruning or sparsification itself** — it is intended for **fine-tuning models that have already been sparsified**.
|
||||
|
||||
Pre-sparsified checkpoints can be:
|
||||
- Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor)
|
||||
- Downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic)
|
||||
- Any custom LLM with compatible sparsity patterns that you've created yourself
|
||||
|
||||
To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:
|
||||
[https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md)
|
||||
|
||||
### Storage Optimization with save_compressed
|
||||
|
||||
Setting `save_compressed: true` in your configuration enables saving models in a compressed format, which:
|
||||
- Reduces disk space usage by approximately 40%
|
||||
- Maintains compatibility with vLLM for accelerated inference
|
||||
- Maintains compatibility with llmcompressor for further optimization (example: quantization)
|
||||
|
||||
This option is highly recommended when working with sparse models to maximize the benefits of model compression.
|
||||
|
||||
### Example Config
|
||||
|
||||
See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example.
|
||||
|
||||
---
|
||||
|
||||
## Inference with vLLM
|
||||
|
||||
After fine-tuning your sparse model, you can leverage vLLM for efficient inference.
|
||||
You can also use LLMCompressor to apply additional quantization to your fine-tuned
|
||||
sparse model before inference for even greater performance benefits.:
|
||||
|
||||
```python
|
||||
from vllm import LLM, SamplingParams
|
||||
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||
llm = LLM("path/to/your/sparse/model")
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
```
|
||||
|
||||
For more details on vLLM's capabilities and advanced configuration options, see the [official vLLM documentation](https://docs.vllm.ai/).
|
||||
|
||||
## Learn More
|
||||
|
||||
For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:
|
||||
|
||||
[https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)
|
||||
@@ -4,7 +4,7 @@ LLMCompressor and Sparse Finetuning config models.
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel, ConfigDict, Field
|
||||
from pydantic import BaseModel, Field
|
||||
from typing_extensions import Annotated
|
||||
|
||||
|
||||
@@ -20,9 +20,13 @@ class CompressionArgs(BaseModel):
|
||||
),
|
||||
]
|
||||
|
||||
model_config = ConfigDict(
|
||||
validate_assignment=True,
|
||||
)
|
||||
save_compressed: Annotated[
|
||||
bool,
|
||||
Field(
|
||||
default=False,
|
||||
description="Whether to save the compressed model after training.",
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
class LLMCompressorArgs(BaseModel):
|
||||
@@ -34,7 +38,3 @@ class LLMCompressorArgs(BaseModel):
|
||||
description="Arguments enabling compression pathways through the LLM Compressor plugins"
|
||||
),
|
||||
]
|
||||
|
||||
model_config = ConfigDict(
|
||||
validate_assignment=True,
|
||||
)
|
||||
|
||||
@@ -5,11 +5,12 @@ by maintaining masks for zero weights during training.
|
||||
|
||||
import logging
|
||||
from functools import wraps
|
||||
from typing import Any, Callable, ParamSpec, TypeVar
|
||||
from typing import Any, Callable, Concatenate, ParamSpec, TypeVar
|
||||
|
||||
from llmcompressor import active_session
|
||||
from llmcompressor import active_session, create_session
|
||||
from llmcompressor.core import callbacks as session_callbacks
|
||||
from llmcompressor.recipe import Recipe
|
||||
from torch.nn import Module
|
||||
from transformers.trainer import Trainer
|
||||
from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
|
||||
from transformers.training_args import TrainingArguments
|
||||
@@ -42,7 +43,9 @@ class LLMCompressorCallbackHandler(TrainerCallback):
|
||||
self.recipe = (
|
||||
Recipe.model_validate(recipe) if not isinstance(recipe, Recipe) else recipe
|
||||
)
|
||||
self.original_compute_loss = trainer.compute_loss
|
||||
self.trainer.compute_loss = compute_loss_wrapper(self.trainer.compute_loss)
|
||||
create_session()
|
||||
|
||||
def on_train_begin(
|
||||
self,
|
||||
@@ -60,13 +63,14 @@ class LLMCompressorCallbackHandler(TrainerCallback):
|
||||
control (TrainerControl): Trainer control.
|
||||
"""
|
||||
super().on_train_begin(args, state, control, **kwargs)
|
||||
session = active_session()
|
||||
session.initialize(
|
||||
self.trainer.accelerator.wait_for_everyone()
|
||||
active_session().initialize(
|
||||
model=self.trainer.model,
|
||||
optimizer=self.trainer.optimizer,
|
||||
start=state.epoch,
|
||||
recipe=self.recipe,
|
||||
)
|
||||
self.trainer.accelerator.wait_for_everyone()
|
||||
|
||||
def on_step_begin(
|
||||
self,
|
||||
@@ -107,8 +111,8 @@ class LLMCompressorCallbackHandler(TrainerCallback):
|
||||
Called at the end of training. Finalizes the compression session.
|
||||
"""
|
||||
super().on_train_end(args, state, control, **kwargs)
|
||||
session = active_session()
|
||||
session.finalize()
|
||||
active_session().finalize()
|
||||
self.trainer.compute_loss_func = self.original_compute_loss
|
||||
|
||||
|
||||
class LLMCompressorPlugin(BasePlugin):
|
||||
@@ -144,7 +148,9 @@ class LLMCompressorPlugin(BasePlugin):
|
||||
return [callback]
|
||||
|
||||
|
||||
def compute_loss_wrapper(compute_loss_func: Callable[P, R]) -> Callable[P, R]:
|
||||
def compute_loss_wrapper(
|
||||
compute_loss_func: Callable[Concatenate[Module, P], R],
|
||||
) -> Callable[Concatenate[Module, P], R]:
|
||||
"""
|
||||
Wraps the loss computation function to trigger the loss_calculated callback.
|
||||
|
||||
@@ -156,9 +162,10 @@ def compute_loss_wrapper(compute_loss_func: Callable[P, R]) -> Callable[P, R]:
|
||||
"""
|
||||
|
||||
@wraps(compute_loss_func)
|
||||
def compute_and_notify(*args: P.args, **kwargs: P.kwargs) -> R:
|
||||
loss = compute_loss_func(*args, **kwargs)
|
||||
session_callbacks.loss_calculated(loss=loss)
|
||||
def compute_and_notify(model: Module, *args: P.args, **kwargs: P.kwargs) -> R:
|
||||
loss = compute_loss_func(model, *args, **kwargs)
|
||||
if active_session().lifecycle.initialized_ and model.training:
|
||||
session_callbacks.loss_calculated(loss=loss)
|
||||
return loss
|
||||
|
||||
return compute_and_notify
|
||||
|
||||
40
src/axolotl/integrations/llm_compressor/utils.py
Normal file
40
src/axolotl/integrations/llm_compressor/utils.py
Normal file
@@ -0,0 +1,40 @@
|
||||
"""Utilities for llmcompressor integration with axolotl."""
|
||||
|
||||
from typing import Union
|
||||
|
||||
from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
|
||||
modify_save_pretrained,
|
||||
)
|
||||
from transformers import PreTrainedModel, Trainer
|
||||
|
||||
|
||||
def save_compressed_model(
|
||||
model: PreTrainedModel,
|
||||
output_dir: Union[str, bytes],
|
||||
trainer: Trainer,
|
||||
safe_serialization: bool = False,
|
||||
save_compressed: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Synchronize processes, apply compression hooks, and save the model.
|
||||
|
||||
Args:
|
||||
model (PreTrainedModel): The model to be saved.
|
||||
output_dir (str or bytes): Path where the model files will be written.
|
||||
trainer (Trainer): Hugging Face Trainer for process synchronization.
|
||||
safe_serialization (bool): Use safe serialization if True.
|
||||
save_compressed (bool): Write compressed tensors if True.
|
||||
"""
|
||||
trainer.accelerator.wait_for_everyone()
|
||||
|
||||
# Only the main process writes the files
|
||||
if not trainer.accelerator.is_main_process:
|
||||
return
|
||||
|
||||
modify_save_pretrained(model)
|
||||
model.save_pretrained(
|
||||
output_dir,
|
||||
safe_serialization=safe_serialization,
|
||||
save_compressed=save_compressed,
|
||||
skip_sparsity_compression_stats=not save_compressed,
|
||||
)
|
||||
@@ -295,8 +295,23 @@ def save_trained_model(
|
||||
trainer.model.save_pretrained(
|
||||
cfg.output_dir, safe_serialization=safe_serialization
|
||||
)
|
||||
|
||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||
|
||||
if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
|
||||
# TODO: add integration support so this can be implemented completely within the plugin
|
||||
from axolotl.integrations.llm_compressor.utils import (
|
||||
save_compressed_model,
|
||||
)
|
||||
|
||||
save_compressed_model(
|
||||
model=model,
|
||||
output_dir=cfg.output_dir,
|
||||
trainer=trainer,
|
||||
safe_serialization=safe_serialization,
|
||||
save_compressed=cfg.llmcompressor.save_compressed,
|
||||
)
|
||||
|
||||
|
||||
def create_model_card(cfg: DictDefault, trainer: Trainer):
|
||||
"""
|
||||
|
||||
106
tests/e2e/integrations/test_llm_compressor.py
Normal file
106
tests/e2e/integrations/test_llm_compressor.py
Normal file
@@ -0,0 +1,106 @@
|
||||
"""
|
||||
E2E smoke tests for LLMCompressorPlugin integration
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from axolotl.cli.args import TrainerCliArgs
|
||||
from axolotl.common.datasets import load_datasets
|
||||
from axolotl.train import train
|
||||
from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
from tests.e2e.utils import (
|
||||
check_model_output_exists,
|
||||
require_llmcompressor,
|
||||
require_torch_2_4_1,
|
||||
)
|
||||
|
||||
MODELS = [
|
||||
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
|
||||
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"base_model", MODELS, ids=["no-checkpoint-recipe", "with-checkpoint-recipe"]
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"save_compressed", [True, False], ids=["save_compressed", "save_uncompressed"]
|
||||
)
|
||||
@require_llmcompressor
|
||||
class TestLLMCompressorIntegration:
|
||||
"""
|
||||
e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin
|
||||
"""
|
||||
|
||||
@require_torch_2_4_1
|
||||
def test_llmcompressor_plugin(
|
||||
self, temp_dir, base_model: str, save_compressed: bool
|
||||
):
|
||||
# core cfg
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": base_model,
|
||||
"plugins": ["axolotl.integrations.llm_compressor.LLMCompressorPlugin"],
|
||||
"sequence_len": 1024,
|
||||
"val_set_size": 0.05,
|
||||
"special_tokens": {"pad_token": "<|endoftext|>"},
|
||||
"datasets": [{"path": "mhenrichsen/alpaca_2k_test", "type": "alpaca"}],
|
||||
"num_epochs": 1,
|
||||
"micro_batch_size": 2,
|
||||
"gradient_accumulation_steps": 2,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 1e-5,
|
||||
"optimizer": "adamw_torch_fused",
|
||||
"lr_scheduler": "cosine",
|
||||
"save_safetensors": True,
|
||||
"bf16": "auto",
|
||||
"max_steps": 5,
|
||||
"llmcompressor": {
|
||||
"recipe": {
|
||||
"finetuning_stage": {
|
||||
"finetuning_modifiers": {
|
||||
"ConstantPruningModifier": {
|
||||
"targets": [
|
||||
"re:.*q_proj.weight",
|
||||
"re:.*k_proj.weight",
|
||||
"re:.*v_proj.weight",
|
||||
"re:.*o_proj.weight",
|
||||
"re:.*gate_proj.weight",
|
||||
"re:.*up_proj.weight",
|
||||
"re:.*down_proj.weight",
|
||||
],
|
||||
"start": 0,
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
"save_compressed": save_compressed,
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
prepare_plugins(cfg)
|
||||
cfg = validate_config(cfg)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
_check_llmcompressor_model_outputs(temp_dir, save_compressed)
|
||||
|
||||
|
||||
def _check_llmcompressor_model_outputs(temp_dir, save_compressed):
|
||||
if save_compressed:
|
||||
assert (Path(temp_dir) / "recipe.yaml").exists()
|
||||
|
||||
from compressed_tensors import ModelCompressor
|
||||
from compressed_tensors.config import Sparse24BitMaskConfig
|
||||
|
||||
compressor = ModelCompressor.from_pretrained(temp_dir)
|
||||
assert compressor is not None
|
||||
assert isinstance(compressor.sparsity_config, Sparse24BitMaskConfig)
|
||||
@@ -109,6 +109,24 @@ def require_vllm(test_case):
|
||||
)(test_case)
|
||||
|
||||
|
||||
def require_llmcompressor(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires a llmcompressor to be installed
|
||||
"""
|
||||
|
||||
def is_llmcompressor_installed():
|
||||
try:
|
||||
import llmcompressor # pylint: disable=unused-import # noqa: F401
|
||||
|
||||
return True
|
||||
except ImportError:
|
||||
return False
|
||||
|
||||
return unittest.skipUnless(
|
||||
is_llmcompressor_installed(), "test requires a llmcompressor to be installed"
|
||||
)(test_case)
|
||||
|
||||
|
||||
def is_hopper():
|
||||
compute_capability = torch.cuda.get_device_capability()
|
||||
return compute_capability == (9, 0)
|
||||
|
||||
Reference in New Issue
Block a user