Compare commits
10 Commits
3a8b637598
...
llmcompres
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
6affbb1f85 | ||
|
|
0ed4b4c310 | ||
|
|
f4a0f496a0 | ||
|
|
82b16bd040 | ||
|
|
fd5c985038 | ||
|
|
5246aebc04 | ||
|
|
f4bcc71c86 | ||
|
|
3a9e172272 | ||
|
|
372f0e137b | ||
|
|
17dffec71d |
12
.github/workflows/tests.yml
vendored
12
.github/workflows/tests.yml
vendored
@@ -261,6 +261,18 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras: llmcompressor
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.4.1
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras:
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
|
|||||||
@@ -49,7 +49,8 @@ sections = [
|
|||||||
("Knowledge Distillation (KD)", "kd"),
|
("Knowledge Distillation (KD)", "kd"),
|
||||||
("Liger Kernels", "liger"),
|
("Liger Kernels", "liger"),
|
||||||
("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
|
("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
|
||||||
("Spectrum", "spectrum")
|
("Spectrum", "spectrum"),
|
||||||
|
("LLMCompressor", "llm_compressor")
|
||||||
]
|
]
|
||||||
|
|
||||||
for section_name, folder_name in sections:
|
for section_name, folder_name in sections:
|
||||||
|
|||||||
@@ -45,6 +45,7 @@ llmcompressor:
|
|||||||
're:.*down_proj.weight',
|
're:.*down_proj.weight',
|
||||||
]
|
]
|
||||||
start: 0
|
start: 0
|
||||||
|
save_compressed: true
|
||||||
# ... (other training arguments)
|
# ... (other training arguments)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -52,19 +53,56 @@ This plugin **does not apply pruning or sparsification itself** — it is intend
|
|||||||
|
|
||||||
Pre-sparsified checkpoints can be:
|
Pre-sparsified checkpoints can be:
|
||||||
- Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor)
|
- Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor)
|
||||||
- Or downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic)
|
- Downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic)
|
||||||
|
- Any custom LLM with compatible sparsity patterns that you've created yourself
|
||||||
|
|
||||||
To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:
|
To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:
|
||||||
[https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md)
|
[https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md)
|
||||||
|
|
||||||
|
### Storage Optimization with save_compressed
|
||||||
|
|
||||||
|
Setting `save_compressed: true` in your configuration enables saving models in a compressed format, which:
|
||||||
|
- Reduces disk space usage by approximately 40%
|
||||||
|
- Maintains compatibility with vLLM for accelerated inference
|
||||||
|
- Maintains compatibility with llmcompressor for further optimization (example: quantization)
|
||||||
|
|
||||||
|
This option is highly recommended when working with sparse models to maximize the benefits of model compression.
|
||||||
|
|
||||||
### Example Config
|
### Example Config
|
||||||
|
|
||||||
See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example.
|
See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## Inference with vLLM
|
||||||
|
|
||||||
|
After fine-tuning your sparse model, you can leverage vLLM for efficient inference.
|
||||||
|
You can also use LLMCompressor to apply additional quantization to your fine-tuned
|
||||||
|
sparse model before inference for even greater performance benefits.:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
llm = LLM("path/to/your/sparse/model")
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|
||||||
|
For more details on vLLM's capabilities and advanced configuration options, see the [official vLLM documentation](https://docs.vllm.ai/).
|
||||||
|
|
||||||
## Learn More
|
## Learn More
|
||||||
|
|
||||||
For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:
|
For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:
|
||||||
|
|
||||||
👉 [https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)
|
[https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)
|
||||||
|
|||||||
@@ -288,7 +288,19 @@ def save_trained_model(
|
|||||||
os.remove(os.path.join(cfg.output_dir, "model.safetensors"))
|
os.remove(os.path.join(cfg.output_dir, "model.safetensors"))
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
pass
|
pass
|
||||||
elif hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
|
elif cfg.local_rank == 0:
|
||||||
|
if cfg.flash_optimum and BetterTransformer:
|
||||||
|
model = BetterTransformer.reverse(model)
|
||||||
|
|
||||||
|
if cfg.rl and cfg.adapter and not cfg.rl_adapter_ref_model:
|
||||||
|
trainer.model.save_pretrained(
|
||||||
|
cfg.output_dir, safe_serialization=safe_serialization
|
||||||
|
)
|
||||||
|
|
||||||
|
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||||
|
|
||||||
|
if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
|
||||||
|
# TODO: add integration support so this can be implemented completely within the plugin
|
||||||
from axolotl.integrations.llm_compressor.utils import (
|
from axolotl.integrations.llm_compressor.utils import (
|
||||||
save_compressed_model,
|
save_compressed_model,
|
||||||
)
|
)
|
||||||
@@ -301,17 +313,6 @@ def save_trained_model(
|
|||||||
save_compressed=cfg.llmcompressor.save_compressed,
|
save_compressed=cfg.llmcompressor.save_compressed,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif cfg.local_rank == 0:
|
|
||||||
if cfg.flash_optimum and BetterTransformer:
|
|
||||||
model = BetterTransformer.reverse(model)
|
|
||||||
|
|
||||||
if cfg.rl and cfg.adapter and not cfg.rl_adapter_ref_model:
|
|
||||||
trainer.model.save_pretrained(
|
|
||||||
cfg.output_dir, safe_serialization=safe_serialization
|
|
||||||
)
|
|
||||||
|
|
||||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
|
||||||
|
|
||||||
|
|
||||||
def create_model_card(cfg: DictDefault, trainer: Trainer):
|
def create_model_card(cfg: DictDefault, trainer: Trainer):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -9,10 +9,14 @@ import pytest
|
|||||||
from axolotl.cli.args import TrainerCliArgs
|
from axolotl.cli.args import TrainerCliArgs
|
||||||
from axolotl.common.datasets import load_datasets
|
from axolotl.common.datasets import load_datasets
|
||||||
from axolotl.train import train
|
from axolotl.train import train
|
||||||
from axolotl.utils.config import normalize_config, prepare_plugins
|
from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
from tests.e2e.utils import check_model_output_exists, require_torch_2_4_1
|
from tests.e2e.utils import (
|
||||||
|
check_model_output_exists,
|
||||||
|
require_llmcompressor,
|
||||||
|
require_torch_2_4_1,
|
||||||
|
)
|
||||||
|
|
||||||
MODELS = [
|
MODELS = [
|
||||||
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
|
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
|
||||||
@@ -31,10 +35,13 @@ class TestLLMCompressorIntegration:
|
|||||||
e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin
|
e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@require_llmcompressor
|
||||||
@require_torch_2_4_1
|
@require_torch_2_4_1
|
||||||
def test_llmcompressor_plugin(
|
def test_llmcompressor_plugin(
|
||||||
self, temp_dir, base_model: str, save_compressed: bool
|
self, temp_dir, base_model: str, save_compressed: bool
|
||||||
):
|
):
|
||||||
|
from llmcompressor import active_session
|
||||||
|
|
||||||
# core cfg
|
# core cfg
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
{
|
{
|
||||||
@@ -79,22 +86,23 @@ class TestLLMCompressorIntegration:
|
|||||||
)
|
)
|
||||||
|
|
||||||
prepare_plugins(cfg)
|
prepare_plugins(cfg)
|
||||||
|
cfg = validate_config(cfg)
|
||||||
normalize_config(cfg)
|
normalize_config(cfg)
|
||||||
cli_args = TrainerCliArgs()
|
cli_args = TrainerCliArgs()
|
||||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||||
|
|
||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
try:
|
||||||
check_model_output_exists(temp_dir, cfg)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
_check_llmcompressor_model_outputs(temp_dir, save_compressed)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
_check_llmcompressor_model_outputs(temp_dir, save_compressed)
|
||||||
|
finally:
|
||||||
|
active_session().reset()
|
||||||
|
|
||||||
|
|
||||||
def _check_llmcompressor_model_outputs(temp_dir, save_compressed):
|
def _check_llmcompressor_model_outputs(temp_dir, save_compressed):
|
||||||
|
|
||||||
# recipe.yaml should exist
|
|
||||||
assert (Path(temp_dir) / "recipe.yaml").exists()
|
|
||||||
|
|
||||||
# sparsity config exists if save_compressed
|
|
||||||
if save_compressed:
|
if save_compressed:
|
||||||
|
assert (Path(temp_dir) / "recipe.yaml").exists()
|
||||||
|
|
||||||
from compressed_tensors import ModelCompressor
|
from compressed_tensors import ModelCompressor
|
||||||
from compressed_tensors.config import Sparse24BitMaskConfig
|
from compressed_tensors.config import Sparse24BitMaskConfig
|
||||||
|
|
||||||
|
|||||||
@@ -105,7 +105,25 @@ def require_vllm(test_case):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
return unittest.skipUnless(
|
return unittest.skipUnless(
|
||||||
is_vllm_installed(), "test requires a vllm to be installed"
|
is_vllm_installed(), "test requires vllm to be installed"
|
||||||
|
)(test_case)
|
||||||
|
|
||||||
|
|
||||||
|
def require_llmcompressor(test_case):
|
||||||
|
"""
|
||||||
|
Decorator marking a test that requires a llmcompressor to be installed
|
||||||
|
"""
|
||||||
|
|
||||||
|
def is_llmcompressor_installed():
|
||||||
|
try:
|
||||||
|
import llmcompressor # pylint: disable=unused-import # noqa: F401
|
||||||
|
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return unittest.skipUnless(
|
||||||
|
is_llmcompressor_installed(), "test requires llmcompressor to be installed"
|
||||||
)(test_case)
|
)(test_case)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user