Compare commits
49 Commits
v0.9.2
...
revert-mul
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
e910e3e164 | ||
|
|
0f3587174d | ||
|
|
25e6c5f9bd | ||
|
|
32f51bca35 | ||
|
|
9daa04da90 | ||
|
|
0d71b0aa5f | ||
|
|
63aaccf85b | ||
|
|
ff0fe767c8 | ||
|
|
8e4158cc0b | ||
|
|
cd84325253 | ||
|
|
0b140fef83 | ||
|
|
e4cfebe995 | ||
|
|
a6cac5dd32 | ||
|
|
b71c0e3447 | ||
|
|
ddaebf8309 | ||
|
|
679743087a | ||
|
|
f720b6e72d | ||
|
|
a980618fd0 | ||
|
|
54960d4de0 | ||
|
|
ed922796b7 | ||
|
|
3dd9c3bf3f | ||
|
|
0ba7d362fa | ||
|
|
e4f73bc98e | ||
|
|
bcb59c70e2 | ||
|
|
6a3e6f8c53 | ||
|
|
fee3c13bb5 | ||
|
|
996fc124e5 | ||
|
|
e963990ad7 | ||
|
|
c3f2b1c5c2 | ||
|
|
6ba5c0ed2c | ||
|
|
24ff5f53f8 | ||
|
|
5e949eaa07 | ||
|
|
89ca14d9a0 | ||
|
|
8446b4ad28 | ||
|
|
fc79606b6d | ||
|
|
baeb00231b | ||
|
|
2413688b08 | ||
|
|
5bb1f3da56 | ||
|
|
a21b9cc472 | ||
|
|
41a1ec0c95 | ||
|
|
ecac731922 | ||
|
|
742fef4200 | ||
|
|
a39caf8824 | ||
|
|
07e4f2e25b | ||
|
|
c7d07de6b4 | ||
|
|
6565ae85d8 | ||
|
|
80b4edb4a7 | ||
|
|
fedbcc0254 | ||
|
|
8175896ada |
2
.github/workflows/multi-gpu-e2e.yml
vendored
2
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -3,7 +3,7 @@ name: docker-multigpu-tests-biweekly
|
|||||||
on:
|
on:
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- 'tests/e2e/multigpu/**.py'
|
- 'tests/e2e/multigpu/*.py'
|
||||||
- 'requirements.txt'
|
- 'requirements.txt'
|
||||||
- 'setup.py'
|
- 'setup.py'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
|
|||||||
276
.github/workflows/tests.yml
vendored
276
.github/workflows/tests.yml
vendored
@@ -44,102 +44,96 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
SKIP: no-commit-to-branch
|
SKIP: no-commit-to-branch
|
||||||
|
|
||||||
# preload-cache:
|
preload-cache:
|
||||||
# name: Preload HF cache
|
name: Preload HF cache
|
||||||
# runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# strategy:
|
strategy:
|
||||||
# fail-fast: false
|
fail-fast: false
|
||||||
# matrix:
|
matrix:
|
||||||
# python_version: ["3.11"]
|
python_version: ["3.11"]
|
||||||
# pytorch_version: ["2.6.0"]
|
pytorch_version: ["2.6.0"]
|
||||||
# timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
#
|
|
||||||
# env:
|
env:
|
||||||
# AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
|
AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
|
||||||
#
|
|
||||||
# steps:
|
steps:
|
||||||
# - name: Check out repository code
|
- name: Check out repository code
|
||||||
# uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
#
|
|
||||||
# - name: Restore HF cache
|
- name: Restore HF cache
|
||||||
# id: hf-cache-restore
|
id: hf-cache-restore
|
||||||
# uses: actions/cache/restore@v4
|
uses: actions/cache/restore@v4
|
||||||
# with:
|
with:
|
||||||
# path: |
|
path: |
|
||||||
# /home/runner/.cache/huggingface/hub/datasets--*
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
# /home/runner/.cache/huggingface/hub/models--*
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
# key: ${{ runner.os }}-hf-hub-cache-v2
|
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||||
#
|
|
||||||
# - name: Restore Cache from S3
|
- name: Setup Python
|
||||||
# id: hf-cache-restore-s3
|
uses: actions/setup-python@v5
|
||||||
# run: |
|
with:
|
||||||
# mkdir -p /home/runner/.cache/huggingface/hub
|
python-version: ${{ matrix.python_version }}
|
||||||
# curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
cache: 'pip' # caching pip dependencies
|
||||||
#
|
|
||||||
# - name: Setup Python
|
- name: upgrade pip
|
||||||
# uses: actions/setup-python@v5
|
run: |
|
||||||
# with:
|
pip3 install --upgrade pip
|
||||||
# python-version: ${{ matrix.python_version }}
|
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
|
||||||
# cache: 'pip' # caching pip dependencies
|
|
||||||
#
|
- name: Install PyTorch
|
||||||
# - name: upgrade pip
|
run: |
|
||||||
# run: |
|
pip3 install torch==${{ matrix.pytorch_version }}
|
||||||
# pip3 install --upgrade pip
|
|
||||||
# pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
|
- name: Install dependencies
|
||||||
#
|
run: |
|
||||||
# - name: Install PyTorch
|
pip3 show torch
|
||||||
# run: |
|
pip3 install --no-build-isolation -U -e .
|
||||||
# pip3 install torch==${{ matrix.pytorch_version }}
|
python scripts/unsloth_install.py | sh
|
||||||
#
|
python scripts/cutcrossentropy_install.py | sh
|
||||||
# - name: Install dependencies
|
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
||||||
# run: |
|
|
||||||
# pip3 show torch
|
- name: Make sure PyTorch version wasn't clobbered
|
||||||
# pip3 install --no-build-isolation -U -e .
|
run: |
|
||||||
# python scripts/unsloth_install.py | sh
|
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
|
||||||
# python scripts/cutcrossentropy_install.py | sh
|
|
||||||
# pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
- name: Ensure axolotl CLI was installed
|
||||||
#
|
run: |
|
||||||
# - name: Make sure PyTorch version wasn't clobbered
|
axolotl --help
|
||||||
# run: |
|
|
||||||
# python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
|
- name: Pre-Download dataset fixture
|
||||||
#
|
run: |
|
||||||
# - name: Ensure axolotl CLI was installed
|
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
||||||
# run: |
|
|
||||||
# axolotl --help
|
- name: Run tests
|
||||||
#
|
run: |
|
||||||
# - name: Pre-Download dataset fixture
|
pytest -v tests/conftest.py
|
||||||
# run: |
|
|
||||||
# huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
- name: Upload coverage to Codecov
|
||||||
#
|
uses: codecov/codecov-action@v5
|
||||||
# - name: Run tests
|
with:
|
||||||
# run: |
|
token: ${{ secrets.CODECOV_TOKEN }}
|
||||||
# pytest -v tests/conftest.py
|
files: ./coverage.xml
|
||||||
#
|
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
||||||
# - name: Upload coverage to Codecov
|
fail_ci_if_error: false
|
||||||
# uses: codecov/codecov-action@v5
|
|
||||||
# with:
|
- name: cleanup pip cache
|
||||||
# token: ${{ secrets.CODECOV_TOKEN }}
|
run: |
|
||||||
# files: ./coverage.xml
|
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
||||||
# flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
|
||||||
# fail_ci_if_error: false
|
- name: Save HF cache
|
||||||
#
|
id: hf-cache
|
||||||
# - name: cleanup pip cache
|
uses: actions/cache/save@v4
|
||||||
# run: |
|
with:
|
||||||
# find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
path: |
|
||||||
#
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
# - name: Save HF cache
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
# id: hf-cache
|
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
|
||||||
# uses: actions/cache/save@v4
|
|
||||||
# with:
|
|
||||||
# path: |
|
|
||||||
# /home/runner/.cache/huggingface/hub/datasets--*
|
|
||||||
# /home/runner/.cache/huggingface/hub/models--*
|
|
||||||
# key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
|
|
||||||
|
|
||||||
pytest:
|
pytest:
|
||||||
name: PyTest
|
name: PyTest
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# needs: [preload-cache]
|
needs: [preload-cache]
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -151,20 +145,14 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
# - name: Restore HF cache
|
- name: Restore HF cache
|
||||||
# id: hf-cache-restore
|
id: hf-cache-restore
|
||||||
# uses: actions/cache/restore@v4
|
uses: actions/cache/restore@v4
|
||||||
# with:
|
with:
|
||||||
# path: |
|
path: |
|
||||||
# /home/runner/.cache/huggingface/hub/datasets--*
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
# /home/runner/.cache/huggingface/hub/models--*
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
# key: ${{ runner.os }}-hf-hub-cache-v2
|
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||||
|
|
||||||
- name: Restore Cache from S3
|
|
||||||
id: hf-cache-restore-s3
|
|
||||||
run: |
|
|
||||||
mkdir -p /home/runner/.cache/huggingface/hub
|
|
||||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
@@ -222,7 +210,7 @@ jobs:
|
|||||||
pytest-sdist:
|
pytest-sdist:
|
||||||
name: PyTest from Source Dist
|
name: PyTest from Source Dist
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
# needs: [preload-cache]
|
needs: [preload-cache]
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
@@ -234,20 +222,14 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
# - name: Restore HF cache
|
- name: Restore HF cache
|
||||||
# id: hf-cache-restore
|
id: hf-cache-restore
|
||||||
# uses: actions/cache/restore@v4
|
uses: actions/cache/restore@v4
|
||||||
# with:
|
with:
|
||||||
# path: |
|
path: |
|
||||||
# /home/runner/.cache/huggingface/hub/datasets--*
|
/home/runner/.cache/huggingface/hub/datasets--*
|
||||||
# /home/runner/.cache/huggingface/hub/models--*
|
/home/runner/.cache/huggingface/hub/models--*
|
||||||
# key: ${{ runner.os }}-hf-hub-cache-v2
|
key: ${{ runner.os }}-hf-hub-cache-v2
|
||||||
|
|
||||||
- name: Restore Cache from S3
|
|
||||||
id: hf-cache-restore-s3
|
|
||||||
run: |
|
|
||||||
mkdir -p /home/runner/.cache/huggingface/hub
|
|
||||||
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
@@ -347,6 +329,18 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras: llmcompressor
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.4.1
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras:
|
||||||
- cuda: 124
|
- cuda: 124
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
@@ -383,43 +377,3 @@ jobs:
|
|||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.e2e_tests
|
modal run cicd.e2e_tests
|
||||||
|
|
||||||
docker-e2e-cleanup:
|
|
||||||
runs-on: [self-hosted, modal]
|
|
||||||
timeout-minutes: 90
|
|
||||||
needs: [docker-e2e-tests]
|
|
||||||
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
include:
|
|
||||||
- cuda: 124
|
|
||||||
cuda_version: 12.4.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.6.0
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras: vllm
|
|
||||||
steps:
|
|
||||||
- name: Checkout
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
- name: Install Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: "3.11"
|
|
||||||
- name: Install Modal
|
|
||||||
run: |
|
|
||||||
python -m pip install --upgrade pip
|
|
||||||
pip install modal==0.71.8 jinja2
|
|
||||||
- name: Update env vars
|
|
||||||
run: |
|
|
||||||
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
|
||||||
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
|
||||||
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
|
||||||
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
|
||||||
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
|
||||||
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
|
||||||
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
|
||||||
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
|
||||||
- name: Run tests job on Modal
|
|
||||||
run: |
|
|
||||||
modal run cicd.cleanup
|
|
||||||
|
|||||||
@@ -57,10 +57,8 @@ async def handler(job):
|
|||||||
logger.info("Training Complete.")
|
logger.info("Training Complete.")
|
||||||
|
|
||||||
# Cleanup
|
# Cleanup
|
||||||
if "WANDB_API_KEY" in os.environ:
|
del os.environ["WANDB_API_KEY"]
|
||||||
del os.environ["WANDB_API_KEY"]
|
del os.environ["HF_TOKEN"]
|
||||||
if "HF_TOKEN" in os.environ:
|
|
||||||
del os.environ["HF_TOKEN"]
|
|
||||||
|
|
||||||
|
|
||||||
runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
|
runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
|
||||||
|
|||||||
@@ -124,8 +124,7 @@ quartodoc:
|
|||||||
- utils.optimizers.adopt
|
- utils.optimizers.adopt
|
||||||
- utils.data.pretraining
|
- utils.data.pretraining
|
||||||
- utils.data.sft
|
- utils.data.sft
|
||||||
- utils.gradient_checkpointing.offload_cpu
|
- utils.gradient_checkpointing.unsloth
|
||||||
- utils.gradient_checkpointing.offload_disk
|
|
||||||
- title: Schemas
|
- title: Schemas
|
||||||
desc: Pydantic data models for Axolotl config
|
desc: Pydantic data models for Axolotl config
|
||||||
contents:
|
contents:
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ pytest -v --durations=10 \
|
|||||||
--cov-append
|
--cov-append
|
||||||
|
|
||||||
# Run patched tests excluding lora kernels with coverage append
|
# Run patched tests excluding lora kernels with coverage append
|
||||||
pytest --full-trace -vvv --durations=10 \
|
pytest -v --durations=10 \
|
||||||
--ignore=tests/e2e/patched/lora_kernels \
|
--ignore=tests/e2e/patched/lora_kernels \
|
||||||
/workspace/axolotl/tests/e2e/patched \
|
/workspace/axolotl/tests/e2e/patched \
|
||||||
--cov=axolotl \
|
--cov=axolotl \
|
||||||
|
|||||||
@@ -1,19 +0,0 @@
|
|||||||
"""Modal app to run axolotl GPU cleanup"""
|
|
||||||
|
|
||||||
from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
|
|
||||||
|
|
||||||
|
|
||||||
@app.function(
|
|
||||||
image=cicd_image,
|
|
||||||
timeout=60 * 60,
|
|
||||||
cpu=8.0,
|
|
||||||
memory=131072,
|
|
||||||
volumes=VOLUME_CONFIG,
|
|
||||||
)
|
|
||||||
def cleanup():
|
|
||||||
run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
|
|
||||||
|
|
||||||
|
|
||||||
@app.local_entrypoint()
|
|
||||||
def main():
|
|
||||||
cleanup.remote()
|
|
||||||
@@ -1,6 +0,0 @@
|
|||||||
#!/bin/bash
|
|
||||||
set -e
|
|
||||||
|
|
||||||
# cleanup old cache files for datasets processing and intermediate mappings
|
|
||||||
find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
|
|
||||||
find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
|
|
||||||
@@ -1,12 +1,75 @@
|
|||||||
"""Modal app to run axolotl GPU tests"""
|
"""Modal app to run axolotl GPU tests"""
|
||||||
|
|
||||||
from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
|
# pylint: disable=duplicate-code
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import jinja2
|
||||||
|
import modal
|
||||||
|
from jinja2 import select_autoescape
|
||||||
|
from modal import App, Image
|
||||||
|
|
||||||
|
cicd_path = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
|
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
||||||
|
template_env = jinja2.Environment(
|
||||||
|
loader=template_loader, autoescape=select_autoescape()
|
||||||
|
)
|
||||||
|
df_template = template_env.get_template("Dockerfile.jinja")
|
||||||
|
|
||||||
|
df_args = {
|
||||||
|
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
||||||
|
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
||||||
|
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
|
||||||
|
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
|
||||||
|
"CUDA": os.environ.get("CUDA", "121"),
|
||||||
|
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
||||||
|
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
||||||
|
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
||||||
|
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
||||||
|
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
||||||
|
}
|
||||||
|
|
||||||
|
dockerfile_contents = df_template.render(**df_args)
|
||||||
|
|
||||||
|
temp_dir = tempfile.mkdtemp()
|
||||||
|
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
||||||
|
f.write(dockerfile_contents)
|
||||||
|
|
||||||
|
cicd_image = Image.from_dockerfile(
|
||||||
|
pathlib.Path(temp_dir) / "Dockerfile",
|
||||||
|
context_mount=None,
|
||||||
|
force_build=True,
|
||||||
|
gpu="A10G",
|
||||||
|
).env(df_args)
|
||||||
|
|
||||||
|
app = App("Axolotl CI/CD", secrets=[])
|
||||||
|
|
||||||
|
hf_cache_volume = modal.Volume.from_name(
|
||||||
|
"axolotl-ci-hf-hub-cache", create_if_missing=True
|
||||||
|
)
|
||||||
|
VOLUME_CONFIG = {
|
||||||
|
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
|
||||||
|
}
|
||||||
|
|
||||||
|
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
||||||
|
GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
|
||||||
|
|
||||||
|
|
||||||
|
def run_cmd(cmd: str, run_folder: str):
|
||||||
|
import subprocess # nosec
|
||||||
|
|
||||||
|
# Propagate errors from subprocess.
|
||||||
|
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
|
||||||
|
exit(exit_code) # pylint: disable=consider-using-sys-exit
|
||||||
|
|
||||||
|
|
||||||
@app.function(
|
@app.function(
|
||||||
image=cicd_image,
|
image=cicd_image,
|
||||||
gpu=GPU_CONFIG,
|
gpu=GPU_CONFIG,
|
||||||
timeout=90 * 60, # 90 min
|
timeout=60 * 60,
|
||||||
cpu=8.0,
|
cpu=8.0,
|
||||||
memory=131072,
|
memory=131072,
|
||||||
volumes=VOLUME_CONFIG,
|
volumes=VOLUME_CONFIG,
|
||||||
|
|||||||
@@ -1,66 +0,0 @@
|
|||||||
"""Modal app to run axolotl GPU tests"""
|
|
||||||
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
|
|
||||||
import os
|
|
||||||
import pathlib
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
import jinja2
|
|
||||||
import modal
|
|
||||||
from jinja2 import select_autoescape
|
|
||||||
from modal import App, Image
|
|
||||||
|
|
||||||
cicd_path = pathlib.Path(__file__).parent.resolve()
|
|
||||||
|
|
||||||
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
|
||||||
template_env = jinja2.Environment(
|
|
||||||
loader=template_loader, autoescape=select_autoescape()
|
|
||||||
)
|
|
||||||
df_template = template_env.get_template("Dockerfile.jinja")
|
|
||||||
|
|
||||||
df_args = {
|
|
||||||
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
|
||||||
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
|
||||||
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
|
|
||||||
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
|
|
||||||
"CUDA": os.environ.get("CUDA", "121"),
|
|
||||||
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
|
||||||
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
|
||||||
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
|
||||||
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
|
||||||
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
|
||||||
}
|
|
||||||
|
|
||||||
dockerfile_contents = df_template.render(**df_args)
|
|
||||||
|
|
||||||
temp_dir = tempfile.mkdtemp()
|
|
||||||
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
|
||||||
f.write(dockerfile_contents)
|
|
||||||
|
|
||||||
cicd_image = Image.from_dockerfile(
|
|
||||||
pathlib.Path(temp_dir) / "Dockerfile",
|
|
||||||
context_mount=None,
|
|
||||||
force_build=True,
|
|
||||||
gpu="A10G",
|
|
||||||
).env(df_args)
|
|
||||||
|
|
||||||
app = App("Axolotl CI/CD", secrets=[])
|
|
||||||
|
|
||||||
hf_cache_volume = modal.Volume.from_name(
|
|
||||||
"axolotl-ci-hf-hub-cache", create_if_missing=True
|
|
||||||
)
|
|
||||||
VOLUME_CONFIG = {
|
|
||||||
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
|
|
||||||
}
|
|
||||||
|
|
||||||
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
|
||||||
GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
|
|
||||||
|
|
||||||
|
|
||||||
def run_cmd(cmd: str, run_folder: str):
|
|
||||||
import subprocess # nosec
|
|
||||||
|
|
||||||
# Propagate errors from subprocess.
|
|
||||||
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
|
|
||||||
exit(exit_code) # pylint: disable=consider-using-sys-exit
|
|
||||||
@@ -19,7 +19,7 @@ coverage:
|
|||||||
if_no_uploads: error
|
if_no_uploads: error
|
||||||
if_not_found: success
|
if_not_found: success
|
||||||
if_ci_failed: error
|
if_ci_failed: error
|
||||||
only_pulls: true
|
only_pulls: false
|
||||||
flags: null
|
flags: null
|
||||||
paths: null
|
paths: null
|
||||||
patch:
|
patch:
|
||||||
|
|||||||
@@ -505,7 +505,6 @@ save_strategy: # Set to `"no"` to skip checkpoint saves, `"epoch"` at end of eac
|
|||||||
save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
|
save_steps: # Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps
|
||||||
saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
|
saves_per_epoch: # number of times per epoch to save a checkpoint, mutually exclusive with save_steps
|
||||||
save_total_limit: # Checkpoints saved at a time
|
save_total_limit: # Checkpoints saved at a time
|
||||||
save_only_model: # Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints.
|
|
||||||
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
# Maximum number of iterations to train for. It precedes num_epochs which means that
|
||||||
# if both are set, num_epochs will not be guaranteed.
|
# if both are set, num_epochs will not be guaranteed.
|
||||||
# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
|
# e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
|
||||||
@@ -539,7 +538,7 @@ train_on_inputs: false
|
|||||||
# Note that training loss may have an oscillating pattern with this enabled.
|
# Note that training loss may have an oscillating pattern with this enabled.
|
||||||
group_by_length: false
|
group_by_length: false
|
||||||
|
|
||||||
# Whether to use gradient checkpointing. Available options are: true, false, "offload", "offload_disk".
|
# Whether to use gradient checkpointing. Available options are: true, false, "offload".
|
||||||
# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
# https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
|
||||||
gradient_checkpointing: false
|
gradient_checkpointing: false
|
||||||
# additional kwargs to pass to the trainer for gradient checkpointing
|
# additional kwargs to pass to the trainer for gradient checkpointing
|
||||||
|
|||||||
@@ -49,7 +49,8 @@ sections = [
|
|||||||
("Knowledge Distillation (KD)", "kd"),
|
("Knowledge Distillation (KD)", "kd"),
|
||||||
("Liger Kernels", "liger"),
|
("Liger Kernels", "liger"),
|
||||||
("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
|
("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
|
||||||
("Spectrum", "spectrum")
|
("Spectrum", "spectrum"),
|
||||||
|
("LLMCompressor", "llm_compressor")
|
||||||
]
|
]
|
||||||
|
|
||||||
for section_name, folder_name in sections:
|
for section_name, folder_name in sections:
|
||||||
|
|||||||
77
examples/llama-3/sparse-finetuning.yaml
Normal file
77
examples/llama-3/sparse-finetuning.yaml
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.llm_compressor.LLMCompressorPlugin
|
||||||
|
|
||||||
|
load_in_8bit: false
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: tatsu-lab/alpaca
|
||||||
|
type: alpaca
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.05
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
eval_sample_packing: false
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 8
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: paged_adamw_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 2e-5
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: auto
|
||||||
|
fp16:
|
||||||
|
tf32: false
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
early_stopping_patience:
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
xformers_attention:
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 100
|
||||||
|
evals_per_epoch: 2
|
||||||
|
eval_table_size:
|
||||||
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|end_of_text|>
|
||||||
|
|
||||||
|
llmcompressor:
|
||||||
|
recipe:
|
||||||
|
finetuning_stage:
|
||||||
|
finetuning_modifiers:
|
||||||
|
ConstantPruningModifier:
|
||||||
|
targets: [
|
||||||
|
're:.*q_proj.weight',
|
||||||
|
're:.*k_proj.weight',
|
||||||
|
're:.*v_proj.weight',
|
||||||
|
're:.*o_proj.weight',
|
||||||
|
're:.*gate_proj.weight',
|
||||||
|
're:.*up_proj.weight',
|
||||||
|
're:.*down_proj.weight',
|
||||||
|
]
|
||||||
|
start: 0
|
||||||
|
save_compressed: true
|
||||||
3
setup.py
3
setup.py
@@ -150,6 +150,9 @@ extras_require = {
|
|||||||
"vllm": [
|
"vllm": [
|
||||||
"vllm==0.7.2",
|
"vllm==0.7.2",
|
||||||
],
|
],
|
||||||
|
"llmcompressor": [
|
||||||
|
"llmcompressor==0.5.1",
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
install_requires, dependency_links, extras_require_build = parse_requirements(
|
install_requires, dependency_links, extras_require_build = parse_requirements(
|
||||||
|
|||||||
@@ -4,4 +4,4 @@ import pkgutil
|
|||||||
|
|
||||||
__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package
|
__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package
|
||||||
|
|
||||||
__version__ = "0.9.2"
|
__version__ = "0.10.0.dev0"
|
||||||
|
|||||||
@@ -82,12 +82,6 @@ class VllmServeCliArgs:
|
|||||||
"hardware support this feature."
|
"hardware support this feature."
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
serve_module: Optional[str] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={
|
|
||||||
"help": "Module to serve. If not set, the default module will be used."
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from pathlib import Path
|
|||||||
from typing import Union
|
from typing import Union
|
||||||
|
|
||||||
from trl.scripts.vllm_serve import ScriptArguments
|
from trl.scripts.vllm_serve import ScriptArguments
|
||||||
|
from trl.scripts.vllm_serve import main as vllm_serve_main
|
||||||
|
|
||||||
from axolotl.cli.config import load_cfg
|
from axolotl.cli.config import load_cfg
|
||||||
|
|
||||||
@@ -27,9 +28,6 @@ def do_vllm_serve(
|
|||||||
cfg = load_cfg(config)
|
cfg = load_cfg(config)
|
||||||
model = cfg.base_model
|
model = cfg.base_model
|
||||||
|
|
||||||
serve_module = cli_args.get("serve_module", "trl.scripts.vllm_serve")
|
|
||||||
vllm_serve_main = getattr(__import__(serve_module, fromlist=["main"]), "main")
|
|
||||||
|
|
||||||
tensor_parallel_size = (
|
tensor_parallel_size = (
|
||||||
cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size
|
cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1057,8 +1057,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
|
|||||||
# default to saving each epoch if not defined
|
# default to saving each epoch if not defined
|
||||||
training_args_kwargs["save_strategy"] = "epoch"
|
training_args_kwargs["save_strategy"] = "epoch"
|
||||||
|
|
||||||
training_args_kwargs["save_only_model"] = self.cfg.save_only_model
|
|
||||||
|
|
||||||
if self.cfg.dataset_processes:
|
if self.cfg.dataset_processes:
|
||||||
training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
|
training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
|
||||||
|
|
||||||
@@ -1188,10 +1186,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
|
|||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported RL: {self.cfg.rl}")
|
raise ValueError(f"Unsupported RL: {self.cfg.rl}")
|
||||||
|
|
||||||
if self.cfg.plugins:
|
|
||||||
plugin_manager = PluginManager.get_instance()
|
|
||||||
trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
|
|
||||||
|
|
||||||
sig = inspect.signature(trainer_cls)
|
sig = inspect.signature(trainer_cls)
|
||||||
if "tokenizer" in sig.parameters.keys():
|
if "tokenizer" in sig.parameters.keys():
|
||||||
dpo_trainer_kwargs["tokenizer"] = self.tokenizer
|
dpo_trainer_kwargs["tokenizer"] = self.tokenizer
|
||||||
|
|||||||
@@ -114,8 +114,6 @@ class AxolotlTrainer(
|
|||||||
packing_efficiency_estimate=self.args.sample_packing_efficiency,
|
packing_efficiency_estimate=self.args.sample_packing_efficiency,
|
||||||
batch_max_len=batch_max_len,
|
batch_max_len=batch_max_len,
|
||||||
batch_size=batch_size,
|
batch_size=batch_size,
|
||||||
group_size=self.args.sample_packing_group_size,
|
|
||||||
bin_size=self.args.sample_packing_bin_size,
|
|
||||||
sequential=self.args.sample_packing_sequentially,
|
sequential=self.args.sample_packing_sequentially,
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
)
|
)
|
||||||
|
|||||||
108
src/axolotl/integrations/llm_compressor/README.md
Normal file
108
src/axolotl/integrations/llm_compressor/README.md
Normal file
@@ -0,0 +1,108 @@
|
|||||||
|
# LLMCompressor Integration
|
||||||
|
|
||||||
|
Fine-tune sparsified models in Axolotl using Neural Magic's [LLMCompressor](https://github.com/vllm-project/llm-compressor).
|
||||||
|
|
||||||
|
This integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor's model compression capabilities with Axolotl's distributed training pipelines, users can efficiently fine-tune sparse models at scale.
|
||||||
|
|
||||||
|
It uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Requirements
|
||||||
|
|
||||||
|
- Axolotl with `llmcompressor` extras:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install "axolotl[llmcompressor]"
|
||||||
|
```
|
||||||
|
|
||||||
|
- Requires `llmcompressor >= 0.5.1`
|
||||||
|
|
||||||
|
This will install all necessary dependencies to fine-tune sparsified models using the integration.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
To enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.llm_compressor.LLMCompressorPlugin
|
||||||
|
|
||||||
|
llmcompressor:
|
||||||
|
recipe:
|
||||||
|
finetuning_stage:
|
||||||
|
finetuning_modifiers:
|
||||||
|
ConstantPruningModifier:
|
||||||
|
targets: [
|
||||||
|
're:.*q_proj.weight',
|
||||||
|
're:.*k_proj.weight',
|
||||||
|
're:.*v_proj.weight',
|
||||||
|
're:.*o_proj.weight',
|
||||||
|
're:.*gate_proj.weight',
|
||||||
|
're:.*up_proj.weight',
|
||||||
|
're:.*down_proj.weight',
|
||||||
|
]
|
||||||
|
start: 0
|
||||||
|
save_compressed: true
|
||||||
|
# ... (other training arguments)
|
||||||
|
```
|
||||||
|
|
||||||
|
This plugin **does not apply pruning or sparsification itself** — it is intended for **fine-tuning models that have already been sparsified**.
|
||||||
|
|
||||||
|
Pre-sparsified checkpoints can be:
|
||||||
|
- Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor)
|
||||||
|
- Downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic)
|
||||||
|
- Any custom LLM with compatible sparsity patterns that you've created yourself
|
||||||
|
|
||||||
|
To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:
|
||||||
|
[https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md)
|
||||||
|
|
||||||
|
### Storage Optimization with save_compressed
|
||||||
|
|
||||||
|
Setting `save_compressed: true` in your configuration enables saving models in a compressed format, which:
|
||||||
|
- Reduces disk space usage by approximately 40%
|
||||||
|
- Maintains compatibility with vLLM for accelerated inference
|
||||||
|
- Maintains compatibility with llmcompressor for further optimization (example: quantization)
|
||||||
|
|
||||||
|
This option is highly recommended when working with sparse models to maximize the benefits of model compression.
|
||||||
|
|
||||||
|
### Example Config
|
||||||
|
|
||||||
|
See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Inference with vLLM
|
||||||
|
|
||||||
|
After fine-tuning your sparse model, you can leverage vLLM for efficient inference.
|
||||||
|
You can also use LLMCompressor to apply additional quantization to your fine-tuned
|
||||||
|
sparse model before inference for even greater performance benefits.:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from vllm import LLM, SamplingParams
|
||||||
|
|
||||||
|
prompts = [
|
||||||
|
"Hello, my name is",
|
||||||
|
"The president of the United States is",
|
||||||
|
"The capital of France is",
|
||||||
|
"The future of AI is",
|
||||||
|
]
|
||||||
|
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
|
||||||
|
llm = LLM("path/to/your/sparse/model")
|
||||||
|
outputs = llm.generate(prompts, sampling_params)
|
||||||
|
|
||||||
|
for output in outputs:
|
||||||
|
prompt = output.prompt
|
||||||
|
generated_text = output.outputs[0].text
|
||||||
|
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||||
|
```
|
||||||
|
|
||||||
|
For more details on vLLM's capabilities and advanced configuration options, see the [official vLLM documentation](https://docs.vllm.ai/).
|
||||||
|
|
||||||
|
## Learn More
|
||||||
|
|
||||||
|
For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:
|
||||||
|
|
||||||
|
[https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)
|
||||||
5
src/axolotl/integrations/llm_compressor/__init__.py
Normal file
5
src/axolotl/integrations/llm_compressor/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
"""Integration entry point for the LLMCompressor plugin."""
|
||||||
|
|
||||||
|
from .plugin import LLMCompressorPlugin
|
||||||
|
|
||||||
|
__all__ = ["LLMCompressorPlugin"]
|
||||||
40
src/axolotl/integrations/llm_compressor/args.py
Normal file
40
src/axolotl/integrations/llm_compressor/args.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
"""
|
||||||
|
LLMCompressor and Sparse Finetuning config models.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
from typing_extensions import Annotated
|
||||||
|
|
||||||
|
|
||||||
|
class CompressionArgs(BaseModel):
|
||||||
|
"""Sparse Finetuning config for LLMCompressor."""
|
||||||
|
|
||||||
|
# Typing for recipe is set to Any due to:
|
||||||
|
# https://github.com/vllm-project/llm-compressor/issues/1319
|
||||||
|
recipe: Annotated[
|
||||||
|
Any,
|
||||||
|
Field(
|
||||||
|
description="The recipe containing the compression algorithms and hyperparameters to apply."
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
save_compressed: Annotated[
|
||||||
|
bool,
|
||||||
|
Field(
|
||||||
|
default=False,
|
||||||
|
description="Whether to save the compressed model after training.",
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class LLMCompressorArgs(BaseModel):
|
||||||
|
"""LLMCompressor configuration BaseModel."""
|
||||||
|
|
||||||
|
llmcompressor: Annotated[
|
||||||
|
CompressionArgs,
|
||||||
|
Field(
|
||||||
|
description="Arguments enabling compression pathways through the LLM Compressor plugins"
|
||||||
|
),
|
||||||
|
]
|
||||||
171
src/axolotl/integrations/llm_compressor/plugin.py
Normal file
171
src/axolotl/integrations/llm_compressor/plugin.py
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
"""
|
||||||
|
Sparse Finetuning plugin for Axolotl — enables handling of sparse neural networks
|
||||||
|
by maintaining masks for zero weights during training.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from functools import wraps
|
||||||
|
from typing import Any, Callable, Concatenate, ParamSpec, TypeVar
|
||||||
|
|
||||||
|
from llmcompressor import active_session, create_session
|
||||||
|
from llmcompressor.core import callbacks as session_callbacks
|
||||||
|
from llmcompressor.recipe import Recipe
|
||||||
|
from torch.nn import Module
|
||||||
|
from transformers.trainer import Trainer
|
||||||
|
from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
|
||||||
|
from transformers.training_args import TrainingArguments
|
||||||
|
|
||||||
|
from axolotl.integrations.base import BasePlugin
|
||||||
|
|
||||||
|
P = ParamSpec("P") # Params for generic function signatures
|
||||||
|
R = TypeVar("R") # Return type for generic function signatures
|
||||||
|
|
||||||
|
LOG = logging.getLogger("axolotl.integrations.llm_compressor")
|
||||||
|
|
||||||
|
|
||||||
|
class LLMCompressorCallbackHandler(TrainerCallback):
|
||||||
|
"""
|
||||||
|
Trainer callback for Sparse Finetuning.
|
||||||
|
Maintains sparsity patterns during training by applying masks after optimization steps,
|
||||||
|
ensuring zero-weight updates are canceled out.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, trainer: Trainer, recipe: Any):
|
||||||
|
"""
|
||||||
|
Initialize the Sparse Finetuning callback handler.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
trainer (Trainer): Huggingface Trainer instance.
|
||||||
|
recipe (Recipe | dict): Sparse finetuning recipe to apply.
|
||||||
|
"""
|
||||||
|
super().__init__()
|
||||||
|
self.trainer = trainer
|
||||||
|
self.recipe = (
|
||||||
|
Recipe.model_validate(recipe) if not isinstance(recipe, Recipe) else recipe
|
||||||
|
)
|
||||||
|
self.original_compute_loss = trainer.compute_loss
|
||||||
|
self.trainer.compute_loss = compute_loss_wrapper(self.trainer.compute_loss)
|
||||||
|
create_session()
|
||||||
|
|
||||||
|
def on_train_begin(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Called at the beginning of training. Initializes the compression session.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
args (TrainingArguments): Training arguments.
|
||||||
|
state (TrainerState): Trainer state.
|
||||||
|
control (TrainerControl): Trainer control.
|
||||||
|
"""
|
||||||
|
super().on_train_begin(args, state, control, **kwargs)
|
||||||
|
self.trainer.accelerator.wait_for_everyone()
|
||||||
|
active_session().initialize(
|
||||||
|
model=self.trainer.model,
|
||||||
|
optimizer=self.trainer.optimizer,
|
||||||
|
start=state.epoch,
|
||||||
|
recipe=self.recipe,
|
||||||
|
)
|
||||||
|
self.trainer.accelerator.wait_for_everyone()
|
||||||
|
|
||||||
|
def on_step_begin(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Called at the beginning of a training step. Triggers batch_start callback.
|
||||||
|
"""
|
||||||
|
super().on_step_begin(args, state, control, **kwargs)
|
||||||
|
session_callbacks.batch_start()
|
||||||
|
|
||||||
|
def on_step_end(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Called at the end of a training step. Triggers optimizer and batch_end callbacks.
|
||||||
|
"""
|
||||||
|
super().on_step_end(args, state, control, **kwargs)
|
||||||
|
session_callbacks.optim_pre_step()
|
||||||
|
session_callbacks.optim_post_step()
|
||||||
|
session_callbacks.batch_end()
|
||||||
|
|
||||||
|
def on_train_end(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**kwargs,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Called at the end of training. Finalizes the compression session.
|
||||||
|
"""
|
||||||
|
super().on_train_end(args, state, control, **kwargs)
|
||||||
|
active_session().finalize()
|
||||||
|
self.trainer.compute_loss_func = self.original_compute_loss
|
||||||
|
|
||||||
|
|
||||||
|
class LLMCompressorPlugin(BasePlugin):
|
||||||
|
"""
|
||||||
|
Sparse Finetuning plugin for Axolotl integration.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_input_args(self) -> str:
|
||||||
|
"""
|
||||||
|
Returns the path to the plugin's argument definition.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
str: Dotted path to the LLMCompressorArgs class.
|
||||||
|
"""
|
||||||
|
return "axolotl.integrations.llm_compressor.args.LLMCompressorArgs"
|
||||||
|
|
||||||
|
def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
|
||||||
|
"""
|
||||||
|
Adds Sparse Finetuning callback to the Trainer instance.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
cfg (Any): Configuration object containing the sparse recipe.
|
||||||
|
trainer (Trainer): Huggingface Trainer instance.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
list: List containing the configured callback instances.
|
||||||
|
"""
|
||||||
|
LOG.info("Adding Sparse Finetuning callback to the trainer")
|
||||||
|
callback = LLMCompressorCallbackHandler(
|
||||||
|
trainer=trainer,
|
||||||
|
recipe=cfg.llmcompressor.recipe,
|
||||||
|
)
|
||||||
|
return [callback]
|
||||||
|
|
||||||
|
|
||||||
|
def compute_loss_wrapper(
|
||||||
|
compute_loss_func: Callable[Concatenate[Module, P], R],
|
||||||
|
) -> Callable[Concatenate[Module, P], R]:
|
||||||
|
"""
|
||||||
|
Wraps the loss computation function to trigger the loss_calculated callback.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
compute_loss_func (Callable): Original loss computation function.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Callable: Wrapped function that also invokes the loss_calculated callback.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@wraps(compute_loss_func)
|
||||||
|
def compute_and_notify(model: Module, *args: P.args, **kwargs: P.kwargs) -> R:
|
||||||
|
loss = compute_loss_func(model, *args, **kwargs)
|
||||||
|
if active_session().lifecycle.initialized_ and model.training:
|
||||||
|
session_callbacks.loss_calculated(loss=loss)
|
||||||
|
return loss
|
||||||
|
|
||||||
|
return compute_and_notify
|
||||||
40
src/axolotl/integrations/llm_compressor/utils.py
Normal file
40
src/axolotl/integrations/llm_compressor/utils.py
Normal file
@@ -0,0 +1,40 @@
|
|||||||
|
"""Utilities for llmcompressor integration with axolotl."""
|
||||||
|
|
||||||
|
from typing import Union
|
||||||
|
|
||||||
|
from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
|
||||||
|
modify_save_pretrained,
|
||||||
|
)
|
||||||
|
from transformers import PreTrainedModel, Trainer
|
||||||
|
|
||||||
|
|
||||||
|
def save_compressed_model(
|
||||||
|
model: PreTrainedModel,
|
||||||
|
output_dir: Union[str, bytes],
|
||||||
|
trainer: Trainer,
|
||||||
|
safe_serialization: bool = False,
|
||||||
|
save_compressed: bool = False,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Synchronize processes, apply compression hooks, and save the model.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (PreTrainedModel): The model to be saved.
|
||||||
|
output_dir (str or bytes): Path where the model files will be written.
|
||||||
|
trainer (Trainer): Hugging Face Trainer for process synchronization.
|
||||||
|
safe_serialization (bool): Use safe serialization if True.
|
||||||
|
save_compressed (bool): Write compressed tensors if True.
|
||||||
|
"""
|
||||||
|
trainer.accelerator.wait_for_everyone()
|
||||||
|
|
||||||
|
# Only the main process writes the files
|
||||||
|
if not trainer.accelerator.is_main_process:
|
||||||
|
return
|
||||||
|
|
||||||
|
modify_save_pretrained(model)
|
||||||
|
model.save_pretrained(
|
||||||
|
output_dir,
|
||||||
|
safe_serialization=safe_serialization,
|
||||||
|
save_compressed=save_compressed,
|
||||||
|
skip_sparsity_compression_stats=not save_compressed,
|
||||||
|
)
|
||||||
@@ -294,8 +294,23 @@ def save_trained_model(
|
|||||||
trainer.model.save_pretrained(
|
trainer.model.save_pretrained(
|
||||||
cfg.output_dir, safe_serialization=safe_serialization
|
cfg.output_dir, safe_serialization=safe_serialization
|
||||||
)
|
)
|
||||||
|
|
||||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||||
|
|
||||||
|
if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
|
||||||
|
# TODO: add integration support so this can be implemented completely within the plugin
|
||||||
|
from axolotl.integrations.llm_compressor.utils import (
|
||||||
|
save_compressed_model,
|
||||||
|
)
|
||||||
|
|
||||||
|
save_compressed_model(
|
||||||
|
model=model,
|
||||||
|
output_dir=cfg.output_dir,
|
||||||
|
trainer=trainer,
|
||||||
|
safe_serialization=safe_serialization,
|
||||||
|
save_compressed=cfg.llmcompressor.save_compressed,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def create_model_card(cfg: DictDefault, trainer: Trainer):
|
def create_model_card(cfg: DictDefault, trainer: Trainer):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -5,11 +5,8 @@ from functools import partial
|
|||||||
|
|
||||||
from packaging import version
|
from packaging import version
|
||||||
|
|
||||||
from axolotl.utils.gradient_checkpointing.offload_cpu import (
|
from axolotl.utils.gradient_checkpointing.unsloth import (
|
||||||
CPU_Offloaded_Gradient_Checkpointer,
|
Unsloth_Offloaded_Gradient_Checkpointer,
|
||||||
)
|
|
||||||
from axolotl.utils.gradient_checkpointing.offload_disk import (
|
|
||||||
Disco,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
transformers_version = version.parse(importlib.metadata.version("transformers"))
|
transformers_version = version.parse(importlib.metadata.version("transformers"))
|
||||||
@@ -29,31 +26,12 @@ def hf_grad_checkpoint_offload_wrapper(
|
|||||||
decoder_layer, *args, use_reentrant=None
|
decoder_layer, *args, use_reentrant=None
|
||||||
): # pylint: disable=unused-argument
|
): # pylint: disable=unused-argument
|
||||||
if uses_gc_layers(decoder_layer):
|
if uses_gc_layers(decoder_layer):
|
||||||
return CPU_Offloaded_Gradient_Checkpointer.apply(
|
return Unsloth_Offloaded_Gradient_Checkpointer.apply(
|
||||||
decoder_layer,
|
decoder_layer,
|
||||||
*args,
|
*args,
|
||||||
)
|
)
|
||||||
|
|
||||||
return CPU_Offloaded_Gradient_Checkpointer.apply(
|
return Unsloth_Offloaded_Gradient_Checkpointer.apply(
|
||||||
(
|
|
||||||
decoder_layer.func.__self__
|
|
||||||
if isinstance(decoder_layer, partial)
|
|
||||||
else decoder_layer.__self__
|
|
||||||
),
|
|
||||||
*args,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def hf_grad_checkpoint_disk_offload_wrapper(
|
|
||||||
decoder_layer, *args, use_reentrant=None
|
|
||||||
): # pylint: disable=unused-argument
|
|
||||||
if uses_gc_layers(decoder_layer):
|
|
||||||
return Disco.apply(
|
|
||||||
decoder_layer,
|
|
||||||
*args,
|
|
||||||
)
|
|
||||||
|
|
||||||
return Disco.apply(
|
|
||||||
(
|
(
|
||||||
decoder_layer.func.__self__
|
decoder_layer.func.__self__
|
||||||
if isinstance(decoder_layer, partial)
|
if isinstance(decoder_layer, partial)
|
||||||
|
|||||||
@@ -1,531 +0,0 @@
|
|||||||
"""
|
|
||||||
DISCO - DIsk-based Storage and Checkpointing with Optimized prefetching
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Copyright 2025 Axolotl AI. All rights reserved.
|
|
||||||
#
|
|
||||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
||||||
# you may not use this file except in compliance with the License.
|
|
||||||
# You may obtain a copy of the License at
|
|
||||||
#
|
|
||||||
# http://www.apache.org/licenses/LICENSE-2.0
|
|
||||||
#
|
|
||||||
# Unless required by applicable law or agreed to in writing, software
|
|
||||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
||||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
||||||
# See the License for the specific language governing permissions and
|
|
||||||
# limitations under the License.
|
|
||||||
|
|
||||||
import atexit
|
|
||||||
import concurrent.futures
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import queue
|
|
||||||
import shutil
|
|
||||||
import tempfile
|
|
||||||
import threading
|
|
||||||
import time
|
|
||||||
import uuid
|
|
||||||
from collections import deque
|
|
||||||
from concurrent.futures import Future
|
|
||||||
from typing import Dict
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
torch_cuda_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda")
|
|
||||||
torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")
|
|
||||||
|
|
||||||
# Setup logger
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class DiskOffloadManager:
|
|
||||||
"""
|
|
||||||
Manages offloaded tensors and handles prefetching in a separate thread.
|
|
||||||
Includes synchronization to prevent race conditions.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
prefetch_size: int = 3,
|
|
||||||
prefetch_to_gpu: bool = True,
|
|
||||||
save_workers: int = 4,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
prefetch_size: Maximum number of tensors to prefetch in the background.
|
|
||||||
prefetch_to_gpu: Whether to prefetch tensors directly to GPU memory.
|
|
||||||
save_workers: Maximum number of concurrent save operations.
|
|
||||||
"""
|
|
||||||
self.temp_dir = tempfile.mkdtemp(prefix="disco_")
|
|
||||||
|
|
||||||
# Track tensor paths and their status
|
|
||||||
self.tensor_paths: deque = deque() # Ordered history of tensor paths (LIFO)
|
|
||||||
self.file_locks: Dict[str, threading.Lock] = (
|
|
||||||
{}
|
|
||||||
) # Maps file_path -> threading.Lock()
|
|
||||||
# Maps file_path -> status ("saving", "ready", "prefetching", "loaded", "deleted")
|
|
||||||
self.file_status: Dict[str, str] = {}
|
|
||||||
|
|
||||||
self.max_prefetch = prefetch_size
|
|
||||||
self.prefetch_to_gpu = prefetch_to_gpu
|
|
||||||
|
|
||||||
# Thread synchronization
|
|
||||||
self.manager_lock = threading.RLock() # Used for thread-safe operations
|
|
||||||
|
|
||||||
# Prefetch queue and cache
|
|
||||||
self.prefetch_queue: queue.Queue = queue.Queue()
|
|
||||||
self.prefetch_cache: Dict[str, torch.Tensor] = {} # Maps file_path -> tensor
|
|
||||||
|
|
||||||
# Save queue and thread pool
|
|
||||||
self.save_queue: queue.Queue = queue.Queue()
|
|
||||||
self.save_pool = concurrent.futures.ThreadPoolExecutor(max_workers=save_workers)
|
|
||||||
self.save_futures: Dict[str, Future] = {}
|
|
||||||
self.save_semaphore = threading.Semaphore(
|
|
||||||
save_workers * 2
|
|
||||||
) # Limit concurrent save operations
|
|
||||||
|
|
||||||
# Start prefetch worker thread
|
|
||||||
self.stop_event = threading.Event()
|
|
||||||
# start multiple threads for prefetching
|
|
||||||
self.prefetch_worker_count = 2
|
|
||||||
self.prefetch_workers = []
|
|
||||||
for _ in range(self.prefetch_worker_count):
|
|
||||||
worker = threading.Thread(target=self._prefetch_worker, daemon=True)
|
|
||||||
worker.start()
|
|
||||||
self.prefetch_workers.append(worker)
|
|
||||||
|
|
||||||
# Start save worker thread
|
|
||||||
self.save_worker = threading.Thread(target=self._save_worker, daemon=True)
|
|
||||||
self.save_worker.start()
|
|
||||||
self.idx = 0
|
|
||||||
|
|
||||||
atexit.register(self.cleanup)
|
|
||||||
|
|
||||||
def _save_worker(self):
|
|
||||||
"""Background thread that processes the save queue"""
|
|
||||||
while not self.stop_event.is_set():
|
|
||||||
try:
|
|
||||||
save_item = self.save_queue.get(timeout=0.5)
|
|
||||||
if save_item is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
tensor, file_path = save_item
|
|
||||||
|
|
||||||
# Submit the save task to the thread pool
|
|
||||||
future = self.save_pool.submit(
|
|
||||||
self._save_tensor_to_disk, tensor, file_path
|
|
||||||
)
|
|
||||||
with self.manager_lock:
|
|
||||||
self.save_futures[file_path] = future
|
|
||||||
|
|
||||||
self.save_queue.task_done()
|
|
||||||
|
|
||||||
except queue.Empty:
|
|
||||||
time.sleep(0.01) # Small sleep to prevent CPU spinning
|
|
||||||
continue
|
|
||||||
|
|
||||||
def _save_tensor_to_disk(self, tensor: torch.Tensor, file_path: str):
|
|
||||||
"""Actually save the tensor to disk"""
|
|
||||||
try:
|
|
||||||
# Save tensor to disk
|
|
||||||
cpu_tensor = tensor.detach().cpu()
|
|
||||||
torch.save(cpu_tensor, file_path)
|
|
||||||
del cpu_tensor
|
|
||||||
|
|
||||||
with self.manager_lock:
|
|
||||||
# Mark file as ready
|
|
||||||
self.file_status[file_path] = "ready"
|
|
||||||
|
|
||||||
# Release semaphore
|
|
||||||
self.save_semaphore.release()
|
|
||||||
|
|
||||||
return True
|
|
||||||
except FileNotFoundError as e:
|
|
||||||
logger.error(f"Error saving tensor to {file_path}: {e}")
|
|
||||||
with self.manager_lock:
|
|
||||||
self.file_status[file_path] = "error"
|
|
||||||
|
|
||||||
# Release semaphore
|
|
||||||
self.save_semaphore.release()
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def _prefetch_worker(self):
|
|
||||||
"""Background thread that loads tensors from disk ahead of time"""
|
|
||||||
while not self.stop_event.is_set():
|
|
||||||
try:
|
|
||||||
file_path = self.prefetch_queue.get(timeout=0.5)
|
|
||||||
if file_path is None:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Check if file is available and not already in cache
|
|
||||||
with self.manager_lock:
|
|
||||||
if (
|
|
||||||
file_path not in self.file_status
|
|
||||||
or self.file_status[file_path] == "deleted"
|
|
||||||
):
|
|
||||||
self.prefetch_queue.task_done()
|
|
||||||
if file_path in self.prefetch_cache:
|
|
||||||
self.prefetch_queue.task_done()
|
|
||||||
continue
|
|
||||||
|
|
||||||
# If file is still being saved, wait for it
|
|
||||||
if (
|
|
||||||
self.file_status[file_path] == "saving"
|
|
||||||
and file_path in self.save_futures
|
|
||||||
):
|
|
||||||
# Re-queue this prefetch request with a little delay
|
|
||||||
self.prefetch_queue.task_done()
|
|
||||||
time.sleep(0.1)
|
|
||||||
self.prefetch_queue.put(file_path)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Mark file as being prefetched
|
|
||||||
self.file_status[file_path] = "prefetching"
|
|
||||||
|
|
||||||
# Load tensor from disk and store in cache
|
|
||||||
try:
|
|
||||||
if os.path.exists(file_path):
|
|
||||||
if self.prefetch_to_gpu:
|
|
||||||
tensor = torch.load(
|
|
||||||
file_path,
|
|
||||||
map_location=torch.device("cuda"),
|
|
||||||
weights_only=True,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
tensor = torch.load(file_path, weights_only=True)
|
|
||||||
|
|
||||||
with self.manager_lock:
|
|
||||||
self.prefetch_cache[file_path] = tensor
|
|
||||||
self.file_status[file_path] = "ready"
|
|
||||||
else:
|
|
||||||
with self.manager_lock:
|
|
||||||
if self.file_status.get(file_path) != "deleted":
|
|
||||||
logger.warning(
|
|
||||||
f"Prefetch error: File not found {file_path}"
|
|
||||||
)
|
|
||||||
self.file_status[file_path] = "missing"
|
|
||||||
|
|
||||||
except FileNotFoundError as e:
|
|
||||||
with self.manager_lock:
|
|
||||||
if self.file_status.get(file_path) != "deleted":
|
|
||||||
logger.warning(f"Prefetch error for {file_path}: {e}")
|
|
||||||
self.file_status[file_path] = "error"
|
|
||||||
|
|
||||||
self.prefetch_queue.task_done()
|
|
||||||
|
|
||||||
except queue.Empty:
|
|
||||||
time.sleep(0.01) # Small sleep to prevent CPU spinning
|
|
||||||
continue
|
|
||||||
|
|
||||||
def save_tensor(self, tensor: torch.Tensor):
|
|
||||||
"""Save tensor to disk asynchronously and return file path with thread-safe operations"""
|
|
||||||
# Generate unique file path
|
|
||||||
self.idx += 1
|
|
||||||
file_path: str = os.path.join(
|
|
||||||
self.temp_dir, f"{self.idx:06d}-{uuid.uuid4()}.pt"
|
|
||||||
)
|
|
||||||
|
|
||||||
with self.manager_lock:
|
|
||||||
# Mark file as being saved
|
|
||||||
self.file_locks[file_path] = threading.Lock()
|
|
||||||
self.file_status[file_path] = "saving"
|
|
||||||
# Add to history
|
|
||||||
self.tensor_paths.append(file_path)
|
|
||||||
|
|
||||||
# Acquire semaphore to limit concurrent save operations
|
|
||||||
self.save_semaphore.acquire() # pylint: disable=consider-using-with
|
|
||||||
# Queue tensor for saving in background
|
|
||||||
self.save_queue.put((tensor.detach(), file_path))
|
|
||||||
|
|
||||||
return file_path
|
|
||||||
|
|
||||||
def wait_for_save(self, file_path, timeout=None) -> None:
|
|
||||||
"""Wait for a tensor to be saved to disk"""
|
|
||||||
start_time = time.time()
|
|
||||||
while timeout is None or time.time() - start_time < timeout:
|
|
||||||
with self.manager_lock:
|
|
||||||
if self.file_status.get(file_path) == "ready":
|
|
||||||
return
|
|
||||||
if self.file_status.get(file_path) in ["error", "missing", "deleted"]:
|
|
||||||
return
|
|
||||||
|
|
||||||
if file_path in self.save_futures:
|
|
||||||
future = self.save_futures[file_path]
|
|
||||||
if future.done():
|
|
||||||
return
|
|
||||||
|
|
||||||
# Small sleep to prevent CPU spinning
|
|
||||||
time.sleep(0.01)
|
|
||||||
|
|
||||||
# Timeout
|
|
||||||
logger.warning(f"Timeout waiting for tensor to be saved: {file_path}")
|
|
||||||
return
|
|
||||||
|
|
||||||
def load_tensor(self, file_path, target_device="cuda"):
|
|
||||||
"""Load tensor from disk or prefetch cache with proper synchronization"""
|
|
||||||
# Wait for tensor to be saved if it's still in progress
|
|
||||||
self.wait_for_save(file_path)
|
|
||||||
|
|
||||||
tensor = None
|
|
||||||
|
|
||||||
# Try to get from cache first
|
|
||||||
with self.manager_lock:
|
|
||||||
# Check if tensor is already in cache
|
|
||||||
if file_path in self.prefetch_cache:
|
|
||||||
tensor = self.prefetch_cache[file_path]
|
|
||||||
del self.prefetch_cache[file_path]
|
|
||||||
self.file_status[file_path] = "loaded"
|
|
||||||
|
|
||||||
if tensor is not None:
|
|
||||||
# Ensure tensor is on correct device
|
|
||||||
if target_device != "cpu" and tensor.device.type == "cpu":
|
|
||||||
tensor = tensor.to(target_device, non_blocking=True)
|
|
||||||
return tensor
|
|
||||||
|
|
||||||
# If not in cache, load directly from disk
|
|
||||||
try:
|
|
||||||
if not os.path.exists(file_path):
|
|
||||||
logger.error(f"File not found for loading: {file_path}")
|
|
||||||
raise FileNotFoundError(f"File not found: {file_path}")
|
|
||||||
|
|
||||||
tensor = torch.load(file_path, weights_only=True)
|
|
||||||
|
|
||||||
with self.manager_lock:
|
|
||||||
self.file_status[file_path] = "loaded"
|
|
||||||
|
|
||||||
if target_device != "cpu":
|
|
||||||
tensor = tensor.to(target_device, non_blocking=True)
|
|
||||||
|
|
||||||
return tensor
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error loading tensor from {file_path}: {e}")
|
|
||||||
raise
|
|
||||||
|
|
||||||
def _safe_delete_file(self, file_path):
|
|
||||||
"""Safely delete a file with proper synchronization"""
|
|
||||||
with self.manager_lock:
|
|
||||||
# Make sure any save operation is completed
|
|
||||||
if file_path in self.save_futures:
|
|
||||||
future = self.save_futures[file_path]
|
|
||||||
try:
|
|
||||||
if not future.done():
|
|
||||||
future.cancel()
|
|
||||||
del self.save_futures[file_path]
|
|
||||||
except FileNotFoundError as e:
|
|
||||||
logger.warning(
|
|
||||||
f"Error canceling save operation for {file_path}: {e}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Only delete if file exists and is not being prefetched
|
|
||||||
status = self.file_status.get(file_path)
|
|
||||||
if status in ["ready", "loaded", "error", "missing"]:
|
|
||||||
try:
|
|
||||||
if os.path.exists(file_path):
|
|
||||||
os.remove(file_path)
|
|
||||||
self.file_status[file_path] = "deleted"
|
|
||||||
return True
|
|
||||||
except FileNotFoundError as e:
|
|
||||||
logger.warning(f"Error deleting file {file_path}: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
def trigger_prefetch(self, n=None):
|
|
||||||
"""Trigger prefetching of the next N tensors with proper synchronization"""
|
|
||||||
if n is None:
|
|
||||||
n = self.max_prefetch
|
|
||||||
|
|
||||||
prefetch_paths = []
|
|
||||||
with self.manager_lock:
|
|
||||||
# Find files that are ready to be prefetched (not already in cache or being prefetched)
|
|
||||||
for path in reversed(self.tensor_paths):
|
|
||||||
if (
|
|
||||||
path not in self.prefetch_cache
|
|
||||||
and self.file_status.get(path) == "ready"
|
|
||||||
):
|
|
||||||
prefetch_paths.append(path)
|
|
||||||
if len(prefetch_paths) >= n:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Queue files for prefetching
|
|
||||||
for path in prefetch_paths:
|
|
||||||
self.prefetch_queue.put(path)
|
|
||||||
|
|
||||||
def cleanup_tensor(self, file_path: str):
|
|
||||||
"""Clean up a specific tensor file after it's been used"""
|
|
||||||
with self.manager_lock:
|
|
||||||
if file_path in self.tensor_paths:
|
|
||||||
self.tensor_paths.remove(file_path)
|
|
||||||
|
|
||||||
# Remove from prefetch cache if present
|
|
||||||
if file_path in self.prefetch_cache:
|
|
||||||
del self.prefetch_cache[file_path]
|
|
||||||
|
|
||||||
# Remove from save futures if present
|
|
||||||
if file_path in self.save_futures:
|
|
||||||
future = self.save_futures[file_path]
|
|
||||||
if not future.done():
|
|
||||||
future.cancel()
|
|
||||||
del self.save_futures[file_path]
|
|
||||||
|
|
||||||
# Try to delete the file
|
|
||||||
self._safe_delete_file(file_path)
|
|
||||||
|
|
||||||
def cleanup(self):
|
|
||||||
"""Clean up all temp files and stop prefetch thread with proper synchronization"""
|
|
||||||
self.stop_event.set()
|
|
||||||
|
|
||||||
# Cancel all pending save operations
|
|
||||||
with self.manager_lock:
|
|
||||||
for _, future in self.save_futures.items():
|
|
||||||
if not future.done():
|
|
||||||
future.cancel()
|
|
||||||
self.save_futures.clear()
|
|
||||||
|
|
||||||
# Drain the save queue
|
|
||||||
while not self.save_queue.empty():
|
|
||||||
try:
|
|
||||||
self.save_queue.get_nowait()
|
|
||||||
self.save_queue.task_done()
|
|
||||||
except queue.Empty:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Shutdown the save pool
|
|
||||||
self.save_pool.shutdown(wait=False)
|
|
||||||
|
|
||||||
# Join the save worker thread
|
|
||||||
if self.save_worker.is_alive():
|
|
||||||
self.save_worker.join(timeout=2.0)
|
|
||||||
|
|
||||||
# Join the prefetch worker threads
|
|
||||||
for thread in self.prefetch_workers:
|
|
||||||
if thread.is_alive():
|
|
||||||
thread.join(timeout=2.0)
|
|
||||||
|
|
||||||
# Clear cache and remove all temporary files
|
|
||||||
with self.manager_lock:
|
|
||||||
self.prefetch_cache.clear()
|
|
||||||
paths_to_delete = list(self.tensor_paths)
|
|
||||||
self.tensor_paths.clear()
|
|
||||||
|
|
||||||
# Delete all temporary files
|
|
||||||
for path in paths_to_delete:
|
|
||||||
self._safe_delete_file(path)
|
|
||||||
|
|
||||||
# Remove temp directory
|
|
||||||
try:
|
|
||||||
if os.path.exists(self.temp_dir):
|
|
||||||
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
||||||
except FileNotFoundError as e:
|
|
||||||
logger.warning(f"Error removing temporary directory {self.temp_dir}: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
class Disco(torch.autograd.Function):
|
|
||||||
"""
|
|
||||||
Disco: DIsk-based Storage and Checkpointing with Optimized prefetching
|
|
||||||
Advanced disk-based gradient checkpointer with prefetching.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Shared manager instance across all checkpointing operations
|
|
||||||
_manager = None
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def get_instance(prefetch_size=1, prefetch_to_gpu=True, save_workers=4):
|
|
||||||
"""Get or create the offload manager"""
|
|
||||||
if Disco._manager is None:
|
|
||||||
Disco._manager = DiskOffloadManager(
|
|
||||||
prefetch_size=prefetch_size,
|
|
||||||
prefetch_to_gpu=prefetch_to_gpu,
|
|
||||||
save_workers=save_workers,
|
|
||||||
)
|
|
||||||
return Disco._manager
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
@torch_cuda_amp_custom_fwd
|
|
||||||
def forward(
|
|
||||||
ctx,
|
|
||||||
forward_function,
|
|
||||||
hidden_states,
|
|
||||||
*args,
|
|
||||||
prefetch_size=1,
|
|
||||||
prefetch_to_gpu=True,
|
|
||||||
save_workers=4,
|
|
||||||
):
|
|
||||||
"""Forward pass that offloads activations to disk asynchronously"""
|
|
||||||
# Get or create the manager
|
|
||||||
manager = Disco.get_instance(
|
|
||||||
prefetch_size=prefetch_size,
|
|
||||||
prefetch_to_gpu=prefetch_to_gpu,
|
|
||||||
save_workers=save_workers,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Save tensor to disk asynchronously
|
|
||||||
file_path = manager.save_tensor(hidden_states)
|
|
||||||
|
|
||||||
# Run forward pass immediately without waiting for save to complete
|
|
||||||
with torch.no_grad():
|
|
||||||
output = forward_function(hidden_states, *args)
|
|
||||||
|
|
||||||
# Store what we need for backward
|
|
||||||
ctx.save_for_backward(torch.tensor([0])) # Dummy tensor
|
|
||||||
ctx.file_path = file_path
|
|
||||||
ctx.forward_function = forward_function
|
|
||||||
ctx.args = args
|
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
@torch_cuda_amp_custom_bwd
|
|
||||||
def backward(ctx, *grad_outputs):
|
|
||||||
"""Backward pass that loads activations from disk with prefetching"""
|
|
||||||
# Get the manager
|
|
||||||
manager = Disco._manager
|
|
||||||
|
|
||||||
# Trigger prefetching for future tensors
|
|
||||||
# This happens at the start of backward, so should have time to complete
|
|
||||||
manager.trigger_prefetch()
|
|
||||||
|
|
||||||
# Load hidden states from disk or prefetch cache
|
|
||||||
file_path = ctx.file_path
|
|
||||||
try:
|
|
||||||
# Ensure the file is saved before we try to load it
|
|
||||||
manager.wait_for_save(file_path)
|
|
||||||
|
|
||||||
hidden_states = manager.load_tensor(file_path)
|
|
||||||
hidden_states.requires_grad = True
|
|
||||||
|
|
||||||
# Compute gradients
|
|
||||||
with torch.enable_grad():
|
|
||||||
output = ctx.forward_function(hidden_states, *ctx.args)
|
|
||||||
|
|
||||||
# Handle tuple outputs properly
|
|
||||||
if isinstance(output, tuple):
|
|
||||||
if len(grad_outputs) == len(output):
|
|
||||||
torch.autograd.backward(output, grad_outputs)
|
|
||||||
else:
|
|
||||||
torch.autograd.backward(output, grad_outputs[0])
|
|
||||||
else:
|
|
||||||
torch.autograd.backward(output, grad_outputs[0])
|
|
||||||
|
|
||||||
# Clean up the file after we're done with it
|
|
||||||
manager.cleanup_tensor(file_path)
|
|
||||||
|
|
||||||
return (
|
|
||||||
(
|
|
||||||
None, # forward_function
|
|
||||||
hidden_states.grad, # hidden_states grad
|
|
||||||
)
|
|
||||||
+ (None,) * len(ctx.args) # for each arg
|
|
||||||
+ (
|
|
||||||
None, # prefetch_size
|
|
||||||
None, # prefetch_to_gpu
|
|
||||||
None, # save_workers
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error in backward pass: {e}")
|
|
||||||
# Clean up the file even on error
|
|
||||||
manager.cleanup_tensor(file_path)
|
|
||||||
raise
|
|
||||||
@@ -1,4 +1,4 @@
|
|||||||
"""CPU offloaded checkpointing"""
|
"""Unsloth checkpointing"""
|
||||||
|
|
||||||
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
|
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
|
||||||
#
|
#
|
||||||
@@ -26,7 +26,7 @@ else:
|
|||||||
torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")
|
torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")
|
||||||
|
|
||||||
|
|
||||||
class CPU_Offloaded_Gradient_Checkpointer( # pylint: disable=invalid-name
|
class Unsloth_Offloaded_Gradient_Checkpointer( # pylint: disable=invalid-name
|
||||||
torch.autograd.Function
|
torch.autograd.Function
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -70,10 +70,7 @@ from axolotl.utils.distributed import (
|
|||||||
is_local_main_process,
|
is_local_main_process,
|
||||||
is_main_process,
|
is_main_process,
|
||||||
)
|
)
|
||||||
from axolotl.utils.gradient_checkpointing import (
|
from axolotl.utils.gradient_checkpointing import hf_grad_checkpoint_offload_wrapper
|
||||||
hf_grad_checkpoint_disk_offload_wrapper,
|
|
||||||
hf_grad_checkpoint_offload_wrapper,
|
|
||||||
)
|
|
||||||
from axolotl.utils.lora_embeddings import get_linear_embedding_layers
|
from axolotl.utils.lora_embeddings import get_linear_embedding_layers
|
||||||
from axolotl.utils.model_shard_quant import load_sharded_model, load_sharded_model_quant
|
from axolotl.utils.model_shard_quant import load_sharded_model, load_sharded_model_quant
|
||||||
|
|
||||||
@@ -144,6 +141,22 @@ def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
|
|||||||
hasattr(model_config, "quantization_config")
|
hasattr(model_config, "quantization_config")
|
||||||
and model_config.quantization_config
|
and model_config.quantization_config
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Detect compressed-tensors config
|
||||||
|
is_compressed_tensors_config = (
|
||||||
|
quant_config_exists
|
||||||
|
and model_config.quantization_config.get("quant_method") == "compressed-tensors"
|
||||||
|
)
|
||||||
|
|
||||||
|
if is_compressed_tensors_config:
|
||||||
|
if model_config.quantization_config.get("config_groups"):
|
||||||
|
LOG.warning(
|
||||||
|
"Found `config_groups` in a compressed-tensors config. "
|
||||||
|
"QAT integration with llmcompressor is not tested."
|
||||||
|
)
|
||||||
|
# Skip further quant checks for compressed-tensors
|
||||||
|
return
|
||||||
|
|
||||||
quant_config_method_is_gptq = (
|
quant_config_method_is_gptq = (
|
||||||
quant_config_exists
|
quant_config_exists
|
||||||
and "quant_method" in model_config.quantization_config
|
and "quant_method" in model_config.quantization_config
|
||||||
@@ -606,10 +619,6 @@ class ModelLoader:
|
|||||||
|
|
||||||
if self.cfg.gradient_checkpointing in ["unsloth", "offload"]:
|
if self.cfg.gradient_checkpointing in ["unsloth", "offload"]:
|
||||||
transformers.modeling_utils.checkpoint = hf_grad_checkpoint_offload_wrapper
|
transformers.modeling_utils.checkpoint = hf_grad_checkpoint_offload_wrapper
|
||||||
if self.cfg.gradient_checkpointing == "offload_disk":
|
|
||||||
transformers.modeling_utils.checkpoint = (
|
|
||||||
hf_grad_checkpoint_disk_offload_wrapper
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.cfg.flash_attention:
|
if self.cfg.flash_attention:
|
||||||
self.patch_attention()
|
self.patch_attention()
|
||||||
|
|||||||
@@ -1,13 +1,10 @@
|
|||||||
|
# pylint: skip-file
|
||||||
"""
|
"""
|
||||||
Multipack Batch Sampler - An efficient batch sampler for packing variable-length sequences
|
Multipack Batch Sampler
|
||||||
into fixed-capacity batches to optimize memory usage and training throughput.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
from typing import Any, Iterable, List, Union
|
||||||
from multiprocessing import cpu_count, get_context
|
|
||||||
from typing import Iterable, Union
|
|
||||||
|
|
||||||
import numba
|
import numba
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -16,39 +13,26 @@ from torch.utils.data import BatchSampler, Sampler, SequentialSampler
|
|||||||
from axolotl.utils.distributed import reduce_and_broadcast
|
from axolotl.utils.distributed import reduce_and_broadcast
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
LOG.setLevel(logging.INFO)
|
LOG.setLevel(logging.INFO)
|
||||||
|
|
||||||
|
|
||||||
@numba.njit
|
@numba.njit
|
||||||
def ffd_check(sequence_lengths: np.ndarray, bin_capacity: int, num_bins: int):
|
def ffd_check(a: np.ndarray, c: int, n: int):
|
||||||
"""
|
# First-fit-decreasing bin packing
|
||||||
First-fit-decreasing bin packing algorithm check
|
# Check if a[] could fit in n bins with capacity c
|
||||||
|
# https://en.wikipedia.org/wiki/First-fit-decreasing_bin_packing
|
||||||
|
|
||||||
Checks if sequences with the given lengths could fit in the specified number of bins
|
a = np.sort(a)[::-1]
|
||||||
|
bins = np.full((n,), c, dtype=a.dtype)
|
||||||
Args:
|
for size in a:
|
||||||
sequence_lengths: Array of sequence lengths
|
|
||||||
bin_capacity: Maximum capacity of each bin
|
|
||||||
num_bins: Number of bins available
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if all sequences can be packed, False otherwise
|
|
||||||
"""
|
|
||||||
# Sort sequence lengths in descending order for optimal packing
|
|
||||||
sequence_lengths = np.sort(sequence_lengths)[::-1]
|
|
||||||
# Initialize all bins with full capacity
|
|
||||||
bins = np.full((num_bins,), bin_capacity, dtype=sequence_lengths.dtype)
|
|
||||||
|
|
||||||
# Try to place each sequence in the first bin it fits
|
|
||||||
for size in sequence_lengths:
|
|
||||||
not_found = True
|
not_found = True
|
||||||
for idx in range(num_bins):
|
for idx in range(n):
|
||||||
if bins[idx] >= size:
|
if bins[idx] >= size:
|
||||||
bins[idx] -= size
|
bins[idx] -= size
|
||||||
not_found = False
|
not_found = False
|
||||||
break
|
break
|
||||||
|
|
||||||
# If no bin could fit this sequence, packing failed
|
|
||||||
if not_found:
|
if not_found:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@@ -56,155 +40,86 @@ def ffd_check(sequence_lengths: np.ndarray, bin_capacity: int, num_bins: int):
|
|||||||
|
|
||||||
|
|
||||||
@numba.njit
|
@numba.njit
|
||||||
def pack_group(
|
def ffd_with_result(a: np.ndarray, c: int, start_index: int):
|
||||||
sequence_lengths: np.ndarray,
|
# First-fit-decreasing bin packing (with result return)
|
||||||
group_offset: int,
|
|
||||||
bin_capacity: int,
|
|
||||||
max_bins: int,
|
|
||||||
bin_size: int,
|
|
||||||
safe_mode: bool = True,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Pack a group of sequences into bins using First-Fit Decreasing algorithm
|
|
||||||
|
|
||||||
Args:
|
indices = np.argsort(a)[::-1]
|
||||||
sequence_lengths: Array of sequence lengths
|
a = a[indices]
|
||||||
group_offset: Offset to apply to indices when returning results
|
|
||||||
bin_capacity: Maximum capacity of each bin
|
|
||||||
max_bins: Maximum number of bins to use
|
|
||||||
bin_size: Maximum number of sequences per bin
|
|
||||||
safe_mode: If True, use a more conservative packing approach
|
|
||||||
|
|
||||||
Returns:
|
bins: List[Any] = []
|
||||||
List of bins, where each bin contains indices of sequences assigned to it
|
bins_result: List[Any] = []
|
||||||
"""
|
for a_id, size in enumerate(a):
|
||||||
bins_remaining_space: list = [] # Tracks remaining capacity in each bin
|
add_new = True
|
||||||
bins_assigned_sequences: list = [] # Tracks sequence indices assigned to each bin
|
for idx in range(len(bins)):
|
||||||
|
if bins[idx] >= size:
|
||||||
for seq_id, size in enumerate(sequence_lengths):
|
bins[idx] -= size
|
||||||
global_idx = seq_id + group_offset
|
bins_result[idx].append(indices[a_id] + start_index)
|
||||||
|
add_new = False
|
||||||
# Try to place sequence in existing bins
|
|
||||||
add_new_bin = True
|
|
||||||
for bin_idx, _ in enumerate(bins_remaining_space):
|
|
||||||
if (
|
|
||||||
bins_remaining_space[bin_idx] >= size
|
|
||||||
and len(bins_assigned_sequences[bin_idx]) < bin_size
|
|
||||||
):
|
|
||||||
bins_remaining_space[bin_idx] -= size
|
|
||||||
bins_assigned_sequences[bin_idx].append(global_idx)
|
|
||||||
add_new_bin = False
|
|
||||||
break
|
break
|
||||||
|
|
||||||
# Create a new bin if needed and if we haven't reached the limit
|
if add_new:
|
||||||
if add_new_bin:
|
bins.append(c - size)
|
||||||
if len(bins_remaining_space) >= max_bins and safe_mode:
|
bins_result.append([indices[a_id] + start_index])
|
||||||
# In safe mode, skip items that would exceed max_bins
|
|
||||||
continue
|
|
||||||
bins_remaining_space.append(bin_capacity - size)
|
|
||||||
bins_assigned_sequences.append([global_idx])
|
|
||||||
|
|
||||||
# Safety check to avoid infinite bins
|
return bins_result
|
||||||
if len(bins_remaining_space) > len(sequence_lengths):
|
|
||||||
break
|
|
||||||
|
|
||||||
return bins_assigned_sequences
|
|
||||||
|
|
||||||
|
|
||||||
# Define a standalone function for multiprocessing
|
|
||||||
def _process_group(args):
|
|
||||||
group_lengths, start_idx, bin_capacity, max_bins, bin_size, safe_mode = args
|
|
||||||
return pack_group(
|
|
||||||
group_lengths, start_idx, bin_capacity, max_bins, bin_size, safe_mode
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def pack_parallel(
|
|
||||||
sequence_lengths: np.ndarray,
|
|
||||||
bin_capacity: int,
|
|
||||||
group_size: int,
|
|
||||||
bin_size: int,
|
|
||||||
num_processes: int | None = None,
|
|
||||||
safe_mode: bool = True,
|
|
||||||
mp_start_method: str | None = "spawn",
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Pack sequences into bins using parallel processing
|
|
||||||
|
|
||||||
Args:
|
|
||||||
sequence_lengths: Array of sequence lengths
|
|
||||||
bin_capacity: Maximum capacity of each bin as total number of tokens
|
|
||||||
group_size: Number of sequences to process in each group
|
|
||||||
bin_size: Maximum number of bins to use
|
|
||||||
num_processes: Number of parallel processes to use
|
|
||||||
safe_mode: If True, use a more conservative packing approach
|
|
||||||
mp_start_method: Multiprocessing start method ('fork', 'spawn', 'forkserver').
|
|
||||||
'spawn' is often safer with Numba/PyTorch.
|
|
||||||
Set to None to use system default.
|
|
||||||
Returns:
|
|
||||||
List of bins, where each bin contains indices of sequences assigned to it
|
|
||||||
"""
|
|
||||||
num_items = len(sequence_lengths)
|
|
||||||
if num_processes is None:
|
|
||||||
num_processes = max(1, min(num_items // group_size, cpu_count()))
|
|
||||||
|
|
||||||
# Create tasks for parallel processing
|
|
||||||
tasks = []
|
|
||||||
for i in range(0, num_items, group_size):
|
|
||||||
group_lengths = sequence_lengths[i : i + group_size]
|
|
||||||
max_bins = len(group_lengths) # Allow as many bins as items in the group
|
|
||||||
tasks.append((group_lengths, i, bin_capacity, max_bins, bin_size, safe_mode))
|
|
||||||
|
|
||||||
# Process groups in parallel
|
|
||||||
all_bins = []
|
|
||||||
|
|
||||||
mp_ctx = None
|
|
||||||
if mp_start_method:
|
|
||||||
try:
|
|
||||||
mp_ctx = get_context(mp_start_method)
|
|
||||||
except ValueError:
|
|
||||||
LOG.warning(
|
|
||||||
f"Failed to get multiprocessing context '{mp_start_method}'. "
|
|
||||||
f"Falling back to default. Available: {get_context().get_all_start_methods()}"
|
|
||||||
)
|
|
||||||
mp_ctx = (
|
|
||||||
None # Fallback to default context if specified one is not available
|
|
||||||
)
|
|
||||||
|
|
||||||
if num_processes == 1:
|
|
||||||
LOG.debug("Using single process for pack_parallel, running sequentially.")
|
|
||||||
for task_args in tasks:
|
|
||||||
group_bins = _process_group(task_args)
|
|
||||||
all_bins.extend(group_bins)
|
|
||||||
else:
|
|
||||||
# Use ProcessPoolExecutor only if num_processes > 1
|
|
||||||
# Pass mp_context if available
|
|
||||||
with ProcessPoolExecutor(
|
|
||||||
max_workers=num_processes, mp_context=mp_ctx
|
|
||||||
) as executor:
|
|
||||||
for group_bins in executor.map(_process_group, tasks):
|
|
||||||
all_bins.extend(group_bins)
|
|
||||||
|
|
||||||
return all_bins
|
|
||||||
|
|
||||||
|
|
||||||
@numba.njit
|
@numba.njit
|
||||||
def allocate_sequentially(
|
def allocate(
|
||||||
sequence_lengths: np.ndarray, rank: int, bin_capacity: int, num_ranks: int
|
lengths: np.ndarray, lengths_cumsum: np.ndarray, rank: int, c: int, n: int
|
||||||
):
|
):
|
||||||
|
# Dynamic batch allocator, similar to Multifit
|
||||||
|
# https://en.wikipedia.org/wiki/Multifit_algorithm
|
||||||
|
# ~99.5% efficiency on OpenChat training set (12 * 2048 ctx len)
|
||||||
|
|
||||||
|
s = 0
|
||||||
|
start_index = 0
|
||||||
|
result = []
|
||||||
|
|
||||||
|
while True:
|
||||||
|
# binary search [l, r)
|
||||||
|
left = 1
|
||||||
|
right = 1 + np.searchsorted(lengths_cumsum[start_index:], s + c * n, "right")
|
||||||
|
|
||||||
|
while right - left > 1:
|
||||||
|
mid = (left + right) // 2
|
||||||
|
if ffd_check(lengths[start_index : start_index + mid], c, n):
|
||||||
|
left = mid
|
||||||
|
else:
|
||||||
|
right = mid
|
||||||
|
|
||||||
|
# use length l
|
||||||
|
batch = ffd_with_result(
|
||||||
|
lengths[start_index : start_index + left], c, start_index
|
||||||
|
)
|
||||||
|
assert len(batch) <= n
|
||||||
|
if len(batch) < n:
|
||||||
|
break
|
||||||
|
|
||||||
|
start_index += left
|
||||||
|
s = lengths_cumsum[start_index - 1]
|
||||||
|
|
||||||
|
# add local rank
|
||||||
|
result.append(batch[rank])
|
||||||
|
|
||||||
|
return result, s, len(result) * c * n
|
||||||
|
|
||||||
|
|
||||||
|
@numba.njit
|
||||||
|
def allocate_sequentially(lengths: np.ndarray, rank: int, c: int, n: int):
|
||||||
"""
|
"""
|
||||||
Sequential allocator that preserves example order
|
Sequential allocator that preserves example order
|
||||||
|
|
||||||
Args:
|
Parameters:
|
||||||
sequence_lengths: The lengths of all examples
|
- lengths: The lengths of all examples
|
||||||
rank: The current rank (for distributed training)
|
- rank: The current rank (for distributed training)
|
||||||
bin_capacity: The capacity of each bin (maximum sequence length)
|
- c: The capacity of each bin (maximum sequence length)
|
||||||
num_ranks: Number of ranks (processes/GPUs)
|
- n: Number of ranks
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
rank_batches: List of batches for the current rank
|
- result: List of batches for the current rank
|
||||||
total_tokens_used: Number of actual example tokens
|
- total_used: Number of actual example tokens
|
||||||
total_token_slots: Maximum theoretical number of example tokens (number of bins * bin capacity)
|
- total_slots: Maximum theoretical number of example tokens (number of bins * bin capacity)
|
||||||
"""
|
"""
|
||||||
result = []
|
result = []
|
||||||
total_used = 0
|
total_used = 0
|
||||||
@@ -212,9 +127,9 @@ def allocate_sequentially(
|
|||||||
# First, do sequential packing into bins
|
# First, do sequential packing into bins
|
||||||
all_bins = []
|
all_bins = []
|
||||||
current_bin = [0 for i in range(0)] # numba hint
|
current_bin = [0 for i in range(0)] # numba hint
|
||||||
remaining_capacity = bin_capacity
|
remaining_capacity = c
|
||||||
|
|
||||||
for idx, size in enumerate(sequence_lengths):
|
for idx, size in enumerate(lengths):
|
||||||
if size <= remaining_capacity:
|
if size <= remaining_capacity:
|
||||||
# Example fits in current bin
|
# Example fits in current bin
|
||||||
current_bin.append(idx)
|
current_bin.append(idx)
|
||||||
@@ -225,7 +140,7 @@ def allocate_sequentially(
|
|||||||
if current_bin: # Add non-empty bin to all_bins
|
if current_bin: # Add non-empty bin to all_bins
|
||||||
all_bins.append(current_bin)
|
all_bins.append(current_bin)
|
||||||
current_bin = [idx]
|
current_bin = [idx]
|
||||||
remaining_capacity = bin_capacity - size
|
remaining_capacity = c - size
|
||||||
total_used += size
|
total_used += size
|
||||||
|
|
||||||
# Add the last bin if not empty
|
# Add the last bin if not empty
|
||||||
@@ -233,227 +148,132 @@ def allocate_sequentially(
|
|||||||
all_bins.append(current_bin)
|
all_bins.append(current_bin)
|
||||||
|
|
||||||
# Assign bins to ranks - each rank gets every n-th bin
|
# Assign bins to ranks - each rank gets every n-th bin
|
||||||
for bin_idx in range(rank, len(all_bins), num_ranks):
|
for bin_idx in range(rank, len(all_bins), n):
|
||||||
result.append(all_bins[bin_idx])
|
result.append(all_bins[bin_idx])
|
||||||
|
|
||||||
return result, total_used, len(all_bins) * bin_capacity
|
return result, total_used, len(all_bins) * c
|
||||||
|
|
||||||
|
|
||||||
class MultipackBatchSampler(BatchSampler):
|
class MultipackBatchSampler(BatchSampler):
|
||||||
"""
|
"""Batch sampler class for multipack"""
|
||||||
Batch sampler class for efficient packing of variable-length sequences
|
|
||||||
|
|
||||||
This sampler packs sequences into fixed-capacity bins (batches) to maximize
|
|
||||||
GPU memory utilization and training throughput by reducing padding.
|
|
||||||
|
|
||||||
It supports both parallel packing (using FFD algorithm) and
|
|
||||||
sequential packing (preserving original sequence order).
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
sampler: Union[Sampler[int], Iterable[int]],
|
sampler: Union[Sampler[int], Iterable[int]],
|
||||||
batch_size: int, # Number of bins per batch
|
batch_size: int,
|
||||||
batch_max_len: int, # Maximum sequence length (bin capacity)
|
batch_max_len: int,
|
||||||
lengths: np.ndarray, # Sequence lengths
|
lengths: np.ndarray,
|
||||||
packing_efficiency_estimate: float = 1.0, # Initial efficiency estimate
|
packing_efficiency_estimate: float = 1.0,
|
||||||
drop_last: bool = False, # Whether to drop final batches (might be incomplete)
|
drop_last: bool = False,
|
||||||
num_count_samples: int = 16, # Number of times to estimate batch count
|
num_count_samples: int = 16,
|
||||||
sequential: bool = False, # Whether to use sequential packing
|
sequential: bool = False,
|
||||||
group_size: int = 100_000, # Size of groups for parallel packing
|
**kwargs,
|
||||||
bin_size: int = 200, # The max number of samples that can be packed in a single bin
|
|
||||||
num_processes: int | None = None, # Number of processes for parallel packing
|
|
||||||
safe_mode: bool = True, # Conservative packing to prevent training instability
|
|
||||||
**kwargs, # pylint: disable=unused-argument
|
|
||||||
):
|
):
|
||||||
super().__init__(sampler, batch_size, drop_last)
|
super().__init__(sampler, batch_size, drop_last)
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
self.batch_max_len = batch_max_len
|
self.batch_max_len = batch_max_len
|
||||||
self.lengths = np.array(lengths, dtype=np.int32)
|
self.lengths: np.ndarray = lengths
|
||||||
self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
|
self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
|
||||||
self.sequential = sequential
|
self.sequential = sequential
|
||||||
self.group_size = group_size
|
|
||||||
self.bin_size = bin_size
|
|
||||||
self.num_processes = num_processes
|
|
||||||
self.safe_mode = safe_mode
|
|
||||||
|
|
||||||
assert isinstance(self.lengths, np.ndarray)
|
assert isinstance(self.lengths, np.ndarray)
|
||||||
|
|
||||||
self.epoch = 0
|
self.epoch = 0
|
||||||
|
|
||||||
# Efficiency statistics tracking
|
# statistics
|
||||||
self.total_tokens_used = 0
|
self.eff_total_used = 0
|
||||||
self.total_token_slots = 0
|
self.eff_total_slots = 0
|
||||||
|
|
||||||
# The number of times to calculate batches to determine minimum packed dataset length
|
# The number of times to calculate the batches to determine the minimum packed dataset length for the local rank
|
||||||
self.num_count_samples = num_count_samples
|
self.num_count_samples = num_count_samples
|
||||||
# Minimum packed dataset length across all ranks (determined by gather/broadcast)
|
# the minimum packed dataset length across all ranks determined by a gather/broadcast
|
||||||
self.len_across_ranks = None
|
self.len_across_ranks = None
|
||||||
|
|
||||||
# Cache for batches
|
|
||||||
self._batches = None
|
|
||||||
|
|
||||||
if self.sequential and not isinstance(sampler, SequentialSampler):
|
if self.sequential and not isinstance(sampler, SequentialSampler):
|
||||||
LOG.warning(
|
LOG.warning(
|
||||||
"using sequential sample packing with non-sequential sampler, did you want to also enable curriculum_sampling?"
|
"using sequential sample packing with non-sequential sampler, did you want to also enable curriculum_sampling?"
|
||||||
)
|
)
|
||||||
|
|
||||||
def set_epoch(self, epoch: int):
|
def set_epoch(self, epoch: int):
|
||||||
"""Set the epoch number, used for reproducible shuffling across epochs"""
|
|
||||||
self.epoch = epoch
|
self.epoch = epoch
|
||||||
self._batches = None # Invalidate batch cache
|
|
||||||
|
|
||||||
def generate_batches(self, set_stats=False):
|
def generate_batches(self, set_stats=False):
|
||||||
"""
|
indices = [idx for idx in self.sampler]
|
||||||
Generate packed batches for training
|
|
||||||
|
|
||||||
Args:
|
|
||||||
set_stats: Whether to update efficiency statistics
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of batches, where each batch contains multiple bins,
|
|
||||||
and each bin contains multiple sequence indices
|
|
||||||
"""
|
|
||||||
if self._batches is not None:
|
|
||||||
return self._batches
|
|
||||||
|
|
||||||
# Get indices from the sampler
|
|
||||||
indices = [ # pylint: disable=unnecessary-comprehension
|
|
||||||
idx for idx in self.sampler
|
|
||||||
]
|
|
||||||
|
|
||||||
# Get lengths of the selected sequences
|
|
||||||
lengths = self.lengths[indices]
|
lengths = self.lengths[indices]
|
||||||
|
lengths_cumsum = np.cumsum(lengths)
|
||||||
|
|
||||||
# Pack sequences into bins using either sequential or parallel packing
|
|
||||||
if self.sequential:
|
if self.sequential:
|
||||||
bins, total_used, total_slots = allocate_sequentially(
|
batches, total_used, total_slots = allocate_sequentially(
|
||||||
lengths,
|
lengths=lengths,
|
||||||
rank=0,
|
rank=0,
|
||||||
bin_capacity=self.batch_max_len,
|
c=self.batch_max_len,
|
||||||
num_ranks=1,
|
n=1,
|
||||||
)
|
)
|
||||||
# Map bin indices back to original indices
|
|
||||||
bins = [[indices[b_idx] for b_idx in bin_indices] for bin_indices in bins]
|
|
||||||
else:
|
else:
|
||||||
# Use parallel packing
|
batches, total_used, total_slots = allocate(
|
||||||
all_bins = pack_parallel(
|
lengths=lengths,
|
||||||
lengths,
|
lengths_cumsum=lengths_cumsum,
|
||||||
bin_capacity=self.batch_max_len,
|
rank=0,
|
||||||
group_size=self.group_size,
|
c=self.batch_max_len,
|
||||||
bin_size=self.bin_size,
|
n=1,
|
||||||
num_processes=self.num_processes,
|
|
||||||
safe_mode=self.safe_mode,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Map bin indices back to original indices
|
|
||||||
bins = [
|
|
||||||
[indices[b_idx] for b_idx in bin_indices] for bin_indices in all_bins
|
|
||||||
]
|
|
||||||
|
|
||||||
# Calculate efficiency statistics
|
|
||||||
total_used = lengths.sum()
|
|
||||||
total_slots = len(all_bins) * self.batch_max_len
|
|
||||||
|
|
||||||
# Group bins into batches (each batch contains batch_size bins)
|
|
||||||
batches = [
|
batches = [
|
||||||
bins[i : i + self.batch_size] for i in range(0, len(bins), self.batch_size)
|
[
|
||||||
|
[indices[b_idx] for b_idx in batch]
|
||||||
|
for batch in batches[i : i + self.batch_size]
|
||||||
|
]
|
||||||
|
for i in range(0, len(batches), self.batch_size)
|
||||||
]
|
]
|
||||||
|
|
||||||
# Drop last batch if requested and it's incomplete
|
# statistics
|
||||||
if self.drop_last and len(batches[-1]) < self.batch_size:
|
|
||||||
batches = batches[:-1]
|
|
||||||
# Adjust total_slots if we dropped a batch
|
|
||||||
if not self.sequential:
|
|
||||||
total_slots -= (self.batch_size - len(batches[-1])) * self.batch_max_len
|
|
||||||
|
|
||||||
# Update statistics if requested
|
|
||||||
if set_stats:
|
if set_stats:
|
||||||
self.total_tokens_used += total_used
|
self.eff_total_used += total_used
|
||||||
self.total_token_slots += total_slots
|
self.eff_total_slots += total_slots
|
||||||
|
|
||||||
self._batches = batches
|
|
||||||
return batches
|
return batches
|
||||||
|
|
||||||
def __iter__(self):
|
def __iter__(self):
|
||||||
"""
|
|
||||||
Return an iterator over batches
|
|
||||||
|
|
||||||
The batches are truncated to match the minimum number of batches across all ranks
|
|
||||||
to ensure distributed training balance
|
|
||||||
"""
|
|
||||||
batches = self.generate_batches(set_stats=True)
|
batches = self.generate_batches(set_stats=True)
|
||||||
if self.len_across_ranks:
|
if self.len_across_ranks:
|
||||||
# Truncate batches to ensure all ranks have the same number of batches
|
# make sure the batches we iterate over is truncated to the same min length across all ranks
|
||||||
batches = batches[: self.len_across_ranks]
|
batches = batches[: self.len_across_ranks]
|
||||||
return iter(batches)
|
return iter(batches)
|
||||||
|
|
||||||
|
def num_batches(self):
|
||||||
|
batches = self.generate_batches(set_stats=True)
|
||||||
|
return len(batches)
|
||||||
|
|
||||||
def efficiency(self):
|
def efficiency(self):
|
||||||
"""
|
return self.eff_total_used / self.eff_total_slots
|
||||||
Calculate the packing efficiency (ratio of tokens used to total token slots)
|
|
||||||
Higher is better - 1.0 would mean perfect packing with no wasted space
|
|
||||||
"""
|
|
||||||
if self.total_token_slots == 0:
|
|
||||||
self.generate_batches(set_stats=True)
|
|
||||||
if self.total_token_slots == 0:
|
|
||||||
return 0.0
|
|
||||||
# Return a Python float instead of potentially a numpy float
|
|
||||||
return float(self.total_tokens_used / self.total_token_slots)
|
|
||||||
|
|
||||||
def gather_efficiency(self):
|
def gather_efficiency(self):
|
||||||
"""
|
def calc_sample_packing_eff_est(estimates: List[float]):
|
||||||
Gather and synchronize packing efficiency estimates across all distributed ranks
|
|
||||||
Returns a conservative efficiency estimate based on the measurements
|
|
||||||
"""
|
|
||||||
|
|
||||||
def calc_sample_packing_eff_est(estimates: list[float]):
|
|
||||||
LOG.debug(f"sample_packing_eff_est across ranks: {repr(estimates)}")
|
LOG.debug(f"sample_packing_eff_est across ranks: {repr(estimates)}")
|
||||||
# Use 99.7% of max observed efficiency as a safe estimate
|
return math.floor(0.997 * max(estimates))
|
||||||
max_eff = max(float(eff) for eff in estimates)
|
|
||||||
return math.floor(0.997 * max_eff)
|
|
||||||
|
|
||||||
# Gather efficiency from all ranks and apply the calculation function
|
|
||||||
sample_packing_actual_eff_all = reduce_and_broadcast(
|
sample_packing_actual_eff_all = reduce_and_broadcast(
|
||||||
lambda: float(self.efficiency()), # pylint: disable=unnecessary-lambda
|
lambda: self.efficiency(), # pylint: disable=unnecessary-lambda
|
||||||
calc_sample_packing_eff_est,
|
calc_sample_packing_eff_est,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Quantize to 0.5% intervals for stability
|
|
||||||
sample_packing_eff_est = (
|
sample_packing_eff_est = (
|
||||||
math.ceil(sample_packing_actual_eff_all * 200.0) / 200.0
|
math.ceil(sample_packing_actual_eff_all * 200.0) / 200.0
|
||||||
)
|
)
|
||||||
return sample_packing_eff_est
|
return sample_packing_eff_est
|
||||||
|
|
||||||
def gather_len_batches(self, num):
|
def gather_len_batches(self, num):
|
||||||
"""
|
|
||||||
Gather and synchronize batch counts across all distributed ranks
|
|
||||||
Returns the minimum number of batches available on any rank
|
|
||||||
"""
|
|
||||||
|
|
||||||
def calc_min_len(estimates: list[(int, float)]):
|
def calc_min_len(estimates: list[(int, float)]):
|
||||||
LOG.info(f"gather_len_batches: {repr(estimates)}")
|
LOG.info(f"gather_len_batches: {repr(estimates)}")
|
||||||
return math.floor(min(estimates))
|
return math.floor(min(estimates))
|
||||||
|
|
||||||
# Find minimum batch count across ranks to ensure balance
|
|
||||||
min_len_batches = reduce_and_broadcast(lambda: num, calc_min_len)
|
min_len_batches = reduce_and_broadcast(lambda: num, calc_min_len)
|
||||||
return min_len_batches
|
return min_len_batches
|
||||||
|
|
||||||
def __len__(self):
|
def __len__(self):
|
||||||
"""
|
if not self.len_across_ranks:
|
||||||
Return the total number of batches that will be yielded by this sampler
|
len_batches = min(
|
||||||
|
[self.num_batches() for _ in range(self.num_count_samples)]
|
||||||
This is calculated as the minimum number of batches available on any rank
|
|
||||||
to ensure balanced distributed training
|
|
||||||
"""
|
|
||||||
if self._batches is None:
|
|
||||||
self._batches = self.generate_batches(set_stats=True)
|
|
||||||
|
|
||||||
if self.len_across_ranks is None:
|
|
||||||
# Sample multiple times to get stable estimate
|
|
||||||
len_batches = min( # pylint: disable=consider-using-generator
|
|
||||||
[len(self._batches) for _ in range(self.num_count_samples)]
|
|
||||||
)
|
)
|
||||||
# Gather minimum across all ranks
|
|
||||||
self.len_across_ranks = self.gather_len_batches(len_batches)
|
self.len_across_ranks = self.gather_len_batches(len_batches)
|
||||||
|
|
||||||
return self.len_across_ranks
|
return self.len_across_ranks
|
||||||
|
|||||||
@@ -178,7 +178,7 @@ class AxolotlInputConfig(
|
|||||||
|
|
||||||
# torch_dtype: torch.dtype | None
|
# torch_dtype: torch.dtype | None
|
||||||
|
|
||||||
gradient_checkpointing: Literal["offload", "offload_disk"] | bool | None = Field(
|
gradient_checkpointing: Literal["unsloth", "offload"] | bool | None = Field(
|
||||||
default=False
|
default=False
|
||||||
)
|
)
|
||||||
gradient_checkpointing_kwargs: dict[str, Any] | None = None
|
gradient_checkpointing_kwargs: dict[str, Any] | None = None
|
||||||
@@ -1149,28 +1149,16 @@ class AxolotlInputConfig(
|
|||||||
|
|
||||||
return data
|
return data
|
||||||
|
|
||||||
# @model_validator(mode="before")
|
|
||||||
# @classmethod
|
|
||||||
# def check_grpo_peft_liger(cls, data):
|
|
||||||
# if (
|
|
||||||
# data.get("rl") == "grpo"
|
|
||||||
# and data.get("trl", {})
|
|
||||||
# and data.get("trl").get("use_liger_loss")
|
|
||||||
# and data.get("adapter")
|
|
||||||
# ):
|
|
||||||
# raise ValueError("PEFT + GRPO + Liger is not yet supported")
|
|
||||||
# return data
|
|
||||||
#
|
|
||||||
@model_validator(mode="before")
|
@model_validator(mode="before")
|
||||||
@classmethod
|
@classmethod
|
||||||
def check_grpo_liger_sequence_parallel(cls, data):
|
def check_grpo_peft_liger(cls, data):
|
||||||
if (
|
if (
|
||||||
data.get("rl") == "grpo"
|
data.get("rl") == "grpo"
|
||||||
and data.get("trl", {})
|
and data.get("trl", {})
|
||||||
and data.get("trl").get("use_liger_loss")
|
and data.get("trl").get("use_liger_loss")
|
||||||
and data.get("sequence_parallel_degree", 1) > 1
|
and data.get("adapter")
|
||||||
):
|
):
|
||||||
raise ValueError("GRPO + SP + Liger not currently supported")
|
raise ValueError("PEFT + GRPO + Liger is not yet supported")
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@model_validator(mode="after")
|
@model_validator(mode="after")
|
||||||
@@ -1357,10 +1345,6 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
|
|||||||
):
|
):
|
||||||
return data
|
return data
|
||||||
|
|
||||||
# Skip if dropout is not 0, as auto enabling it would just disable it during runtime patch checks
|
|
||||||
if data.get("lora_dropout") != 0:
|
|
||||||
return data
|
|
||||||
|
|
||||||
# Check multi-GPU compatibility
|
# Check multi-GPU compatibility
|
||||||
capabilities = data.get("capabilities")
|
capabilities = data.get("capabilities")
|
||||||
is_multi_gpu = capabilities and capabilities.get("n_gpu", 0) > 1
|
is_multi_gpu = capabilities and capabilities.get("n_gpu", 0) > 1
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ class TestKnowledgeDistillation:
|
|||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
assert (Path(temp_dir) / "model.safetensors").exists()
|
assert (Path(temp_dir) / "model.safetensors").exists()
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/loss", 1.2, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -121,5 +121,5 @@ class TestKnowledgeDistillation:
|
|||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
assert (Path(temp_dir) / "adapter_model.safetensors").exists()
|
assert (Path(temp_dir) / "adapter_model.safetensors").exists()
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/loss", 1.2, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
|
||||||
)
|
)
|
||||||
|
|||||||
111
tests/e2e/integrations/test_llm_compressor.py
Normal file
111
tests/e2e/integrations/test_llm_compressor.py
Normal file
@@ -0,0 +1,111 @@
|
|||||||
|
"""
|
||||||
|
E2E smoke tests for LLMCompressorPlugin integration
|
||||||
|
"""
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from axolotl.cli.args import TrainerCliArgs
|
||||||
|
from axolotl.common.datasets import load_datasets
|
||||||
|
from axolotl.train import train
|
||||||
|
from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
|
||||||
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
|
from tests.e2e.utils import (
|
||||||
|
check_model_output_exists,
|
||||||
|
require_llmcompressor,
|
||||||
|
require_torch_2_4_1,
|
||||||
|
)
|
||||||
|
|
||||||
|
MODELS = [
|
||||||
|
"nm-testing/llama2.c-stories42M-pruned2.4-compressed",
|
||||||
|
"nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"base_model", MODELS, ids=["no-checkpoint-recipe", "with-checkpoint-recipe"]
|
||||||
|
)
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"save_compressed", [True, False], ids=["save_compressed", "save_uncompressed"]
|
||||||
|
)
|
||||||
|
class TestLLMCompressorIntegration:
|
||||||
|
"""
|
||||||
|
e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin
|
||||||
|
"""
|
||||||
|
|
||||||
|
@require_llmcompressor
|
||||||
|
@require_torch_2_4_1
|
||||||
|
def test_llmcompressor_plugin(
|
||||||
|
self, temp_dir, base_model: str, save_compressed: bool
|
||||||
|
):
|
||||||
|
from llmcompressor import active_session
|
||||||
|
|
||||||
|
# core cfg
|
||||||
|
cfg = DictDefault(
|
||||||
|
{
|
||||||
|
"base_model": base_model,
|
||||||
|
"plugins": ["axolotl.integrations.llm_compressor.LLMCompressorPlugin"],
|
||||||
|
"sequence_len": 1024,
|
||||||
|
"val_set_size": 0.05,
|
||||||
|
"special_tokens": {"pad_token": "<|endoftext|>"},
|
||||||
|
"datasets": [{"path": "mhenrichsen/alpaca_2k_test", "type": "alpaca"}],
|
||||||
|
"num_epochs": 1,
|
||||||
|
"micro_batch_size": 2,
|
||||||
|
"gradient_accumulation_steps": 2,
|
||||||
|
"output_dir": temp_dir,
|
||||||
|
"learning_rate": 1e-5,
|
||||||
|
"optimizer": "adamw_torch_fused",
|
||||||
|
"lr_scheduler": "cosine",
|
||||||
|
"save_safetensors": True,
|
||||||
|
"bf16": "auto",
|
||||||
|
"max_steps": 5,
|
||||||
|
"llmcompressor": {
|
||||||
|
"recipe": {
|
||||||
|
"finetuning_stage": {
|
||||||
|
"finetuning_modifiers": {
|
||||||
|
"ConstantPruningModifier": {
|
||||||
|
"targets": [
|
||||||
|
"re:.*q_proj.weight",
|
||||||
|
"re:.*k_proj.weight",
|
||||||
|
"re:.*v_proj.weight",
|
||||||
|
"re:.*o_proj.weight",
|
||||||
|
"re:.*gate_proj.weight",
|
||||||
|
"re:.*up_proj.weight",
|
||||||
|
"re:.*down_proj.weight",
|
||||||
|
],
|
||||||
|
"start": 0,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"save_compressed": save_compressed,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
prepare_plugins(cfg)
|
||||||
|
cfg = validate_config(cfg)
|
||||||
|
normalize_config(cfg)
|
||||||
|
cli_args = TrainerCliArgs()
|
||||||
|
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||||
|
|
||||||
|
try:
|
||||||
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
_check_llmcompressor_model_outputs(temp_dir, save_compressed)
|
||||||
|
finally:
|
||||||
|
active_session().reset()
|
||||||
|
|
||||||
|
|
||||||
|
def _check_llmcompressor_model_outputs(temp_dir, save_compressed):
|
||||||
|
if save_compressed:
|
||||||
|
assert (Path(temp_dir) / "recipe.yaml").exists()
|
||||||
|
|
||||||
|
from compressed_tensors import ModelCompressor
|
||||||
|
from compressed_tensors.config import Sparse24BitMaskConfig
|
||||||
|
|
||||||
|
compressor = ModelCompressor.from_pretrained(temp_dir)
|
||||||
|
assert compressor is not None
|
||||||
|
assert isinstance(compressor.sparsity_config, Sparse24BitMaskConfig)
|
||||||
@@ -166,7 +166,6 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
|
|||||||
"""
|
"""
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.skip(reason="flaky test")
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"num_gpus",
|
"num_gpus",
|
||||||
[1, 2],
|
[1, 2],
|
||||||
@@ -228,7 +227,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
|
|||||||
|
|
||||||
current_env = os.environ.copy()
|
current_env = os.environ.copy()
|
||||||
env = {
|
env = {
|
||||||
"NCCL_P2P_LEVEL": "LOC",
|
"NCCL_P2P_LEVEL": "NVL",
|
||||||
**current_env,
|
**current_env,
|
||||||
"CUDA_VISIBLE_DEVICES": "1",
|
"CUDA_VISIBLE_DEVICES": "1",
|
||||||
"VLLM_DISABLE_COMPILE_CACHE": "1",
|
"VLLM_DISABLE_COMPILE_CACHE": "1",
|
||||||
@@ -258,7 +257,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
|
|||||||
f"{get_torch_dist_unique_port()}",
|
f"{get_torch_dist_unique_port()}",
|
||||||
],
|
],
|
||||||
env={
|
env={
|
||||||
"NCCL_P2P_LEVEL": "LOC",
|
"NCCL_P2P_LEVEL": "NVL",
|
||||||
"NCCL_DEBUG": "INFO",
|
"NCCL_DEBUG": "INFO",
|
||||||
**current_env,
|
**current_env,
|
||||||
},
|
},
|
||||||
@@ -266,7 +265,6 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
|
|||||||
finally:
|
finally:
|
||||||
recursive_kill(vllm_process)
|
recursive_kill(vllm_process)
|
||||||
|
|
||||||
@pytest.mark.skip(reason="flaky test")
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"num_gpus",
|
"num_gpus",
|
||||||
[1, 2],
|
[1, 2],
|
||||||
@@ -322,7 +320,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
|
|||||||
|
|
||||||
current_env = os.environ.copy()
|
current_env = os.environ.copy()
|
||||||
env = {
|
env = {
|
||||||
"NCCL_P2P_LEVEL": "LOC", # nccl can be brittle, assume P2P isn't reliable
|
"NCCL_P2P_LEVEL": "NVL", # nccl can be brittle, assume P2P isn't reliable
|
||||||
**current_env,
|
**current_env,
|
||||||
"CUDA_VISIBLE_DEVICES": "1",
|
"CUDA_VISIBLE_DEVICES": "1",
|
||||||
"VLLM_DISABLE_COMPILE_CACHE": "1",
|
"VLLM_DISABLE_COMPILE_CACHE": "1",
|
||||||
@@ -352,7 +350,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
|
|||||||
f"{get_torch_dist_unique_port()}",
|
f"{get_torch_dist_unique_port()}",
|
||||||
],
|
],
|
||||||
env={
|
env={
|
||||||
"NCCL_P2P_LEVEL": "LOC",
|
"NCCL_P2P_LEVEL": "NVL",
|
||||||
"NCCL_DEBUG": "INFO",
|
"NCCL_DEBUG": "INFO",
|
||||||
**current_env,
|
**current_env,
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -57,9 +57,9 @@ class Test4dMultipackLlama(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 5,
|
"max_steps": 20,
|
||||||
"save_steps": 3,
|
"save_steps": 10,
|
||||||
"eval_steps": 4,
|
"eval_steps": 10,
|
||||||
"fp16": True,
|
"fp16": True,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -105,9 +105,9 @@ class Test4dMultipackLlama(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 5,
|
"max_steps": 20,
|
||||||
"save_steps": 3,
|
"save_steps": 10,
|
||||||
"eval_steps": 4,
|
"eval_steps": 10,
|
||||||
"fp16": True,
|
"fp16": True,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -26,15 +26,10 @@ class TestActivationCheckpointing:
|
|||||||
E2E tests for activation checkpointing
|
E2E tests for activation checkpointing
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
|
||||||
"gradient_checkpointing",
|
|
||||||
["offload", "offload_disk"],
|
|
||||||
)
|
|
||||||
def test_activation_checkpointing_offload(
|
def test_activation_checkpointing_offload(
|
||||||
self,
|
self,
|
||||||
temp_dir,
|
temp_dir,
|
||||||
fix_checkpoint_after_test, # pylint: disable=unused-argument,redefined-outer-name
|
fix_checkpoint_after_test, # pylint: disable=unused-argument,redefined-outer-name
|
||||||
gradient_checkpointing,
|
|
||||||
):
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
@@ -69,7 +64,7 @@ class TestActivationCheckpointing:
|
|||||||
"sample_packing": True,
|
"sample_packing": True,
|
||||||
"bf16": True,
|
"bf16": True,
|
||||||
"save_safetensors": True,
|
"save_safetensors": True,
|
||||||
"gradient_checkpointing": gradient_checkpointing,
|
"gradient_checkpointing": "offload",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -57,9 +57,9 @@ class TestMistral(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 5,
|
"max_steps": 20,
|
||||||
"save_steps": 3,
|
"save_steps": 10,
|
||||||
"eval_steps": 4,
|
"eval_steps": 10,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -99,9 +99,9 @@ class TestMistral(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 5,
|
"max_steps": 20,
|
||||||
"save_steps": 3,
|
"save_steps": 10,
|
||||||
"eval_steps": 4,
|
"eval_steps": 10,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -54,9 +54,9 @@ class TestMixtral(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 5,
|
"max_steps": 20,
|
||||||
"save_steps": 3,
|
"save_steps": 10,
|
||||||
"eval_steps": 4,
|
"eval_steps": 10,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -93,9 +93,9 @@ class TestMixtral(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 5,
|
"max_steps": 20,
|
||||||
"save_steps": 3,
|
"save_steps": 10,
|
||||||
"eval_steps": 4,
|
"eval_steps": 10,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -56,9 +56,9 @@ class TestPhiMultipack(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 5,
|
"max_steps": 20,
|
||||||
"eval_steps": 3,
|
"eval_steps": 10,
|
||||||
"save_steps": 4,
|
"save_steps": 10,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -108,9 +108,9 @@ class TestPhiMultipack(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 5,
|
"max_steps": 20,
|
||||||
"eval_steps": 3,
|
"eval_steps": 10,
|
||||||
"save_steps": 4,
|
"save_steps": 10,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -105,7 +105,25 @@ def require_vllm(test_case):
|
|||||||
return False
|
return False
|
||||||
|
|
||||||
return unittest.skipUnless(
|
return unittest.skipUnless(
|
||||||
is_vllm_installed(), "test requires a vllm to be installed"
|
is_vllm_installed(), "test requires vllm to be installed"
|
||||||
|
)(test_case)
|
||||||
|
|
||||||
|
|
||||||
|
def require_llmcompressor(test_case):
|
||||||
|
"""
|
||||||
|
Decorator marking a test that requires a llmcompressor to be installed
|
||||||
|
"""
|
||||||
|
|
||||||
|
def is_llmcompressor_installed():
|
||||||
|
try:
|
||||||
|
import llmcompressor # pylint: disable=unused-import # noqa: F401
|
||||||
|
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return unittest.skipUnless(
|
||||||
|
is_llmcompressor_installed(), "test requires llmcompressor to be installed"
|
||||||
)(test_case)
|
)(test_case)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -106,4 +106,3 @@ class TestBatchedSamplerPacking:
|
|||||||
|
|
||||||
original_idxs = set(range(len(train_dataset)))
|
original_idxs = set(range(len(train_dataset)))
|
||||||
assert original_idxs == set(batch_idxs)
|
assert original_idxs == set(batch_idxs)
|
||||||
assert len(batch_idxs) == len(set(batch_idxs))
|
|
||||||
|
|||||||
Reference in New Issue
Block a user