Compare commits
6 Commits
v0.15.0
...
fix/merge-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dce5bed379 | ||
|
|
cf4d550c88 | ||
|
|
43b1c80aa6 | ||
|
|
a36aaa70ce | ||
|
|
80f7088ad1 | ||
|
|
46b9f40f2a |
15
.github/workflows/multi-gpu-e2e.yml
vendored
15
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -8,6 +8,7 @@ on:
|
||||
- 'setup.py'
|
||||
- 'pyproject.toml'
|
||||
- '.github/workflows/multi-gpu-e2e.yml'
|
||||
- 'scripts/cutcrossentropy_install.py'
|
||||
- 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
|
||||
- 'src/axolotl/utils/distributed.py'
|
||||
workflow_dispatch:
|
||||
@@ -35,13 +36,13 @@ jobs:
|
||||
pytorch: 2.8.0
|
||||
axolotl_extras: fbgemm-gpu
|
||||
num_gpus: 2
|
||||
- cuda: 129
|
||||
cuda_version: 12.9.1
|
||||
python_version: "3.12"
|
||||
pytorch: 2.9.1
|
||||
axolotl_extras: "fbgemm-gpu"
|
||||
num_gpus: 2
|
||||
dockerfile: "Dockerfile-uv.jinja"
|
||||
# - cuda: 129
|
||||
# cuda_version: 12.9.1
|
||||
# python_version: "3.12"
|
||||
# pytorch: 2.9.1
|
||||
# axolotl_extras: "fbgemm-gpu"
|
||||
# num_gpus: 2
|
||||
# dockerfile: "Dockerfile-uv.jinja"
|
||||
- cuda: 130
|
||||
cuda_version: 13.0.0
|
||||
python_version: "3.11"
|
||||
|
||||
8
.github/workflows/preview-docs.yml
vendored
8
.github/workflows/preview-docs.yml
vendored
@@ -14,14 +14,8 @@ on:
|
||||
- .github/workflows/preview-docs.yml
|
||||
|
||||
permissions:
|
||||
checks: write
|
||||
contents: write
|
||||
deployments: write
|
||||
issues: write
|
||||
discussions: write
|
||||
pages: write
|
||||
contents: read
|
||||
pull-requests: write
|
||||
statuses: write
|
||||
|
||||
jobs:
|
||||
preview:
|
||||
|
||||
9
.github/workflows/tests-nightly.yml
vendored
9
.github/workflows/tests-nightly.yml
vendored
@@ -3,6 +3,10 @@ on:
|
||||
workflow_dispatch:
|
||||
schedule:
|
||||
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
|
||||
pull_request:
|
||||
types: [opened, synchronize, reopened, ready_for_review]
|
||||
paths:
|
||||
- '.github/workflows/tests-nightly.yml'
|
||||
|
||||
jobs:
|
||||
pre-commit:
|
||||
@@ -27,7 +31,7 @@ jobs:
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
||||
curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
||||
|
||||
pytest:
|
||||
name: PyTest
|
||||
@@ -35,7 +39,6 @@ jobs:
|
||||
needs: [prime-cdn-s3-cache]
|
||||
strategy:
|
||||
fail-fast: false
|
||||
max-parallel: 2
|
||||
matrix:
|
||||
python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
|
||||
pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
|
||||
@@ -60,7 +63,7 @@ jobs:
|
||||
- name: upgrade pip
|
||||
run: |
|
||||
pip3 install --upgrade pip
|
||||
pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
|
||||
pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel
|
||||
|
||||
- name: Install PyTorch
|
||||
run: |
|
||||
|
||||
2
.github/workflows/tests.yml
vendored
2
.github/workflows/tests.yml
vendored
@@ -55,7 +55,7 @@ jobs:
|
||||
- name: Restore Cache from S3
|
||||
id: hf-cache-restore-s3
|
||||
run: |
|
||||
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
||||
curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
||||
|
||||
pytest:
|
||||
name: PyTest
|
||||
|
||||
@@ -31,7 +31,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
||||
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
||||
fi
|
||||
|
||||
RUN uv pip install packaging==26.0 setuptools==75.8.0
|
||||
RUN uv pip install packaging==26.0 setuptools==78.1.1
|
||||
RUN uv pip install torchvision
|
||||
RUN uv pip uninstall causal_conv1d
|
||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
|
||||
@@ -32,7 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
||||
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
||||
fi
|
||||
|
||||
RUN pip install packaging==26.0 setuptools==75.8.0 psutil
|
||||
RUN pip install packaging==26.0 setuptools==78.1.1 psutil
|
||||
RUN pip uninstall -y causal_conv1d
|
||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||
|
||||
@@ -26,7 +26,7 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
|
||||
model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
|
||||
|
||||
LOG.info("Running merge of LoRA with base model...")
|
||||
model = model.merge_and_unload(progressbar=True)
|
||||
model = model.merge_and_unload(progressbar=True, safe_merge=True)
|
||||
try:
|
||||
model.to(dtype=cfg.torch_dtype)
|
||||
except ValueError as e:
|
||||
|
||||
@@ -226,7 +226,7 @@ class ModelLoader:
|
||||
isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
|
||||
and not self.is_qlora_and_fsdp_enabled
|
||||
):
|
||||
self.model = self.model.merge_and_unload()
|
||||
self.model = self.model.merge_and_unload(safe_merge=True)
|
||||
|
||||
self._configure_experts_implementation()
|
||||
self._apply_activation_checkpointing()
|
||||
|
||||
@@ -7,6 +7,8 @@ on-the-fly (4-bit via bitsandbytes parametrize, 8-bit via custom int8 parametriz
|
||||
reducing peak VRAM from "all experts in bf16" to "one expert at a time."
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import bitsandbytes as bnb
|
||||
import torch
|
||||
import torch.nn.utils.parametrize as P
|
||||
@@ -101,6 +103,14 @@ def patch_moe_quantization_on_load(cfg):
|
||||
_moe_load_state["quant_type"] = quant_type
|
||||
_moe_load_state["compress_statistics"] = compress_statistics
|
||||
|
||||
# Disable async tensor loading. Transformers' convert_and_load_state_dict_in_model
|
||||
# uses a ThreadPoolExecutor to materialise tensors (move from safetensors → CUDA)
|
||||
# ahead of time. With MoE models this pre-fetches many large bf16 expert tensors
|
||||
# onto the GPU simultaneously — long before our set_param_for_module patch can
|
||||
# quantise and free them one-by-one — causing OOM even at <5 % of weights loaded.
|
||||
# Sequential loading ensures only ONE bf16 expert tensor is on-GPU at a time.
|
||||
os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"
|
||||
|
||||
# Disable caching_allocator_warmup — it pre-allocates a huge tensor at bf16
|
||||
# size for all params, defeating our on-load quantization VRAM savings.
|
||||
def _noop_warmup(*args, **kwargs):
|
||||
|
||||
@@ -257,7 +257,7 @@ def save_trained_model(
|
||||
# Handle ReLoRA early return case
|
||||
if cfg.relora:
|
||||
if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
|
||||
model = model.merge_and_unload()
|
||||
model = model.merge_and_unload(safe_merge=True)
|
||||
else:
|
||||
# final model weights have already been saved by `ReLoRACallback.on_train_end`
|
||||
return
|
||||
|
||||
1478
tests/e2e/integrations/test_scattermoe_lora_kernels.py
Normal file
1478
tests/e2e/integrations/test_scattermoe_lora_kernels.py
Normal file
File diff suppressed because it is too large
Load Diff
1255
tests/e2e/integrations/test_scattermoe_lora_olmoe.py
Normal file
1255
tests/e2e/integrations/test_scattermoe_lora_olmoe.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -69,7 +69,7 @@ class TestAdapterMergeUnmerge:
|
||||
|
||||
self.scaling = alpha / r
|
||||
|
||||
def mock_merge_and_unload(progressbar=False):
|
||||
def mock_merge_and_unload(progressbar=False, safe_merge=False):
|
||||
"""Simulate the actual merge operation"""
|
||||
# Apply LoRA delta to base weights: W_new = W_base + (B @ A) * scaling
|
||||
delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling
|
||||
|
||||
Reference in New Issue
Block a user