Compare commits
6 Commits
v0.15.0
...
fix/merge-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dce5bed379 | ||
|
|
cf4d550c88 | ||
|
|
43b1c80aa6 | ||
|
|
a36aaa70ce | ||
|
|
80f7088ad1 | ||
|
|
46b9f40f2a |
15
.github/workflows/multi-gpu-e2e.yml
vendored
15
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -8,6 +8,7 @@ on:
|
|||||||
- 'setup.py'
|
- 'setup.py'
|
||||||
- 'pyproject.toml'
|
- 'pyproject.toml'
|
||||||
- '.github/workflows/multi-gpu-e2e.yml'
|
- '.github/workflows/multi-gpu-e2e.yml'
|
||||||
|
- 'scripts/cutcrossentropy_install.py'
|
||||||
- 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
|
- 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
|
||||||
- 'src/axolotl/utils/distributed.py'
|
- 'src/axolotl/utils/distributed.py'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
@@ -35,13 +36,13 @@ jobs:
|
|||||||
pytorch: 2.8.0
|
pytorch: 2.8.0
|
||||||
axolotl_extras: fbgemm-gpu
|
axolotl_extras: fbgemm-gpu
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
- cuda: 129
|
# - cuda: 129
|
||||||
cuda_version: 12.9.1
|
# cuda_version: 12.9.1
|
||||||
python_version: "3.12"
|
# python_version: "3.12"
|
||||||
pytorch: 2.9.1
|
# pytorch: 2.9.1
|
||||||
axolotl_extras: "fbgemm-gpu"
|
# axolotl_extras: "fbgemm-gpu"
|
||||||
num_gpus: 2
|
# num_gpus: 2
|
||||||
dockerfile: "Dockerfile-uv.jinja"
|
# dockerfile: "Dockerfile-uv.jinja"
|
||||||
- cuda: 130
|
- cuda: 130
|
||||||
cuda_version: 13.0.0
|
cuda_version: 13.0.0
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
|
|||||||
8
.github/workflows/preview-docs.yml
vendored
8
.github/workflows/preview-docs.yml
vendored
@@ -14,14 +14,8 @@ on:
|
|||||||
- .github/workflows/preview-docs.yml
|
- .github/workflows/preview-docs.yml
|
||||||
|
|
||||||
permissions:
|
permissions:
|
||||||
checks: write
|
contents: read
|
||||||
contents: write
|
|
||||||
deployments: write
|
|
||||||
issues: write
|
|
||||||
discussions: write
|
|
||||||
pages: write
|
|
||||||
pull-requests: write
|
pull-requests: write
|
||||||
statuses: write
|
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
preview:
|
preview:
|
||||||
|
|||||||
9
.github/workflows/tests-nightly.yml
vendored
9
.github/workflows/tests-nightly.yml
vendored
@@ -3,6 +3,10 @@ on:
|
|||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
|
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
|
||||||
|
pull_request:
|
||||||
|
types: [opened, synchronize, reopened, ready_for_review]
|
||||||
|
paths:
|
||||||
|
- '.github/workflows/tests-nightly.yml'
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
pre-commit:
|
pre-commit:
|
||||||
@@ -27,7 +31,7 @@ jobs:
|
|||||||
- name: Restore Cache from S3
|
- name: Restore Cache from S3
|
||||||
id: hf-cache-restore-s3
|
id: hf-cache-restore-s3
|
||||||
run: |
|
run: |
|
||||||
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
||||||
|
|
||||||
pytest:
|
pytest:
|
||||||
name: PyTest
|
name: PyTest
|
||||||
@@ -35,7 +39,6 @@ jobs:
|
|||||||
needs: [prime-cdn-s3-cache]
|
needs: [prime-cdn-s3-cache]
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
max-parallel: 2
|
|
||||||
matrix:
|
matrix:
|
||||||
python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
|
python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
|
||||||
pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
|
pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
|
||||||
@@ -60,7 +63,7 @@ jobs:
|
|||||||
- name: upgrade pip
|
- name: upgrade pip
|
||||||
run: |
|
run: |
|
||||||
pip3 install --upgrade pip
|
pip3 install --upgrade pip
|
||||||
pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
|
pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel
|
||||||
|
|
||||||
- name: Install PyTorch
|
- name: Install PyTorch
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
2
.github/workflows/tests.yml
vendored
2
.github/workflows/tests.yml
vendored
@@ -55,7 +55,7 @@ jobs:
|
|||||||
- name: Restore Cache from S3
|
- name: Restore Cache from S3
|
||||||
id: hf-cache-restore-s3
|
id: hf-cache-restore-s3
|
||||||
run: |
|
run: |
|
||||||
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
|
||||||
|
|
||||||
pytest:
|
pytest:
|
||||||
name: PyTest
|
name: PyTest
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
|||||||
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
RUN uv pip install packaging==26.0 setuptools==75.8.0
|
RUN uv pip install packaging==26.0 setuptools==78.1.1
|
||||||
RUN uv pip install torchvision
|
RUN uv pip install torchvision
|
||||||
RUN uv pip uninstall causal_conv1d
|
RUN uv pip uninstall causal_conv1d
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
|||||||
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
RUN pip install packaging==26.0 setuptools==75.8.0 psutil
|
RUN pip install packaging==26.0 setuptools==78.1.1 psutil
|
||||||
RUN pip uninstall -y causal_conv1d
|
RUN pip uninstall -y causal_conv1d
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||||
|
|||||||
@@ -26,7 +26,7 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
|
|||||||
model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
|
model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
|
||||||
|
|
||||||
LOG.info("Running merge of LoRA with base model...")
|
LOG.info("Running merge of LoRA with base model...")
|
||||||
model = model.merge_and_unload(progressbar=True)
|
model = model.merge_and_unload(progressbar=True, safe_merge=True)
|
||||||
try:
|
try:
|
||||||
model.to(dtype=cfg.torch_dtype)
|
model.to(dtype=cfg.torch_dtype)
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
|
|||||||
@@ -226,7 +226,7 @@ class ModelLoader:
|
|||||||
isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
|
isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
|
||||||
and not self.is_qlora_and_fsdp_enabled
|
and not self.is_qlora_and_fsdp_enabled
|
||||||
):
|
):
|
||||||
self.model = self.model.merge_and_unload()
|
self.model = self.model.merge_and_unload(safe_merge=True)
|
||||||
|
|
||||||
self._configure_experts_implementation()
|
self._configure_experts_implementation()
|
||||||
self._apply_activation_checkpointing()
|
self._apply_activation_checkpointing()
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ on-the-fly (4-bit via bitsandbytes parametrize, 8-bit via custom int8 parametriz
|
|||||||
reducing peak VRAM from "all experts in bf16" to "one expert at a time."
|
reducing peak VRAM from "all experts in bf16" to "one expert at a time."
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
|
||||||
import bitsandbytes as bnb
|
import bitsandbytes as bnb
|
||||||
import torch
|
import torch
|
||||||
import torch.nn.utils.parametrize as P
|
import torch.nn.utils.parametrize as P
|
||||||
@@ -101,6 +103,14 @@ def patch_moe_quantization_on_load(cfg):
|
|||||||
_moe_load_state["quant_type"] = quant_type
|
_moe_load_state["quant_type"] = quant_type
|
||||||
_moe_load_state["compress_statistics"] = compress_statistics
|
_moe_load_state["compress_statistics"] = compress_statistics
|
||||||
|
|
||||||
|
# Disable async tensor loading. Transformers' convert_and_load_state_dict_in_model
|
||||||
|
# uses a ThreadPoolExecutor to materialise tensors (move from safetensors → CUDA)
|
||||||
|
# ahead of time. With MoE models this pre-fetches many large bf16 expert tensors
|
||||||
|
# onto the GPU simultaneously — long before our set_param_for_module patch can
|
||||||
|
# quantise and free them one-by-one — causing OOM even at <5 % of weights loaded.
|
||||||
|
# Sequential loading ensures only ONE bf16 expert tensor is on-GPU at a time.
|
||||||
|
os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"
|
||||||
|
|
||||||
# Disable caching_allocator_warmup — it pre-allocates a huge tensor at bf16
|
# Disable caching_allocator_warmup — it pre-allocates a huge tensor at bf16
|
||||||
# size for all params, defeating our on-load quantization VRAM savings.
|
# size for all params, defeating our on-load quantization VRAM savings.
|
||||||
def _noop_warmup(*args, **kwargs):
|
def _noop_warmup(*args, **kwargs):
|
||||||
|
|||||||
@@ -257,7 +257,7 @@ def save_trained_model(
|
|||||||
# Handle ReLoRA early return case
|
# Handle ReLoRA early return case
|
||||||
if cfg.relora:
|
if cfg.relora:
|
||||||
if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
|
if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
|
||||||
model = model.merge_and_unload()
|
model = model.merge_and_unload(safe_merge=True)
|
||||||
else:
|
else:
|
||||||
# final model weights have already been saved by `ReLoRACallback.on_train_end`
|
# final model weights have already been saved by `ReLoRACallback.on_train_end`
|
||||||
return
|
return
|
||||||
|
|||||||
1478
tests/e2e/integrations/test_scattermoe_lora_kernels.py
Normal file
1478
tests/e2e/integrations/test_scattermoe_lora_kernels.py
Normal file
File diff suppressed because it is too large
Load Diff
1255
tests/e2e/integrations/test_scattermoe_lora_olmoe.py
Normal file
1255
tests/e2e/integrations/test_scattermoe_lora_olmoe.py
Normal file
File diff suppressed because it is too large
Load Diff
@@ -69,7 +69,7 @@ class TestAdapterMergeUnmerge:
|
|||||||
|
|
||||||
self.scaling = alpha / r
|
self.scaling = alpha / r
|
||||||
|
|
||||||
def mock_merge_and_unload(progressbar=False):
|
def mock_merge_and_unload(progressbar=False, safe_merge=False):
|
||||||
"""Simulate the actual merge operation"""
|
"""Simulate the actual merge operation"""
|
||||||
# Apply LoRA delta to base weights: W_new = W_base + (B @ A) * scaling
|
# Apply LoRA delta to base weights: W_new = W_base + (B @ A) * scaling
|
||||||
delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling
|
delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling
|
||||||
|
|||||||
Reference in New Issue
Block a user