Compare commits

..

6 Commits

Author SHA1 Message Date
NanoCode012
dce5bed379 feat: merge adapter in fp32 2026-03-14 00:20:59 +07:00
NanoCode012
cf4d550c88 fix: reduce permissions for preview docs CI (#3480) [skip ci] 2026-03-09 08:04:31 -04:00
Wing Lian
43b1c80aa6 load weights synchronously so they can be converted and not OOM: (#3477) 2026-03-07 07:09:24 -05:00
Wing Lian
a36aaa70ce add gpu tests for scattermoe (#3474) [skip ci] 2026-03-07 00:00:48 -05:00
Wing Lian
80f7088ad1 update setuptools so trl can be installed from main for nightlies (#3471)
* update setuptools so trl can be installed from main for nightlies

* run the nightly in the PR CI on change

* use range request, don't use cu129 in CI since it's not supported with AO

* run multigpu ci if CCE install script changes
2026-03-06 14:59:25 -05:00
Wing Lian
46b9f40f2a bump dev version to 0.16.0.dev0 (#3472) [skip ci] 2026-03-06 14:59:00 -05:00
14 changed files with 2766 additions and 25 deletions

View File

@@ -8,6 +8,7 @@ on:
- 'setup.py'
- 'pyproject.toml'
- '.github/workflows/multi-gpu-e2e.yml'
- 'scripts/cutcrossentropy_install.py'
- 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
- 'src/axolotl/utils/distributed.py'
workflow_dispatch:
@@ -35,13 +36,13 @@ jobs:
pytorch: 2.8.0
axolotl_extras: fbgemm-gpu
num_gpus: 2
- cuda: 129
cuda_version: 12.9.1
python_version: "3.12"
pytorch: 2.9.1
axolotl_extras: "fbgemm-gpu"
num_gpus: 2
dockerfile: "Dockerfile-uv.jinja"
# - cuda: 129
# cuda_version: 12.9.1
# python_version: "3.12"
# pytorch: 2.9.1
# axolotl_extras: "fbgemm-gpu"
# num_gpus: 2
# dockerfile: "Dockerfile-uv.jinja"
- cuda: 130
cuda_version: 13.0.0
python_version: "3.11"

View File

@@ -14,14 +14,8 @@ on:
- .github/workflows/preview-docs.yml
permissions:
checks: write
contents: write
deployments: write
issues: write
discussions: write
pages: write
contents: read
pull-requests: write
statuses: write
jobs:
preview:

View File

@@ -3,6 +3,10 @@ on:
workflow_dispatch:
schedule:
- cron: '0 0 * * *' # Runs at 00:00 UTC every day
pull_request:
types: [opened, synchronize, reopened, ready_for_review]
paths:
- '.github/workflows/tests-nightly.yml'
jobs:
pre-commit:
@@ -27,7 +31,7 @@ jobs:
- name: Restore Cache from S3
id: hf-cache-restore-s3
run: |
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
pytest:
name: PyTest
@@ -35,7 +39,6 @@ jobs:
needs: [prime-cdn-s3-cache]
strategy:
fail-fast: false
max-parallel: 2
matrix:
python_version: ["3.12"] # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
@@ -60,7 +63,7 @@ jobs:
- name: upgrade pip
run: |
pip3 install --upgrade pip
pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel
- name: Install PyTorch
run: |

View File

@@ -55,7 +55,7 @@ jobs:
- name: Restore Cache from S3
id: hf-cache-restore-s3
run: |
curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
pytest:
name: PyTest

View File

@@ -1 +1 @@
0.15.0
0.16.0.dev0

View File

@@ -31,7 +31,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
fi
RUN uv pip install packaging==26.0 setuptools==75.8.0
RUN uv pip install packaging==26.0 setuptools==78.1.1
RUN uv pip install torchvision
RUN uv pip uninstall causal_conv1d
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \

View File

@@ -32,7 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
fi
RUN pip install packaging==26.0 setuptools==75.8.0 psutil
RUN pip install packaging==26.0 setuptools==78.1.1 psutil
RUN pip uninstall -y causal_conv1d
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \

View File

@@ -26,7 +26,7 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
LOG.info("Running merge of LoRA with base model...")
model = model.merge_and_unload(progressbar=True)
model = model.merge_and_unload(progressbar=True, safe_merge=True)
try:
model.to(dtype=cfg.torch_dtype)
except ValueError as e:

View File

@@ -226,7 +226,7 @@ class ModelLoader:
isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
and not self.is_qlora_and_fsdp_enabled
):
self.model = self.model.merge_and_unload()
self.model = self.model.merge_and_unload(safe_merge=True)
self._configure_experts_implementation()
self._apply_activation_checkpointing()

View File

@@ -7,6 +7,8 @@ on-the-fly (4-bit via bitsandbytes parametrize, 8-bit via custom int8 parametriz
reducing peak VRAM from "all experts in bf16" to "one expert at a time."
"""
import os
import bitsandbytes as bnb
import torch
import torch.nn.utils.parametrize as P
@@ -101,6 +103,14 @@ def patch_moe_quantization_on_load(cfg):
_moe_load_state["quant_type"] = quant_type
_moe_load_state["compress_statistics"] = compress_statistics
# Disable async tensor loading. Transformers' convert_and_load_state_dict_in_model
# uses a ThreadPoolExecutor to materialise tensors (move from safetensors → CUDA)
# ahead of time. With MoE models this pre-fetches many large bf16 expert tensors
# onto the GPU simultaneously — long before our set_param_for_module patch can
# quantise and free them one-by-one — causing OOM even at <5 % of weights loaded.
# Sequential loading ensures only ONE bf16 expert tensor is on-GPU at a time.
os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"
# Disable caching_allocator_warmup — it pre-allocates a huge tensor at bf16
# size for all params, defeating our on-load quantization VRAM savings.
def _noop_warmup(*args, **kwargs):

View File

@@ -257,7 +257,7 @@ def save_trained_model(
# Handle ReLoRA early return case
if cfg.relora:
if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
model = model.merge_and_unload()
model = model.merge_and_unload(safe_merge=True)
else:
# final model weights have already been saved by `ReLoRACallback.on_train_end`
return

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -69,7 +69,7 @@ class TestAdapterMergeUnmerge:
self.scaling = alpha / r
def mock_merge_and_unload(progressbar=False):
def mock_merge_and_unload(progressbar=False, safe_merge=False):
"""Simulate the actual merge operation"""
# Apply LoRA delta to base weights: W_new = W_base + (B @ A) * scaling
delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling