feat: merge adapter in fp32

fix: reduce permissions for preview docs CI (#3480 ) [skip ci]
load weights synchronously so they can be converted and not OOM: (#3477 )
2026-03-14 00:20:59 +07:00 · 2026-03-09 08:04:31 -04:00 · 2026-03-07 07:09:24 -05:00 · 2026-03-07 00:00:48 -05:00 · 2026-03-06 14:59:25 -05:00 · 2026-03-06 14:59:00 -05:00
14 changed files with 2766 additions and 25 deletions
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -8,6 +8,7 @@ on:
      - 'setup.py'
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
      - 'scripts/cutcrossentropy_install.py'
      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
@@ -35,13 +36,13 @@ jobs:
            pytorch: 2.8.0
            axolotl_extras: fbgemm-gpu
            num_gpus: 2
-          - cuda: 129
+#          - cuda: 129
-            cuda_version: 12.9.1
+#            cuda_version: 12.9.1
-            python_version: "3.12"
+#            python_version: "3.12"
-            pytorch: 2.9.1
+#            pytorch: 2.9.1
-            axolotl_extras: "fbgemm-gpu"
+#            axolotl_extras: "fbgemm-gpu"
-            num_gpus: 2
+#            num_gpus: 2
-            dockerfile: "Dockerfile-uv.jinja"
+#            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
--- a/.github/workflows/preview-docs.yml
+++ b/.github/workflows/preview-docs.yml
@@ -14,14 +14,8 @@ on:
      - .github/workflows/preview-docs.yml
 permissions:
-  checks: write
+  contents: read
  contents: write
  deployments: write
  issues: write
  discussions: write
  pages: write
  pull-requests: write
  statuses: write
 jobs:
  preview:
--- a/.github/workflows/tests-nightly.yml
+++ b/.github/workflows/tests-nightly.yml
@@ -3,6 +3,10 @@ on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
    paths:
      - '.github/workflows/tests-nightly.yml'
 jobs:
  pre-commit:
@@ -27,7 +31,7 @@ jobs:
      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
-          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
+          curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
  pytest:
    name: PyTest
@@ -35,7 +39,6 @@ jobs:
    needs: [prime-cdn-s3-cache]
    strategy:
      fail-fast: false
      max-parallel: 2
      matrix:
        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
        pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
@@ -60,7 +63,7 @@ jobs:
      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
-          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel
+          pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel
      - name: Install PyTorch
        run: |
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -55,7 +55,7 @@ jobs:
      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
-          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
+          curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null
  pytest:
    name: PyTest
--- a/2
+++ b/2
@@ -1 +1 @@
-0.15.0
+0.16.0.dev0
--- a/cicd/Dockerfile-uv.jinja
+++ b/cicd/Dockerfile-uv.jinja
@@ -31,7 +31,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi
-RUN uv pip install packaging==26.0 setuptools==75.8.0
+RUN uv pip install packaging==26.0 setuptools==78.1.1
 RUN uv pip install torchvision
 RUN uv pip uninstall causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
--- a/cicd/Dockerfile.jinja
+++ b/cicd/Dockerfile.jinja
@@ -32,7 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi
-RUN pip install packaging==26.0 setuptools==75.8.0 psutil
+RUN pip install packaging==26.0 setuptools==78.1.1 psutil
 RUN pip uninstall -y causal_conv1d
 RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
--- a/src/axolotl/cli/merge_lora.py
+++ b/src/axolotl/cli/merge_lora.py
@@ -26,7 +26,7 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
    model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
    LOG.info("Running merge of LoRA with base model...")
-    model = model.merge_and_unload(progressbar=True)
+    model = model.merge_and_unload(progressbar=True, safe_merge=True)
    try:
        model.to(dtype=cfg.torch_dtype)
    except ValueError as e:
--- a/src/axolotl/loaders/model.py
+++ b/src/axolotl/loaders/model.py
@@ -226,7 +226,7 @@ class ModelLoader:
            isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
            and not self.is_qlora_and_fsdp_enabled
        ):
-            self.model = self.model.merge_and_unload()
+            self.model = self.model.merge_and_unload(safe_merge=True)
        self._configure_experts_implementation()
        self._apply_activation_checkpointing()
--- a/src/axolotl/monkeypatch/moe_quant.py
+++ b/src/axolotl/monkeypatch/moe_quant.py
@@ -7,6 +7,8 @@ on-the-fly (4-bit via bitsandbytes parametrize, 8-bit via custom int8 parametriz
 reducing peak VRAM from "all experts in bf16" to "one expert at a time."
 """
 import os
 import bitsandbytes as bnb
 import torch
 import torch.nn.utils.parametrize as P
@@ -101,6 +103,14 @@ def patch_moe_quantization_on_load(cfg):
        _moe_load_state["quant_type"] = quant_type
        _moe_load_state["compress_statistics"] = compress_statistics
    # Disable async tensor loading.  Transformers' convert_and_load_state_dict_in_model
    # uses a ThreadPoolExecutor to materialise tensors (move from safetensors → CUDA)
    # ahead of time.  With MoE models this pre-fetches many large bf16 expert tensors
    # onto the GPU simultaneously — long before our set_param_for_module patch can
    # quantise and free them one-by-one — causing OOM even at <5 % of weights loaded.
    # Sequential loading ensures only ONE bf16 expert tensor is on-GPU at a time.
    os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"
    # Disable caching_allocator_warmup — it pre-allocates a huge tensor at bf16
    # size for all params, defeating our on-load quantization VRAM savings.
    def _noop_warmup(*args, **kwargs):
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -257,7 +257,7 @@ def save_trained_model(
    # Handle ReLoRA early return case
    if cfg.relora:
        if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
-            model = model.merge_and_unload()
+            model = model.merge_and_unload(safe_merge=True)
        else:
            # final model weights have already been saved by `ReLoRACallback.on_train_end`
            return
--- a/tests/e2e/integrations/test_scattermoe_lora_kernels.py
+++ b/tests/e2e/integrations/test_scattermoe_lora_kernels.py
--- a/tests/e2e/integrations/test_scattermoe_lora_olmoe.py
+++ b/tests/e2e/integrations/test_scattermoe_lora_olmoe.py
--- a/tests/utils/lora/test_merge_lora.py
+++ b/tests/utils/lora/test_merge_lora.py
@@ -69,7 +69,7 @@ class TestAdapterMergeUnmerge:
        self.scaling = alpha / r
-        def mock_merge_and_unload(progressbar=False):
+        def mock_merge_and_unload(progressbar=False, safe_merge=False):
            """Simulate the actual merge operation"""
            # Apply LoRA delta to base weights: W_new = W_base + (B @ A) * scaling
            delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling
Author	SHA1	Message	Date
NanoCode012	dce5bed379	feat: merge adapter in fp32	2026-03-14 00:20:59 +07:00
NanoCode012	cf4d550c88	fix: reduce permissions for preview docs CI (#3480 ) [skip ci]	2026-03-09 08:04:31 -04:00
Wing Lian	43b1c80aa6	load weights synchronously so they can be converted and not OOM: (#3477 )	2026-03-07 07:09:24 -05:00
Wing Lian	a36aaa70ce	add gpu tests for scattermoe (#3474 ) [skip ci]	2026-03-07 00:00:48 -05:00
Wing Lian	80f7088ad1	update setuptools so trl can be installed from main for nightlies (#3471 ) * update setuptools so trl can be installed from main for nightlies * run the nightly in the PR CI on change * use range request, don't use cu129 in CI since it's not supported with AO * run multigpu ci if CCE install script changes	2026-03-06 14:59:25 -05:00
Wing Lian	46b9f40f2a	bump dev version to 0.16.0.dev0 (#3472 ) [skip ci]	2026-03-06 14:59:00 -05:00