Merge branch 'main' into testingci

wip
2025-07-24 09:13:15 +01:00 · 2025-07-24 09:12:55 +01:00
1 changed files with 0 additions and 351 deletions
--- a/src/axolotl/monkeypatch/accelerate/fsdp2.py
+++ b/src/axolotl/monkeypatch/accelerate/fsdp2.py
@@ -1,351 +0,0 @@
-"""
-monkeypatch for accelerate fsdp2 fix when modifying ordereddict during interation, and saving full state dicts
-"""
-
-import copy
-import functools
-import sys
-
-import torch
-from torch import nn
-
-from axolotl.utils.bench import log_gpu_memory_usage
-from axolotl.utils.logging import get_logger
-
-LOG = get_logger(__name__)
-
-
-def fsdp2_load_full_state_dict(
-    _accelerator, model: torch.nn.Module, full_sd: dict, offload_to_cpu: bool = False
-):
-    """
-    Loads the full state dict (could be only on rank 0) into the sharded model. This is done by broadcasting the
-    parameters from rank 0 to all other ranks. This function modifies the model in-place.
-    Args:
-        accelerator (`Accelerator`): The accelerator instance
-        model (`torch.nn.Module`):
-            The model to load the state dict into, expected to be on meta device or a VRAM spike can occur
-        full_sd (`dict`): The full state dict to load, can only be on rank 0
-    """
-    from torch.distributed.tensor import distribute_tensor
-
-    LOG.info("Broadcasting full state dict to all ranks...")
-    import time
-
-    start_time = time.time()
-
-    meta_sharded_sd = model.state_dict()
-    sharded_sd = {}
-    for param_name, full_tensor in full_sd.items():
-        sharded_meta_param = meta_sharded_sd.get(param_name)
-        full_tensor = full_tensor.to(sharded_meta_param.dtype).to(torch.device("cuda"))
-        if hasattr(sharded_meta_param, "device_mesh"):
-            sharded_param = distribute_tensor(
-                full_tensor,
-                sharded_meta_param.device_mesh,
-                sharded_meta_param.placements,
-                src_data_rank=0,
-            )
-        else:
-            sharded_param = full_tensor
-
-        if offload_to_cpu:
-            sharded_param = sharded_param.cpu()
-
-        sharded_sd[param_name] = nn.Parameter(sharded_param)
-        del full_tensor
-        full_sd[param_name] = None
-    model.load_state_dict(sharded_sd, assign=True, strict=True)
-    end_time = time.time()
-    LOG.debug(
-        f"Time taken to load full state dict: {(end_time - start_time):.2f} seconds"
-    )
-    log_gpu_memory_usage(LOG, "Memory usage after broadcasting full state dict", 0)
-    return model
-
-
-def get_state_dict(self, model, unwrap=True):
-    """
-    Returns the state dictionary of a model sent through [`Accelerator.prepare`] potentially without full
-    precision.
-
-    Args:
-        model (`torch.nn.Module`):
-            A PyTorch model sent through [`Accelerator.prepare`]
-        unwrap (`bool`, *optional*, defaults to `True`):
-            Whether to return the original underlying state_dict of `model` or to return the wrapped state_dict
-
-    Returns:
-        `dict`: The state dictionary of the model potentially without full precision.
-
-    Example:
-
-    ```python
-    >>> import torch
-    >>> from accelerate import Accelerator
-
-    >>> accelerator = Accelerator()
-    >>> net = torch.nn.Linear(2, 2)
-    >>> net = accelerator.prepare(net)
-    >>> state_dict = accelerator.get_state_dict(net)
-    ```
-    """
-    from accelerate import DistributedType
-    from accelerate.utils import compare_versions
-
-    if self.distributed_type == DistributedType.DEEPSPEED:
-        zero3_sharding = self.deepspeed_config["zero_optimization"]["stage"] == 3
-        tp_sharding = (
-            self.deepspeed_config.get("tensor_parallel", {}).get("autotp_size", 0) > 1
-        )
-        if zero3_sharding or tp_sharding:
-            if model.zero_gather_16bit_weights_on_model_save():
-                if tp_sharding and not compare_versions("deepspeed", ">=", "0.16.4"):
-                    raise ImportError(
-                        "Deepspeed TP requires deepspeed >= 0.16.4, Please update DeepSpeed via `pip install deepspeed -U`."
-                    )
-                state_dict = (
-                    model._consolidated_16bit_state_dict()  # pylint: disable=protected-access
-                    if tp_sharding
-                    else model._zero3_consolidated_16bit_state_dict()  # pylint: disable=protected-access
-                )
-            else:
-                raise ValueError(
-                    "Cannot get 16bit model weights because `stage3_gather_16bit_weights_on_model_save` in DeepSpeed config is False. "
-                    "To save the model weights in 16bit, set `stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed config file or "
-                    "set `zero3_save_16bit_model` to True when using `accelerate config`. "
-                    "To save the full checkpoint, run `model.save_checkpoint(save_dir)` and use `zero_to_fp32.py` to recover weights."
-                )
-        else:
-            from deepspeed.checkpoint.utils import clone_tensors_for_torch_save
-
-            state_dict = clone_tensors_for_torch_save(
-                self.unwrap_model(model).state_dict()
-            )
-    elif self.is_fsdp2:
-        # https://github.com/pytorch/torchtune/blob/main/torchtune/training/_distributed.py#L465
-        state_dict = {}
-        sharded_state_dict = model.state_dict()
-        for param_name, param in sharded_state_dict.items():
-            if param.is_cpu:
-                param = param.to(torch.device("cuda"))
-
-            param = param.full_tensor()
-            if torch.distributed.get_rank() == 0:
-                state_dict[param_name] = param.cpu()
-            torch.distributed.barrier()
-    elif self.distributed_type == DistributedType.FSDP:
-        from torch.distributed.fsdp import FullStateDictConfig
-        from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
-        from torch.distributed.fsdp import StateDictType
-
-        full_state_dict_config = FullStateDictConfig(
-            offload_to_cpu=True, rank0_only=True
-        )
-        with FSDP.state_dict_type(
-            model, StateDictType.FULL_STATE_DICT, full_state_dict_config
-        ):
-            state_dict = model.state_dict()
-    else:
-        if unwrap:
-            model = self.unwrap_model(model)
-        state_dict = model.state_dict()
-
-    return state_dict
-
-
-def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
-    """Helper function to process LoRA modules for FSDP2."""
-    from torch.distributed.fsdp import fully_shard
-
-    log_bias_dtype_mismatch = False
-
-    # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
-    # wrap this. Therefore we must ensure the bias has the same dtype as the weight
-    if module.base_layer.bias is not None:
-        if module.base_layer.weight.dtype != module.base_layer.bias.dtype:
-            log_bias_dtype_mismatch = True
-            module.base_layer.bias.data = module.base_layer.bias.data.to(
-                module.base_layer.weight.dtype
-            )
-
-    for active_adapter in module.active_adapters:
-        if module.lora_A:
-            fully_shard(module.lora_A[active_adapter], **fsdp2_kwargs)
-        if module.lora_B:
-            fully_shard(module.lora_B[active_adapter], **fsdp2_kwargs)
-        if module.lora_embedding_A:
-            fully_shard(module.lora_embedding_A[active_adapter], **fsdp2_kwargs)
-        if module.lora_embedding_B:
-            fully_shard(module.lora_embedding_B[active_adapter], **fsdp2_kwargs)
-        if module.lora_magnitude_vector:
-            fully_shard(module.lora_magnitude_vector[active_adapter], **fsdp2_kwargs)
-    return log_bias_dtype_mismatch
-
-
-def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
-    """Prepares the model for FSDP2 in-place. Also returns the model to avoid misuse of the original model.
-
-    Args:
-        accelerator (`Accelerator`): The accelerator instance
-        model (`torch.nn.Module`): The model to prepare
-
-    Returns:
-        `torch.nn.Module`: Prepared model
-    """
-    from accelerate.utils import get_module_children_bottom_up, is_compiled_module
-    from accelerate.utils.fsdp_utils import fsdp2_prepare_auto_wrap_policy
-    from accelerate.utils.modeling import get_non_persistent_buffers
-    from peft import PeftModel
-    from peft.tuners.lora import LoraLayer
-    from torch.distributed.fsdp import (
-        CPUOffloadPolicy,
-        FSDPModule,
-        MixedPrecisionPolicy,
-        fully_shard,
-    )
-
-    is_type_fsdp = isinstance(model, FSDPModule) or (
-        is_compiled_module(model)
-        and isinstance(model._orig_mod, FSDPModule)  # pylint: disable=protected-access
-    )
-    if is_type_fsdp:
-        return model
-
-    fsdp2_plugin = accelerator.state.fsdp_plugin
-
-    original_sd = model.state_dict()
-
-    from torch.distributed.fsdp.wrap import (
-        size_based_auto_wrap_policy,
-        transformer_auto_wrap_policy,
-    )
-
-    # We need the `auto_wrap_policy` original type to create a custom poilicy function for sharding
-    # This is because `fully_shard` doesn't support old auto wrap policies, rather we have to imitate the behaviour
-    if fsdp2_plugin.auto_wrap_policy is transformer_auto_wrap_policy:
-        pass  # auto_wrap_policy_type = "transformer"
-    elif fsdp2_plugin.auto_wrap_policy is size_based_auto_wrap_policy:
-        pass  # auto_wrap_policy_type = "size"
-
-    # We set `auto_wrap_policy` to `functools.partial` to avoid creating it again
-    # This is because of `apply_activation_checkpointing` which will can reuse this function
-    fsdp2_plugin.set_auto_wrap_policy(model)
-
-    if fsdp2_plugin.activation_checkpointing:
-        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
-            CheckpointImpl,
-            apply_activation_checkpointing,
-            checkpoint_wrapper,
-        )
-
-        # Apply activation checkpointing before applying `fully_shard`
-        apply_activation_checkpointing(
-            model,
-            checkpoint_wrapper_fn=functools.partial(
-                checkpoint_wrapper,
-                checkpoint_impl=CheckpointImpl.NO_REENTRANT,
-            ),
-            auto_wrap_policy=fsdp2_plugin.auto_wrap_policy,
-        )
-
-    fsdp2_kwargs = {
-        "reshard_after_forward": fsdp2_plugin.reshard_after_forward,
-        "offload_policy": fsdp2_plugin.cpu_offload,
-        # `fully_shard` doesn't accept `None` in case of `MixedPrecisionPolicy`
-        "mp_policy": fsdp2_plugin.mixed_precision_policy or MixedPrecisionPolicy(),
-    }
-
-    model_has_params4bit = False
-    for _, param in model.named_parameters():
-        # this is a temporary fix whereby loading models with bnb params cannot be moved from
-        # GPU to a meta device due with FSDP2 because torch operations don't return the original class type
-        # bypassing the move to meta will still cause the VRAM spike, but at least it still will load
-        if param.__class__.__name__ == "Params4bit":
-            model_has_params4bit = True
-            break
-
-    if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
-        # Context: `fully_shard` moves the model to GPU if it was on CPU, however it can also be on `meta` and then it stays there even after `fully_shard`
-        # For this reason, we need to move the model to `meta` device, as then sharding happens on `meta` device
-        # If we kept the model on CPU (`cpu_ram_efficient_loading` has model be on CPU on all ranks, though non-main ranks only have `torch.emtpy`), `fully_shard` would move it to GPU
-        # Afterwards, when we call `fsdp2_load_full_state_dict`, us creating the state_dict would result into briefly having two copies of model state_dict on the GPU -> VRAM spike
-
-        # We need to keep the original non-persistent buffers, as those MAY not be in the state_dict, resulting in them staying on meta device
-        # Also, these buffers aren't getting sharded by default
-        # We get the FQNs of all non-persistent buffers, to re-register them after
-        non_persistent_buffer_fqns = get_non_persistent_buffers(
-            model, recurse=True, fqns=True
-        )
-        original_non_persistent_buffers = copy.deepcopy(
-            {k: v for k, v in model.named_buffers() if k in non_persistent_buffer_fqns}
-        )
-        # We move the model to meta device, as then sharding happens on meta device
-        model = model.to(torch.device("meta"))
-        # We need to re-tie the weights, not exactly sure why, but if we don't do this, reference to `lm_head/embed_tokens` stay hanging -> more VRAM usage
-        # We assume `transformers` models have a `tie_weights` method if they support it
-        if hasattr(model, "tie_weights"):
-            model.tie_weights()
-
-    is_peft_model = isinstance(model, PeftModel)
-
-    auto_wrap_policy = fsdp2_prepare_auto_wrap_policy(fsdp2_plugin, model)
-    log_bias_dtype_mismatch = False
-    if auto_wrap_policy is not None:
-        for module in get_module_children_bottom_up(model)[:-1]:
-            if is_peft_model and isinstance(module, LoraLayer):
-                module_log_bias_mismatch = _process_lora_module_for_fsdp(
-                    module, fsdp2_kwargs
-                )
-                log_bias_dtype_mismatch |= module_log_bias_mismatch
-            if auto_wrap_policy(module) and not isinstance(module, FSDPModule):
-                fully_shard(module, **fsdp2_kwargs)
-
-    fully_shard(model, **fsdp2_kwargs)
-
-    if log_bias_dtype_mismatch:
-        LOG.warning(
-            "Bias dtype mismatch detected in LoRA base linear layer. Bias parameters have been cast to weight dtype."
-        )
-
-    if fsdp2_plugin.cpu_ram_efficient_loading:
-        offload_to_cpu = isinstance(fsdp2_plugin.cpu_offload, CPUOffloadPolicy)
-        fsdp2_load_full_state_dict(
-            accelerator, model, original_sd, offload_to_cpu=offload_to_cpu
-        )
-
-    if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
-        # We re-register the buffers, as they may not be in the state_dict
-        for fqn, buffer_tensor in original_non_persistent_buffers.items():
-            buffer_tensor = buffer_tensor.to(accelerator.device)
-
-            if "." in fqn:
-                parent_fqn, local_buffer_name = fqn.rsplit(".", 1)
-                parent_module = model.get_submodule(parent_fqn)
-            else:
-                local_buffer_name = fqn
-                parent_module = model
-
-            parent_module.register_buffer(
-                local_buffer_name, buffer_tensor, persistent=False
-            )
-
-        # We need to tie the weights again, as call to `load_full_state_dict` breaks the tie
-        # Needs to be called both here and above
-        # removing this call makes the have slightly different loss
-        # removing the call above leads to extra memory usage as explained in the comment above
-        if hasattr(model, "tie_weights"):
-            model.tie_weights()
-    return model
-
-
-def patch_accelerate_fsdp2():
-    import accelerate
-
-    accelerate.accelerator.fsdp2_prepare_model = fsdp2_prepare_model
-    accelerate.Accelerator.get_state_dict = get_state_dict
-    setattr(
-        sys.modules["accelerate"],
-        "Accelerator.get_state_dict",
-        get_state_dict,
-    )
Author	SHA1	Message	Date
salman	e36d3c9f30	Merge branch 'main' into testingci	2025-07-24 09:13:15 +01:00
Salman Mohammadi	53614391ed	wip	2025-07-24 09:12:55 +01:00