more checks and fixes for deepspeed and fsdp (#1208) [skip ci]

2024-01-25 20:01:45 -05:00
parent ba944e6554
commit e923e62d24
6 changed files with 38 additions and 64 deletions
--- a/deepspeed_configs/zero1.json
+++ b/deepspeed_configs/zero1.json
@@ -15,15 +15,6 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/deepspeed_configs/zero2.json
+++ b/deepspeed_configs/zero2.json
@@ -19,15 +19,6 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/deepspeed_configs/zero3.json
+++ b/deepspeed_configs/zero3.json
@@ -23,15 +23,6 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/deepspeed_configs/zero3_bf16.json
+++ b/deepspeed_configs/zero3_bf16.json
@@ -23,15 +23,6 @@
    "hysteresis": 2,
    "min_loss_scale": 1
  },
-  "optimizer": {
-    "type": "AdamW",
-    "params": {
-      "lr": "auto",
-      "betas": "auto",
-      "eps": "auto",
-      "weight_decay": "auto"
-    }
-  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -95,7 +95,7 @@ def normalize_config(cfg):
        save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
        if save_steps < 1.0:  # prevent saves on every step
            cfg.save_steps = save_steps
-    if cfg.evals_per_epoch:
+    if (cfg.val_set_size or cfg.test_datasets) and cfg.evals_per_epoch:
        eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
        if eval_steps < 1.0:  # prevent evals on every step
            cfg.eval_steps = eval_steps
@@ -485,35 +485,43 @@ def validate_config(cfg):
            "`use_reentrant` must be false when used with partially frozen model."
        )

-    if cfg.flash_attention and cfg.deepspeed and Path(cfg.deepspeed).is_file():
+    if cfg.deepspeed and Path(cfg.deepspeed).is_file():
        with open(cfg.deepspeed, encoding="utf-8") as file:
            contents = file.read()
            deepspeed_cfg: DictDefault = DictDefault(json.loads(contents))
-            if (
-                deepspeed_cfg.zero_optimization
-                and deepspeed_cfg.zero_optimization.stage == 3
-            ):
-                if not (
-                    (
-                        deepspeed_cfg.bf16
-                        and deepspeed_cfg.bf16.enabled  # pylint: disable=no-member
-                        is True
-                    )
-                    or (
-                        deepspeed_cfg.fp16
-                        and deepspeed_cfg.fp16.enabled  # pylint: disable=no-member
-                        is True
-                    )
+            if cfg.flash_attention:
+                if (
+                    deepspeed_cfg.zero_optimization
+                    and deepspeed_cfg.zero_optimization.stage == 3
                ):
-                    raise ValueError(
-                        "bf16.enabled or fp16.enabled must be set to true when using ZeRO-3 with flash-attention"
-                    )
+                    if not (
+                        (
+                            deepspeed_cfg.bf16
+                            and deepspeed_cfg.bf16.enabled  # pylint: disable=no-member
+                            is True
+                        )
+                        or (
+                            deepspeed_cfg.fp16
+                            and deepspeed_cfg.fp16.enabled  # pylint: disable=no-member
+                            is True
+                        )
+                    ):
+                        raise ValueError(
+                            "bf16.enabled or fp16.enabled must be set to true when using ZeRO-3 with flash-attention"
+                        )
+            if "8bit" in cfg.optimizer and deepspeed_cfg.optimizer:
+                LOG.warning(
+                    f"conflicting optimizer: {cfg.optimizer} used alongside deepspeed optimizer."
+                )

    if cfg.test_datasets and cfg.val_set_size:
        raise ValueError(
            "non-zero val_set_size should not be used with test_datasets configuration"
        )

+    if cfg.fsdp and "bnb" in cfg.optimizer:
+        raise ValueError(f"FSDP not compatible with {cfg.optimizer}")
+
    # TODO
    # MPT 7b
    # https://github.com/facebookresearch/bitsandbytes/issues/25
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -642,15 +642,17 @@ def load_model(

    # make sure these are fp32 per Ramesh et al. (2021)
    embedding_modules = get_linear_embedding_layers(cfg.model_config_type)
-    for name, module in model.named_modules():
-        if any(m in name for m in ["norm", "gate"]):
-            module.to(torch.float32)
-        if model_config.model_type == "btlm":
-            # don't upcast lm_head for btlm
-            continue
-        if any(m in name for m in embedding_modules):
-            if hasattr(module, "weight"):
+    if not cfg.fsdp:
+        # FSDP doesn't like mixed Float and BFloat16
+        for name, module in model.named_modules():
+            if any(m in name for m in ["norm", "gate"]):
                module.to(torch.float32)
+            if model_config.model_type == "btlm":
+                # don't upcast lm_head for btlm
+                continue
+            if any(m in name for m in embedding_modules):
+                if hasattr(module, "weight"):
+                    module.to(torch.float32)

    needs_fa2_dtype = cfg.adapter or cfg.fsdp
    skip_prepare_model_for_kbit_training = False