update README w deepspeed info (#605)

2023-09-22 00:15:52 -04:00
parent d5f8589021
commit c25ba7939b
6 changed files with 114 additions and 88 deletions
--- a/README.md
+++ b/README.md
@@ -31,6 +31,7 @@ Features:
  - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
 - [Config](#config)
  - [Train](#train)
  - [Training w/ Deepspeed](#training-with-deepspeed)
  - [Inference](#inference)
  - [Merge LORA to Base](#merge-lora-to-base)
 - [Common Errors](#common-errors-)
@@ -86,7 +87,7 @@ git clone https://github.com/OpenAccess-AI-Collective/axolotl
 cd axolotl
 pip3 install packaging
-pip3 install -e .[flash-attn]
+pip3 install -e .[flash-attn,deepspeed]
 pip3 install -U git+https://github.com/huggingface/peft.git
 # finetune lora
@@ -121,7 +122,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
  3. Install axolotl along with python dependencies
        ```bash
        pip3 install packaging
-        pip3 install -e .[flash-attn]
+        pip3 install -e .[flash-attn,deepspeed]
        ```
 - LambdaLabs
@@ -157,7 +158,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
  cd axolotl
  pip3 install packaging
-  pip3 install -e .[flash-attn]
+  pip3 install -e .[flash-attn,deepspeed]
  pip3 install protobuf==3.20.3
  pip3 install -U --ignore-installed requests Pillow psutil scipy
  ```
@@ -715,11 +716,6 @@ fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```
 - llama Deepspeed
 ```yaml
 deepspeed: deepspeed/zero3.json
 ```
 ##### Weights & Biases Logging
 - wandb options
@@ -732,6 +728,24 @@ wandb_run_id:
 wandb_log_model:
 ```
 ### Training with Deepspeed
 Deepspeed is an optimization suite for multi-gpu systems allowing you to train much larger models than you
 might typically be able to fit into your GPU's VRAM. More information about the various optimization types
 for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed#what-is-integrated
 We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3.
 ```shell
 accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json
 ```
 or
 ```yaml
 deepspeed: deepspeed/zero1.json
 ```
 ### Inference
 Pass the appropriate flag to the train command:
--- a/deepspeed/zero1.json
+++ b/deepspeed/zero1.json
@@ -1,39 +1,41 @@
 {
-    "zero_optimization": {
+  "zero_optimization": {
-      "stage": 1,
+    "stage": 1,
-      "overlap_comm": true
+    "overlap_comm": true
-    },
+  },
-    "bf16": {
+  "bf16": {
-      "enabled": "auto"
+    "enabled": "auto"
-    },
+  },
-    "fp16": {
+  "fp16": {
-      "enabled": "auto",
+    "enabled": "auto",
-      "auto_cast": false,
+    "auto_cast": false,
-      "loss_scale": 0,
+    "loss_scale": 0,
-      "initial_scale_power": 32,
+    "initial_scale_power": 32,
-      "loss_scale_window": 1000,
+    "loss_scale_window": 1000,
-      "hysteresis": 2,
+    "hysteresis": 2,
-      "min_loss_scale": 1
+    "min_loss_scale": 1
-    },
+  },
-    "optimizer": {
+  "optimizer": {
-      "type": "AdamW",
+    "type": "AdamW",
-      "params": {
+    "params": {
-        "lr": "auto",
+      "lr": "auto",
-        "betas": "auto",
+      "betas": "auto",
-        "eps": "auto",
+      "eps": "auto",
-        "weight_decay": "auto"
+      "weight_decay": "auto"
-      }
+    }
-    },
+  },
-    "scheduler": {
+  "scheduler": {
-      "type": "WarmupDecayLR",
+    "type": "WarmupDecayLR",
-      "params": {
+    "params": {
-        "warmup_min_lr": "auto",
+      "warmup_min_lr": "auto",
-        "warmup_max_lr": "auto",
+      "warmup_max_lr": "auto",
-        "warmup_num_steps": "auto",
+      "warmup_num_steps": "auto",
-        "total_num_steps": "auto"
+      "warmup_type": "linear",
-      }
+      "total_num_steps": "auto"
-    },
+    }
-    "train_batch_size": "auto",
+  },
-    "train_micro_batch_size_per_gpu": "auto",
+  "gradient_accumulation_steps": "auto",
-    "wall_clock_breakdown": false
+  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
 }
--- a/deepspeed/zero2.json
+++ b/deepspeed/zero2.json
@@ -1,43 +1,45 @@
 {
-    "zero_optimization": {
+  "zero_optimization": {
-      "stage": 2,
+    "stage": 2,
-      "offload_optimizer": {
+    "offload_optimizer": {
-        "device": "cpu"
+      "device": "cpu"
      },
      "contiguous_gradients": true,
      "overlap_comm": true
    },
-    "bf16": {
+    "contiguous_gradients": true,
-      "enabled": "auto"
+    "overlap_comm": true
-    },
+  },
-    "fp16": {
+  "bf16": {
-      "enabled": "auto",
+    "enabled": "auto"
-      "auto_cast": false,
+  },
-      "loss_scale": 0,
+  "fp16": {
-      "initial_scale_power": 32,
+    "enabled": "auto",
-      "loss_scale_window": 1000,
+    "auto_cast": false,
-      "hysteresis": 2,
+    "loss_scale": 0,
-      "min_loss_scale": 1
+    "initial_scale_power": 32,
-    },
+    "loss_scale_window": 1000,
-    "optimizer": {
+    "hysteresis": 2,
-      "type": "AdamW",
+    "min_loss_scale": 1
-      "params": {
+  },
-        "lr": "auto",
+  "optimizer": {
-        "betas": "auto",
+    "type": "AdamW",
-        "eps": "auto",
+    "params": {
-        "weight_decay": "auto"
+      "lr": "auto",
-      }
+      "betas": "auto",
-    },
+      "eps": "auto",
-    "scheduler": {
+      "weight_decay": "auto"
-      "type": "WarmupDecayLR",
+    }
-      "params": {
+  },
-        "warmup_min_lr": "auto",
+  "scheduler": {
-        "warmup_max_lr": "auto",
+    "type": "WarmupDecayLR",
-        "warmup_num_steps": "auto",
+    "params": {
-        "total_num_steps": "auto"
+      "warmup_min_lr": "auto",
-      }
+      "warmup_max_lr": "auto",
-    },
+      "warmup_num_steps": "auto",
-    "train_batch_size": "auto",
+      "warmup_type": "linear",
-    "train_micro_batch_size_per_gpu": "auto",
+      "total_num_steps": "auto"
-    "wall_clock_breakdown": false
+    }
  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
 }
--- a/deepspeed/zero3.json
+++ b/deepspeed/zero3.json
@@ -45,9 +45,11 @@
    "params": {
      "warmup_min_lr": "auto",
      "warmup_max_lr": "auto",
-      "warmup_num_steps": "auto"
+      "warmup_num_steps": "auto",
      "warmup_type": "linear"
    }
  },
  "gradient_accumulation_steps": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
--- a/requirements.txt
+++ b/requirements.txt
@@ -7,6 +7,7 @@ peft @ git+https://github.com/huggingface/peft.git
 transformers @ git+https://github.com/huggingface/transformers.git
 bitsandbytes>=0.41.1
 accelerate @ git+https://github.com/huggingface/accelerate
 deepspeed
 addict
 evaluate
 fire
--- a/setup.py
+++ b/setup.py
@@ -13,7 +13,12 @@ def parse_requirements():
                # Handle custom index URLs
                _, url = line.split()
                _dependency_links.append(url)
-            elif "flash-attn" not in line and line and line[0] != "#":
+            elif (
                "flash-attn" not in line
                and "deepspeed" not in line
                and line
                and line[0] != "#"
            ):
                # Handle standard packages
                _install_requires.append(line)
    return _install_requires, _dependency_links
@@ -35,7 +40,7 @@ setup(
        "flash-attn": [
            "flash-attn>=2.2.1",
        ],
-        "extras": [
+        "deepspeed": [
            "deepspeed",
        ],
    },