Simplify creating parameters

Stop transformers from using all memory
Simplify conversion + more debug
2024-03-18 12:32:59 +00:00 · 2024-03-18 11:47:47 +00:00 · 2024-03-17 20:21:46 +00:00 · 2024-03-17 19:52:56 +01:00 · 2024-03-17 19:51:31 +01:00 · 2024-03-17 19:48:52 +01:00
4 changed files with 99 additions and 49 deletions
--- a/examples/mistral/mixtral_fused.py
+++ b/examples/mistral/mixtral_fused.py
@@ -0,0 +1,75 @@
+import gc
+import torch
+from tqdm import tqdm
+from axolotl.monkeypatch.moe.moe import SparseMoeBlock
+from transformers import AutoTokenizer, TextStreamer
+from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock, MixtralForCausalLM, MixtralConfig
+
+def compute_memory_used_pct(device):
+    memory_used = torch.cuda.max_memory_allocated(device) / (1024**3)
+    memory_pct = (
+        memory_used
+        / (torch.cuda.get_device_properties(device).total_memory / (1024**3))
+        * 100
+    )
+    return memory_pct
+
+model_path = "mistralai/Mixtral-8x7B-Instruct-v0.1"
+
+# Load model
+config = MixtralConfig.from_pretrained(model_path, max_position_embeddings=2048, use_cache=False)
+model = MixtralForCausalLM.from_pretrained(
+    model_path,
+    config=config,
+    device_map="auto",
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float16,
+)
+modules = {k:v for k,v in model.named_modules() if isinstance(v, MixtralSparseMoeBlock)}
+
+for device_index in range(torch.cuda.device_count()):
+    device_memory_pct = compute_memory_used_pct(device_index)
+    print(device_index, device_memory_pct)
+
+with tqdm(modules.items(), desc="scatter moe") as pbar:
+    for i, (name, module) in enumerate(pbar):
+        smoe = SparseMoeBlock(
+            experts=module.experts,
+            gate=module.gate,
+            hidden_dim=module.hidden_dim,
+            ffn_dim=module.ffn_dim,
+            num_experts=module.num_experts,
+            top_k=module.top_k,
+        )
+        old_module = model.model.layers[i].block_sparse_moe
+        setattr(model.model.layers[i], "block_sparse_moe", smoe)
+        del old_module
+        torch.cuda.empty_cache()
+        gc.collect()
+        torch.cuda.empty_cache()
+
+        for device_index in range(torch.cuda.device_count()):
+            device_memory_pct = compute_memory_used_pct(device_index)
+            print(device_index, device_memory_pct)
+
+tokenizer = AutoTokenizer.from_pretrained(model_path)
+streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
+
+# Convert prompt to tokens
+prompt_template = "[INST] {prompt} [/INST]"
+
+prompt = "You're standing on the surface of the Earth. "\
+        "You walk one mile south, one mile west and one mile north. "\
+        "You end up exactly where you started. Where are you?"
+
+tokens = tokenizer(
+    prompt_template.format(prompt=prompt), 
+    return_tensors='pt'
+).input_ids.cuda()
+
+# Generate output
+generation_output = model.generate(
+    tokens, 
+    streamer=streamer,
+    max_new_tokens=512
+)
--- a/src/axolotl/monkeypatch/moe/linear.py
+++ b/src/axolotl/monkeypatch/moe/linear.py
@@ -123,9 +123,11 @@ def parallel_linear(inputs, expert_weights, k,
    return results

 class ParallelExperts(nn.Module):
-    def __init__(self, num_experts, input_size, output_size) -> None:
+    def __init__(self, num_experts, input_size, output_size, device) -> None:
        super().__init__()
-        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
+        self.weight = nn.Parameter(
+            torch.empty(num_experts, output_size, input_size, device=device)
+        )
        self.num_experts = num_experts
        self.input_size = input_size
        self.output_size = output_size
--- a/src/axolotl/monkeypatch/moe/mlp.py
+++ b/src/axolotl/monkeypatch/moe/mlp.py
@@ -4,6 +4,7 @@ https://github.com/shawntan/scattermoe
 https://arxiv.org/abs/2403.08245
 """

+import gc
 import torch
 from torch import nn

@@ -14,7 +15,7 @@ from axolotl.monkeypatch.moe.linear import ParallelExperts
 class FusedExperts(nn.Module):
    def __init__(
        self,
-        experts=None,
+        experts: nn.ModuleList =None,
        hidden_dim=128,
        ffn_dim=512,
        num_experts=8,
@@ -27,31 +28,26 @@ class FusedExperts(nn.Module):
        """
        super(FusedExperts, self).__init__()

+        device = experts[0].w1.weight.device
        self.num_experts = num_experts
        self.hidden_dim = hidden_dim
        self.ffn_dim = ffn_dim
-        self.experts = ParallelExperts(num_experts, hidden_dim, 2 * ffn_dim)
-        self.output_experts = ParallelExperts(num_experts, ffn_dim, hidden_dim)
+        self.experts = ParallelExperts(num_experts, hidden_dim, 2 * ffn_dim, device=device)
+        self.output_experts = ParallelExperts(num_experts, ffn_dim, hidden_dim, device=device)
        self.top_k = min(top_k, self.num_experts)
        self.activation = activation

-        # parallelize all w1 and w3 computation by concat + stack
        with torch.no_grad():
-            torch.stack(
-                [
-                    torch.cat([experts[i].w1.weight, experts[i].w3.weight], dim=0)
-                    for i in range(len(experts))
-                ],
-                dim=0,
-                out=self.experts.weight.data,
-            )
-
-            # parallelize all w2 computation by stack
-            torch.stack(
-                [expert.w2.weight for expert in experts],
-                dim=0,
-                out=self.output_experts.weight.data,
-            )
+            for i in range(len(experts)):
+                self.experts.weight.data[i].copy_(
+                    torch.cat(
+                        [experts[i].w1.weight.detach(), experts[i].w3.weight.detach()],
+                        dim=0
+                    )
+                )
+                self.output_experts.weight.data[i].copy_(
+                    experts[i].w2.weight.detach()
+                )

    def forward(
        self, x: torch.Tensor, routing_weights: torch.Tensor, selected_experts: torch.Tensor
--- a/src/axolotl/monkeypatch/moe/moe.py
+++ b/src/axolotl/monkeypatch/moe/moe.py
@@ -21,37 +21,14 @@ class SparseMoeBlock(nn.Module):
        )

    def _post_training(self, model, name):
-        # Get original weights back: reverse the concat + stack in the fused experts
+        # get original weights back: reverse the concat + stack in the fused experts
        w1s, w3s = torch.split(torch.unbind(self.experts.experts.weight, dim=0), 2, dim=1)
        w2s = torch.unbind(self.experts.output_experts.weight, dim=0)

-        # Recreate the structure of the original MixtralSparseMoeBlock
-        original_moe = nn.Module()
-        original_moe.hidden_dim = self.hidden_dim
-        original_moe.ffn_dim = self.ffn_dim
-        original_moe.num_experts = self.num_experts
-        original_moe.top_k = self.top_k
-
-        # Recreate the gating module
-        original_moe.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False)
-        original_moe.gate.weight.data = self.gate.weight.data
-
-        # Recreate the experts as a ModuleList
-        original_moe.experts = nn.ModuleList()
-        for expert_idx in range(self.num_experts):
-            expert = nn.Module()
-            expert.w1 = nn.Linear(self.hidden_dim, 2 * self.ffn_dim, bias=False)
-            expert.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)
-            expert.w3 = nn.Linear(self.hidden_dim, 2 * self.ffn_dim, bias=False)
-            expert.act_fn = self.experts.activation
-
-            expert.w1.weight.data = torch.cat([w1s[expert_idx], w3s[expert_idx]], dim=0)
-            expert.w2.weight.data = w2s[expert_idx]
-
-            original_moe.experts.append(expert)
-
-        # Replace the SparseMoeBlock with the recreated MixtralSparseMoeBlock structure
-        setattr(model, name, original_moe)
+        # TODO: recreate MoE class with original weights
+        experts = []
+        for i in range(self.num_experts):
+            pass

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        batch_size, sequence_length, hidden_dim = hidden_states.shape
Author	SHA1	Message	Date
Casper Hansen	10328b3429	Simplify creating parameters	2024-03-18 12:32:59 +00:00
Casper Hansen	5bfc470d57	Stop transformers from using all memory	2024-03-18 11:47:47 +00:00
Casper Hansen	04168801c9	Simplify conversion + more debug	2024-03-17 20:21:46 +00:00
Casper	d43a79b7bf	device_map auto	2024-03-17 19:52:56 +01:00
Casper	884d81331e	Initialize ParallelExperts on device of first expert	2024-03-17 19:51:31 +01:00
Casper	2ea75b4160	temporary: inference validation script	2024-03-17 19:48:52 +01:00