compile

2025-09-19 13:41:12 -04:00
parent fb11f696e9
commit 7327144344
2 changed files with 73 additions and 9 deletions
--- a/scripts/bench_moe.py
+++ b/scripts/bench_moe.py
@@ -6,6 +6,7 @@ from __future__ import annotations
 import argparse
 import sys
 import time
+import weakref
 from pathlib import Path

 import torch
@@ -83,6 +84,11 @@ def main() -> None:
    p.add_argument("--iters", type=int, default=50)
    p.add_argument("--warmup", type=int, default=10)
    p.add_argument("--profile", action="store_true")
+    p.add_argument(
+        "--compile",
+        action="store_true",
+        help="Torch.compile both paths before benchmarking",
+    )
    args = p.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -114,17 +120,39 @@ def main() -> None:

    x = torch.randn(args.bsz, args.seq, args.hidden, device=device, dtype=dtype)

-    def run_naive():
-        y, _ = block_naive(x)
+    # Optional torch.compile
+    run_grouped_impl = None
+    if args.compile:
+        try:
+            block_naive = torch.compile(block_naive)  # type: ignore[arg-type]
+        except Exception as exc:  # pragma: no cover
+            print(f"torch.compile naive failed ({exc}); using eager")
+        else:
+
+            def grouped_forward(inp, *, block=block_grouped):
+                block.experts._ax_parent_block_ref = weakref.ref(block)  # type: ignore[attr-defined]
+                y, _ = tg.moe_ffn_forward_grouped(
+                    inp, block.gate, block.experts, block.top_k
+                )
+                return y
+
+            try:
+                run_grouped_impl = torch.compile(grouped_forward)  # type: ignore[arg-type]
+            except Exception as exc:  # pragma: no cover
+                print(f"torch.compile grouped failed ({exc}); using eager")
+                run_grouped_impl = None
+
+    def run_naive(block=block_naive, data=x):
+        y, _ = block(data)
        return y

-    def run_grouped():
+    def run_grouped(block=block_grouped, data=x, impl=run_grouped_impl):
+        if impl is not None:
+            return impl(data)
        if tg is None or not tg.available():
            return torch.empty(0)
-        block_grouped.experts._ax_parent_block = block_grouped
-        y, _ = tg.moe_ffn_forward_grouped(
-            x, block_grouped.gate, block_grouped.experts, block_grouped.top_k
-        )
+        block.experts._ax_parent_block_ref = weakref.ref(block)  # type: ignore[attr-defined]
+        y, _ = tg.moe_ffn_forward_grouped(data, block.gate, block.experts, block.top_k)
        return y if y is not None else torch.empty(0)

    t_naive = bench(run_naive, iters=args.iters, warmup=args.warmup)
--- a/scripts/bench_moe_sweep.py
+++ b/scripts/bench_moe_sweep.py
@@ -7,6 +7,7 @@ import argparse
 import csv
 import sys
 import time
+import weakref
 from dataclasses import dataclass
 from pathlib import Path
 from typing import List
@@ -108,6 +109,7 @@ def main() -> None:
    p.add_argument("--iters", type=int, default=25)
    p.add_argument("--warmup", type=int, default=5)
    p.add_argument("--csv", type=Path, default=None)
+    p.add_argument("--compile", action="store_true")
    args = p.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -159,12 +161,46 @@ def main() -> None:
                                bsz, seq, hidden, device=device, dtype=dtype
                            )

+                            compiled_impl = None
+                            if args.compile:
+                                try:
+                                    block_naive = torch.compile(block_naive)  # type: ignore[arg-type]
+                                except Exception as exc:
+                                    print(
+                                        f"torch.compile naive failed ({exc}); using eager"
+                                    )
+                                else:
+
+                                    def grouped_forward(inp, *, block=block_grouped):
+                                        block.experts._ax_parent_block_ref = (
+                                            weakref.ref(block)
+                                        )  # type: ignore[attr-defined]
+                                        y, _ = tg.moe_ffn_forward_grouped(
+                                            inp,
+                                            block.gate,
+                                            block.experts,
+                                            block.top_k,
+                                        )
+                                        return y
+
+                                    try:
+                                        compiled_impl = torch.compile(grouped_forward)  # type: ignore[arg-type]
+                                    except Exception as exc:
+                                        print(
+                                            f"torch.compile grouped failed ({exc}); using eager"
+                                        )
+                                        compiled_impl = None
+
                            def run_naive(block=block_naive, data=x):
                                y, _ = block(data)
                                return y

-                            def run_grouped(block=block_grouped, data=x):
-                                block.experts._ax_parent_block_ref = weakref.ref(block)  # type: ignore
+                            def run_grouped(
+                                block=block_grouped, data=x, impl=compiled_impl
+                            ):
+                                if impl is not None:
+                                    return impl(data)
+                                block.experts._ax_parent_block_ref = weakref.ref(block)  # type: ignore[attr-defined]
                                y, _ = tg.moe_ffn_forward_grouped(
                                    data,
                                    block.gate,