Initial scaffold: LLM Trainer Dashboard

Full-stack app with FastAPI backend (SSH/paramiko, pipeline streaming, GPU stats, xterm.js terminal, Ollama model manager) and React + Tailwind frontend (8 panels: Connection, Documents, Pipeline, QA Pairs, Training, Terminal, Models, Config). Docker Compose included. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-21 17:13:32 -04:00
commit 90a6ee6fbf
26 changed files with 2688 additions and 0 deletions
--- a/backend/gpu.py
+++ b/backend/gpu.py
@@ -0,0 +1,39 @@
+from ssh_client import ssh_manager
+
+
+def get_gpu_stats() -> dict:
+    """Query nvidia-smi on the remote host and return parsed GPU info."""
+    try:
+        if not ssh_manager.is_connected():
+            return {"gpus": [], "error": "Not connected"}
+
+        out, err, code = ssh_manager.execute(
+            "nvidia-smi --query-gpu=name,utilization.gpu,memory.used,memory.total,"
+            "temperature.gpu,power.draw --format=csv,noheader,nounits",
+            use_conda=False
+        )
+
+        if code != 0:
+            return {"gpus": [], "error": err.strip() or "nvidia-smi failed"}
+
+        gpus = []
+        for line in out.strip().split("\n"):
+            if not line.strip():
+                continue
+            parts = [p.strip() for p in line.split(",")]
+            if len(parts) >= 5:
+                try:
+                    gpus.append({
+                        "name": parts[0],
+                        "utilization": int(parts[1]),
+                        "memory_used": int(parts[2]),
+                        "memory_total": int(parts[3]),
+                        "temperature": int(parts[4]),
+                        "power_draw": float(parts[5]) if len(parts) > 5 else None,
+                    })
+                except (ValueError, IndexError):
+                    pass
+
+        return {"gpus": gpus, "error": None}
+    except Exception as e:
+        return {"gpus": [], "error": str(e)}