Initial scaffold: LLM Trainer Dashboard
Full-stack app with FastAPI backend (SSH/paramiko, pipeline streaming, GPU stats, xterm.js terminal, Ollama model manager) and React + Tailwind frontend (8 panels: Connection, Documents, Pipeline, QA Pairs, Training, Terminal, Models, Config). Docker Compose included. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
39
backend/gpu.py
Normal file
39
backend/gpu.py
Normal file
@@ -0,0 +1,39 @@
|
||||
from ssh_client import ssh_manager
|
||||
|
||||
|
||||
def get_gpu_stats() -> dict:
|
||||
"""Query nvidia-smi on the remote host and return parsed GPU info."""
|
||||
try:
|
||||
if not ssh_manager.is_connected():
|
||||
return {"gpus": [], "error": "Not connected"}
|
||||
|
||||
out, err, code = ssh_manager.execute(
|
||||
"nvidia-smi --query-gpu=name,utilization.gpu,memory.used,memory.total,"
|
||||
"temperature.gpu,power.draw --format=csv,noheader,nounits",
|
||||
use_conda=False
|
||||
)
|
||||
|
||||
if code != 0:
|
||||
return {"gpus": [], "error": err.strip() or "nvidia-smi failed"}
|
||||
|
||||
gpus = []
|
||||
for line in out.strip().split("\n"):
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = [p.strip() for p in line.split(",")]
|
||||
if len(parts) >= 5:
|
||||
try:
|
||||
gpus.append({
|
||||
"name": parts[0],
|
||||
"utilization": int(parts[1]),
|
||||
"memory_used": int(parts[2]),
|
||||
"memory_total": int(parts[3]),
|
||||
"temperature": int(parts[4]),
|
||||
"power_draw": float(parts[5]) if len(parts) > 5 else None,
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
pass
|
||||
|
||||
return {"gpus": gpus, "error": None}
|
||||
except Exception as e:
|
||||
return {"gpus": [], "error": str(e)}
|
||||
Reference in New Issue
Block a user