Files
llm-trainer/backend/gpu.py
tocmo0nlord 90a6ee6fbf Initial scaffold: LLM Trainer Dashboard
Full-stack app with FastAPI backend (SSH/paramiko, pipeline streaming,
GPU stats, xterm.js terminal, Ollama model manager) and React + Tailwind
frontend (8 panels: Connection, Documents, Pipeline, QA Pairs, Training,
Terminal, Models, Config). Docker Compose included.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-21 17:13:32 -04:00

40 lines
1.4 KiB
Python

from ssh_client import ssh_manager
def get_gpu_stats() -> dict:
"""Query nvidia-smi on the remote host and return parsed GPU info."""
try:
if not ssh_manager.is_connected():
return {"gpus": [], "error": "Not connected"}
out, err, code = ssh_manager.execute(
"nvidia-smi --query-gpu=name,utilization.gpu,memory.used,memory.total,"
"temperature.gpu,power.draw --format=csv,noheader,nounits",
use_conda=False
)
if code != 0:
return {"gpus": [], "error": err.strip() or "nvidia-smi failed"}
gpus = []
for line in out.strip().split("\n"):
if not line.strip():
continue
parts = [p.strip() for p in line.split(",")]
if len(parts) >= 5:
try:
gpus.append({
"name": parts[0],
"utilization": int(parts[1]),
"memory_used": int(parts[2]),
"memory_total": int(parts[3]),
"temperature": int(parts[4]),
"power_draw": float(parts[5]) if len(parts) > 5 else None,
})
except (ValueError, IndexError):
pass
return {"gpus": gpus, "error": None}
except Exception as e:
return {"gpus": [], "error": str(e)}