Initial scaffold: LLM Trainer Dashboard

Full-stack app with FastAPI backend (SSH/paramiko, pipeline streaming, GPU stats, xterm.js terminal, Ollama model manager) and React + Tailwind frontend (8 panels: Connection, Documents, Pipeline, QA Pairs, Training, Terminal, Models, Config). Docker Compose included. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-21 17:13:32 -04:00
commit 90a6ee6fbf
26 changed files with 2688 additions and 0 deletions
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -0,0 +1,12 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+EXPOSE 8080
+
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8080"]
--- a/backend/gpu.py
+++ b/backend/gpu.py
@@ -0,0 +1,39 @@
+from ssh_client import ssh_manager
+
+
+def get_gpu_stats() -> dict:
+    """Query nvidia-smi on the remote host and return parsed GPU info."""
+    try:
+        if not ssh_manager.is_connected():
+            return {"gpus": [], "error": "Not connected"}
+
+        out, err, code = ssh_manager.execute(
+            "nvidia-smi --query-gpu=name,utilization.gpu,memory.used,memory.total,"
+            "temperature.gpu,power.draw --format=csv,noheader,nounits",
+            use_conda=False
+        )
+
+        if code != 0:
+            return {"gpus": [], "error": err.strip() or "nvidia-smi failed"}
+
+        gpus = []
+        for line in out.strip().split("\n"):
+            if not line.strip():
+                continue
+            parts = [p.strip() for p in line.split(",")]
+            if len(parts) >= 5:
+                try:
+                    gpus.append({
+                        "name": parts[0],
+                        "utilization": int(parts[1]),
+                        "memory_used": int(parts[2]),
+                        "memory_total": int(parts[3]),
+                        "temperature": int(parts[4]),
+                        "power_draw": float(parts[5]) if len(parts) > 5 else None,
+                    })
+                except (ValueError, IndexError):
+                    pass
+
+        return {"gpus": gpus, "error": None}
+    except Exception as e:
+        return {"gpus": [], "error": str(e)}
--- a/backend/main.py
+++ b/backend/main.py
@@ -0,0 +1,450 @@
+import asyncio
+import json
+import os
+import tempfile
+import threading
+from pathlib import Path
+from typing import Optional
+
+import httpx
+import yaml
+from fastapi import FastAPI, File, HTTPException, UploadFile, WebSocket, WebSocketDisconnect
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+
+from gpu import get_gpu_stats
+from pipeline import STAGE_DIRS, CONFIG_PATH, ingest_cmd, create_cmd, curate_cmd, save_as_cmd, train_cmd
+from ssh_client import ssh_manager
+
+# ──────────────────────────────────────────────────────────────────────────────
+app = FastAPI(title="LLM Trainer API", version="1.0.0")
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://192.168.2.47:11434")
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  Pydantic models
+# ──────────────────────────────────────────────────────────────────────────────
+
+class ConnectRequest(BaseModel):
+    host: str = "192.168.2.47"
+    username: str = "tocmo0nlord"
+    password: Optional[str] = None
+    key_path: Optional[str] = None
+    port: int = 22
+
+
+class TrainRequest(BaseModel):
+    model_name: str = "llama3.1:8b"
+    dataset_path: str
+    output_dir: str = "/opt/synthetic/output"
+    num_epochs: int = 3
+    batch_size: int = 2
+    learning_rate: float = 2e-4
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  Helpers
+# ──────────────────────────────────────────────────────────────────────────────
+
+def _require_ssh():
+    if not ssh_manager.is_connected():
+        raise HTTPException(status_code=503, detail="Not connected to SSH server")
+
+
+async def _stream_ws(websocket: WebSocket, command: str, use_conda: bool = True):
+    """Run a remote command and stream output lines over WebSocket."""
+    await websocket.accept()
+    loop = asyncio.get_event_loop()
+    queue: asyncio.Queue = asyncio.Queue()
+
+    def _worker():
+        try:
+            for line in ssh_manager.execute_stream(command, use_conda=use_conda):
+                asyncio.run_coroutine_threadsafe(
+                    queue.put({"type": "log", "data": line}), loop
+                )
+            asyncio.run_coroutine_threadsafe(
+                queue.put({"type": "done", "data": "Command completed."}), loop
+            )
+        except Exception as exc:
+            asyncio.run_coroutine_threadsafe(
+                queue.put({"type": "error", "data": str(exc)}), loop
+            )
+
+    threading.Thread(target=_worker, daemon=True).start()
+
+    try:
+        while True:
+            msg = await queue.get()
+            await websocket.send_json(msg)
+            if msg["type"] in ("done", "error"):
+                break
+    except WebSocketDisconnect:
+        pass
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  Connection
+# ──────────────────────────────────────────────────────────────────────────────
+
+@app.post("/api/connect")
+async def connect(req: ConnectRequest):
+    try:
+        ssh_manager.connect(
+            host=req.host,
+            username=req.username,
+            password=req.password,
+            key_path=req.key_path,
+            port=req.port,
+        )
+        return {"status": "connected", "host": req.host, "username": req.username}
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc))
+
+
+@app.post("/api/disconnect")
+async def disconnect():
+    ssh_manager.disconnect()
+    return {"status": "disconnected"}
+
+
+@app.get("/api/status")
+async def status():
+    connected = ssh_manager.is_connected()
+    gpu = get_gpu_stats() if connected else {"gpus": [], "error": "Not connected"}
+    return {
+        "connected": connected,
+        "host": ssh_manager.host if connected else None,
+        "username": ssh_manager.username if connected else None,
+        "gpu": gpu,
+    }
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  GPU
+# ──────────────────────────────────────────────────────────────────────────────
+
+@app.get("/api/gpu")
+async def gpu():
+    _require_ssh()
+    return get_gpu_stats()
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  File management
+# ──────────────────────────────────────────────────────────────────────────────
+
+@app.get("/api/files/{stage}")
+async def list_files(stage: str):
+    if stage not in STAGE_DIRS:
+        raise HTTPException(status_code=400, detail=f"Unknown stage: {stage}")
+    _require_ssh()
+
+    out, _, code = ssh_manager.execute(
+        f"ls -la '{STAGE_DIRS[stage]}' 2>/dev/null | tail -n +2", use_conda=False
+    )
+
+    files = []
+    for line in out.strip().split("\n"):
+        if not line.strip() or line.startswith("total"):
+            continue
+        parts = line.split()
+        if len(parts) >= 9 and not parts[0].startswith("d"):
+            files.append({
+                "name": " ".join(parts[8:]),
+                "size": int(parts[4]),
+                "modified": f"{parts[5]} {parts[6]} {parts[7]}",
+            })
+
+    return {"stage": stage, "directory": STAGE_DIRS[stage], "files": files}
+
+
+@app.delete("/api/files/{stage}/{filename}")
+async def delete_file(stage: str, filename: str):
+    if stage not in STAGE_DIRS:
+        raise HTTPException(status_code=400, detail=f"Unknown stage: {stage}")
+    _require_ssh()
+
+    path = f"{STAGE_DIRS[stage]}/{filename}"
+    _, err, code = ssh_manager.execute(f"rm -f '{path}'", use_conda=False)
+    if code != 0:
+        raise HTTPException(status_code=500, detail=err)
+    return {"deleted": filename}
+
+
+@app.get("/api/files/{stage}/{filename}/preview")
+async def preview_file(stage: str, filename: str, lines: int = 120):
+    if stage not in STAGE_DIRS:
+        raise HTTPException(status_code=400, detail=f"Unknown stage: {stage}")
+    _require_ssh()
+
+    path = f"{STAGE_DIRS[stage]}/{filename}"
+    out, err, code = ssh_manager.execute(f"head -n {lines} '{path}'", use_conda=False)
+    if code != 0:
+        raise HTTPException(status_code=500, detail=err)
+    return {"filename": filename, "content": out}
+
+
+@app.post("/api/upload")
+async def upload_file(file: UploadFile = File(...)):
+    _require_ssh()
+
+    suffix = Path(file.filename).suffix
+    with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
+        tmp.write(await file.read())
+        tmp_path = tmp.name
+
+    try:
+        remote_path = f"{STAGE_DIRS['input']}/{file.filename}"
+        ssh_manager.upload_file(tmp_path, remote_path)
+        return {"uploaded": file.filename, "remote_path": remote_path}
+    finally:
+        os.unlink(tmp_path)
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  Pipeline  (WebSocket streaming)
+# ──────────────────────────────────────────────────────────────────────────────
+
+@app.websocket("/api/pipeline/ingest")
+async def ws_ingest(websocket: WebSocket, filename: str):
+    if not ssh_manager.is_connected():
+        await websocket.accept()
+        await websocket.send_json({"type": "error", "data": "Not connected"})
+        return
+    cmd = ingest_cmd(f"{STAGE_DIRS['input']}/{filename}")
+    await _stream_ws(websocket, cmd)
+
+
+@app.websocket("/api/pipeline/create")
+async def ws_create(websocket: WebSocket, filename: str,
+                    num_pairs: int = 50, pair_type: str = "qa"):
+    if not ssh_manager.is_connected():
+        await websocket.accept()
+        await websocket.send_json({"type": "error", "data": "Not connected"})
+        return
+    cmd = create_cmd(f"{STAGE_DIRS['parsed']}/{filename}", num_pairs, pair_type)
+    await _stream_ws(websocket, cmd)
+
+
+@app.websocket("/api/pipeline/curate")
+async def ws_curate(websocket: WebSocket, filename: str,
+                    output_filename: str, threshold: float = 7.0):
+    if not ssh_manager.is_connected():
+        await websocket.accept()
+        await websocket.send_json({"type": "error", "data": "Not connected"})
+        return
+    cmd = curate_cmd(
+        f"{STAGE_DIRS['generated']}/{filename}",
+        f"{STAGE_DIRS['curated']}/{output_filename}",
+        threshold,
+    )
+    await _stream_ws(websocket, cmd)
+
+
+@app.websocket("/api/pipeline/save")
+async def ws_save(websocket: WebSocket, filename: str,
+                  output_filename: str, fmt: str = "jsonl"):
+    if not ssh_manager.is_connected():
+        await websocket.accept()
+        await websocket.send_json({"type": "error", "data": "Not connected"})
+        return
+    cmd = save_as_cmd(
+        f"{STAGE_DIRS['curated']}/{filename}",
+        f"{STAGE_DIRS['final']}/{output_filename}",
+        fmt,
+    )
+    await _stream_ws(websocket, cmd)
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  QA Pairs viewer
+# ──────────────────────────────────────────────────────────────────────────────
+
+@app.get("/api/pairs/{filename}")
+async def get_pairs(filename: str, stage: str = "generated"):
+    _require_ssh()
+    path = f"{STAGE_DIRS.get(stage, STAGE_DIRS['generated'])}/{filename}"
+    out, err, code = ssh_manager.execute(f"cat '{path}'", use_conda=False)
+    if code != 0:
+        raise HTTPException(status_code=404, detail=f"File not found: {filename}")
+
+    pairs = []
+    for line in out.strip().split("\n"):
+        if not line.strip():
+            continue
+        try:
+            pairs.append(json.loads(line))
+        except json.JSONDecodeError:
+            pass
+
+    return {"filename": filename, "count": len(pairs), "pairs": pairs}
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  Config editor
+# ──────────────────────────────────────────────────────────────────────────────
+
+@app.get("/api/config")
+async def get_config():
+    _require_ssh()
+    try:
+        raw = ssh_manager.read_remote_file(CONFIG_PATH)
+        return {"config": yaml.safe_load(raw), "raw": raw}
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc))
+
+
+@app.put("/api/config")
+async def update_config(payload: dict):
+    _require_ssh()
+    try:
+        ssh_manager.write_remote_file(CONFIG_PATH, yaml.dump(payload, default_flow_style=False))
+        return {"status": "updated"}
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc))
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  Training  (WebSocket streaming)
+# ──────────────────────────────────────────────────────────────────────────────
+
+@app.websocket("/api/train")
+async def ws_train(
+    websocket: WebSocket,
+    model_name: str = "llama3.1:8b",
+    dataset_path: str = "",
+    output_dir: str = "/opt/synthetic/output",
+    num_epochs: int = 3,
+    batch_size: int = 2,
+    learning_rate: float = 2e-4,
+):
+    if not ssh_manager.is_connected():
+        await websocket.accept()
+        await websocket.send_json({"type": "error", "data": "Not connected"})
+        return
+
+    cmd = train_cmd(model_name, dataset_path, output_dir, num_epochs, batch_size, learning_rate)
+    await _stream_ws(websocket, cmd)
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  Interactive terminal (xterm.js ↔ SSH shell)
+# ──────────────────────────────────────────────────────────────────────────────
+
+@app.websocket("/api/terminal")
+async def ws_terminal(websocket: WebSocket):
+    await websocket.accept()
+
+    if not ssh_manager.is_connected():
+        await websocket.send_text("\r\nNot connected to SSH server.\r\n")
+        return
+
+    channel = None
+    try:
+        channel = ssh_manager.open_shell_channel()
+
+        async def ssh_to_ws():
+            while True:
+                if channel.recv_ready():
+                    data = channel.recv(4096)
+                    if not data:
+                        break
+                    await websocket.send_bytes(data)
+                elif channel.exit_status_ready():
+                    break
+                else:
+                    await asyncio.sleep(0.02)
+
+        async def ws_to_ssh():
+            try:
+                while True:
+                    data = await websocket.receive_bytes()
+                    channel.send(data)
+            except WebSocketDisconnect:
+                pass
+
+        await asyncio.gather(ssh_to_ws(), ws_to_ssh())
+    except WebSocketDisconnect:
+        pass
+    except Exception as exc:
+        try:
+            await websocket.send_text(f"\r\nError: {exc}\r\n")
+        except Exception:
+            pass
+    finally:
+        if channel:
+            try:
+                channel.close()
+            except Exception:
+                pass
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+#  Model manager (Ollama)
+# ──────────────────────────────────────────────────────────────────────────────
+
+@app.get("/api/models")
+async def list_models():
+    try:
+        async with httpx.AsyncClient(timeout=10) as client:
+            resp = await client.get(f"{OLLAMA_URL}/api/tags")
+            resp.raise_for_status()
+            return {"models": resp.json().get("models", [])}
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc))
+
+
+@app.websocket("/api/models/pull")
+async def ws_pull_model(websocket: WebSocket, model_name: str):
+    await websocket.accept()
+    try:
+        async with httpx.AsyncClient(timeout=600) as client:
+            async with client.stream(
+                "POST", f"{OLLAMA_URL}/api/pull",
+                json={"name": model_name, "stream": True}
+            ) as resp:
+                async for line in resp.aiter_lines():
+                    if line.strip():
+                        try:
+                            await websocket.send_json(json.loads(line))
+                        except json.JSONDecodeError:
+                            pass
+        await websocket.send_json({"status": "success"})
+    except WebSocketDisconnect:
+        pass
+    except Exception as exc:
+        try:
+            await websocket.send_json({"status": "error", "error": str(exc)})
+        except Exception:
+            pass
+
+
+@app.delete("/api/models/{model_name:path}")
+async def delete_model(model_name: str):
+    try:
+        async with httpx.AsyncClient(timeout=30) as client:
+            resp = await client.request(
+                "DELETE", f"{OLLAMA_URL}/api/delete",
+                json={"name": model_name}
+            )
+            resp.raise_for_status()
+        return {"deleted": model_name}
+    except Exception as exc:
+        raise HTTPException(status_code=500, detail=str(exc))
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8080, reload=True)
--- a/backend/pipeline.py
+++ b/backend/pipeline.py
@@ -0,0 +1,73 @@
+# ──────────────────────────────────────────────────────────────────────────────
+#  Pipeline paths & command builders
+#  These match the remote Ubuntu server layout from LLM_TRAINER_APP_SCOPE.md
+# ──────────────────────────────────────────────────────────────────────────────
+
+SDK_BIN = (
+    "/home/tocmo0nlord/miniconda3/envs/synthetic-data/bin/synthetic-data-kit"
+)
+CONFIG_PATH = "/opt/synthetic/synthetic-data-kit/config.yaml"
+DATA_BASE   = "/opt/synthetic/synthetic-data-kit/data"
+
+STAGE_DIRS = {
+    "input":     f"{DATA_BASE}/input",
+    "parsed":    f"{DATA_BASE}/parsed",
+    "generated": f"{DATA_BASE}/generated",
+    "curated":   f"{DATA_BASE}/curated",
+    "final":     f"{DATA_BASE}/final",
+}
+
+TRAIN_SCRIPT = "/opt/synthetic/train.py"
+OUTPUT_BASE  = "/opt/synthetic/output"
+
+
+def _sdk(subcommand: str, *args) -> str:
+    return f"{SDK_BIN} --config {CONFIG_PATH} {subcommand} {' '.join(args)}"
+
+
+def ingest_cmd(input_file: str) -> str:
+    return _sdk("ingest", f"'{input_file}'", "-o", STAGE_DIRS["parsed"])
+
+
+def create_cmd(parsed_file: str, num_pairs: int = 50, pair_type: str = "qa") -> str:
+    return _sdk(
+        "create", f"'{parsed_file}'",
+        "-o", STAGE_DIRS["generated"],
+        "--type", pair_type,
+        "--num-pairs", str(num_pairs),
+    )
+
+
+def curate_cmd(generated_file: str, output_file: str, threshold: float = 7.0) -> str:
+    return _sdk(
+        "curate", f"'{generated_file}'",
+        "-o", f"'{output_file}'",
+        "--threshold", str(threshold),
+    )
+
+
+def save_as_cmd(curated_file: str, output_file: str, fmt: str = "jsonl") -> str:
+    return _sdk(
+        "save-as", f"'{curated_file}'",
+        "-f", fmt,
+        "-o", f"'{output_file}'",
+    )
+
+
+def train_cmd(
+    model_name: str,
+    dataset_path: str,
+    output_dir: str = OUTPUT_BASE,
+    num_epochs: int = 3,
+    batch_size: int = 2,
+    learning_rate: float = 2e-4,
+) -> str:
+    return (
+        f"python3 {TRAIN_SCRIPT} "
+        f"--model '{model_name}' "
+        f"--dataset '{dataset_path}' "
+        f"--output '{output_dir}' "
+        f"--epochs {num_epochs} "
+        f"--batch-size {batch_size} "
+        f"--lr {learning_rate}"
+    )
--- a/backend/requirements.txt
+++ b/backend/requirements.txt
@@ -0,0 +1,7 @@
+fastapi==0.111.0
+uvicorn[standard]==0.29.0
+paramiko==3.4.0
+httpx==0.27.0
+pyyaml==6.0.1
+python-multipart==0.0.9
+websockets==12.0
--- a/backend/ssh_client.py
+++ b/backend/ssh_client.py
@@ -0,0 +1,177 @@
+import base64
+import threading
+import time
+from typing import Optional
+
+import paramiko
+
+
+class SSHClient:
+    def __init__(self):
+        self.client: Optional[paramiko.SSHClient] = None
+        self.connected = False
+        self.host = ""
+        self.username = ""
+        self.port = 22
+        self._keepalive_thread: Optional[threading.Thread] = None
+        self._stop_keepalive = threading.Event()
+        self._lock = threading.Lock()
+
+    def connect(self, host: str, username: str, password: str = None,
+                key_path: str = None, port: int = 22) -> bool:
+        with self._lock:
+            try:
+                if self.client:
+                    self.client.close()
+
+                self.client = paramiko.SSHClient()
+                self.client.set_missing_host_key_policy(paramiko.AutoAddPolicy())
+
+                kwargs = {"hostname": host, "port": port, "username": username, "timeout": 10}
+                if key_path:
+                    kwargs["key_filename"] = key_path
+                if password:
+                    kwargs["password"] = password
+
+                self.client.connect(**kwargs)
+                self.connected = True
+                self.host = host
+                self.username = username
+                self.port = port
+
+                self._stop_keepalive.clear()
+                self._keepalive_thread = threading.Thread(target=self._keepalive_loop, daemon=True)
+                self._keepalive_thread.start()
+                return True
+            except Exception as e:
+                self.connected = False
+                raise e
+
+    def disconnect(self):
+        self._stop_keepalive.set()
+        if self.client:
+            self.client.close()
+        self.connected = False
+
+    def _keepalive_loop(self):
+        while not self._stop_keepalive.wait(30):
+            try:
+                transport = self.client.get_transport()
+                if transport and transport.is_active():
+                    transport.send_ignore()
+                else:
+                    self.connected = False
+                    break
+            except Exception:
+                self.connected = False
+                break
+
+    def execute(self, command: str, use_conda: bool = True) -> tuple:
+        if not self.is_connected():
+            raise Exception("Not connected to SSH server")
+
+        if use_conda:
+            full_cmd = (
+                f"source /home/{self.username}/miniconda3/etc/profile.d/conda.sh && "
+                f"conda activate synthetic-data && {command}"
+            )
+        else:
+            full_cmd = command
+
+        _, stdout, stderr = self.client.exec_command(full_cmd)
+        out = stdout.read().decode("utf-8", errors="replace")
+        err = stderr.read().decode("utf-8", errors="replace")
+        exit_code = stdout.channel.recv_exit_status()
+        return out, err, exit_code
+
+    def execute_stream(self, command: str, use_conda: bool = True):
+        """Generator that yields output lines from a command."""
+        if not self.is_connected():
+            raise Exception("Not connected to SSH server")
+
+        if use_conda:
+            full_cmd = (
+                f"source /home/{self.username}/miniconda3/etc/profile.d/conda.sh && "
+                f"conda activate synthetic-data && {command}"
+            )
+        else:
+            full_cmd = command
+
+        transport = self.client.get_transport()
+        channel = transport.open_session()
+        channel.get_pty()
+        channel.exec_command(full_cmd)
+
+        buffer = b""
+        while True:
+            if channel.recv_ready():
+                data = channel.recv(4096)
+                if not data:
+                    break
+                buffer += data
+                while b"\n" in buffer:
+                    line, buffer = buffer.split(b"\n", 1)
+                    yield line.decode("utf-8", errors="replace") + "\n"
+            elif channel.exit_status_ready():
+                if buffer:
+                    yield buffer.decode("utf-8", errors="replace")
+                break
+            else:
+                time.sleep(0.05)
+
+        channel.close()
+
+    def open_shell_channel(self, term: str = "xterm-256color", width: int = 220, height: int = 50):
+        """Open an interactive shell channel for the terminal panel."""
+        if not self.is_connected():
+            raise Exception("Not connected to SSH server")
+
+        transport = self.client.get_transport()
+        channel = transport.open_session()
+        channel.get_pty(term=term, width=width, height=height)
+        channel.invoke_shell()
+
+        # Auto-activate conda env
+        activate = (
+            f"source /home/{self.username}/miniconda3/etc/profile.d/conda.sh && "
+            f"conda activate synthetic-data\n"
+        )
+        channel.send(activate)
+        return channel
+
+    def upload_file(self, local_path: str, remote_path: str):
+        if not self.is_connected():
+            raise Exception("Not connected to SSH server")
+        sftp = self.client.open_sftp()
+        try:
+            sftp.put(local_path, remote_path)
+        finally:
+            sftp.close()
+
+    def read_remote_file(self, remote_path: str) -> str:
+        out, err, code = self.execute(f"cat '{remote_path}'", use_conda=False)
+        if code != 0:
+            raise Exception(f"Failed to read file: {err}")
+        return out
+
+    def write_remote_file(self, remote_path: str, content: str):
+        encoded = base64.b64encode(content.encode()).decode()
+        cmd = f"echo '{encoded}' | base64 -d > '{remote_path}'"
+        out, err, code = self.execute(cmd, use_conda=False)
+        if code != 0:
+            raise Exception(f"Failed to write file: {err}")
+
+    def is_connected(self) -> bool:
+        try:
+            if self.client:
+                transport = self.client.get_transport()
+                if transport and transport.is_active():
+                    return True
+        except Exception:
+            pass
+        self.connected = False
+        return False
+
+
+# Singleton shared across all routes
+ssh_manager = SSHClient()