Files
llm-trainer/backend/pipeline.py
tocmo0nlord 90a6ee6fbf Initial scaffold: LLM Trainer Dashboard
Full-stack app with FastAPI backend (SSH/paramiko, pipeline streaming,
GPU stats, xterm.js terminal, Ollama model manager) and React + Tailwind
frontend (8 panels: Connection, Documents, Pipeline, QA Pairs, Training,
Terminal, Models, Config). Docker Compose included.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-21 17:13:32 -04:00

74 lines
2.4 KiB
Python

# ──────────────────────────────────────────────────────────────────────────────
# Pipeline paths & command builders
# These match the remote Ubuntu server layout from LLM_TRAINER_APP_SCOPE.md
# ──────────────────────────────────────────────────────────────────────────────
SDK_BIN = (
"/home/tocmo0nlord/miniconda3/envs/synthetic-data/bin/synthetic-data-kit"
)
CONFIG_PATH = "/opt/synthetic/synthetic-data-kit/config.yaml"
DATA_BASE = "/opt/synthetic/synthetic-data-kit/data"
STAGE_DIRS = {
"input": f"{DATA_BASE}/input",
"parsed": f"{DATA_BASE}/parsed",
"generated": f"{DATA_BASE}/generated",
"curated": f"{DATA_BASE}/curated",
"final": f"{DATA_BASE}/final",
}
TRAIN_SCRIPT = "/opt/synthetic/train.py"
OUTPUT_BASE = "/opt/synthetic/output"
def _sdk(subcommand: str, *args) -> str:
return f"{SDK_BIN} --config {CONFIG_PATH} {subcommand} {' '.join(args)}"
def ingest_cmd(input_file: str) -> str:
return _sdk("ingest", f"'{input_file}'", "-o", STAGE_DIRS["parsed"])
def create_cmd(parsed_file: str, num_pairs: int = 50, pair_type: str = "qa") -> str:
return _sdk(
"create", f"'{parsed_file}'",
"-o", STAGE_DIRS["generated"],
"--type", pair_type,
"--num-pairs", str(num_pairs),
)
def curate_cmd(generated_file: str, output_file: str, threshold: float = 7.0) -> str:
return _sdk(
"curate", f"'{generated_file}'",
"-o", f"'{output_file}'",
"--threshold", str(threshold),
)
def save_as_cmd(curated_file: str, output_file: str, fmt: str = "jsonl") -> str:
return _sdk(
"save-as", f"'{curated_file}'",
"-f", fmt,
"-o", f"'{output_file}'",
)
def train_cmd(
model_name: str,
dataset_path: str,
output_dir: str = OUTPUT_BASE,
num_epochs: int = 3,
batch_size: int = 2,
learning_rate: float = 2e-4,
) -> str:
return (
f"python3 {TRAIN_SCRIPT} "
f"--model '{model_name}' "
f"--dataset '{dataset_path}' "
f"--output '{output_dir}' "
f"--epochs {num_epochs} "
f"--batch-size {batch_size} "
f"--lr {learning_rate}"
)