Full-stack app with FastAPI backend (SSH/paramiko, pipeline streaming, GPU stats, xterm.js terminal, Ollama model manager) and React + Tailwind frontend (8 panels: Connection, Documents, Pipeline, QA Pairs, Training, Terminal, Models, Config). Docker Compose included. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
74 lines
2.4 KiB
Python
74 lines
2.4 KiB
Python
# ──────────────────────────────────────────────────────────────────────────────
|
|
# Pipeline paths & command builders
|
|
# These match the remote Ubuntu server layout from LLM_TRAINER_APP_SCOPE.md
|
|
# ──────────────────────────────────────────────────────────────────────────────
|
|
|
|
SDK_BIN = (
|
|
"/home/tocmo0nlord/miniconda3/envs/synthetic-data/bin/synthetic-data-kit"
|
|
)
|
|
CONFIG_PATH = "/opt/synthetic/synthetic-data-kit/config.yaml"
|
|
DATA_BASE = "/opt/synthetic/synthetic-data-kit/data"
|
|
|
|
STAGE_DIRS = {
|
|
"input": f"{DATA_BASE}/input",
|
|
"parsed": f"{DATA_BASE}/parsed",
|
|
"generated": f"{DATA_BASE}/generated",
|
|
"curated": f"{DATA_BASE}/curated",
|
|
"final": f"{DATA_BASE}/final",
|
|
}
|
|
|
|
TRAIN_SCRIPT = "/opt/synthetic/train.py"
|
|
OUTPUT_BASE = "/opt/synthetic/output"
|
|
|
|
|
|
def _sdk(subcommand: str, *args) -> str:
|
|
return f"{SDK_BIN} --config {CONFIG_PATH} {subcommand} {' '.join(args)}"
|
|
|
|
|
|
def ingest_cmd(input_file: str) -> str:
|
|
return _sdk("ingest", f"'{input_file}'", "-o", STAGE_DIRS["parsed"])
|
|
|
|
|
|
def create_cmd(parsed_file: str, num_pairs: int = 50, pair_type: str = "qa") -> str:
|
|
return _sdk(
|
|
"create", f"'{parsed_file}'",
|
|
"-o", STAGE_DIRS["generated"],
|
|
"--type", pair_type,
|
|
"--num-pairs", str(num_pairs),
|
|
)
|
|
|
|
|
|
def curate_cmd(generated_file: str, output_file: str, threshold: float = 7.0) -> str:
|
|
return _sdk(
|
|
"curate", f"'{generated_file}'",
|
|
"-o", f"'{output_file}'",
|
|
"--threshold", str(threshold),
|
|
)
|
|
|
|
|
|
def save_as_cmd(curated_file: str, output_file: str, fmt: str = "jsonl") -> str:
|
|
return _sdk(
|
|
"save-as", f"'{curated_file}'",
|
|
"-f", fmt,
|
|
"-o", f"'{output_file}'",
|
|
)
|
|
|
|
|
|
def train_cmd(
|
|
model_name: str,
|
|
dataset_path: str,
|
|
output_dir: str = OUTPUT_BASE,
|
|
num_epochs: int = 3,
|
|
batch_size: int = 2,
|
|
learning_rate: float = 2e-4,
|
|
) -> str:
|
|
return (
|
|
f"python3 {TRAIN_SCRIPT} "
|
|
f"--model '{model_name}' "
|
|
f"--dataset '{dataset_path}' "
|
|
f"--output '{output_dir}' "
|
|
f"--epochs {num_epochs} "
|
|
f"--batch-size {batch_size} "
|
|
f"--lr {learning_rate}"
|
|
)
|