# ────────────────────────────────────────────────────────────────────────────── # Pipeline paths & command builders # These match the remote Ubuntu server layout from LLM_TRAINER_APP_SCOPE.md # ────────────────────────────────────────────────────────────────────────────── SDK_BIN = ( "/home/tocmo0nlord/miniconda3/envs/synthetic-data/bin/synthetic-data-kit" ) CONFIG_PATH = "/opt/synthetic/synthetic-data-kit/config.yaml" DATA_BASE = "/opt/synthetic/synthetic-data-kit/data" STAGE_DIRS = { "input": f"{DATA_BASE}/input", "parsed": f"{DATA_BASE}/parsed", "generated": f"{DATA_BASE}/generated", "curated": f"{DATA_BASE}/curated", "final": f"{DATA_BASE}/final", } TRAIN_SCRIPT = "/opt/synthetic/train.py" OUTPUT_BASE = "/opt/synthetic/output" def _sdk(subcommand: str, *args) -> str: return f"{SDK_BIN} --config {CONFIG_PATH} {subcommand} {' '.join(args)}" def ingest_cmd(input_file: str) -> str: return _sdk("ingest", f"'{input_file}'", "-o", STAGE_DIRS["parsed"]) def create_cmd(parsed_file: str, num_pairs: int = 50, pair_type: str = "qa") -> str: return _sdk( "create", f"'{parsed_file}'", "-o", STAGE_DIRS["generated"], "--type", pair_type, "--num-pairs", str(num_pairs), ) def curate_cmd(generated_file: str, output_file: str, threshold: float = 7.0) -> str: return _sdk( "curate", f"'{generated_file}'", "-o", f"'{output_file}'", "--threshold", str(threshold), ) def save_as_cmd(curated_file: str, output_file: str, fmt: str = "jsonl") -> str: return _sdk( "save-as", f"'{curated_file}'", "-f", fmt, "-o", f"'{output_file}'", ) def train_cmd( model_name: str, dataset_path: str, output_dir: str = OUTPUT_BASE, num_epochs: int = 3, batch_size: int = 2, learning_rate: float = 2e-4, ) -> str: return ( f"python3 {TRAIN_SCRIPT} " f"--model '{model_name}' " f"--dataset '{dataset_path}' " f"--output '{output_dir}' " f"--epochs {num_epochs} " f"--batch-size {batch_size} " f"--lr {learning_rate}" )