llm-trainer/backend/pipeline.py

# ──────────────────────────────────────────────────────────────────────────────
#  Pipeline paths & command builders
#  These match the remote Ubuntu server layout from LLM_TRAINER_APP_SCOPE.md
# ──────────────────────────────────────────────────────────────────────────────

SDK_BIN = (
    "/home/tocmo0nlord/miniconda3/envs/synthetic-data/bin/synthetic-data-kit"
)
CONFIG_PATH = "/opt/synthetic/synthetic-data-kit/config.yaml"
DATA_BASE   = "/opt/synthetic/synthetic-data-kit/data"

STAGE_DIRS = {
    "input":     f"{DATA_BASE}/input",
    "parsed":    f"{DATA_BASE}/parsed",
    "generated": f"{DATA_BASE}/generated",
    "curated":   f"{DATA_BASE}/curated",
    "final":     f"{DATA_BASE}/final",
}

TRAIN_SCRIPT = "/opt/synthetic/train.py"
OUTPUT_BASE  = "/opt/synthetic/output"


def _sdk(subcommand: str, *args) -> str:
    return f"{SDK_BIN} --config {CONFIG_PATH} {subcommand} {' '.join(args)}"


def ingest_cmd(input_file: str) -> str:
    return _sdk("ingest", f"'{input_file}'", "-o", STAGE_DIRS["parsed"])


def create_cmd(parsed_file: str, num_pairs: int = 50, pair_type: str = "qa") -> str:
    return _sdk(
        "create", f"'{parsed_file}'",
        "-o", STAGE_DIRS["generated"],
        "--type", pair_type,
        "--num-pairs", str(num_pairs),
    )


def curate_cmd(generated_file: str, output_file: str, threshold: float = 7.0) -> str:
    return _sdk(
        "curate", f"'{generated_file}'",
        "-o", f"'{output_file}'",
        "--threshold", str(threshold),
    )


def save_as_cmd(curated_file: str, output_file: str, fmt: str = "jsonl") -> str:
    return _sdk(
        "save-as", f"'{curated_file}'",
        "-f", fmt,
        "-o", f"'{output_file}'",
    )


def train_cmd(
    model_name: str,
    dataset_path: str,
    output_dir: str = OUTPUT_BASE,
    num_epochs: int = 3,
    batch_size: int = 2,
    learning_rate: float = 2e-4,
) -> str:
    return (
        f"python3 {TRAIN_SCRIPT} "
        f"--model '{model_name}' "
        f"--dataset '{dataset_path}' "
        f"--output '{output_dir}' "
        f"--epochs {num_epochs} "
        f"--batch-size {batch_size} "
        f"--lr {learning_rate}"
    )