112 lines
3.9 KiB
Bash
112 lines
3.9 KiB
Bash
#!/usr/bin/env bash
|
|
# llm-trainer remote bootstrap
|
|
# Sets up the GPU host so the dashboard can run training pipelines end-to-end.
|
|
# Runs as the SSH user — uses $HOME, no sudo required.
|
|
set -e
|
|
|
|
CONDA_DIR="$HOME/miniconda3"
|
|
ENV_NAME="synthetic-data"
|
|
SYNTHETIC_DIR="/opt/synthetic"
|
|
DATA_DIR="$SYNTHETIC_DIR/synthetic-data-kit/data"
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
|
|
# Try to use sudo only when we actually need it for /opt
|
|
SUDO=""
|
|
if [ ! -w /opt ] && [ "$(id -u)" -ne 0 ]; then
|
|
if command -v sudo >/dev/null 2>&1; then
|
|
SUDO="sudo"
|
|
else
|
|
SYNTHETIC_DIR="$HOME/synthetic"
|
|
DATA_DIR="$SYNTHETIC_DIR/synthetic-data-kit/data"
|
|
echo "[bootstrap] No sudo, falling back to $SYNTHETIC_DIR"
|
|
fi
|
|
fi
|
|
|
|
echo "::stage:: starting"
|
|
echo "[bootstrap] User: $USER Home: $HOME"
|
|
echo "[bootstrap] Conda: $CONDA_DIR"
|
|
echo "[bootstrap] Synthetic dir: $SYNTHETIC_DIR"
|
|
echo
|
|
|
|
# ── Step 1: miniconda ────────────────────────────────────────
|
|
echo "::stage:: miniconda"
|
|
if [ ! -x "$CONDA_DIR/bin/conda" ]; then
|
|
echo "[1/5] Installing miniconda to $CONDA_DIR"
|
|
TMP_INSTALLER="$(mktemp /tmp/miniconda.XXXXXX.sh)"
|
|
if command -v wget >/dev/null 2>&1; then
|
|
wget -q -O "$TMP_INSTALLER" \
|
|
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
|
else
|
|
curl -fsSL -o "$TMP_INSTALLER" \
|
|
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
|
|
fi
|
|
bash "$TMP_INSTALLER" -b -p "$CONDA_DIR"
|
|
rm -f "$TMP_INSTALLER"
|
|
else
|
|
echo "[1/5] Miniconda already installed at $CONDA_DIR"
|
|
fi
|
|
|
|
source "$CONDA_DIR/etc/profile.d/conda.sh"
|
|
|
|
# ── Step 2: conda env ────────────────────────────────────────
|
|
# Accept Anaconda channel ToS (required by recent conda versions)
|
|
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main >/dev/null 2>&1 || true
|
|
conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r >/dev/null 2>&1 || true
|
|
|
|
echo "::stage:: conda_env"
|
|
if conda env list | awk '{print $1}' | grep -qx "$ENV_NAME"; then
|
|
echo "[2/5] Conda env $ENV_NAME already exists"
|
|
else
|
|
echo "[2/5] Creating conda env $ENV_NAME (python 3.10)"
|
|
conda create -y -q -n "$ENV_NAME" python=3.10
|
|
fi
|
|
|
|
conda activate "$ENV_NAME"
|
|
|
|
# ── Step 3: Python packages ──────────────────────────────────
|
|
echo "::stage:: python_packages"
|
|
echo "[3/5] Installing Python packages (this may take several minutes)"
|
|
pip install --upgrade pip setuptools wheel
|
|
pip install \
|
|
synthetic-data-kit \
|
|
"torch>=2.1" \
|
|
"transformers>=4.40" \
|
|
"peft>=0.10" \
|
|
"trl>=0.8" \
|
|
"accelerate>=0.30" \
|
|
"bitsandbytes>=0.43" \
|
|
"datasets>=2.18" \
|
|
pyyaml \
|
|
sentencepiece \
|
|
scipy
|
|
|
|
# ── Step 4: Directories ──────────────────────────────────────
|
|
echo "::stage:: directories"
|
|
echo "[4/5] Creating data directories under $SYNTHETIC_DIR"
|
|
$SUDO mkdir -p \
|
|
"$DATA_DIR/input" \
|
|
"$DATA_DIR/parsed" \
|
|
"$DATA_DIR/generated" \
|
|
"$DATA_DIR/curated" \
|
|
"$DATA_DIR/final" \
|
|
"$SYNTHETIC_DIR/output"
|
|
if [ -n "$SUDO" ]; then
|
|
$SUDO chown -R "$USER:$USER" "$SYNTHETIC_DIR"
|
|
fi
|
|
|
|
# ── Step 5: Drop train.py + config.yaml ──────────────────────
|
|
echo "::stage:: assets"
|
|
echo "[5/5] Installing train.py and default config"
|
|
cp "$SCRIPT_DIR/train.py" "$SYNTHETIC_DIR/train.py"
|
|
chmod +x "$SYNTHETIC_DIR/train.py"
|
|
|
|
if [ ! -f "$SYNTHETIC_DIR/synthetic-data-kit/config.yaml" ]; then
|
|
cp "$SCRIPT_DIR/config.yaml" "$SYNTHETIC_DIR/synthetic-data-kit/config.yaml"
|
|
fi
|
|
|
|
echo
|
|
echo "::stage:: done"
|
|
echo "[bootstrap] OK Setup complete"
|
|
echo "[bootstrap] Conda env: $CONDA_DIR/envs/$ENV_NAME"
|
|
echo "[bootstrap] Data dirs: $DATA_DIR"
|
|
echo "[bootstrap] Train script: $SYNTHETIC_DIR/train.py" |