* slurm example and make preprocess play nicely * start slurm if it init file exists * remove incorrect comment * feat: add slurm docs --------- Co-authored-by: NanoCode012 <nano@axolotl.ai>
21 lines
670 B
Bash
21 lines
670 B
Bash
#!/bin/bash
|
|
# Prior to running this script, export your HF_TOKEN and WANDB_API_KEY to your environment; i.e.
|
|
# export HF_TOKEN="..."
|
|
# export WANDB_API_KEY="..."
|
|
#
|
|
|
|
# ---------- SBATCH commands ---------- #
|
|
#SBATCH --job-name=axolotl-slurm-multinode
|
|
#SBATCH --ntasks-per-node=1
|
|
#SBATCH --nodes=$NUM_NODES
|
|
#SBATCH --gpus-per-task=8
|
|
#SBATCH --cpus-per-task=128
|
|
|
|
export TORCH_DIST_INIT_BARRIER=0
|
|
|
|
srun axolotl preprocess train.yaml
|
|
|
|
srun axolotl train train.yaml --launcher torchrun -- \
|
|
--nproc_per_node=$NUM_TRAINERS --nnodes=$NUM_NODES \
|
|
--rdzv_id axolotl-cli --rdzv_backend c10d --rdzv_endpoint "${PRIMARY_ADDR}:${PRIMARY_PORT}" --rdzv-conf="join_timeout=1800"
|