From bb483ad4c47c7d90b3a33997f8da2de9dcccc03a Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Thu, 19 Mar 2026 08:29:24 -0400 Subject: [PATCH] make the CI fail GitHub Actions on test failures (#3517) * make the CI fail GitHub Actions on test failures * use model bundle * install zstd for compressed model artifact --- cicd/Dockerfile-uv.jinja | 2 +- cicd/Dockerfile.jinja | 2 +- cicd/cicd.sh | 11 ++++++----- cicd/single_gpu.py | 10 +++------- 4 files changed, 11 insertions(+), 14 deletions(-) diff --git a/cicd/Dockerfile-uv.jinja b/cicd/Dockerfile-uv.jinja index 29c2e79d5..857b94c6b 100644 --- a/cicd/Dockerfile-uv.jinja +++ b/cicd/Dockerfile-uv.jinja @@ -11,7 +11,7 @@ ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}" ENV HF_HOME="{{ HF_HOME }}" RUN apt-get update && \ - apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm + apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm WORKDIR /workspace diff --git a/cicd/Dockerfile.jinja b/cicd/Dockerfile.jinja index 4f0140fc6..7344f2a2c 100644 --- a/cicd/Dockerfile.jinja +++ b/cicd/Dockerfile.jinja @@ -12,7 +12,7 @@ ENV HF_HOME="{{ HF_HOME }}" ENV AXOLOTL_DATASET_NUM_PROC="8" RUN apt-get update && \ - apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm + apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm WORKDIR /workspace diff --git a/cicd/cicd.sh b/cicd/cicd.sh index 462b874a6..5058779fb 100755 --- a/cicd/cicd.sh +++ b/cicd/cicd.sh @@ -3,11 +3,12 @@ set -e python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__" -# curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1 -hf download "NousResearch/Meta-Llama-3-8B" -hf download "NousResearch/Meta-Llama-3-8B-Instruct" -hf download "microsoft/Phi-4-reasoning" -hf download "microsoft/Phi-3.5-mini-instruct" +curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/" --use-compress-program unzstd --strip-components=1 +# hf download "NousResearch/Meta-Llama-3-8B" +# hf download "NousResearch/Meta-Llama-3-8B-Instruct" +# hf download "microsoft/Phi-4-reasoning" +# hf download "microsoft/Phi-3.5-mini-instruct" +# hf download "microsoft/Phi-3-medium-128k-instruct" # Run unit tests with initial coverage report pytest -v --durations=10 -n8 \ diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py index cd73f60b8..592b6b931 100644 --- a/cicd/single_gpu.py +++ b/cicd/single_gpu.py @@ -68,10 +68,6 @@ def run_cmd(cmd: str, run_folder: str): sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8" # Propagate errors from subprocess. - try: - exit_code = subprocess.call(cmd.split(), cwd=run_folder, env=sp_env) # nosec - if exit_code: - print(f"Command '{cmd}' failed with exit code {exit_code}") - return exit_code - except Exception as e: # pylint: disable=broad-except - print(f"Command '{cmd}' failed with exception {e}") + exit_code = subprocess.call(cmd.split(), cwd=run_folder, env=sp_env) # nosec + if exit_code: + raise RuntimeError(f"Command '{cmd}' failed with exit code {exit_code}")