* update trl to 0.17.0 * grpo + vllm no longer supported with 2.5.1 due to vllm constraints * disable VLLM_USE_V1 for ci * imporve handle killing off of multiprocessing vllm service * debug why this doesn't run in CI * increase vllm wait time * increase timeout to 5min * upgrade to vllm 0.8.4 * dump out the vllm log for debugging * use debug logging * increase vllm start timeout * use NVL instead * disable torch compile cache * revert some commented checks now that grpo tests are fixed * increase vllm timeoout back to 5min
24 lines
757 B
Bash
Executable File
24 lines
757 B
Bash
Executable File
#!/bin/bash
|
|
set -e
|
|
|
|
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
|
|
pytest -v -n2 \
|
|
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
|
|
--ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
|
|
/workspace/axolotl/tests/e2e/multigpu/ \
|
|
--cov=axolotl
|
|
|
|
# Run solo tests with coverage append
|
|
pytest -v --durations=10 -n1 \
|
|
/workspace/axolotl/tests/e2e/multigpu/solo/ \
|
|
--cov=axolotl \
|
|
--cov-append
|
|
|
|
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
|
|
--cov=axolotl \
|
|
--cov-append \
|
|
--cov-report=xml:multigpu-coverage.xml
|
|
|
|
# Upload coverage to Codecov
|
|
codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
|