update doc and use P2P=LOC for brittle grpo test (#2649)

* update doc and skip brittle grpo test * fix the path to run the multigpu tests * increase timeout, use LOC instead of NVL * typo * use hf cache from s3 backed cloudfront * mark grpo as flaky test dues to vllm start
2025-05-12 14:17:25 -04:00
parent c7b6790614
commit f34eef546a
6 changed files with 131 additions and 110 deletions
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -166,6 +166,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
 """
            )

+    @pytest.mark.skip(reason="flaky test")
    @pytest.mark.parametrize(
        "num_gpus",
        [1, 2],
@@ -227,7 +228,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):

        current_env = os.environ.copy()
        env = {
-            "NCCL_P2P_LEVEL": "NVL",
+            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
            "VLLM_DISABLE_COMPILE_CACHE": "1",
@@ -257,7 +258,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                    f"{get_torch_dist_unique_port()}",
                ],
                env={
-                    "NCCL_P2P_LEVEL": "NVL",
+                    "NCCL_P2P_LEVEL": "LOC",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
@@ -265,6 +266,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
        finally:
            recursive_kill(vllm_process)

+    @pytest.mark.skip(reason="flaky test")
    @pytest.mark.parametrize(
        "num_gpus",
        [1, 2],
@@ -320,7 +322,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):

        current_env = os.environ.copy()
        env = {
-            "NCCL_P2P_LEVEL": "NVL",  # nccl can be brittle, assume P2P isn't reliable
+            "NCCL_P2P_LEVEL": "LOC",  # nccl can be brittle, assume P2P isn't reliable
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
            "VLLM_DISABLE_COMPILE_CACHE": "1",
@@ -350,7 +352,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                    f"{get_torch_dist_unique_port()}",
                ],
                env={
-                    "NCCL_P2P_LEVEL": "NVL",
+                    "NCCL_P2P_LEVEL": "LOC",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },