From e993ed5208cd3b8d4d21073ff407ce0be4289f27 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Mon, 13 Apr 2026 18:29:55 +0000 Subject: [PATCH] retry head-server probe with longer timeout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``get_server_configs`` was hardcoded to a 5s timeout with no retry. That's empirically too tight to survive a kill-and-relaunch cycle: when the agent server is finishing in-flight rollouts from a prior run, it can take 10-30s to respond to /global_config_dict_yaml, and the trainer would crash at startup with a ReadTimeoutError. Bump the per-attempt timeout to 30s and retry up to 3 times with a 2s/4s backoff. The retry intentionally raises a RuntimeError after the third failure rather than returning empty config — silent failure here would let training proceed with no agent servers discovered, which is also a no-op trainer. --- src/axolotl/integrations/nemo_gym/server.py | 38 ++++++++++++++++----- 1 file changed, 29 insertions(+), 9 deletions(-) diff --git a/src/axolotl/integrations/nemo_gym/server.py b/src/axolotl/integrations/nemo_gym/server.py index 0af9b3b71..bd619569e 100644 --- a/src/axolotl/integrations/nemo_gym/server.py +++ b/src/axolotl/integrations/nemo_gym/server.py @@ -130,21 +130,41 @@ def start_servers( ) -def get_server_configs(head_port: int = 11000) -> dict: +def get_server_configs(head_port: int = 11000, timeout: float = 30.0) -> dict: """Fetch the global config from the NeMo Gym head server. + Retries up to 3 times with exponential backoff. The default per-attempt + timeout is 30s (raised from the original 5s) because head servers can + be slow to respond when they're concurrently serving rollouts from a + prior training run. A 5s timeout was empirically too tight to survive + a kill-and-relaunch cycle. + Returns: Dict mapping server_name -> server config. """ - response = requests.get( - f"http://127.0.0.1:{head_port}/global_config_dict_yaml", timeout=5 + url = f"http://127.0.0.1:{head_port}/global_config_dict_yaml" + last_exc: Exception | None = None + for attempt in (1, 2, 3): + try: + response = requests.get(url, timeout=timeout) + response.raise_for_status() + result = yaml.safe_load(response.text) + # NeMo Gym head server double-encodes: YAML string inside a YAML string + if isinstance(result, str): + result = yaml.safe_load(result) + return result + except (requests.exceptions.RequestException, OSError) as exc: + last_exc = exc + LOG.warning( + "NeMo Gym head probe attempt %d/3 failed: %s. Retrying...", + attempt, + type(exc).__name__, + ) + if attempt < 3: + time.sleep(2.0 * attempt) + raise RuntimeError( + f"NeMo Gym head server at {url} did not respond after 3 attempts: {last_exc}" ) - response.raise_for_status() - result = yaml.safe_load(response.text) - # NeMo Gym head server double-encodes: YAML string inside a YAML string - if isinstance(result, str): - result = yaml.safe_load(result) - return result def get_agent_servers(