use smaller pretrained models for ci (#3620) [skip ci]

* use smaller pretrained models for ci * more steps for loss check * fix tests * more train steps * fix losses
2026-04-27 13:22:56 -04:00
parent 798c8fba89
commit ac77da96da
24 changed files with 716 additions and 288 deletions
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -199,6 +199,106 @@ def check_tensorboard(
        assert df.value.values[-1] > 1e-5, "Expected loss to be greater than zero"


+def check_tensorboard_loss_decreased(
+    temp_run_dir: str,
+    tag: str | None = None,
+    initial_window: int = 1,
+    final_window: int = 1,
+    min_delta: float | None = None,
+    max_initial: float | None = None,
+    max_final: float | None = None,
+    max_loss_ratio: float = 0.95,
+) -> None:
+    """Check that training actually learned — loss went down and stayed in
+    a sensible range.
+
+    Used with the tiny ``axolotl-ai-co/tiny-*`` CI models, where pretraining
+    was brief enough that final loss won't clear the absolute thresholds used
+    for 135M+ models — but the training pipeline should still behave.
+
+    ``train/train_loss`` is only logged once (end-of-training aggregate). The
+    per-step tag is ``train/loss`` for SFT/LM trainers and may vary across
+    trainers (e.g. DPO). When ``tag`` is None we try common per-step tags in
+    order and use the first with enough samples.
+
+    Two kinds of regression we guard against:
+
+    1. **Loss blew up.** A silent bug (e.g. broken label masking) can start
+       training at an absurdly high loss. ``max_initial`` / ``max_final``
+       assert the measured means stay at-or-below bounds measured from a
+       known-good run. Both are optional but strongly encouraged — loss
+       going *down* from a bad starting scale still looks like "learning."
+
+    2. **Loss didn't go down enough.** ``max_loss_ratio`` (default 0.95)
+       requires ``final <= initial * ratio``. A default below 1.0 means the
+       final window mean must sit at least 5% below the initial window mean
+       — real learning, not noise that happened to land below start. Only
+       raise this for configs where a smaller drop is expected *and*
+       documented (e.g. DPO with near-trivial pairs); in that case you are
+       intentionally weakening the test.
+
+    ``min_delta`` is optional; when set, additionally requires
+    ``final + min_delta <= initial`` — use for configs with enough signal
+    to demand a specific minimum absolute drop.
+    """
+    tb_log_path = most_recent_subdir(temp_run_dir)
+    event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
+    reader = SummaryReader(event_file)
+    df = reader.scalars
+
+    if tag is None:
+        candidates = ["train/loss", "train/train_loss"]
+    else:
+        candidates = [tag]
+
+    required = initial_window + final_window
+    chosen_tag, values = None, None
+    for candidate in candidates:
+        sub = df[df.tag == candidate]
+        if len(sub) >= required:
+            chosen_tag = candidate
+            values = sub.value.values
+            break
+
+    available = sorted({t for t in df.tag.unique() if "loss" in t.lower()})
+    assert values is not None, (
+        f"None of the tags {candidates} had ≥{required} logged steps. "
+        f"Loss tags present: {available}"
+    )
+
+    initial = float(values[:initial_window].mean())
+    final = float(values[-final_window:].mean())
+    print(
+        f"[check_tensorboard_loss_decreased] tag={chosen_tag} n={len(values)} "
+        f"initial_mean{initial_window}={initial:.4f} final_mean{final_window}={final:.4f}"
+    )
+    assert final > 1e-5, "Expected loss to be greater than zero"
+    assert final <= initial * max_loss_ratio, (
+        f"Loss did not decrease for {chosen_tag}: "
+        f"initial(mean of first {initial_window})={initial:.4f}, "
+        f"final(mean of last {final_window})={final:.4f}, "
+        f"ratio={final / initial:.4f} (max allowed {max_loss_ratio}). "
+        f"Expected final <= initial — training did not learn."
+    )
+    if min_delta is not None:
+        assert final + min_delta <= initial, (
+            f"Expected loss to decrease by at least {min_delta} for {chosen_tag}: "
+            f"initial={initial:.4f}, final={final:.4f}, delta={initial - final:.4f}"
+        )
+    if max_initial is not None:
+        assert initial <= max_initial, (
+            f"Initial loss {initial:.4f} is above the expected max {max_initial}. "
+            f"Absolute scale is wrong — probably a silent regression "
+            f"(e.g. bad label masking) that bumped the starting point."
+        )
+    if max_final is not None:
+        assert final <= max_final, (
+            f"Final loss {final:.4f} is above the expected max {max_final}. "
+            f"Absolute scale is wrong — probably a silent regression "
+            f"(e.g. bad label masking) that bumped the endpoint."
+        )
+
+
 def check_model_output_exists(temp_dir: str, cfg: DictDefault) -> None:
    """
    helper function to check if a model output file exists after training