lint and additional train metric checks for kd

2025-02-26 03:19:42 -05:00
parent afbb44f08b
commit 23f029a89c
5 changed files with 35 additions and 4 deletions
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -90,6 +90,12 @@ class TestKnowledgeDistillation:
        check_tensorboard(
            temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/loss", 0.0, "Train Loss is too low", lt=False
+        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/grad_norm", 8.0, "Train grad norm is too high"
+        )

    @pytest.mark.parametrize(
        "load_in_8bit",
@@ -121,3 +127,9 @@ class TestKnowledgeDistillation:
        check_tensorboard(
            temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/loss", 0.0, "Train Loss is too low", lt=False
+        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/grad_norm", 8.0, "Train grad norm is too high"
+        )
--- a/tests/e2e/integrations/test_kl_loss.py
+++ b/tests/e2e/integrations/test_kl_loss.py
@@ -27,6 +27,7 @@ def test_kl_loss_gradient():
    student_logits_triton = student_logits.detach().clone().requires_grad_(True)

    # Generate random target token IDs, ensuring they're valid indices
+    # pylint: disable=duplicate-code
    target_token_ids = torch.randint(
        0, vocab_size, (batch_size, seq_len, top_k), device="cuda"
    )
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -102,7 +102,11 @@ def is_hopper():


 def check_tensorboard(
-    temp_run_dir: str, tag: str, lt_val: float, assertion_err: str
+    temp_run_dir: str,
+    tag: str,
+    comparison_val: float,
+    assertion_err: str,
+    lt: bool = True,
 ) -> None:
    """
    helper function to parse and check tensorboard logs
@@ -112,10 +116,20 @@ def check_tensorboard(
    reader = SummaryReader(event_file)
    df = reader.scalars  # pylint: disable=invalid-name
    df = df[(df.tag == tag)]  # pylint: disable=invalid-name
-    if "%s" in assertion_err:
-        assert df.value.values[-1] < lt_val, assertion_err % df.value.values[-1]
+    if lt:
+        if "%s" in assertion_err:
+            assert df.value.values[-1] < comparison_val, (
+                assertion_err % df.value.values[-1]
+            )
+        else:
+            assert df.value.values[-1] < comparison_val, assertion_err
    else:
-        assert df.value.values[-1] < lt_val, assertion_err
+        if "%s" in assertion_err:
+            assert df.value.values[-1] > comparison_val, (
+                assertion_err % df.value.values[-1]
+            )
+        else:
+            assert df.value.values[-1] > comparison_val, assertion_err


 def check_model_output_exists(temp_dir: str, cfg: DictDefault) -> None: