lint and additional train metric checks for kd

2025-02-26 03:19:42 -05:00
parent afbb44f08b
commit 23f029a89c
5 changed files with 35 additions and 4 deletions
--- a/tests/e2e/integrations/test_kd.py
+++ b/tests/e2e/integrations/test_kd.py
@@ -90,6 +90,12 @@ class TestKnowledgeDistillation:
        check_tensorboard(
            temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/loss", 0.0, "Train Loss is too low", lt=False
+        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/grad_norm", 8.0, "Train grad norm is too high"
+        )

    @pytest.mark.parametrize(
        "load_in_8bit",
@@ -121,3 +127,9 @@ class TestKnowledgeDistillation:
        check_tensorboard(
            temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/loss", 0.0, "Train Loss is too low", lt=False
+        )
+        check_tensorboard(
+            temp_dir + "/runs", "train/grad_norm", 8.0, "Train grad norm is too high"
+        )
--- a/tests/e2e/integrations/test_kl_loss.py
+++ b/tests/e2e/integrations/test_kl_loss.py
@@ -27,6 +27,7 @@ def test_kl_loss_gradient():
    student_logits_triton = student_logits.detach().clone().requires_grad_(True)

    # Generate random target token IDs, ensuring they're valid indices
+    # pylint: disable=duplicate-code
    target_token_ids = torch.randint(
        0, vocab_size, (batch_size, seq_len, top_k), device="cuda"
    )