lint and additional train metric checks for kd

This commit is contained in:
Wing Lian
2025-02-26 03:19:42 -05:00
parent afbb44f08b
commit 23f029a89c
5 changed files with 35 additions and 4 deletions

View File

@@ -90,6 +90,12 @@ class TestKnowledgeDistillation:
check_tensorboard(
temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
)
check_tensorboard(
temp_dir + "/runs", "train/loss", 0.0, "Train Loss is too low", lt=False
)
check_tensorboard(
temp_dir + "/runs", "train/grad_norm", 8.0, "Train grad norm is too high"
)
@pytest.mark.parametrize(
"load_in_8bit",
@@ -121,3 +127,9 @@ class TestKnowledgeDistillation:
check_tensorboard(
temp_dir + "/runs", "train/loss", 1.0, "Train Loss is too high"
)
check_tensorboard(
temp_dir + "/runs", "train/loss", 0.0, "Train Loss is too low", lt=False
)
check_tensorboard(
temp_dir + "/runs", "train/grad_norm", 8.0, "Train grad norm is too high"
)

View File

@@ -27,6 +27,7 @@ def test_kl_loss_gradient():
student_logits_triton = student_logits.detach().clone().requires_grad_(True)
# Generate random target token IDs, ensuring they're valid indices
# pylint: disable=duplicate-code
target_token_ids = torch.randint(
0, vocab_size, (batch_size, seq_len, top_k), device="cuda"
)

View File

@@ -102,7 +102,11 @@ def is_hopper():
def check_tensorboard(
temp_run_dir: str, tag: str, lt_val: float, assertion_err: str
temp_run_dir: str,
tag: str,
comparison_val: float,
assertion_err: str,
lt: bool = True,
) -> None:
"""
helper function to parse and check tensorboard logs
@@ -112,10 +116,20 @@ def check_tensorboard(
reader = SummaryReader(event_file)
df = reader.scalars # pylint: disable=invalid-name
df = df[(df.tag == tag)] # pylint: disable=invalid-name
if "%s" in assertion_err:
assert df.value.values[-1] < lt_val, assertion_err % df.value.values[-1]
if lt:
if "%s" in assertion_err:
assert df.value.values[-1] < comparison_val, (
assertion_err % df.value.values[-1]
)
else:
assert df.value.values[-1] < comparison_val, assertion_err
else:
assert df.value.values[-1] < lt_val, assertion_err
if "%s" in assertion_err:
assert df.value.values[-1] > comparison_val, (
assertion_err % df.value.values[-1]
)
else:
assert df.value.values[-1] > comparison_val, assertion_err
def check_model_output_exists(temp_dir: str, cfg: DictDefault) -> None: