replace tensorboard checks with helper function (#2120) [skip ci]

* replace tensorboard checks with helper function * move helper function * use relative
2024-12-03 21:06:20 -05:00
parent 418ad2b586
commit a1790f2652
7 changed files with 50 additions and 74 deletions
--- a/tests/e2e/patched/test_fa_xentropy.py
+++ b/tests/e2e/patched/test_fa_xentropy.py
@@ -8,7 +8,6 @@ from importlib import reload
 from pathlib import Path

 import pytest
-from tbparse import SummaryReader
 from transformers.utils import is_torch_bf16_gpu_available

 from axolotl.cli import load_datasets
@@ -17,7 +16,7 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault

-from ..utils import most_recent_subdir
+from ..utils import check_tensorboard

 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"
@@ -94,9 +93,6 @@ class TestFAXentropyLlama:
        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "adapter_model.bin").exists()

-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 1.5, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high"
+        )
--- a/tests/e2e/patched/test_unsloth_qlora.py
+++ b/tests/e2e/patched/test_unsloth_qlora.py
@@ -6,8 +6,6 @@ import os
 from pathlib import Path

 import pytest
-from e2e.utils import most_recent_subdir
-from tbparse import SummaryReader

 from axolotl.cli import load_datasets
 from axolotl.common.cli import TrainerCliArgs
@@ -15,6 +13,8 @@ from axolotl.train import train
 from axolotl.utils.config import normalize_config
 from axolotl.utils.dict import DictDefault

+from ..utils import check_tensorboard
+
 LOG = logging.getLogger("axolotl.tests.e2e")
 os.environ["WANDB_DISABLED"] = "true"

@@ -73,12 +73,9 @@ class TestUnslothQLoRA:
        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "adapter_model.bin").exists()

-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.0, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+        )

    def test_unsloth_llama_qlora_unpacked(self, temp_dir):
        cfg = DictDefault(
@@ -123,12 +120,9 @@ class TestUnslothQLoRA:
        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "adapter_model.bin").exists()

-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.0, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+        )

    @pytest.mark.parametrize(
        "sdp_attention",
@@ -178,9 +172,6 @@ class TestUnslothQLoRA:
        train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "adapter_model.bin").exists()

-        tb_log_path = most_recent_subdir(temp_dir + "/runs")
-        event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
-        reader = SummaryReader(event_file)
-        df = reader.scalars  # pylint: disable=invalid-name
-        df = df[(df.tag == "train/train_loss")]  # pylint: disable=invalid-name
-        assert df.value.values[-1] < 2.0, "Loss is too high"
+        check_tensorboard(
+            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
+        )