replace tensorboard checks with helper function (#2120) [skip ci]
* replace tensorboard checks with helper function * move helper function * use relative
This commit is contained in:
@@ -7,12 +7,11 @@ from pathlib import Path
|
|||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
from accelerate.test_utils import execute_subprocess_async
|
from accelerate.test_utils import execute_subprocess_async
|
||||||
from tbparse import SummaryReader
|
|
||||||
from transformers.testing_utils import get_torch_dist_unique_port
|
from transformers.testing_utils import get_torch_dist_unique_port
|
||||||
|
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
from ..utils import most_recent_subdir
|
from ..utils import check_tensorboard
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
|
LOG = logging.getLogger("axolotl.tests.e2e.multigpu")
|
||||||
os.environ["WANDB_DISABLED"] = "true"
|
os.environ["WANDB_DISABLED"] = "true"
|
||||||
@@ -91,12 +90,8 @@ class TestMultiGPUEval:
|
|||||||
str(Path(temp_dir) / "config.yaml"),
|
str(Path(temp_dir) / "config.yaml"),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
tb_log_path = most_recent_subdir(temp_dir + "/runs")
|
|
||||||
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
check_tensorboard(temp_dir + "/runs", "eval/loss", 2.5, "Eval Loss is too high")
|
||||||
reader = SummaryReader(event_file)
|
|
||||||
df = reader.scalars # pylint: disable=invalid-name
|
|
||||||
df = df[(df.tag == "eval/loss")] # pylint: disable=invalid-name
|
|
||||||
assert df.value.values[-1] < 2.5, "Loss is too high"
|
|
||||||
|
|
||||||
def test_eval(self, temp_dir):
|
def test_eval(self, temp_dir):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
@@ -164,9 +159,5 @@ class TestMultiGPUEval:
|
|||||||
str(Path(temp_dir) / "config.yaml"),
|
str(Path(temp_dir) / "config.yaml"),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
tb_log_path = most_recent_subdir(temp_dir + "/runs")
|
|
||||||
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
check_tensorboard(temp_dir + "/runs", "eval/loss", 2.9, "Eval Loss is too high")
|
||||||
reader = SummaryReader(event_file)
|
|
||||||
df = reader.scalars # pylint: disable=invalid-name
|
|
||||||
df = df[(df.tag == "eval/loss")] # pylint: disable=invalid-name
|
|
||||||
assert df.value.values[-1] < 2.9, "Loss is too high"
|
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ from importlib import reload
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from tbparse import SummaryReader
|
|
||||||
from transformers.utils import is_torch_bf16_gpu_available
|
from transformers.utils import is_torch_bf16_gpu_available
|
||||||
|
|
||||||
from axolotl.cli import load_datasets
|
from axolotl.cli import load_datasets
|
||||||
@@ -17,7 +16,7 @@ from axolotl.train import train
|
|||||||
from axolotl.utils.config import normalize_config
|
from axolotl.utils.config import normalize_config
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
from ..utils import most_recent_subdir
|
from ..utils import check_tensorboard
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||||
os.environ["WANDB_DISABLED"] = "true"
|
os.environ["WANDB_DISABLED"] = "true"
|
||||||
@@ -94,9 +93,6 @@ class TestFAXentropyLlama:
|
|||||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||||
|
|
||||||
tb_log_path = most_recent_subdir(temp_dir + "/runs")
|
check_tensorboard(
|
||||||
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss is too high"
|
||||||
reader = SummaryReader(event_file)
|
)
|
||||||
df = reader.scalars # pylint: disable=invalid-name
|
|
||||||
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
|
|
||||||
assert df.value.values[-1] < 1.5, "Loss is too high"
|
|
||||||
|
|||||||
@@ -6,8 +6,6 @@ import os
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
from e2e.utils import most_recent_subdir
|
|
||||||
from tbparse import SummaryReader
|
|
||||||
|
|
||||||
from axolotl.cli import load_datasets
|
from axolotl.cli import load_datasets
|
||||||
from axolotl.common.cli import TrainerCliArgs
|
from axolotl.common.cli import TrainerCliArgs
|
||||||
@@ -15,6 +13,8 @@ from axolotl.train import train
|
|||||||
from axolotl.utils.config import normalize_config
|
from axolotl.utils.config import normalize_config
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
|
from ..utils import check_tensorboard
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||||
os.environ["WANDB_DISABLED"] = "true"
|
os.environ["WANDB_DISABLED"] = "true"
|
||||||
|
|
||||||
@@ -73,12 +73,9 @@ class TestUnslothQLoRA:
|
|||||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||||
|
|
||||||
tb_log_path = most_recent_subdir(temp_dir + "/runs")
|
check_tensorboard(
|
||||||
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||||
reader = SummaryReader(event_file)
|
)
|
||||||
df = reader.scalars # pylint: disable=invalid-name
|
|
||||||
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
|
|
||||||
assert df.value.values[-1] < 2.0, "Loss is too high"
|
|
||||||
|
|
||||||
def test_unsloth_llama_qlora_unpacked(self, temp_dir):
|
def test_unsloth_llama_qlora_unpacked(self, temp_dir):
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
@@ -123,12 +120,9 @@ class TestUnslothQLoRA:
|
|||||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||||
|
|
||||||
tb_log_path = most_recent_subdir(temp_dir + "/runs")
|
check_tensorboard(
|
||||||
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||||
reader = SummaryReader(event_file)
|
)
|
||||||
df = reader.scalars # pylint: disable=invalid-name
|
|
||||||
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
|
|
||||||
assert df.value.values[-1] < 2.0, "Loss is too high"
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"sdp_attention",
|
"sdp_attention",
|
||||||
@@ -178,9 +172,6 @@ class TestUnslothQLoRA:
|
|||||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||||
|
|
||||||
tb_log_path = most_recent_subdir(temp_dir + "/runs")
|
check_tensorboard(
|
||||||
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||||
reader = SummaryReader(event_file)
|
)
|
||||||
df = reader.scalars # pylint: disable=invalid-name
|
|
||||||
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
|
|
||||||
assert df.value.values[-1] < 2.0, "Loss is too high"
|
|
||||||
|
|||||||
@@ -7,15 +7,13 @@ import os
|
|||||||
import unittest
|
import unittest
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from tbparse import SummaryReader
|
|
||||||
|
|
||||||
from axolotl.cli import load_datasets
|
from axolotl.cli import load_datasets
|
||||||
from axolotl.common.cli import TrainerCliArgs
|
from axolotl.common.cli import TrainerCliArgs
|
||||||
from axolotl.train import train
|
from axolotl.train import train
|
||||||
from axolotl.utils.config import normalize_config
|
from axolotl.utils.config import normalize_config
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
from .utils import most_recent_subdir, with_temp_dir
|
from .utils import check_tensorboard, with_temp_dir
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||||
os.environ["WANDB_DISABLED"] = "true"
|
os.environ["WANDB_DISABLED"] = "true"
|
||||||
@@ -66,12 +64,9 @@ class TestEmbeddingsLrScale(unittest.TestCase):
|
|||||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||||
assert (Path(temp_dir) / "model.safetensors").exists()
|
assert (Path(temp_dir) / "model.safetensors").exists()
|
||||||
|
|
||||||
tb_log_path = most_recent_subdir(temp_dir + "/runs")
|
check_tensorboard(
|
||||||
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
|
||||||
reader = SummaryReader(event_file)
|
)
|
||||||
df = reader.scalars # pylint: disable=invalid-name
|
|
||||||
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
|
|
||||||
assert df.value.values[-1] < 2.0, "Loss is too high"
|
|
||||||
|
|
||||||
@with_temp_dir
|
@with_temp_dir
|
||||||
def test_train_w_embedding_lr(self, temp_dir):
|
def test_train_w_embedding_lr(self, temp_dir):
|
||||||
@@ -113,9 +108,6 @@ class TestEmbeddingsLrScale(unittest.TestCase):
|
|||||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||||
assert (Path(temp_dir) / "model.safetensors").exists()
|
assert (Path(temp_dir) / "model.safetensors").exists()
|
||||||
|
|
||||||
tb_log_path = most_recent_subdir(temp_dir + "/runs")
|
check_tensorboard(
|
||||||
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
|
||||||
reader = SummaryReader(event_file)
|
)
|
||||||
df = reader.scalars # pylint: disable=invalid-name
|
|
||||||
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
|
|
||||||
assert df.value.values[-1] < 2.0, "Loss is too high"
|
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
from tbparse import SummaryReader
|
|
||||||
from transformers.utils import is_torch_bf16_gpu_available
|
from transformers.utils import is_torch_bf16_gpu_available
|
||||||
|
|
||||||
from axolotl.cli import load_datasets
|
from axolotl.cli import load_datasets
|
||||||
@@ -15,7 +14,7 @@ from axolotl.train import train
|
|||||||
from axolotl.utils.config import normalize_config
|
from axolotl.utils.config import normalize_config
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
from .utils import most_recent_subdir, with_temp_dir
|
from .utils import check_tensorboard, with_temp_dir
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||||
os.environ["WANDB_DISABLED"] = "true"
|
os.environ["WANDB_DISABLED"] = "true"
|
||||||
@@ -66,9 +65,6 @@ class TestPackedLlama(unittest.TestCase):
|
|||||||
|
|
||||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||||
|
|
||||||
tb_log_path = most_recent_subdir(temp_dir + "/runs")
|
check_tensorboard(
|
||||||
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||||
reader = SummaryReader(event_file)
|
)
|
||||||
df = reader.scalars # pylint: disable=invalid-name
|
|
||||||
df = df[(df.tag == "train/train_loss")] # pylint: disable=invalid-name
|
|
||||||
assert df.value.values[-1] < 2.0, "Loss is too high"
|
|
||||||
|
|||||||
@@ -7,15 +7,13 @@ import os
|
|||||||
import unittest
|
import unittest
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from tbparse import SummaryReader
|
|
||||||
|
|
||||||
from axolotl.cli import load_datasets
|
from axolotl.cli import load_datasets
|
||||||
from axolotl.common.cli import TrainerCliArgs
|
from axolotl.common.cli import TrainerCliArgs
|
||||||
from axolotl.train import train
|
from axolotl.train import train
|
||||||
from axolotl.utils.config import normalize_config
|
from axolotl.utils.config import normalize_config
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
from .utils import most_recent_subdir, with_temp_dir
|
from .utils import check_tensorboard, with_temp_dir
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||||
os.environ["WANDB_DISABLED"] = "true"
|
os.environ["WANDB_DISABLED"] = "true"
|
||||||
@@ -85,9 +83,6 @@ class TestReLoraLlama(unittest.TestCase):
|
|||||||
).exists()
|
).exists()
|
||||||
assert (Path(temp_dir) / "checkpoint-100/relora/model.safetensors").exists()
|
assert (Path(temp_dir) / "checkpoint-100/relora/model.safetensors").exists()
|
||||||
|
|
||||||
tb_log_path = most_recent_subdir(temp_dir + "/runs")
|
check_tensorboard(
|
||||||
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
temp_dir + "/runs", "train/grad_norm", 0.2, "grad_norm is too high"
|
||||||
reader = SummaryReader(event_file)
|
)
|
||||||
df = reader.scalars # pylint: disable=invalid-name
|
|
||||||
df = df[(df.tag == "train/grad_norm")] # pylint: disable=invalid-name
|
|
||||||
assert df.value.values[-1] < 0.2, "grad_norm is too high"
|
|
||||||
|
|||||||
@@ -12,6 +12,7 @@ import torch
|
|||||||
|
|
||||||
# from importlib.metadata import version
|
# from importlib.metadata import version
|
||||||
from packaging import version
|
from packaging import version
|
||||||
|
from tbparse import SummaryReader
|
||||||
|
|
||||||
|
|
||||||
def with_temp_dir(test_func):
|
def with_temp_dir(test_func):
|
||||||
@@ -66,3 +67,17 @@ def require_torch_2_5_1(test_case):
|
|||||||
def is_hopper():
|
def is_hopper():
|
||||||
compute_capability = torch.cuda.get_device_capability()
|
compute_capability = torch.cuda.get_device_capability()
|
||||||
return compute_capability == (9, 0)
|
return compute_capability == (9, 0)
|
||||||
|
|
||||||
|
|
||||||
|
def check_tensorboard(
|
||||||
|
temp_run_dir: str, tag: str, lt_val: float, assertion_err: str
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
helper function to parse and check tensorboard logs
|
||||||
|
"""
|
||||||
|
tb_log_path = most_recent_subdir(temp_run_dir)
|
||||||
|
event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
|
||||||
|
reader = SummaryReader(event_file)
|
||||||
|
df = reader.scalars # pylint: disable=invalid-name
|
||||||
|
df = df[(df.tag == tag)] # pylint: disable=invalid-name
|
||||||
|
assert df.value.values[-1] < lt_val, assertion_err
|
||||||
|
|||||||
Reference in New Issue
Block a user