From cf17649ef3a1eada923668b37d84511ef72caabf Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Fri, 31 Jan 2025 08:58:04 -0500 Subject: [PATCH] Misc fixes 20250130 (#2301) * misc fixes for garbage collection and L40S w NCCL P2P * patch bnb fix for triton check * chore: lint * change up import * try patching differently * remove patch for bnb fix for now * more verbose checks and tweak train loss threshold --- docs/dataset-formats/stepwise_supervised.qmd | 2 +- src/axolotl/utils/callbacks/__init__.py | 8 +++++++- src/axolotl/utils/environment.py | 2 +- tests/e2e/test_process_reward_model_smollm2.py | 2 +- tests/e2e/utils.py | 5 ++++- 5 files changed, 14 insertions(+), 5 deletions(-) diff --git a/docs/dataset-formats/stepwise_supervised.qmd b/docs/dataset-formats/stepwise_supervised.qmd index 072bf8353..2cec8e1bd 100644 --- a/docs/dataset-formats/stepwise_supervised.qmd +++ b/docs/dataset-formats/stepwise_supervised.qmd @@ -23,4 +23,4 @@ Here's a simple example of a stepwise supervised dataset entry: ], "labels": [true, false] } -``` \ No newline at end of file +``` diff --git a/src/axolotl/utils/callbacks/__init__.py b/src/axolotl/utils/callbacks/__init__.py index d92cb9d99..9ca0e84fe 100644 --- a/src/axolotl/utils/callbacks/__init__.py +++ b/src/axolotl/utils/callbacks/__init__.py @@ -846,6 +846,12 @@ class GCCallback(TrainerCallback): def on_step_end( self, args, state, control, **kwargs # pylint: disable=unused-argument ): - if state.global_step % self.gc_steps == 0: + if self.gc_steps > 0 and state.global_step % self.gc_steps == 0: torch.cuda.empty_cache() gc.collect() + + def on_epoch_end( + self, args, state, control, **kwargs # pylint: disable=unused-argument + ): + torch.cuda.empty_cache() + gc.collect() diff --git a/src/axolotl/utils/environment.py b/src/axolotl/utils/environment.py index cf2e5d23d..381fec84c 100644 --- a/src/axolotl/utils/environment.py +++ b/src/axolotl/utils/environment.py @@ -10,7 +10,7 @@ from accelerate.utils.environment import get_gpu_info def check_cuda_p2p_ib_support(): if not accelerate_check_cuda_p2p_ib_support(): return False - unsupported_devices = {"RTX 6000 Ada"} + unsupported_devices = {"RTX 6000 Ada", "L40S"} try: device_names, device_count = get_gpu_info() if 1 < device_count < 8: diff --git a/tests/e2e/test_process_reward_model_smollm2.py b/tests/e2e/test_process_reward_model_smollm2.py index 16bf2cdc8..19347cf92 100644 --- a/tests/e2e/test_process_reward_model_smollm2.py +++ b/tests/e2e/test_process_reward_model_smollm2.py @@ -63,7 +63,7 @@ class TestProcessRewardSmolLM2(unittest.TestCase): train(cfg=cfg, dataset_meta=dataset_meta) check_tensorboard( - temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss is too high" + temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high" ) check_model_output_exists(temp_dir, cfg) diff --git a/tests/e2e/utils.py b/tests/e2e/utils.py index de0dba33a..2baead7d2 100644 --- a/tests/e2e/utils.py +++ b/tests/e2e/utils.py @@ -82,7 +82,10 @@ def check_tensorboard( reader = SummaryReader(event_file) df = reader.scalars # pylint: disable=invalid-name df = df[(df.tag == tag)] # pylint: disable=invalid-name - assert df.value.values[-1] < lt_val, assertion_err + if "%s" in assertion_err: + assert df.value.values[-1] < lt_val, assertion_err % df.value.values[-1] + else: + assert df.value.values[-1] < lt_val, assertion_err def check_model_output_exists(temp_dir: str, cfg: DictDefault) -> None: