add e2e tests for checking functionality of resume from checkpoint (#865)

* use tensorboard to see if resume from checkpoint works

* make sure e2e test is either fp16 or bf16

* set max_steps and save limit so we have the checkpoint when testing resuming

* fix test parameters
This commit is contained in:
Wing Lian
2023-11-15 23:05:55 -05:00
committed by GitHub
parent 8a8d1c4023
commit b3a61e8ce2
4 changed files with 109 additions and 1 deletions

View File

@@ -1,10 +1,11 @@
"""
helper utils for tests
"""
import os
import shutil
import tempfile
from functools import wraps
from pathlib import Path
def with_temp_dir(test_func):
@@ -20,3 +21,13 @@ def with_temp_dir(test_func):
shutil.rmtree(temp_dir)
return wrapper
def most_recent_subdir(path):
base_path = Path(path)
subdirectories = [d for d in base_path.iterdir() if d.is_dir()]
if not subdirectories:
return None
subdir = max(subdirectories, key=os.path.getctime)
return subdir