add e2e tests for checking functionality of resume from checkpoint (#865)
* use tensorboard to see if resume from checkpoint works * make sure e2e test is either fp16 or bf16 * set max_steps and save limit so we have the checkpoint when testing resuming * fix test parameters
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
"""
|
||||
helper utils for tests
|
||||
"""
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
from functools import wraps
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def with_temp_dir(test_func):
|
||||
@@ -20,3 +21,13 @@ def with_temp_dir(test_func):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
return wrapper
|
||||
|
||||
|
||||
def most_recent_subdir(path):
|
||||
base_path = Path(path)
|
||||
subdirectories = [d for d in base_path.iterdir() if d.is_dir()]
|
||||
if not subdirectories:
|
||||
return None
|
||||
subdir = max(subdirectories, key=os.path.getctime)
|
||||
|
||||
return subdir
|
||||
|
||||
Reference in New Issue
Block a user