add e2e tests for checking functionality of resume from checkpoint (#865)
* use tensorboard to see if resume from checkpoint works * make sure e2e test is either fp16 or bf16 * set max_steps and save limit so we have the checkpoint when testing resuming * fix test parameters
This commit is contained in:
@@ -101,6 +101,7 @@ class TestLoraLlama(unittest.TestCase):
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"lr_scheduler": "cosine",
|
||||
"bf16": True,
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
|
||||
Reference in New Issue
Block a user