add e2e tests for checking functionality of resume from checkpoint (#865)

* use tensorboard to see if resume from checkpoint works

* make sure e2e test is either fp16 or bf16

* set max_steps and save limit so we have the checkpoint when testing resuming

* fix test parameters
This commit is contained in:
Wing Lian
2023-11-15 23:05:55 -05:00
committed by GitHub
parent 8a8d1c4023
commit b3a61e8ce2
4 changed files with 109 additions and 1 deletions

View File

@@ -32,3 +32,4 @@ pynvml
art art
fschat==0.2.29 fschat==0.2.29
gradio gradio
tensorboard

View File

@@ -101,6 +101,7 @@ class TestLoraLlama(unittest.TestCase):
"learning_rate": 0.00001, "learning_rate": 0.00001,
"optimizer": "adamw_torch", "optimizer": "adamw_torch",
"lr_scheduler": "cosine", "lr_scheduler": "cosine",
"bf16": True,
} }
) )
normalize_config(cfg) normalize_config(cfg)

95
tests/e2e/test_resume.py Normal file
View File

@@ -0,0 +1,95 @@
"""
E2E tests for resuming training
"""
import logging
import os
import re
import subprocess
import unittest
from pathlib import Path
from transformers.utils import is_torch_bf16_gpu_available
from axolotl.cli import load_datasets
from axolotl.common.cli import TrainerCliArgs
from axolotl.train import train
from axolotl.utils.config import normalize_config
from axolotl.utils.dict import DictDefault
from .utils import most_recent_subdir, with_temp_dir
LOG = logging.getLogger("axolotl.tests.e2e")
os.environ["WANDB_DISABLED"] = "true"
class TestResumeLlama(unittest.TestCase):
"""
Test case for resuming training of llama models
"""
@with_temp_dir
def test_resume_qlora(self, temp_dir):
# pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "JackFram/llama-68m",
"tokenizer_type": "LlamaTokenizer",
"sequence_len": 1024,
"sample_packing": True,
"flash_attention": True,
"load_in_4bit": True,
"adapter": "qlora",
"lora_r": 32,
"lora_alpha": 64,
"lora_dropout": 0.05,
"lora_target_linear": True,
"val_set_size": 0.1,
"special_tokens": {},
"datasets": [
{
"path": "vicgalle/alpaca-gpt4",
"type": "alpaca",
},
],
"num_epochs": 2,
"micro_batch_size": 1,
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_torch",
"lr_scheduler": "cosine",
"save_steps": 10,
"save_total_limit": 5,
"max_steps": 40,
}
)
if is_torch_bf16_gpu_available():
cfg.bf16 = True
else:
cfg.fp16 = True
normalize_config(cfg)
cli_args = TrainerCliArgs()
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
resume_cfg = cfg | DictDefault(
{
"resume_from_checkpoint": f"{temp_dir}/checkpoint-30/",
}
)
normalize_config(resume_cfg)
cli_args = TrainerCliArgs()
train(cfg=resume_cfg, cli_args=cli_args, dataset_meta=dataset_meta)
assert (Path(temp_dir) / "adapter_model.bin").exists()
tb_log_path_1 = most_recent_subdir(temp_dir + "/runs")
cmd = f"tensorboard --inspect --logdir {tb_log_path_1}"
res = subprocess.run(
cmd, shell=True, text=True, capture_output=True, check=True
)
pattern = r"first_step\s+(\d+)"
first_steps = int(re.findall(pattern, res.stdout)[0])
assert first_steps == 31

View File

@@ -1,10 +1,11 @@
""" """
helper utils for tests helper utils for tests
""" """
import os
import shutil import shutil
import tempfile import tempfile
from functools import wraps from functools import wraps
from pathlib import Path
def with_temp_dir(test_func): def with_temp_dir(test_func):
@@ -20,3 +21,13 @@ def with_temp_dir(test_func):
shutil.rmtree(temp_dir) shutil.rmtree(temp_dir)
return wrapper return wrapper
def most_recent_subdir(path):
base_path = Path(path)
subdirectories = [d for d in base_path.iterdir() if d.is_dir()]
if not subdirectories:
return None
subdir = max(subdirectories, key=os.path.getctime)
return subdir