From b164725417fd29e65f74760e08332fe34eefbe02 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Sat, 15 Apr 2023 12:14:52 -0400 Subject: [PATCH] improve prepared dataset loading, fix inference --- configs/cerebras_1_3B_alpaca.yml | 1 + configs/llama_65B_alpaca.yml | 1 + configs/pythia_1_2B_alpaca.yml | 1 + scripts/finetune.py | 13 +++++++++---- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/configs/cerebras_1_3B_alpaca.yml b/configs/cerebras_1_3B_alpaca.yml index d2f0bb3be..1c8bbff84 100644 --- a/configs/cerebras_1_3B_alpaca.yml +++ b/configs/cerebras_1_3B_alpaca.yml @@ -11,6 +11,7 @@ datasets: type: gpteacher - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl type: gpteacher +dataset_prepared_path: data/last_run val_set_size: 0.05 adapter: lora sequence_len: 2048 diff --git a/configs/llama_65B_alpaca.yml b/configs/llama_65B_alpaca.yml index d3f98e3ec..4d2e8681f 100644 --- a/configs/llama_65B_alpaca.yml +++ b/configs/llama_65B_alpaca.yml @@ -11,6 +11,7 @@ datasets: type: gpteacher - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl type: gpteacher +dataset_prepared_path: data/last_run val_set_size: 0.04 adapter: lora lora_model_dir: diff --git a/configs/pythia_1_2B_alpaca.yml b/configs/pythia_1_2B_alpaca.yml index 3c2c8592e..4aeb79dfd 100644 --- a/configs/pythia_1_2B_alpaca.yml +++ b/configs/pythia_1_2B_alpaca.yml @@ -11,6 +11,7 @@ datasets: type: gpteacher - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl type: gpteacher +dataset_prepared_path: data/last_run val_set_size: 0.05 adapter: lora lora_model_dir: diff --git a/scripts/finetune.py b/scripts/finetune.py index bbba3dc93..d52975a96 100644 --- a/scripts/finetune.py +++ b/scripts/finetune.py @@ -173,6 +173,8 @@ def do_inference(cfg, model, tokenizer): input = "" prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n".format(instruction=instruction, input=input) batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False) + + model.eval() with torch.no_grad(): generated = model.generate(inputs=batch["input_ids"], do_sample=True, use_cache=True, @@ -255,13 +257,12 @@ def train( do_inference(cfg, model, tokenizer) return - datasets = [] - if not isinstance(cfg.datasets, list) and isinstance(cfg.datasets, str): - # assumption that we are loading a previously saved/cached dataset + if cfg.dataset_prepared_path and any(Path(cfg.dataset_prepared_path).glob("*")): print("Loading prepared dataset from disk...") dataset = load_from_disk(cfg.datasets) print("Prepared dataset loaded from disk...") else: + datasets = [] for d in cfg.datasets: ds: IterableDataset = load_dataset( "json", data_files=d.path, streaming=True, split=None @@ -291,8 +292,12 @@ def train( dataset = Dataset.from_list( [_ for _ in constant_len_dataset] ).train_test_split(test_size=cfg.val_set_size, shuffle=True, seed=42) + print("Saving prepared dataset to disk...") - dataset.save_to_disk("data/last_run") + if cfg.dataset_prepared_path: + dataset.save_to_disk(cfg.dataset_prepared_path) + else: + dataset.save_to_disk("data/last_run") train_dataset = dataset["train"] eval_dataset = dataset["test"]