From b164725417fd29e65f74760e08332fe34eefbe02 Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sat, 15 Apr 2023 12:14:52 -0400
Subject: [PATCH] improve prepared dataset loading, fix inference

---
 configs/cerebras_1_3B_alpaca.yml |  1 +
 configs/llama_65B_alpaca.yml     |  1 +
 configs/pythia_1_2B_alpaca.yml   |  1 +
 scripts/finetune.py              | 13 +++++++++----
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/configs/cerebras_1_3B_alpaca.yml b/configs/cerebras_1_3B_alpaca.yml
index d2f0bb3be..1c8bbff84 100644
--- a/configs/cerebras_1_3B_alpaca.yml
+++ b/configs/cerebras_1_3B_alpaca.yml
@@ -11,6 +11,7 @@ datasets:
     type: gpteacher
   - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
     type: gpteacher
+dataset_prepared_path: data/last_run
 val_set_size: 0.05
 adapter: lora
 sequence_len: 2048
diff --git a/configs/llama_65B_alpaca.yml b/configs/llama_65B_alpaca.yml
index d3f98e3ec..4d2e8681f 100644
--- a/configs/llama_65B_alpaca.yml
+++ b/configs/llama_65B_alpaca.yml
@@ -11,6 +11,7 @@ datasets:
     type: gpteacher
   - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
     type: gpteacher
+dataset_prepared_path: data/last_run
 val_set_size: 0.04
 adapter: lora
 lora_model_dir:
diff --git a/configs/pythia_1_2B_alpaca.yml b/configs/pythia_1_2B_alpaca.yml
index 3c2c8592e..4aeb79dfd 100644
--- a/configs/pythia_1_2B_alpaca.yml
+++ b/configs/pythia_1_2B_alpaca.yml
@@ -11,6 +11,7 @@ datasets:
     type: gpteacher
   - path: data/roleplay-similarity_0.6-instruct-dataset.jsonl
     type: gpteacher
+dataset_prepared_path: data/last_run
 val_set_size: 0.05
 adapter: lora
 lora_model_dir:
diff --git a/scripts/finetune.py b/scripts/finetune.py
index bbba3dc93..d52975a96 100644
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -173,6 +173,8 @@ def do_inference(cfg, model, tokenizer):
     input = ""
     prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n".format(instruction=instruction, input=input)
     batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
+
+    model.eval()
     with torch.no_grad():
         generated = model.generate(inputs=batch["input_ids"],
                                    do_sample=True, use_cache=True,
@@ -255,13 +257,12 @@ def train(
         do_inference(cfg, model, tokenizer)
         return
 
-    datasets = []
-    if not isinstance(cfg.datasets, list) and isinstance(cfg.datasets, str):
-        # assumption that we are loading a previously saved/cached dataset
+    if cfg.dataset_prepared_path and any(Path(cfg.dataset_prepared_path).glob("*")):
         print("Loading prepared dataset from disk...")
         dataset = load_from_disk(cfg.datasets)
         print("Prepared dataset loaded from disk...")
     else:
+        datasets = []
         for d in cfg.datasets:
             ds: IterableDataset = load_dataset(
                 "json", data_files=d.path, streaming=True, split=None
@@ -291,8 +292,12 @@ def train(
         dataset = Dataset.from_list(
             [_ for _ in constant_len_dataset]
         ).train_test_split(test_size=cfg.val_set_size, shuffle=True, seed=42)
+
         print("Saving prepared dataset to disk...")
-        dataset.save_to_disk("data/last_run")
+        if cfg.dataset_prepared_path:
+            dataset.save_to_disk(cfg.dataset_prepared_path)
+        else:
+            dataset.save_to_disk("data/last_run")
 
     train_dataset = dataset["train"]
     eval_dataset = dataset["test"]