prepared dataset caching, other misc fixes (#665)
* prepared dataset caching, other misc fixes * also don't load from disk cache unless explicit
This commit is contained in:
@@ -7,7 +7,7 @@ push_dataset_to_hub:
|
|||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
adapter: qlora
|
adapter: qlora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./lora-out
|
output_dir: ./lora-out
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./qlora-out
|
output_dir: ./qlora-out
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./lora-out
|
output_dir: ./lora-out
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./qlora-out
|
output_dir: ./qlora-out
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./lora-out
|
output_dir: ./lora-out
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./qlora-out
|
output_dir: ./qlora-out
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ push_dataset_to_hub:
|
|||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca:chat
|
type: alpaca:chat
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
adapter: lora
|
adapter: lora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ datasets:
|
|||||||
data_files:
|
data_files:
|
||||||
- Chain-of-Thought/formatted_cot_data/gsm8k_train.json
|
- Chain-of-Thought/formatted_cot_data/gsm8k_train.json
|
||||||
type: "alpaca:chat"
|
type: "alpaca:chat"
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
# enable QLoRA
|
# enable QLoRA
|
||||||
adapter: qlora
|
adapter: qlora
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ push_dataset_to_hub:
|
|||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca:chat
|
type: alpaca:chat
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
adapter:
|
adapter:
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ push_dataset_to_hub:
|
|||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
adapter: qlora
|
adapter: qlora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ load_in_8bit: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: openaccess-ai-collective/jeopardy
|
- path: openaccess-ai-collective/jeopardy
|
||||||
type: jeopardy
|
type: jeopardy
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.02
|
val_set_size: 0.02
|
||||||
adapter:
|
adapter:
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ hf_use_auth_token: true
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
adapter: lora
|
adapter: lora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./lora-out
|
output_dir: ./lora-out
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./qlora-out
|
output_dir: ./qlora-out
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./relora-out
|
output_dir: ./relora-out
|
||||||
|
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./lora-out
|
output_dir: ./lora-out
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ strict: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
output_dir: ./out
|
output_dir: ./out
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ load_in_8bit: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: vicgalle/alpaca-gpt4
|
- path: vicgalle/alpaca-gpt4
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.02
|
val_set_size: 0.02
|
||||||
adapter:
|
adapter:
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ push_dataset_to_hub:
|
|||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.02
|
val_set_size: 0.02
|
||||||
adapter:
|
adapter:
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ push_dataset_to_hub:
|
|||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.02
|
val_set_size: 0.02
|
||||||
adapter: lora
|
adapter: lora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ push_dataset_to_hub:
|
|||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
adapter: qlora
|
adapter: qlora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ datasets:
|
|||||||
- path: garage-bAInd/Open-Platypus
|
- path: garage-bAInd/Open-Platypus
|
||||||
type: alpaca
|
type: alpaca
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.05
|
val_set_size: 0.05
|
||||||
output_dir: ./phi-sft-out
|
output_dir: ./phi-sft-out
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ datasets:
|
|||||||
- path: garage-bAInd/Open-Platypus
|
- path: garage-bAInd/Open-Platypus
|
||||||
type: alpaca
|
type: alpaca
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.05
|
val_set_size: 0.05
|
||||||
output_dir: ./phi-sft-out
|
output_dir: ./phi-sft-out
|
||||||
|
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ device_map: auto
|
|||||||
datasets:
|
datasets:
|
||||||
- path: vicgalle/alpaca-gpt4
|
- path: vicgalle/alpaca-gpt4
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.05
|
val_set_size: 0.05
|
||||||
adapter:
|
adapter:
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ load_in_8bit: true
|
|||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.05
|
val_set_size: 0.05
|
||||||
adapter: lora
|
adapter: lora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ load_in_8bit: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: vicgalle/alpaca-gpt4
|
- path: vicgalle/alpaca-gpt4
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.02
|
val_set_size: 0.02
|
||||||
adapter:
|
adapter:
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ load_in_8bit: false
|
|||||||
datasets:
|
datasets:
|
||||||
- path: vicgalle/alpaca-gpt4
|
- path: vicgalle/alpaca-gpt4
|
||||||
type: alpaca
|
type: alpaca
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.05
|
val_set_size: 0.05
|
||||||
adapter: lora
|
adapter: lora
|
||||||
lora_model_dir:
|
lora_model_dir:
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ datasets:
|
|||||||
data_files:
|
data_files:
|
||||||
- openassistant_best_replies_train.jsonl
|
- openassistant_best_replies_train.jsonl
|
||||||
type: "completion"
|
type: "completion"
|
||||||
dataset_prepared_path: last_run_prepared
|
dataset_prepared_path:
|
||||||
val_set_size: 0.01
|
val_set_size: 0.01
|
||||||
# enable QLoRA
|
# enable QLoRA
|
||||||
adapter: qlora
|
adapter: qlora
|
||||||
|
|||||||
@@ -51,7 +51,7 @@ def print_axolotl_text_art(suffix=None):
|
|||||||
|
|
||||||
|
|
||||||
def get_multi_line_input() -> Optional[str]:
|
def get_multi_line_input() -> Optional[str]:
|
||||||
print("Give me an instruction (Ctrl + D to finish): ")
|
print("Give me an instruction (Ctrl + D to submit): ")
|
||||||
instruction = ""
|
instruction = ""
|
||||||
for line in sys.stdin:
|
for line in sys.stdin:
|
||||||
instruction += line # pylint: disable=consider-using-join
|
instruction += line # pylint: disable=consider-using-join
|
||||||
|
|||||||
@@ -122,7 +122,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
|
|
||||||
if dataset:
|
if dataset:
|
||||||
...
|
...
|
||||||
elif any(prepared_ds_path.glob("*")):
|
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
|
||||||
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
||||||
dataset = load_from_disk(str(prepared_ds_path))
|
dataset = load_from_disk(str(prepared_ds_path))
|
||||||
LOG.info("Prepared dataset loaded from disk...")
|
LOG.info("Prepared dataset loaded from disk...")
|
||||||
@@ -357,7 +357,7 @@ def load_tokenized_prepared_datasets(
|
|||||||
if len(datasets) > 1:
|
if len(datasets) > 1:
|
||||||
LOG.info("shuffle merged datasets")
|
LOG.info("shuffle merged datasets")
|
||||||
dataset = dataset.shuffle(seed=seed)
|
dataset = dataset.shuffle(seed=seed)
|
||||||
if cfg.local_rank == 0:
|
if cfg.local_rank == 0 and cfg.dataset_prepared_path:
|
||||||
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
||||||
dataset.save_to_disk(prepared_ds_path)
|
dataset.save_to_disk(prepared_ds_path)
|
||||||
if cfg.push_dataset_to_hub:
|
if cfg.push_dataset_to_hub:
|
||||||
@@ -425,7 +425,7 @@ def load_prepare_datasets(
|
|||||||
|
|
||||||
if dataset:
|
if dataset:
|
||||||
...
|
...
|
||||||
elif any(prepared_ds_path.glob("*")):
|
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -31,7 +31,8 @@ def check_example_labels(example, tokenizer, text_only=False):
|
|||||||
)
|
)
|
||||||
colored_tokens.append(colored_token)
|
colored_tokens.append(colored_token)
|
||||||
|
|
||||||
LOG.info(" ".join(colored_tokens))
|
delimiter = "" if text_only else " "
|
||||||
|
LOG.info(delimiter.join(colored_tokens))
|
||||||
LOG.info("\n\n\n")
|
LOG.info("\n\n\n")
|
||||||
print(" ".join(colored_tokens))
|
print(" ".join(colored_tokens))
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user