prepared dataset caching, other misc fixes (#665)
* prepared dataset caching, other misc fixes * also don't load from disk cache unless explicit
This commit is contained in:
@@ -51,7 +51,7 @@ def print_axolotl_text_art(suffix=None):
|
||||
|
||||
|
||||
def get_multi_line_input() -> Optional[str]:
|
||||
print("Give me an instruction (Ctrl + D to finish): ")
|
||||
print("Give me an instruction (Ctrl + D to submit): ")
|
||||
instruction = ""
|
||||
for line in sys.stdin:
|
||||
instruction += line # pylint: disable=consider-using-join
|
||||
|
||||
@@ -122,7 +122,7 @@ def load_tokenized_prepared_datasets(
|
||||
|
||||
if dataset:
|
||||
...
|
||||
elif any(prepared_ds_path.glob("*")):
|
||||
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
|
||||
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
||||
dataset = load_from_disk(str(prepared_ds_path))
|
||||
LOG.info("Prepared dataset loaded from disk...")
|
||||
@@ -357,7 +357,7 @@ def load_tokenized_prepared_datasets(
|
||||
if len(datasets) > 1:
|
||||
LOG.info("shuffle merged datasets")
|
||||
dataset = dataset.shuffle(seed=seed)
|
||||
if cfg.local_rank == 0:
|
||||
if cfg.local_rank == 0 and cfg.dataset_prepared_path:
|
||||
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
||||
dataset.save_to_disk(prepared_ds_path)
|
||||
if cfg.push_dataset_to_hub:
|
||||
@@ -425,7 +425,7 @@ def load_prepare_datasets(
|
||||
|
||||
if dataset:
|
||||
...
|
||||
elif any(prepared_ds_path.glob("*")):
|
||||
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
|
||||
LOG.info(
|
||||
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
||||
)
|
||||
|
||||
@@ -31,7 +31,8 @@ def check_example_labels(example, tokenizer, text_only=False):
|
||||
)
|
||||
colored_tokens.append(colored_token)
|
||||
|
||||
LOG.info(" ".join(colored_tokens))
|
||||
delimiter = "" if text_only else " "
|
||||
LOG.info(delimiter.join(colored_tokens))
|
||||
LOG.info("\n\n\n")
|
||||
print(" ".join(colored_tokens))
|
||||
|
||||
|
||||
Reference in New Issue
Block a user