make sure to use train split if loading from hf

This commit is contained in:
Wing Lian
2023-05-21 22:04:39 -04:00
parent 99383f14a3
commit 607a4d33f2

View File

@@ -58,6 +58,7 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
try:
if cfg.push_dataset_to_hub:
dataset = load_dataset(f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True)
dataset = dataset["train"]
except:
pass
@@ -232,6 +233,7 @@ def load_prepare_datasets(tokenizer: PreTrainedTokenizerBase, cfg, default_datas
f"checkking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
)
dataset = load_dataset(f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True)
dataset = dataset["train"]
except:
pass