Fix setting correct repo id when pushing dataset to hub (#1657)

* use the ds hash as the dataset's config_name

* improve logging for loading/pushing ds to hub

* fix missing f string
This commit is contained in:
ripes
2024-08-05 09:42:15 -07:00
committed by GitHub
parent 203816f7b4
commit 7402eb9dcb

View File

@@ -160,8 +160,12 @@ def load_tokenized_prepared_datasets(
use_auth_token = cfg.hf_use_auth_token
try:
if cfg.push_dataset_to_hub:
LOG.info(
f"Attempting to load prepared dataset from Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..."
)
dataset = load_dataset(
f"{cfg.push_dataset_to_hub}/{ds_hash}",
cfg.push_dataset_to_hub,
ds_hash,
token=use_auth_token,
)
dataset = dataset[split]
@@ -181,6 +185,8 @@ def load_tokenized_prepared_datasets(
dataset = load_from_disk(str(prepared_ds_path))
LOG.info("Prepared dataset loaded from disk...")
else:
if cfg.push_dataset_to_hub:
LOG.info("Unable to find prepared dataset in Huggingface hub")
LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
LOG.info("Loading raw datasets...")
if not cfg.is_preprocess:
@@ -433,10 +439,12 @@ def load_tokenized_prepared_datasets(
dataset.save_to_disk(str(prepared_ds_path))
if cfg.push_dataset_to_hub:
LOG.info(
f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
f"Pushing merged prepared dataset to Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..."
)
dataset.push_to_hub(
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
cfg.push_dataset_to_hub,
ds_hash,
private=True,
)
return dataset, prompters