Fix setting correct repo id when pushing dataset to hub (#1657)
* use the ds hash as the dataset's config_name * improve logging for loading/pushing ds to hub * fix missing f string
This commit is contained in:
@@ -160,8 +160,12 @@ def load_tokenized_prepared_datasets(
|
|||||||
use_auth_token = cfg.hf_use_auth_token
|
use_auth_token = cfg.hf_use_auth_token
|
||||||
try:
|
try:
|
||||||
if cfg.push_dataset_to_hub:
|
if cfg.push_dataset_to_hub:
|
||||||
|
LOG.info(
|
||||||
|
f"Attempting to load prepared dataset from Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..."
|
||||||
|
)
|
||||||
dataset = load_dataset(
|
dataset = load_dataset(
|
||||||
f"{cfg.push_dataset_to_hub}/{ds_hash}",
|
cfg.push_dataset_to_hub,
|
||||||
|
ds_hash,
|
||||||
token=use_auth_token,
|
token=use_auth_token,
|
||||||
)
|
)
|
||||||
dataset = dataset[split]
|
dataset = dataset[split]
|
||||||
@@ -181,6 +185,8 @@ def load_tokenized_prepared_datasets(
|
|||||||
dataset = load_from_disk(str(prepared_ds_path))
|
dataset = load_from_disk(str(prepared_ds_path))
|
||||||
LOG.info("Prepared dataset loaded from disk...")
|
LOG.info("Prepared dataset loaded from disk...")
|
||||||
else:
|
else:
|
||||||
|
if cfg.push_dataset_to_hub:
|
||||||
|
LOG.info("Unable to find prepared dataset in Huggingface hub")
|
||||||
LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
|
LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
|
||||||
LOG.info("Loading raw datasets...")
|
LOG.info("Loading raw datasets...")
|
||||||
if not cfg.is_preprocess:
|
if not cfg.is_preprocess:
|
||||||
@@ -433,10 +439,12 @@ def load_tokenized_prepared_datasets(
|
|||||||
dataset.save_to_disk(str(prepared_ds_path))
|
dataset.save_to_disk(str(prepared_ds_path))
|
||||||
if cfg.push_dataset_to_hub:
|
if cfg.push_dataset_to_hub:
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
|
f"Pushing merged prepared dataset to Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..."
|
||||||
)
|
)
|
||||||
dataset.push_to_hub(
|
dataset.push_to_hub(
|
||||||
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
|
cfg.push_dataset_to_hub,
|
||||||
|
ds_hash,
|
||||||
|
private=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
return dataset, prompters
|
return dataset, prompters
|
||||||
|
|||||||
Reference in New Issue
Block a user