From 7402eb9dcb1c3cc9a2dc8ba0de8cab68147ece3d Mon Sep 17 00:00:00 2001 From: ripes <44345856+chrislee973@users.noreply.github.com> Date: Mon, 5 Aug 2024 09:42:15 -0700 Subject: [PATCH] Fix setting correct repo id when pushing dataset to hub (#1657) * use the ds hash as the dataset's config_name * improve logging for loading/pushing ds to hub * fix missing f string --- src/axolotl/utils/data/sft.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/axolotl/utils/data/sft.py b/src/axolotl/utils/data/sft.py index 2e923057d..97061cc62 100644 --- a/src/axolotl/utils/data/sft.py +++ b/src/axolotl/utils/data/sft.py @@ -160,8 +160,12 @@ def load_tokenized_prepared_datasets( use_auth_token = cfg.hf_use_auth_token try: if cfg.push_dataset_to_hub: + LOG.info( + f"Attempting to load prepared dataset from Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..." + ) dataset = load_dataset( - f"{cfg.push_dataset_to_hub}/{ds_hash}", + cfg.push_dataset_to_hub, + ds_hash, token=use_auth_token, ) dataset = dataset[split] @@ -181,6 +185,8 @@ def load_tokenized_prepared_datasets( dataset = load_from_disk(str(prepared_ds_path)) LOG.info("Prepared dataset loaded from disk...") else: + if cfg.push_dataset_to_hub: + LOG.info("Unable to find prepared dataset in Huggingface hub") LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}") LOG.info("Loading raw datasets...") if not cfg.is_preprocess: @@ -433,10 +439,12 @@ def load_tokenized_prepared_datasets( dataset.save_to_disk(str(prepared_ds_path)) if cfg.push_dataset_to_hub: LOG.info( - f"Saving merged prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}" + f"Pushing merged prepared dataset to Huggingface hub at {cfg.push_dataset_to_hub} (version {ds_hash})..." ) dataset.push_to_hub( - f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True + cfg.push_dataset_to_hub, + ds_hash, + private=True, ) return dataset, prompters