make sure to use train split if loading from hf

2023-05-21 22:04:39 -04:00
parent 99383f14a3
commit 607a4d33f2
1 changed files with 2 additions and 0 deletions
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -58,6 +58,7 @@ def load_tokenized_prepared_datasets(tokenizer, cfg, default_dataset_prepared_pa
    try:
        if cfg.push_dataset_to_hub:
            dataset = load_dataset(f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True)
+            dataset = dataset["train"]
    except:
        pass

@@ -232,6 +233,7 @@ def load_prepare_datasets(tokenizer: PreTrainedTokenizerBase, cfg, default_datas
                    f"checkking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
                )
                dataset = load_dataset(f"{cfg.push_dataset_to_hub}/{ds_hash}", use_auth_token=True)
+                dataset = dataset["train"]
        except:
            pass