From 8fe0e633d2715d84333ce68fdd45d2d975638a54 Mon Sep 17 00:00:00 2001 From: Ethan Smith Date: Wed, 27 Sep 2023 10:41:31 -0700 Subject: [PATCH] Fix bug in dataset loading (#284) * Fix bug in dataset loading This fixes a bug when loading datasets. `d.data_files` is a list, so it cannot be directly passed to `hf_hub_download` * Check type of data_files, and load accordingly --- src/axolotl/utils/data.py | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index eeac15d30..271379677 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -205,11 +205,26 @@ def load_tokenized_prepared_datasets( use_auth_token=use_auth_token, ) else: - fp = hf_hub_download( - repo_id=d.path, - repo_type="dataset", - filename=d.data_files, - ) + if isinstance(d.data_files, str): + fp = hf_hub_download( + repo_id=d.path, + repo_type="dataset", + filename=d.data_files, + ) + elif isinstance(d.data_files, list): + fp = [] + for file in d.data_files: + fp.append( + hf_hub_download( + repo_id=d.path, + repo_type="dataset", + filename=file, + ) + ) + else: + raise ValueError( + "data_files must be either a string or list of strings" + ) ds = load_dataset( "json", name=d.name, data_files=fp, streaming=False, split=None )