From 9bdd30cdfdfad725b03620fdb933689fe1b828d5 Mon Sep 17 00:00:00 2001 From: Utensil Date: Wed, 21 Jun 2023 08:00:58 +0000 Subject: [PATCH] Support loading data files from a local directory ref: https://huggingface.co/docs/datasets/v2.13.0/en/package_reference/loading_methods#datasets.load_dataset.path --- src/axolotl/utils/data.py | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/src/axolotl/utils/data.py b/src/axolotl/utils/data.py index c36bfcee9..eed7d6db1 100644 --- a/src/axolotl/utils/data.py +++ b/src/axolotl/utils/data.py @@ -102,13 +102,26 @@ def load_tokenized_prepared_datasets( pass # prefer local dataset, even if hub exists - if Path(d.path).exists(): - ds = load_dataset( - "json", - data_files=d.path, - streaming=False, - split=None, - ) + local_path = Path(d.path) + if local_path.exists(): + if local_path.is_dir(): + ds = load_dataset( + d.path, + data_files=d.data_files, + streaming=False, + split=None, + ) + elif local_path.is_file(): + ds = load_dataset( + "json", + data_files=d.path, + streaming=False, + split=None, + ) + else: + raise ValueError( + "unhandled dataset load: local path exists, but is neither a directory or a file" + ) elif ds_from_hub: if d.data_files: ds = load_dataset(