fix(preprocess): Make sure dataset not loaded from cache when using preprocess cli (#1136)
This commit is contained in:
@@ -152,7 +152,11 @@ def load_tokenized_prepared_datasets(
|
|||||||
|
|
||||||
if dataset:
|
if dataset:
|
||||||
...
|
...
|
||||||
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
|
elif (
|
||||||
|
cfg.dataset_prepared_path
|
||||||
|
and any(prepared_ds_path.glob("*"))
|
||||||
|
and not cfg.is_preprocess
|
||||||
|
):
|
||||||
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
||||||
dataset = load_from_disk(str(prepared_ds_path))
|
dataset = load_from_disk(str(prepared_ds_path))
|
||||||
LOG.info("Prepared dataset loaded from disk...")
|
LOG.info("Prepared dataset loaded from disk...")
|
||||||
@@ -465,7 +469,11 @@ def load_prepare_datasets(
|
|||||||
|
|
||||||
if dataset:
|
if dataset:
|
||||||
...
|
...
|
||||||
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
|
elif (
|
||||||
|
cfg.dataset_prepared_path
|
||||||
|
and any(prepared_ds_path.glob("*"))
|
||||||
|
and not cfg.is_preprocess
|
||||||
|
):
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user