This commit is contained in:
Dan Saunders
2025-08-20 04:28:18 +00:00
parent 846aa41baa
commit e4e8ffd40c

View File

@@ -1,5 +1,5 @@
""" """
Module containing Dataset functionality Module containing dataset functionality.
We want this to be a wrapper for an existing dataset that we have loaded. Lets use the We want this to be a wrapper for an existing dataset that we have loaded. Lets use the
concept of middlewares to wrap each dataset, for example: concept of middlewares to wrap each dataset, for example:
@@ -47,6 +47,7 @@ class TokenizedPromptDataset(Dataset):
) )
def process(self, dataset: Dataset | IterableDataset) -> Dataset | IterableDataset: def process(self, dataset: Dataset | IterableDataset) -> Dataset | IterableDataset:
"""Apply filtering and tokenization."""
# For IterableDataset, we can't access features up front. Anyways, we don't care # For IterableDataset, we can't access features up front. Anyways, we don't care
# to remove unused columns from streaming datasets. # to remove unused columns from streaming datasets.
features = None features = None
@@ -104,7 +105,8 @@ def wrap_dataset_for_tokenized_prompt(
return TokenizedPromptDataset(prompt_tokenizer, dataset, **kwargs) return TokenizedPromptDataset(prompt_tokenizer, dataset, **kwargs)
# TODO this isn't the best since it can't interleave datasets # TODO: this isn't the best since it can't interleave datasets.
# NOTE: this is only used in a test. Can it be deleted?
class ConstantLengthDataset(IterableDataset): class ConstantLengthDataset(IterableDataset):
"""Iterable dataset that returns constant length chunks of tokens from stream of """Iterable dataset that returns constant length chunks of tokens from stream of
text files. text files.