nits
This commit is contained in:
@@ -1,5 +1,5 @@
|
|||||||
"""
|
"""
|
||||||
Module containing Dataset functionality
|
Module containing dataset functionality.
|
||||||
|
|
||||||
We want this to be a wrapper for an existing dataset that we have loaded. Lets use the
|
We want this to be a wrapper for an existing dataset that we have loaded. Lets use the
|
||||||
concept of middlewares to wrap each dataset, for example:
|
concept of middlewares to wrap each dataset, for example:
|
||||||
@@ -47,6 +47,7 @@ class TokenizedPromptDataset(Dataset):
|
|||||||
)
|
)
|
||||||
|
|
||||||
def process(self, dataset: Dataset | IterableDataset) -> Dataset | IterableDataset:
|
def process(self, dataset: Dataset | IterableDataset) -> Dataset | IterableDataset:
|
||||||
|
"""Apply filtering and tokenization."""
|
||||||
# For IterableDataset, we can't access features up front. Anyways, we don't care
|
# For IterableDataset, we can't access features up front. Anyways, we don't care
|
||||||
# to remove unused columns from streaming datasets.
|
# to remove unused columns from streaming datasets.
|
||||||
features = None
|
features = None
|
||||||
@@ -104,7 +105,8 @@ def wrap_dataset_for_tokenized_prompt(
|
|||||||
return TokenizedPromptDataset(prompt_tokenizer, dataset, **kwargs)
|
return TokenizedPromptDataset(prompt_tokenizer, dataset, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
# TODO this isn't the best since it can't interleave datasets
|
# TODO: this isn't the best since it can't interleave datasets.
|
||||||
|
# NOTE: this is only used in a test. Can it be deleted?
|
||||||
class ConstantLengthDataset(IterableDataset):
|
class ConstantLengthDataset(IterableDataset):
|
||||||
"""Iterable dataset that returns constant length chunks of tokens from stream of
|
"""Iterable dataset that returns constant length chunks of tokens from stream of
|
||||||
text files.
|
text files.
|
||||||
|
|||||||
Reference in New Issue
Block a user