From e4e8ffd40c6d3c27a16689e0530e9dea956456e5 Mon Sep 17 00:00:00 2001 From: Dan Saunders Date: Wed, 20 Aug 2025 04:28:18 +0000 Subject: [PATCH] nits --- src/axolotl/datasets.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/axolotl/datasets.py b/src/axolotl/datasets.py index 99ba2522b..87f26275f 100644 --- a/src/axolotl/datasets.py +++ b/src/axolotl/datasets.py @@ -1,5 +1,5 @@ """ -Module containing Dataset functionality +Module containing dataset functionality. We want this to be a wrapper for an existing dataset that we have loaded. Lets use the concept of middlewares to wrap each dataset, for example: @@ -47,6 +47,7 @@ class TokenizedPromptDataset(Dataset): ) def process(self, dataset: Dataset | IterableDataset) -> Dataset | IterableDataset: + """Apply filtering and tokenization.""" # For IterableDataset, we can't access features up front. Anyways, we don't care # to remove unused columns from streaming datasets. features = None @@ -104,7 +105,8 @@ def wrap_dataset_for_tokenized_prompt( return TokenizedPromptDataset(prompt_tokenizer, dataset, **kwargs) -# TODO this isn't the best since it can't interleave datasets +# TODO: this isn't the best since it can't interleave datasets. +# NOTE: this is only used in a test. Can it be deleted? class ConstantLengthDataset(IterableDataset): """Iterable dataset that returns constant length chunks of tokens from stream of text files.