Streaming SFT support (#3101)

* working

* fixes

* deprecate --iterable; cleanup

* pretrain_multipack_buffer_size -> streaming_multipack_buffer_size

* improvements

* tests

* remove unused

* docs, examples

* nit

* nit

* add val_set_size validation

* val

* nit

* min

* coderabbito

* cleanup

* nit

* add depr warning, cleanup

* nit

* fix test, fix quarto

* fix

* review comments

* review comments

* fix
This commit is contained in:
Dan Saunders
2025-09-02 12:08:44 -04:00
committed by GitHub
parent 0094a2d744
commit 231a67e70b
24 changed files with 849 additions and 283 deletions

View File

@@ -9,7 +9,7 @@ import torch
from datasets import IterableDataset
from torch.utils.data import DataLoader
from axolotl.utils.data import get_dataset_wrapper, wrap_pretraining_dataset
from axolotl.utils.data import get_dataset_wrapper, wrap_streaming_dataset
from axolotl.utils.dict import DictDefault
@@ -77,14 +77,11 @@ class TestPretrainingPacking:
)
original_bsz = cfg.micro_batch_size
train_dataset = wrap_pretraining_dataset(
train_dataset = wrap_streaming_dataset(
dataset,
tokenizer_huggyllama,
cfg,
ds_wrapper_partial,
max_tokens=cfg.sequence_len,
batch_size=cfg.micro_batch_size,
seed=cfg.seed or 42,
)
trainer_loader = DataLoader(