Streaming SFT support (#3101)
* working * fixes * deprecate --iterable; cleanup * pretrain_multipack_buffer_size -> streaming_multipack_buffer_size * improvements * tests * remove unused * docs, examples * nit * nit * add val_set_size validation * val * nit * min * coderabbito * cleanup * nit * add depr warning, cleanup * nit * fix test, fix quarto * fix * review comments * review comments * fix
This commit is contained in:
@@ -9,7 +9,7 @@ import torch
|
||||
from datasets import IterableDataset
|
||||
from torch.utils.data import DataLoader
|
||||
|
||||
from axolotl.utils.data import get_dataset_wrapper, wrap_pretraining_dataset
|
||||
from axolotl.utils.data import get_dataset_wrapper, wrap_streaming_dataset
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
|
||||
@@ -77,14 +77,11 @@ class TestPretrainingPacking:
|
||||
)
|
||||
|
||||
original_bsz = cfg.micro_batch_size
|
||||
train_dataset = wrap_pretraining_dataset(
|
||||
train_dataset = wrap_streaming_dataset(
|
||||
dataset,
|
||||
tokenizer_huggyllama,
|
||||
cfg,
|
||||
ds_wrapper_partial,
|
||||
max_tokens=cfg.sequence_len,
|
||||
batch_size=cfg.micro_batch_size,
|
||||
seed=cfg.seed or 42,
|
||||
)
|
||||
|
||||
trainer_loader = DataLoader(
|
||||
|
||||
Reference in New Issue
Block a user