Streaming SFT support (#3101)

* working

* fixes

* deprecate --iterable; cleanup

* pretrain_multipack_buffer_size -> streaming_multipack_buffer_size

* improvements

* tests

* remove unused

* docs, examples

* nit

* nit

* add val_set_size validation

* val

* nit

* min

* coderabbito

* cleanup

* nit

* add depr warning, cleanup

* nit

* fix test, fix quarto

* fix

* review comments

* review comments

* fix
This commit is contained in:
Dan Saunders
2025-09-02 12:08:44 -04:00
committed by GitHub
parent 0094a2d744
commit 231a67e70b
24 changed files with 849 additions and 283 deletions

View File

@@ -6,7 +6,7 @@ import unittest
from transformers import LlamaTokenizer
from axolotl.utils.data import encode_pretraining, md5
from axolotl.utils.data import encode_streaming, md5
from tests.hf_offline_utils import enable_hf_offline
@@ -39,7 +39,7 @@ class TestEncodePretraining(unittest.TestCase):
"hello, hello",
]
}
result = encode_pretraining(self.tokenizer, self.max_tokens, examples)
result = encode_streaming(examples, self.tokenizer, self.max_tokens)
self.assertEqual(len(result["input_ids"]), 3)