Streaming SFT support (#3101)
* working * fixes * deprecate --iterable; cleanup * pretrain_multipack_buffer_size -> streaming_multipack_buffer_size * improvements * tests * remove unused * docs, examples * nit * nit * add val_set_size validation * val * nit * min * coderabbito * cleanup * nit * add depr warning, cleanup * nit * fix test, fix quarto * fix * review comments * review comments * fix
This commit is contained in:
@@ -6,7 +6,7 @@ import unittest
|
||||
|
||||
from transformers import LlamaTokenizer
|
||||
|
||||
from axolotl.utils.data import encode_pretraining, md5
|
||||
from axolotl.utils.data import encode_streaming, md5
|
||||
|
||||
from tests.hf_offline_utils import enable_hf_offline
|
||||
|
||||
@@ -39,7 +39,7 @@ class TestEncodePretraining(unittest.TestCase):
|
||||
"hello, hello",
|
||||
]
|
||||
}
|
||||
result = encode_pretraining(self.tokenizer, self.max_tokens, examples)
|
||||
result = encode_streaming(examples, self.tokenizer, self.max_tokens)
|
||||
|
||||
self.assertEqual(len(result["input_ids"]), 3)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user