From 05d19d20373c1ab1117b150c15c6a9cb8de07f27 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Tue, 13 Jun 2023 21:32:47 -0400 Subject: [PATCH] remove debugging, use gpt2 since starcoder requires consent --- tests/test_packed_dataset.py | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/tests/test_packed_dataset.py b/tests/test_packed_dataset.py index 65bb2eb60..2733131b4 100644 --- a/tests/test_packed_dataset.py +++ b/tests/test_packed_dataset.py @@ -18,7 +18,7 @@ class TestGpt2Packing(unittest.TestCase): def setUp(self) -> None: # pylint: disable=duplicate-code - self.tokenizer = AutoTokenizer.from_pretrained("bigcode/starcoderplus") + self.tokenizer = AutoTokenizer.from_pretrained("gpt2") self.tokenizer.add_special_tokens( { "bos_token": "<|endoftext|>", @@ -52,16 +52,11 @@ class TestGpt2Packing(unittest.TestCase): packed_dataset = Dataset.from_list(list(constant_len_dataset)) example = packed_dataset[0] - from axolotl.utils.tokenization import check_example_labels - - check_example_labels(example, self.tokenizer) # tokenizers where eos and bos tokens are the same, don't have a bos token next_eos_index = ( example["input_ids"][1:].index(self.tokenizer.eos_token_id) + 1 ) # add one since we sliced - print(example["input_ids"][next_eos_index + 1]) - assert example["input_ids"][next_eos_index] == self.tokenizer.eos_token_id assert example["attention_mask"][next_eos_index + 1] == 0 @@ -103,9 +98,6 @@ class TestLlamaPacking(unittest.TestCase): ) packed_dataset = Dataset.from_list(list(constant_len_dataset)) example = packed_dataset[0] - from axolotl.utils.tokenization import check_example_labels - - check_example_labels(example, self.tokenizer) next_bos_index = ( example["input_ids"][1:].index(self.tokenizer.bos_token_id) + 1 ) # add one since we sliced