Compare commits

...

2 Commits

Author SHA1 Message Date
Wing Lian
05d19d2037 remove debugging, use gpt2 since starcoder requires consent
Some checks failed
pre-commit / pre-commit (push) Has been cancelled
PyTest / test (3.10) (push) Has been cancelled
PyTest / test (3.9) (push) Has been cancelled
2023-06-13 21:32:47 -04:00
Wing Lian
61f44f311e fix packing for tokenizers that don't use a bos_token when the bos token and eos token are both the same 2023-06-13 21:26:13 -04:00
3 changed files with 59 additions and 3 deletions

View File

@@ -132,8 +132,12 @@ class ConstantLengthDataset(IterableDataset):
attention_mask = example["attention_mask"]
labels = example["labels"]
if (
buffer["input_ids"]
and input_ids[0] == self.tokenizer.bos_token_id
(
buffer["input_ids"]
and input_ids[0] == self.tokenizer.bos_token_id
)
or self.tokenizer.bos_token_id
== self.tokenizer.eos_token_id
):
attention_mask[0] = 0

View File

@@ -34,3 +34,5 @@ def check_example_labels(example, tokenizer):
logging.info(" ".join(colored_tokens))
logging.info("\n\n\n")
print(" ".join(colored_tokens))

View File

@@ -11,7 +11,57 @@ from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
from axolotl.prompters import AlpacaPrompter
class TestPacking(unittest.TestCase):
class TestGpt2Packing(unittest.TestCase):
"""
Test class for packing dataset sequences
"""
def setUp(self) -> None:
# pylint: disable=duplicate-code
self.tokenizer = AutoTokenizer.from_pretrained("gpt2")
self.tokenizer.add_special_tokens(
{
"bos_token": "<|endoftext|>",
"eos_token": "<|endoftext|>",
"unk_token": "<|endoftext|>",
}
)
self.tokenizer.bos_token_id = 0
self.tokenizer.eos_token_id = 0
self.tokenizer.unk_token_id = 0
def test_resets_attention(self):
prompter = AlpacaPrompter("chat")
strat = AlpacaPromptTokenizingStrategy(
prompter,
self.tokenizer,
False,
2048,
)
dateset = load_dataset(
"json",
data_files=str(Path(__file__).parent / "fixtures/alpaca/alpaca.json"),
)["train"]
dataset = Dataset.from_list(list(TokenizedPromptDataset(strat, dateset)))
constant_len_dataset = ConstantLengthDataset(
self.tokenizer,
[dataset],
seq_length=2048,
)
packed_dataset = Dataset.from_list(list(constant_len_dataset))
example = packed_dataset[0]
# tokenizers where eos and bos tokens are the same, don't have a bos token
next_eos_index = (
example["input_ids"][1:].index(self.tokenizer.eos_token_id) + 1
) # add one since we sliced
assert example["input_ids"][next_eos_index] == self.tokenizer.eos_token_id
assert example["attention_mask"][next_eos_index + 1] == 0
class TestLlamaPacking(unittest.TestCase):
"""
Test class for packing dataset sequences
"""