* fix attetion mask with packing * set position ids and use block diagonal attn mask * fix expand mask for multiple batch items, make sure we pad position_ids * don't move masks to cpu * use multi pack dataloader w random sampler * add position_ids back * more fixes for dataloader integration * est total tokens, fix field loop * more fixes, position_ids seems broken * more fixes for sample packing * use distributed sampler, avoid accelerate prepare * use accelerator prepare for dataloader * fix for position_ids w packing * Update src/axolotl/utils/dataloader.py * validation for sample packing and doc * more fixes for 4k and optimizations * optimized expand mask fn * better handling of variance in multipack dataloader length and trainer hanging when it runs out of data * fix rounding of len of batches to int * better handling so that all devices have the same dataloader len * fix step calc for packing * pass sample packing efficiency to training args * add a test for the mask expansion for sequence packing * only process eval dataset for packing if not None * don't split batches when packing * weighted CE losses * weighted CEL fixes * limit packing to sequences of max seq len * seq_len_multiple for packing * make sure the chunk size is an int * sample_packing_seq_len_multiplier config * use cumulative seq len with var len flash attn v2 w packing * properly calculate max len * fix flash-attn, xformers, packing, support chatml * fix chatml system prompt for openorca, legacy tokenizer opts * add chatml * add unit tests for cum seq lens, add ability to build cu_seq_lens from positional ids, fix prompt test * fix test and pylint checks * more packing and dataset optimizations and fixes * filter w multiple cpus * more fixes and optimizations * fixes and go back to distributed sampler since batch sampler won't work * fix counts by accounting for num devices * fix steps calculation * previous accelerate is still most performant * add numba to requirements. * use custom distributed checks * fix sampler to prevent overfit w new epochs * let's not cleanup the cached datasets * calculate cum seq lens with pos_ids instead of mask, simplify packing params, fix distributed barrier * speed optimizations and set accelerate fsdp env vars * optimize dataset concatenation? * more optimizations for dataset handling * fix import for annotation * manual pre-commit fixes * another sum optimization and bug fix for calc steps * fix packing estimations * fix formatting * pylint problems * add back flash attention branch for handling unpacked sequences seperately * Address PR feedback * add optional sample packing config params to readme
70 lines
2.2 KiB
Python
70 lines
2.2 KiB
Python
"""Module for testing dataset sequence packing"""
|
|
|
|
import unittest
|
|
from pathlib import Path
|
|
|
|
from datasets import Dataset, load_dataset
|
|
from transformers import AutoTokenizer
|
|
|
|
from axolotl.datasets import ConstantLengthDataset, TokenizedPromptDataset
|
|
from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
|
|
from axolotl.prompters import AlpacaPrompter
|
|
|
|
|
|
class TestPacking(unittest.TestCase):
|
|
"""
|
|
Test class for packing dataset sequences
|
|
"""
|
|
|
|
def setUp(self) -> None:
|
|
# pylint: disable=duplicate-code
|
|
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
|
|
self.tokenizer.add_special_tokens(
|
|
{
|
|
"bos_token": "<s>",
|
|
"eos_token": "</s>",
|
|
"unk_token": "<unk>",
|
|
}
|
|
)
|
|
|
|
def test_increments_attention(self):
|
|
prompter = AlpacaPrompter("chat")
|
|
strat = AlpacaPromptTokenizingStrategy(
|
|
prompter,
|
|
self.tokenizer,
|
|
False,
|
|
2048,
|
|
)
|
|
dateset = load_dataset(
|
|
"json",
|
|
data_files=str(Path(__file__).parent / "fixtures/alpaca/alpaca.json"),
|
|
)["train"]
|
|
dataset = Dataset.from_list(list(TokenizedPromptDataset(strat, dateset)))
|
|
|
|
constant_len_dataset = ConstantLengthDataset(
|
|
self.tokenizer,
|
|
[dataset],
|
|
seq_length=2048,
|
|
)
|
|
packed_dataset = Dataset.from_list(list(constant_len_dataset))
|
|
example = packed_dataset[0]
|
|
next_bos_index = (
|
|
example["input_ids"][1:].index(self.tokenizer.bos_token_id) + 1
|
|
) # add one since we sliced
|
|
|
|
# first example doesn't have mask reset
|
|
assert example["input_ids"][0] == self.tokenizer.bos_token_id
|
|
assert example["attention_mask"][0] == 1
|
|
assert example["position_ids"][0] == 0
|
|
assert example["position_ids"][1] == 1
|
|
|
|
# but subsequent one does
|
|
assert example["input_ids"][next_bos_index] == self.tokenizer.bos_token_id
|
|
assert example["attention_mask"][next_bos_index] == 2
|
|
assert example["position_ids"][next_bos_index] == 0
|
|
assert example["position_ids"][next_bos_index + 1] == 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|