* hf offline decorator for tests to workaround rate limits * fail quicker so we can see logs * try new cache name * limit files downloaded * phi mini predownload * offline decorator for phi tokenizer * handle meta llama 8b offline too * make sure to return fixtures if they are wrapped too * more fixes * more things offline * more offline things * fix the env var * fix the model name * handle gemma also * force reload of modules to recheck offline status * prefetch mistral too * use reset_sessions so hub picks up offline mode * more fixes * rename so it doesn't seem like a context manager * fix backoff * switch out tinyshakespeare dataset since it runs a py script to fetch data and doesn't work offline * include additional dataset * more fixes * more fixes * replace tiny shakespeaere dataset * skip some tests for now * use more robust check using snapshot download to determine if a dataset name is on the hub * typo for skip reason * use local_files_only * more fixtures * remove local only * use tiny shakespeare as pretrain dataset and streaming can't be offline even if precached * make sure fixtures aren't offline improve the offline reset try bumping version of datasets reorder reloading and setting prime a new cache run the tests now with fresh cache try with a static cache * now run all the ci again with hopefully a correct cache * skip wonky tests for now * skip wonky tests for now * handle offline mode for model card creation
103 lines
3.1 KiB
Python
103 lines
3.1 KiB
Python
"""Module for testing streaming dataset sequence packing"""
|
|
|
|
import functools
|
|
import unittest
|
|
|
|
import pytest
|
|
import torch
|
|
from datasets import load_dataset
|
|
from torch.utils.data import DataLoader
|
|
from transformers import AutoTokenizer
|
|
from utils import disable_hf_offline, enable_hf_offline
|
|
|
|
from axolotl.utils.data import get_dataset_wrapper, wrap_pretraining_dataset
|
|
from axolotl.utils.dict import DictDefault
|
|
|
|
|
|
class TestPretrainingPacking(unittest.TestCase):
|
|
"""
|
|
Test class for packing streaming dataset sequences
|
|
"""
|
|
|
|
@enable_hf_offline
|
|
def setUp(self) -> None:
|
|
# pylint: disable=duplicate-code
|
|
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
|
|
self.tokenizer.pad_token = "</s>"
|
|
|
|
@pytest.mark.flaky(retries=1, delay=5)
|
|
@disable_hf_offline
|
|
def test_packing_stream_dataset(self):
|
|
# pylint: disable=duplicate-code
|
|
dataset = load_dataset(
|
|
"winglian/tiny-shakespeare",
|
|
streaming=True,
|
|
)["train"]
|
|
|
|
cfg = DictDefault(
|
|
{
|
|
"pretraining_dataset": [
|
|
{
|
|
"path": "winglian/tiny-shakespeare",
|
|
"type": "pretrain",
|
|
}
|
|
],
|
|
"sample_packing": True,
|
|
"pretrain_multipack_attn": True,
|
|
"pad_to_sequence_len": True,
|
|
"sequence_len": 2048,
|
|
"micro_batch_size": 2,
|
|
"sample_packing_group_size": 100000,
|
|
"sample_packing_bin_size": 200,
|
|
}
|
|
)
|
|
|
|
ds_wrapper_partial = functools.partial(
|
|
get_dataset_wrapper,
|
|
cfg.pretraining_dataset[0],
|
|
self.tokenizer,
|
|
cfg,
|
|
cfg.pretraining_dataset[0]["type"] or "pretrain",
|
|
)
|
|
|
|
original_bsz = cfg.micro_batch_size
|
|
train_dataset = wrap_pretraining_dataset(
|
|
dataset,
|
|
self.tokenizer,
|
|
cfg,
|
|
ds_wrapper_partial,
|
|
max_tokens=cfg.sequence_len,
|
|
batch_size=cfg.micro_batch_size,
|
|
seed=cfg.seed or 42,
|
|
)
|
|
|
|
trainer_loader = DataLoader(
|
|
train_dataset,
|
|
batch_size=1,
|
|
collate_fn=None,
|
|
drop_last=True,
|
|
)
|
|
idx = 0
|
|
for data in trainer_loader:
|
|
if idx > 10:
|
|
break
|
|
assert data["input_ids"].shape == torch.Size(
|
|
[1, original_bsz * cfg.sequence_len]
|
|
)
|
|
assert data["position_ids"].shape == torch.Size(
|
|
[1, original_bsz * cfg.sequence_len]
|
|
)
|
|
assert data["labels"].shape == torch.Size(
|
|
[1, original_bsz * cfg.sequence_len]
|
|
)
|
|
assert "attention_mask" not in data
|
|
# FIXME add back once we fix packing unpad/pad with attention mask
|
|
# assert data["attention_mask"].shape == torch.Size(
|
|
# [1, original_bsz * cfg.sequence_len]
|
|
# )
|
|
idx += 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|