Pretrain transforms (#1261)

* wip for pretraining/iterable data with arbitrary prompt strategies

* more fixes, wip

* more fixes for custom pretraining

* iterable ds wrapper not needed

* remove extra features

* chore: lint

* update pretraning example yml

* fix order for partials

* fixup for tests
This commit is contained in:
Wing Lian
2024-02-06 00:37:03 -05:00
committed by GitHub
parent 8c2e05ade3
commit c7cf3810bd
5 changed files with 145 additions and 62 deletions

View File

@@ -1,14 +1,14 @@
"""Module for testing streaming dataset sequence packing"""
import functools
import unittest
from functools import partial
import torch
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from axolotl.utils.collators import PretrainingBatchSamplerDataCollatorForSeq2Seq
from axolotl.utils.data import encode_packed_pretraining
from axolotl.utils.data import get_dataset_wrapper, wrap_pretraining_dataset
from axolotl.utils.dict import DictDefault
class TestPretrainingPacking(unittest.TestCase):
@@ -20,8 +20,6 @@ class TestPretrainingPacking(unittest.TestCase):
# pylint: disable=duplicate-code
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
self.tokenizer.pad_token = "</s>"
self.max_seq_length = 2048
self.batch_size = 2
def test_packing_stream_dataset(self):
# pylint: disable=duplicate-code
@@ -31,30 +29,43 @@ class TestPretrainingPacking(unittest.TestCase):
streaming=True,
)["train"]
collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq(
self.tokenizer,
return_tensors="pt",
padding=True,
pad_to_multiple_of=self.max_seq_length,
cfg = DictDefault(
{
"pretraining_dataset": [
{
"path": "c4",
"name": "en",
"type": "pretrain",
}
],
"sample_packing": True,
"pad_to_sequence_len": True,
"sequence_len": 2048,
"micro_batch_size": 2,
}
)
encode = partial(
encode_packed_pretraining,
ds_wrapper_partial = functools.partial(
get_dataset_wrapper,
cfg.pretraining_dataset[0],
self.tokenizer,
collate_fn,
max_seq_length=self.max_seq_length,
batch_size=self.batch_size,
cfg,
cfg.pretraining_dataset[0]["type"] or "pretrain",
)
dataset = dataset.map(
encode,
batched=True,
input_columns="text",
remove_columns=dataset.features.keys(),
original_bsz = cfg.micro_batch_size
train_dataset = wrap_pretraining_dataset(
dataset,
self.tokenizer,
cfg,
ds_wrapper_partial,
max_tokens=cfg.sequence_len,
batch_size=cfg.micro_batch_size,
seed=cfg.seed or 42,
)
trainer_loader = DataLoader(
dataset,
train_dataset,
batch_size=1,
collate_fn=None,
drop_last=True,
@@ -64,16 +75,16 @@ class TestPretrainingPacking(unittest.TestCase):
if idx > 10:
break
assert data["input_ids"].shape == torch.Size(
[1, self.batch_size * self.max_seq_length]
[1, original_bsz * cfg.sequence_len]
)
assert data["position_ids"].shape == torch.Size(
[1, self.batch_size * self.max_seq_length]
[1, original_bsz * cfg.sequence_len]
)
assert data["labels"].shape == torch.Size(
[1, self.batch_size * self.max_seq_length]
[1, original_bsz * cfg.sequence_len]
)
assert data["attention_mask"].shape == torch.Size(
[1, self.batch_size * self.max_seq_length]
[1, original_bsz * cfg.sequence_len]
)
idx += 1