Pretrain transforms (#1261)
* wip for pretraining/iterable data with arbitrary prompt strategies * more fixes, wip * more fixes for custom pretraining * iterable ds wrapper not needed * remove extra features * chore: lint * update pretraning example yml * fix order for partials * fixup for tests
This commit is contained in:
@@ -1,14 +1,14 @@
|
||||
"""Module for testing streaming dataset sequence packing"""
|
||||
import functools
|
||||
import unittest
|
||||
from functools import partial
|
||||
|
||||
import torch
|
||||
from datasets import load_dataset
|
||||
from torch.utils.data import DataLoader
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from axolotl.utils.collators import PretrainingBatchSamplerDataCollatorForSeq2Seq
|
||||
from axolotl.utils.data import encode_packed_pretraining
|
||||
from axolotl.utils.data import get_dataset_wrapper, wrap_pretraining_dataset
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
|
||||
class TestPretrainingPacking(unittest.TestCase):
|
||||
@@ -20,8 +20,6 @@ class TestPretrainingPacking(unittest.TestCase):
|
||||
# pylint: disable=duplicate-code
|
||||
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
|
||||
self.tokenizer.pad_token = "</s>"
|
||||
self.max_seq_length = 2048
|
||||
self.batch_size = 2
|
||||
|
||||
def test_packing_stream_dataset(self):
|
||||
# pylint: disable=duplicate-code
|
||||
@@ -31,30 +29,43 @@ class TestPretrainingPacking(unittest.TestCase):
|
||||
streaming=True,
|
||||
)["train"]
|
||||
|
||||
collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq(
|
||||
self.tokenizer,
|
||||
return_tensors="pt",
|
||||
padding=True,
|
||||
pad_to_multiple_of=self.max_seq_length,
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"pretraining_dataset": [
|
||||
{
|
||||
"path": "c4",
|
||||
"name": "en",
|
||||
"type": "pretrain",
|
||||
}
|
||||
],
|
||||
"sample_packing": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"sequence_len": 2048,
|
||||
"micro_batch_size": 2,
|
||||
}
|
||||
)
|
||||
|
||||
encode = partial(
|
||||
encode_packed_pretraining,
|
||||
ds_wrapper_partial = functools.partial(
|
||||
get_dataset_wrapper,
|
||||
cfg.pretraining_dataset[0],
|
||||
self.tokenizer,
|
||||
collate_fn,
|
||||
max_seq_length=self.max_seq_length,
|
||||
batch_size=self.batch_size,
|
||||
cfg,
|
||||
cfg.pretraining_dataset[0]["type"] or "pretrain",
|
||||
)
|
||||
|
||||
dataset = dataset.map(
|
||||
encode,
|
||||
batched=True,
|
||||
input_columns="text",
|
||||
remove_columns=dataset.features.keys(),
|
||||
original_bsz = cfg.micro_batch_size
|
||||
train_dataset = wrap_pretraining_dataset(
|
||||
dataset,
|
||||
self.tokenizer,
|
||||
cfg,
|
||||
ds_wrapper_partial,
|
||||
max_tokens=cfg.sequence_len,
|
||||
batch_size=cfg.micro_batch_size,
|
||||
seed=cfg.seed or 42,
|
||||
)
|
||||
|
||||
trainer_loader = DataLoader(
|
||||
dataset,
|
||||
train_dataset,
|
||||
batch_size=1,
|
||||
collate_fn=None,
|
||||
drop_last=True,
|
||||
@@ -64,16 +75,16 @@ class TestPretrainingPacking(unittest.TestCase):
|
||||
if idx > 10:
|
||||
break
|
||||
assert data["input_ids"].shape == torch.Size(
|
||||
[1, self.batch_size * self.max_seq_length]
|
||||
[1, original_bsz * cfg.sequence_len]
|
||||
)
|
||||
assert data["position_ids"].shape == torch.Size(
|
||||
[1, self.batch_size * self.max_seq_length]
|
||||
[1, original_bsz * cfg.sequence_len]
|
||||
)
|
||||
assert data["labels"].shape == torch.Size(
|
||||
[1, self.batch_size * self.max_seq_length]
|
||||
[1, original_bsz * cfg.sequence_len]
|
||||
)
|
||||
assert data["attention_mask"].shape == torch.Size(
|
||||
[1, self.batch_size * self.max_seq_length]
|
||||
[1, original_bsz * cfg.sequence_len]
|
||||
)
|
||||
idx += 1
|
||||
|
||||
|
||||
Reference in New Issue
Block a user