Pretrain multipack (#2278)

* fix for pretrain with packing

* fix model name and loss expected

* make sure to check with micro batch size for pretraining

* change loss threshholds based on parametrization

* make tests smaller for CI

* fix pretrain packing

* fix pretrain packing test

* address pr feedback
This commit is contained in:
Wing Lian
2025-01-24 12:55:20 -05:00
committed by GitHub
parent 6086162488
commit 20620771f1
5 changed files with 42 additions and 21 deletions

View File

@@ -41,6 +41,7 @@ class TestPretrainingPacking(unittest.TestCase):
}
],
"sample_packing": True,
"pretrain_multipack_attn": True,
"pad_to_sequence_len": True,
"sequence_len": 2048,
"micro_batch_size": 2,
@@ -87,9 +88,11 @@ class TestPretrainingPacking(unittest.TestCase):
assert data["labels"].shape == torch.Size(
[1, original_bsz * cfg.sequence_len]
)
assert data["attention_mask"].shape == torch.Size(
[1, original_bsz * cfg.sequence_len]
)
assert "attention_mask" not in data
# FIXME add back once we fix packing unpad/pad with attention mask
# assert data["attention_mask"].shape == torch.Size(
# [1, original_bsz * cfg.sequence_len]
# )
idx += 1