[feat] truncation support with excess_length_strategy (#3068) [skip ci]

* feat:truncation support with excess_len

* pre-commit

* excess_length_strategy

* requested changes

* lint

* added handle_long_seq_in_dataset in sft

* comments improved
This commit is contained in:
VED
2025-08-18 18:09:13 +05:30
committed by GitHub
parent ecbe8b2b61
commit 0eef385b1a
4 changed files with 61 additions and 8 deletions

View File

@@ -8,7 +8,7 @@ from transformers import AutoTokenizer
from axolotl.datasets import TokenizedPromptDataset
from axolotl.prompt_strategies.completion import load
from axolotl.utils.collators import V2BatchSamplerDataCollatorForSeq2Seq
from axolotl.utils.data.utils import drop_long_seq_in_dataset
from axolotl.utils.data.utils import handle_long_seq_in_dataset
from axolotl.utils.dict import DictDefault
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
@@ -70,7 +70,7 @@ class TestBatchedSamplerPacking:
)
train_dataset = concatenate_datasets([dataset_wrapper])
train_dataset = drop_long_seq_in_dataset(train_dataset, cfg.sequence_len, cfg)
train_dataset = handle_long_seq_in_dataset(train_dataset, cfg.sequence_len, cfg)
lengths = get_dataset_lengths(train_dataset)
batch_sampler = MultipackBatchSampler(