axolotl/examples/ebft/ebft_pretrain.py

"""
Dataset transform for unstructured text data with strided EBFT.

Tokenizes raw text into fixed-length input_ids for the strided trainer.
Sequences are padded to sequence_len for uniform batching.
"""


def transform(cfg, *args, **kwargs):
    seq_len = cfg.sequence_len

    def transform_fn(example, tokenizer=None):
        text = example.get("question", example.get("text", ""))
        if tokenizer is None:
            return {"prompt": text}

        encoded = tokenizer(
            text,
            truncation=True,
            max_length=seq_len,
            padding="max_length",
            add_special_tokens=True,
            return_tensors=None,
        )
        return {
            "input_ids": encoded["input_ids"],
            "attention_mask": encoded["attention_mask"],
            "labels": list(encoded["input_ids"]),
        }

    return transform_fn, {"remove_columns": ["question", "answer"]}