* EBFT wip * fixes * more fixeS * add missing strided module * ebft fixes for multi-turn * make ebft work with async * add example for ebft w qwen3.5 * fix for split thinking and update yaml for lora over linear attention only * enforce_eager for vllm arg in schema * fix sync weights * fix multi-gpu * handle updated sig for mm * ddp fixes * improve multi-gpu handling, don't calculate logits, adaptive completion length * chore: lint * chore: lint * support completion_mean * Address corereview feedback * clamp min IS ratio * Address PR code review * more fixes identified * address code review * Fix property from rebase conflict
32 lines
925 B
Python
32 lines
925 B
Python
"""
|
|
Dataset transform for unstructured text data with strided EBFT.
|
|
|
|
Tokenizes raw text into fixed-length input_ids for the strided trainer.
|
|
Sequences are padded to sequence_len for uniform batching.
|
|
"""
|
|
|
|
|
|
def transform(cfg, *args, **kwargs):
|
|
seq_len = cfg.sequence_len
|
|
|
|
def transform_fn(example, tokenizer=None):
|
|
text = example.get("question", example.get("text", ""))
|
|
if tokenizer is None:
|
|
return {"prompt": text}
|
|
|
|
encoded = tokenizer(
|
|
text,
|
|
truncation=True,
|
|
max_length=seq_len,
|
|
padding="max_length",
|
|
add_special_tokens=True,
|
|
return_tensors=None,
|
|
)
|
|
return {
|
|
"input_ids": encoded["input_ids"],
|
|
"attention_mask": encoded["attention_mask"],
|
|
"labels": list(encoded["input_ids"]),
|
|
}
|
|
|
|
return transform_fn, {"remove_columns": ["question", "answer"]}
|