Compare commits
2 Commits
sequence-p
...
pretrain-d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9eaae5925a | ||
|
|
d000851eeb |
@@ -41,6 +41,7 @@ class PretrainTokenizationStrategy(PromptTokenizingStrategy):
|
|||||||
seq + [self.tokenizer.eos_token_id] for seq in res["input_ids"]
|
seq + [self.tokenizer.eos_token_id] for seq in res["input_ids"]
|
||||||
]
|
]
|
||||||
res["attention_mask"] = [seq + [1] for seq in res["attention_mask"]]
|
res["attention_mask"] = [seq + [1] for seq in res["attention_mask"]]
|
||||||
|
res["labels"] = res["input_ids"].copy()
|
||||||
|
|
||||||
return res
|
return res
|
||||||
|
|
||||||
@@ -49,12 +50,16 @@ class PretrainTokenizationStrategy(PromptTokenizingStrategy):
|
|||||||
|
|
||||||
|
|
||||||
def load(tokenizer, cfg):
|
def load(tokenizer, cfg):
|
||||||
|
if cfg.pretraining_dataset:
|
||||||
|
cfg_ds = cfg.pretraining_dataset
|
||||||
|
else:
|
||||||
|
cfg_ds = cfg.datasets
|
||||||
strat = PretrainTokenizationStrategy(
|
strat = PretrainTokenizationStrategy(
|
||||||
PretrainTokenizer(),
|
PretrainTokenizer(),
|
||||||
tokenizer,
|
tokenizer,
|
||||||
cfg.train_on_inputs,
|
cfg.train_on_inputs,
|
||||||
cfg.sequence_len,
|
cfg.sequence_len,
|
||||||
text_column=cfg.pretraining_dataset[0]["text_column"] or "text",
|
text_column=cfg_ds[0]["text_column"] or "text",
|
||||||
max_length=cfg.sequence_len * 64,
|
max_length=cfg.sequence_len * 64,
|
||||||
)
|
)
|
||||||
return strat
|
return strat
|
||||||
|
|||||||
Reference in New Issue
Block a user