Built site for gh-pages

This commit is contained in:
Quarto GHA Workflow Runner
2024-06-29 05:39:41 +00:00
parent c1008cc28d
commit 43c266673d
25 changed files with 76 additions and 65 deletions

View File

@@ -414,7 +414,7 @@
"href": "docs/dataset-formats/tokenized.html",
"title": "Custom Pre-Tokenized Dataset",
"section": "",
"text": "Do not pass a type: in your axolotl config.\nColumns in Dataset must be exactly input_ids, attention_mask, labels\n\n\n\nconfig.yml\n\n- path: ...",
"text": "Pass an empty type: in your axolotl config.\nColumns in Dataset must be exactly input_ids, attention_mask, labels\nTo indicate that a token should be ignored during training, set its corresponding label to -100.\nDo not add BOS/EOS. Axolotl will add them for you based on the default tokenizer for the model youre using.\nFor pretraining, do not truncate/pad documents to the context window length.\nFor instruction training, documents must be truncated/padded as desired.\n\nSample config:\n\n\nconfig.yml\n\ndatasets:\n - path: /path/to/your/file.jsonl\n ds_type: json\n type:\n\nSample jsonl:\n{\"input_ids\":[271,299,99],\"attention_mask\":[1,1,1],\"labels\":[271,-100,99]}\n{\"input_ids\":[87,227,8383,12],\"attention_mask\":[1,1,1,1],\"labels\":[87,227,8383,12]}",
"crumbs": [
"Dataset Formats",
"Custom Pre-Tokenized Dataset"