From 1b26b6a2f3417c86cdd1c7be0023b23e198ce649 Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Mon, 13 Jan 2025 15:46:05 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- docs/dataset-formats/index.html | 10 ++--- docs/dataset-formats/pretraining.html | 11 +++++- search.json | 2 +- sitemap.xml | 56 +++++++++++++-------------- 5 files changed, 44 insertions(+), 37 deletions(-) diff --git a/.nojekyll b/.nojekyll index 1ae0613af..6c82489a4 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -aa0c628c \ No newline at end of file +5173f1f8 \ No newline at end of file diff --git a/docs/dataset-formats/index.html b/docs/dataset-formats/index.html index 0b54be381..b8fb070e0 100644 --- a/docs/dataset-formats/index.html +++ b/docs/dataset-formats/index.html @@ -363,7 +363,7 @@ Description - + Pre-training @@ -371,7 +371,7 @@ Description Data format for a pre-training completion task. - + Instruction Tuning @@ -379,7 +379,7 @@ Description Instruction tuning formats for supervised fine-tuning. - + Conversation @@ -387,7 +387,7 @@ Description Conversation format for supervised fine-tuning. - + Template-Free @@ -395,7 +395,7 @@ Description Construct prompts without a template. - + Custom Pre-Tokenized Dataset diff --git a/docs/dataset-formats/pretraining.html b/docs/dataset-formats/pretraining.html index b10d26c69..64970be83 100644 --- a/docs/dataset-formats/pretraining.html +++ b/docs/dataset-formats/pretraining.html @@ -342,8 +342,15 @@ Streaming is recommended for large datasets
config.yaml
-
pretraining_dataset: # hf path only
-...
+
pretraining_dataset:
+  - name:
+    path:
+    split:
+    text_column: # column in dataset with the data, usually `text`
+    type: pretrain
+    trust_remote_code:
+    skip: # number of rows of data to skip over from the beginning
+...
diff --git a/search.json b/search.json index 7552df95d..79269069c 100644 --- a/search.json +++ b/search.json @@ -629,7 +629,7 @@ "href": "docs/dataset-formats/pretraining.html", "title": "Pre-training", "section": "", - "text": "For pretraining, there is no prompt template or roles. The only required field is text:\n\n\ndata.jsonl\n\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\n\n\n\n\n\n\n\nStreaming is recommended for large datasets\n\n\n\nAxolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:\n\n\nconfig.yaml\n\npretraining_dataset: # hf path only\n...", + "text": "For pretraining, there is no prompt template or roles. The only required field is text:\n\n\ndata.jsonl\n\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\n\n\n\n\n\n\n\nStreaming is recommended for large datasets\n\n\n\nAxolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:\n\n\nconfig.yaml\n\npretraining_dataset:\n - name:\n path:\n split:\n text_column: # column in dataset with the data, usually `text`\n type: pretrain\n trust_remote_code:\n skip: # number of rows of data to skip over from the beginning\n...", "crumbs": [ "Dataset Formats", "Pre-training" diff --git a/sitemap.xml b/sitemap.xml index cbbba9f59..5d765ab41 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,114 +2,114 @@ https://axolotl-ai-cloud.github.io/axolotl/index.html - 2025-01-13T15:44:24.847Z + 2025-01-13T15:44:58.956Z https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/LICENSE.html - 2025-01-13T15:44:24.851Z + 2025-01-13T15:44:58.960Z https://axolotl-ai-cloud.github.io/axolotl/docs/nccl.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.944Z https://axolotl-ai-cloud.github.io/axolotl/docs/input_output.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.944Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset_preprocessing.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/torchao.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.944Z https://axolotl-ai-cloud.github.io/axolotl/docs/rlhf.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.944Z https://axolotl-ai-cloud.github.io/axolotl/docs/config.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/template_free.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/tokenized.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/mac.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.944Z https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.944Z https://axolotl-ai-cloud.github.io/axolotl/FAQS.html - 2025-01-13T15:44:24.831Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/TODO.html - 2025-01-13T15:44:24.831Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/pretraining.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/index.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/unsloth.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.944Z https://axolotl-ai-cloud.github.io/axolotl/docs/multimodal.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.944Z https://axolotl-ai-cloud.github.io/axolotl/docs/batch_vs_grad.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/fsdp_qlora.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.944Z https://axolotl-ai-cloud.github.io/axolotl/docs/amd_hpc.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.940Z https://axolotl-ai-cloud.github.io/axolotl/examples/colab-notebooks/colab-axolotl-example.html - 2025-01-13T15:44:24.835Z + 2025-01-13T15:44:58.944Z https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html - 2025-01-13T15:44:24.851Z + 2025-01-13T15:44:58.960Z