diff --git a/.nojekyll b/.nojekyll
index 1ae0613af..6c82489a4 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-aa0c628c
\ No newline at end of file
+5173f1f8
\ No newline at end of file
diff --git a/docs/dataset-formats/index.html b/docs/dataset-formats/index.html
index 0b54be381..b8fb070e0 100644
--- a/docs/dataset-formats/index.html
+++ b/docs/dataset-formats/index.html
@@ -363,7 +363,7 @@ Description
diff --git a/docs/dataset-formats/pretraining.html b/docs/dataset-formats/pretraining.html
index b10d26c69..64970be83 100644
--- a/docs/dataset-formats/pretraining.html
+++ b/docs/dataset-formats/pretraining.html
@@ -342,8 +342,15 @@ Streaming is recommended for large datasets
config.yaml
-
pretraining_dataset: # hf path only
-...
+
pretraining_dataset:
+-name:
+path:
+split:
+text_column: # column in dataset with the data, usually `text`
+type: pretrain
+trust_remote_code:
+skip: # number of rows of data to skip over from the beginning
+...
diff --git a/search.json b/search.json
index 7552df95d..79269069c 100644
--- a/search.json
+++ b/search.json
@@ -629,7 +629,7 @@
"href": "docs/dataset-formats/pretraining.html",
"title": "Pre-training",
"section": "",
- "text": "For pretraining, there is no prompt template or roles. The only required field is text:\n\n\ndata.jsonl\n\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\n\n\n\n\n\n\n\nStreaming is recommended for large datasets\n\n\n\nAxolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:\n\n\nconfig.yaml\n\npretraining_dataset: # hf path only\n...",
+ "text": "For pretraining, there is no prompt template or roles. The only required field is text:\n\n\ndata.jsonl\n\n{\"text\": \"first row\"}\n{\"text\": \"second row\"}\n...\n\n\n\n\n\n\n\nStreaming is recommended for large datasets\n\n\n\nAxolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:\n\n\nconfig.yaml\n\npretraining_dataset:\n - name:\n path:\n split:\n text_column: # column in dataset with the data, usually `text`\n type: pretrain\n trust_remote_code:\n skip: # number of rows of data to skip over from the beginning\n...",
"crumbs": [
"Dataset Formats",
"Pre-training"
diff --git a/sitemap.xml b/sitemap.xml
index cbbba9f59..5d765ab41 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,114 +2,114 @@
https://axolotl-ai-cloud.github.io/axolotl/index.html
- 2025-01-13T15:44:24.847Z
+ 2025-01-13T15:44:58.956Zhttps://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/LICENSE.html
- 2025-01-13T15:44:24.851Z
+ 2025-01-13T15:44:58.960Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/nccl.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.944Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/input_output.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.944Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/dataset_preprocessing.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/torchao.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.944Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/rlhf.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.944Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/config.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/template_free.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/tokenized.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/mac.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.944Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.944Zhttps://axolotl-ai-cloud.github.io/axolotl/FAQS.html
- 2025-01-13T15:44:24.831Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/TODO.html
- 2025-01-13T15:44:24.831Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/faq.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/pretraining.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/index.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/unsloth.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.944Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/multimodal.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.944Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/batch_vs_grad.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/fsdp_qlora.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.944Zhttps://axolotl-ai-cloud.github.io/axolotl/docs/amd_hpc.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.940Zhttps://axolotl-ai-cloud.github.io/axolotl/examples/colab-notebooks/colab-axolotl-example.html
- 2025-01-13T15:44:24.835Z
+ 2025-01-13T15:44:58.944Zhttps://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html
- 2025-01-13T15:44:24.851Z
+ 2025-01-13T15:44:58.960Z