diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 89b2746e4..4f8074ad1 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -15,16 +15,6 @@ jobs:
       fail-fast: false
       matrix:
         include:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.10"
-            pytorch: 2.3.1
-            axolotl_extras: mamba-ssm
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            axolotl_extras: mamba-ssm
           - cuda: 124
             cuda_version: 12.4.1
             python_version: "3.11"
@@ -82,16 +72,6 @@ jobs:
     strategy:
       matrix:
         include:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.10"
-            pytorch: 2.3.1
-            axolotl_extras:
-          - cuda: 121
-            cuda_version: 12.1.1
-            python_version: "3.11"
-            pytorch: 2.3.1
-            axolotl_extras:
           - cuda: 124
             cuda_version: 12.4.1
             python_version: "3.11"
@@ -148,7 +128,7 @@ jobs:
           - cuda: 121
             cuda_version: 12.1.1
             python_version: "3.11"
-            pytorch: 2.3.1
+            pytorch: 2.4.1
             axolotl_extras:
     runs-on: axolotl-gpu-runner
     steps:
diff --git a/.nojekyll b/.nojekyll
index 446ec107d..afc349f6e 100644
--- a/.nojekyll
+++ b/.nojekyll
@@ -1 +1 @@
-907d35b7
\ No newline at end of file
+ed280e0b
\ No newline at end of file
diff --git a/docs/dataset-formats/index.html b/docs/dataset-formats/index.html
index cad6a42eb..c6781595a 100644
--- a/docs/dataset-formats/index.html
+++ b/docs/dataset-formats/index.html
@@ -363,7 +363,7 @@ Description
 </tr>
 </thead>
 <tbody class="list">
-<tr data-index="0" data-listing-file-modified-sort="1737741398671" data-listing-reading-time-sort="1" data-listing-word-count-sort="92" data-listing-title-sort="Pre-training" data-listing-filename-sort="pretraining.qmd">
+<tr data-index="0" data-listing-file-modified-sort="1738124636858" data-listing-reading-time-sort="1" data-listing-word-count-sort="92" data-listing-title-sort="Pre-training" data-listing-filename-sort="pretraining.qmd">
 <td>
 <a href="../../docs/dataset-formats/pretraining.html" class="title listing-title">Pre-training</a>
 </td>
@@ -371,7 +371,7 @@ Description
 <span class="listing-description">Data format for a pre-training completion task.</span>
 </td>
 </tr>
-<tr data-index="1" data-listing-file-modified-sort="1737741398671" data-listing-reading-time-sort="2" data-listing-word-count-sort="308" data-listing-title-sort="Instruction Tuning" data-listing-filename-sort="inst_tune.qmd">
+<tr data-index="1" data-listing-file-modified-sort="1738124636858" data-listing-reading-time-sort="2" data-listing-word-count-sort="308" data-listing-title-sort="Instruction Tuning" data-listing-filename-sort="inst_tune.qmd">
 <td>
 <a href="../../docs/dataset-formats/inst_tune.html" class="title listing-title">Instruction Tuning</a>
 </td>
@@ -379,7 +379,7 @@ Description
 <span class="listing-description">Instruction tuning formats for supervised fine-tuning.</span>
 </td>
 </tr>
-<tr data-index="2" data-listing-file-modified-sort="1737741398671" data-listing-reading-time-sort="4" data-listing-word-count-sort="625" data-listing-title-sort="Conversation" data-listing-filename-sort="conversation.qmd">
+<tr data-index="2" data-listing-file-modified-sort="1738124636858" data-listing-reading-time-sort="4" data-listing-word-count-sort="625" data-listing-title-sort="Conversation" data-listing-filename-sort="conversation.qmd">
 <td>
 <a href="../../docs/dataset-formats/conversation.html" class="title listing-title">Conversation</a>
 </td>
@@ -387,7 +387,7 @@ Description
 <span class="listing-description">Conversation format for supervised fine-tuning.</span>
 </td>
 </tr>
-<tr data-index="3" data-listing-file-modified-sort="1737741398671" data-listing-reading-time-sort="1" data-listing-word-count-sort="3" data-listing-title-sort="Template-Free" data-listing-filename-sort="template_free.qmd">
+<tr data-index="3" data-listing-file-modified-sort="1738124636858" data-listing-reading-time-sort="1" data-listing-word-count-sort="3" data-listing-title-sort="Template-Free" data-listing-filename-sort="template_free.qmd">
 <td>
 <a href="../../docs/dataset-formats/template_free.html" class="title listing-title">Template-Free</a>
 </td>
@@ -395,7 +395,7 @@ Description
 <span class="listing-description">Construct prompts without a template.</span>
 </td>
 </tr>
-<tr data-index="4" data-listing-file-modified-sort="1737741398671" data-listing-reading-time-sort="1" data-listing-word-count-sort="92" data-listing-title-sort="Custom Pre-Tokenized Dataset" data-listing-filename-sort="tokenized.qmd">
+<tr data-index="4" data-listing-file-modified-sort="1738124636858" data-listing-reading-time-sort="1" data-listing-word-count-sort="92" data-listing-title-sort="Custom Pre-Tokenized Dataset" data-listing-filename-sort="tokenized.qmd">
 <td>
 <a href="../../docs/dataset-formats/tokenized.html" class="title listing-title">Custom Pre-Tokenized Dataset</a>
 </td>
diff --git a/index.html b/index.html
index 07c175538..ea8534b49 100644
--- a/index.html
+++ b/index.html
@@ -368,7 +368,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin
 <section id="quickstart" class="level2">
 <h2 class="anchored" data-anchor-id="quickstart">Quickstart ⚡</h2>
 <p>Get started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.</p>
-<p><strong>Requirements</strong>: <em>Nvidia</em> GPU (Ampere architecture or newer for <code>bf16</code> and Flash Attention) or <em>AMD</em> GPU, Python &gt;=3.10 and PyTorch &gt;=2.3.1.</p>
+<p><strong>Requirements</strong>: <em>Nvidia</em> GPU (Ampere architecture or newer for <code>bf16</code> and Flash Attention) or <em>AMD</em> GPU, Python &gt;=3.10 and PyTorch &gt;=2.4.1.</p>
 <div class="sourceCode" id="cb1"><pre class="sourceCode bash code-with-copy"><code class="sourceCode bash"><span id="cb1-1"><a href="#cb1-1" aria-hidden="true" tabindex="-1"></a><span class="ex">pip3</span> install <span class="at">--no-build-isolation</span> axolotl<span class="pp">[</span><span class="ss">flash</span><span class="pp">-</span><span class="ss">attn,deepspeed</span><span class="pp">]</span></span>
 <span id="cb1-2"><a href="#cb1-2" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-3"><a href="#cb1-3" aria-hidden="true" tabindex="-1"></a><span class="co"># download examples and optionally deepspeed configs to the local path</span></span>
diff --git a/search.json b/search.json
index 25c7d5b13..13ce20ad9 100644
--- a/search.json
+++ b/search.json
@@ -557,7 +557,7 @@
     "href": "index.html#quickstart",
     "title": "Axolotl",
     "section": "Quickstart ⚡",
-    "text": "Quickstart ⚡\nGet started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.\nRequirements: Nvidia GPU (Ampere architecture or newer for bf16 and Flash Attention) or AMD GPU, Python &gt;=3.10 and PyTorch &gt;=2.3.1.\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\n\n# download examples and optionally deepspeed configs to the local path\naxolotl fetch examples\naxolotl fetch deepspeed_configs  # OPTIONAL\n\n# finetune using lora\naxolotl train examples/llama-3/lora-1b.yml\n\nEdge Builds 🏎️\nIf you’re looking for the latest features and updates between releases, you’ll need to install from source.\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\npip3 install packaging ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\nAxolotl CLI Usage\nWe now support a new, more streamlined CLI using click.\n# preprocess datasets - optional but recommended\nCUDA_VISIBLE_DEVICES=\"0\" axolotl preprocess examples/llama-3/lora-1b.yml\n\n# finetune lora\naxolotl train examples/llama-3/lora-1b.yml\n\n# inference\naxolotl inference examples/llama-3/lora-1b.yml \\\n    --lora-model-dir=\"./outputs/lora-out\"\n\n# gradio\naxolotl inference examples/llama-3/lora-1b.yml \\\n    --lora-model-dir=\"./outputs/lora-out\" --gradio\n\n# remote yaml files - the yaml config can be hosted on a public URL\n# Note: the yaml config must directly link to the **raw** yaml\naxolotl train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/lora-1b.yml\nWe’ve also added a new command for fetching examples and deepspeed_configs to your local machine. This will come in handy when installing axolotl from PyPI.\n# Fetch example YAML files (stores in \"examples/\" folder)\naxolotl fetch examples\n\n# Fetch deepspeed config files (stores in \"deepspeed_configs/\" folder)\naxolotl fetch deepspeed_configs\n\n# Optionally, specify a destination folder\naxolotl fetch examples --dest path/to/folder\n\n\nLegacy Usage\n\n\nClick to Expand\n\nWhile the Axolotl CLI is the preferred method for interacting with axolotl, we still support the legacy -m axolotl.cli.* usage.\n# preprocess datasets - optional but recommended\nCUDA_VISIBLE_DEVICES=\"0\" python -m axolotl.cli.preprocess examples/llama-3/lora-1b.yml\n\n# finetune lora\naccelerate launch -m axolotl.cli.train examples/llama-3/lora-1b.yml\n\n# inference\naccelerate launch -m axolotl.cli.inference examples/llama-3/lora-1b.yml \\\n    --lora_model_dir=\"./outputs/lora-out\"\n\n# gradio\naccelerate launch -m axolotl.cli.inference examples/llama-3/lora-1b.yml \\\n    --lora_model_dir=\"./outputs/lora-out\" --gradio\n\n# remote yaml files - the yaml config can be hosted on a public URL\n# Note: the yaml config must directly link to the **raw** yaml\naccelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/lora-1b.yml",
+    "text": "Quickstart ⚡\nGet started with Axolotl in just a few steps! This quickstart guide will walk you through setting up and running a basic fine-tuning task.\nRequirements: Nvidia GPU (Ampere architecture or newer for bf16 and Flash Attention) or AMD GPU, Python &gt;=3.10 and PyTorch &gt;=2.4.1.\npip3 install --no-build-isolation axolotl[flash-attn,deepspeed]\n\n# download examples and optionally deepspeed configs to the local path\naxolotl fetch examples\naxolotl fetch deepspeed_configs  # OPTIONAL\n\n# finetune using lora\naxolotl train examples/llama-3/lora-1b.yml\n\nEdge Builds 🏎️\nIf you’re looking for the latest features and updates between releases, you’ll need to install from source.\ngit clone https://github.com/axolotl-ai-cloud/axolotl.git\ncd axolotl\npip3 install packaging ninja\npip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'\n\n\nAxolotl CLI Usage\nWe now support a new, more streamlined CLI using click.\n# preprocess datasets - optional but recommended\nCUDA_VISIBLE_DEVICES=\"0\" axolotl preprocess examples/llama-3/lora-1b.yml\n\n# finetune lora\naxolotl train examples/llama-3/lora-1b.yml\n\n# inference\naxolotl inference examples/llama-3/lora-1b.yml \\\n    --lora-model-dir=\"./outputs/lora-out\"\n\n# gradio\naxolotl inference examples/llama-3/lora-1b.yml \\\n    --lora-model-dir=\"./outputs/lora-out\" --gradio\n\n# remote yaml files - the yaml config can be hosted on a public URL\n# Note: the yaml config must directly link to the **raw** yaml\naxolotl train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/lora-1b.yml\nWe’ve also added a new command for fetching examples and deepspeed_configs to your local machine. This will come in handy when installing axolotl from PyPI.\n# Fetch example YAML files (stores in \"examples/\" folder)\naxolotl fetch examples\n\n# Fetch deepspeed config files (stores in \"deepspeed_configs/\" folder)\naxolotl fetch deepspeed_configs\n\n# Optionally, specify a destination folder\naxolotl fetch examples --dest path/to/folder\n\n\nLegacy Usage\n\n\nClick to Expand\n\nWhile the Axolotl CLI is the preferred method for interacting with axolotl, we still support the legacy -m axolotl.cli.* usage.\n# preprocess datasets - optional but recommended\nCUDA_VISIBLE_DEVICES=\"0\" python -m axolotl.cli.preprocess examples/llama-3/lora-1b.yml\n\n# finetune lora\naccelerate launch -m axolotl.cli.train examples/llama-3/lora-1b.yml\n\n# inference\naccelerate launch -m axolotl.cli.inference examples/llama-3/lora-1b.yml \\\n    --lora_model_dir=\"./outputs/lora-out\"\n\n# gradio\naccelerate launch -m axolotl.cli.inference examples/llama-3/lora-1b.yml \\\n    --lora_model_dir=\"./outputs/lora-out\" --gradio\n\n# remote yaml files - the yaml config can be hosted on a public URL\n# Note: the yaml config must directly link to the **raw** yaml\naccelerate launch -m axolotl.cli.train https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/examples/llama-3/lora-1b.yml",
     "crumbs": [
       "Home"
     ]
diff --git a/sitemap.xml b/sitemap.xml
index 8e4eaa97b..aaa4c2db4 100644
--- a/sitemap.xml
+++ b/sitemap.xml
@@ -2,118 +2,118 @@
 <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/FAQS.html</loc>
-    <lastmod>2025-01-24T17:56:38.669Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.857Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/index.html</loc>
-    <lastmod>2025-01-24T17:56:38.671Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.858Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html</loc>
-    <lastmod>2025-01-24T17:56:38.671Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.858Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/template_free.html</loc>
-    <lastmod>2025-01-24T17:56:38.671Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.858Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/amd_hpc.html</loc>
-    <lastmod>2025-01-24T17:56:38.670Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.858Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/input_output.html</loc>
-    <lastmod>2025-01-24T17:56:38.672Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.860Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/config.html</loc>
-    <lastmod>2025-01-24T17:56:38.670Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.858Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/lr_groups.html</loc>
-    <lastmod>2025-01-24T17:56:38.672Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.860Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html</loc>
-    <lastmod>2025-01-24T17:56:38.671Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.859Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html</loc>
-    <lastmod>2025-01-24T17:56:38.671Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.859Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html</loc>
-    <lastmod>2025-01-24T17:56:38.672Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.860Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/unsloth.html</loc>
-    <lastmod>2025-01-24T17:56:38.672Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.860Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/examples/colab-notebooks/colab-axolotl-example.html</loc>
-    <lastmod>2025-01-24T17:56:38.673Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.861Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/TODO.html</loc>
-    <lastmod>2025-01-24T17:56:38.669Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.857Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html</loc>
-    <lastmod>2025-01-24T17:56:38.687Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.874Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/src/axolotl/integrations/LICENSE.html</loc>
-    <lastmod>2025-01-24T17:56:38.686Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.874Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/index.html</loc>
-    <lastmod>2025-01-24T17:56:38.684Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.871Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/mac.html</loc>
-    <lastmod>2025-01-24T17:56:38.672Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.860Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/multimodal.html</loc>
-    <lastmod>2025-01-24T17:56:38.672Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.860Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/batch_vs_grad.html</loc>
-    <lastmod>2025-01-24T17:56:38.670Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.858Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset_preprocessing.html</loc>
-    <lastmod>2025-01-24T17:56:38.671Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.858Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/fsdp_qlora.html</loc>
-    <lastmod>2025-01-24T17:56:38.671Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.859Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/rlhf.html</loc>
-    <lastmod>2025-01-24T17:56:38.672Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.860Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html</loc>
-    <lastmod>2025-01-24T17:56:38.672Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.860Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/nccl.html</loc>
-    <lastmod>2025-01-24T17:56:38.672Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.860Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/torchao.html</loc>
-    <lastmod>2025-01-24T17:56:38.672Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.860Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/tokenized.html</loc>
-    <lastmod>2025-01-24T17:56:38.671Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.858Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/pretraining.html</loc>
-    <lastmod>2025-01-24T17:56:38.671Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.858Z</lastmod>
   </url>
   <url>
     <loc>https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/conversation.html</loc>
-    <lastmod>2025-01-24T17:56:38.671Z</lastmod>
+    <lastmod>2025-01-29T04:23:56.858Z</lastmod>
   </url>
 </urlset>