diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 87b11b786..0d8dd6aa0 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -365,3 +365,43 @@ jobs:
       - name: Run tests job on Modal
         run: |
           modal run cicd.e2e_tests
+
+  docker-e2e-cleanup:
+    runs-on: [self-hosted, modal]
+    timeout-minutes: 90
+    needs: [docker-e2e-tests]
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - cuda: 124
+            cuda_version: 12.4.1
+            python_version: "3.11"
+            pytorch: 2.6.0
+            num_gpus: 1
+            axolotl_extras: vllm
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.11"
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal==0.71.8 jinja2
+      - name: Update env vars
+        run: |
+          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
+          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
+          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
+          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
+          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
+          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
+          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
+          echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
+      - name: Run tests job on Modal
+        run: |
+          modal run cicd.cleanup
diff --git a/cicd/__init__.py b/cicd/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/cicd/cicd.sh b/cicd/cicd.sh
index 86cc4fa96..65ee8699d 100755
--- a/cicd/cicd.sh
+++ b/cicd/cicd.sh
@@ -18,7 +18,7 @@ pytest -v --durations=10 \
   --cov-append
 
 # Run patched tests excluding lora kernels with coverage append
-pytest -v --durations=10 \
+pytest --full-trace -vvv --durations=10 \
   --ignore=tests/e2e/patched/lora_kernels \
   /workspace/axolotl/tests/e2e/patched \
   --cov=axolotl \
diff --git a/cicd/cleanup.py b/cicd/cleanup.py
new file mode 100644
index 000000000..007489993
--- /dev/null
+++ b/cicd/cleanup.py
@@ -0,0 +1,19 @@
+"""Modal app to run axolotl GPU cleanup"""
+
+from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
+
+
+@app.function(
+    image=cicd_image,
+    timeout=60 * 60,
+    cpu=8.0,
+    memory=131072,
+    volumes=VOLUME_CONFIG,
+)
+def cleanup():
+    run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
+
+
+@app.local_entrypoint()
+def main():
+    cleanup.remote()
diff --git a/cicd/cleanup.sh b/cicd/cleanup.sh
new file mode 100755
index 000000000..4ea851bb4
--- /dev/null
+++ b/cicd/cleanup.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+
+# cleanup old cache files for datasets processing and intermediate mappings
+find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
+find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
diff --git a/cicd/e2e_tests.py b/cicd/e2e_tests.py
index 998f8c35d..2bc8ca072 100644
--- a/cicd/e2e_tests.py
+++ b/cicd/e2e_tests.py
@@ -1,69 +1,6 @@
 """Modal app to run axolotl GPU tests"""
 
-# pylint: disable=duplicate-code
-
-import os
-import pathlib
-import tempfile
-
-import jinja2
-import modal
-from jinja2 import select_autoescape
-from modal import App, Image
-
-cicd_path = pathlib.Path(__file__).parent.resolve()
-
-template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
-template_env = jinja2.Environment(
-    loader=template_loader, autoescape=select_autoescape()
-)
-df_template = template_env.get_template("Dockerfile.jinja")
-
-df_args = {
-    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
-    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
-    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
-    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
-    "CUDA": os.environ.get("CUDA", "121"),
-    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
-    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
-    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
-    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
-    "HF_HOME": "/workspace/data/huggingface-cache/hub",
-}
-
-dockerfile_contents = df_template.render(**df_args)
-
-temp_dir = tempfile.mkdtemp()
-with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
-    f.write(dockerfile_contents)
-
-cicd_image = Image.from_dockerfile(
-    pathlib.Path(temp_dir) / "Dockerfile",
-    context_mount=None,
-    force_build=True,
-    gpu="A10G",
-).env(df_args)
-
-app = App("Axolotl CI/CD", secrets=[])
-
-hf_cache_volume = modal.Volume.from_name(
-    "axolotl-ci-hf-hub-cache", create_if_missing=True
-)
-VOLUME_CONFIG = {
-    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
-}
-
-N_GPUS = int(os.environ.get("N_GPUS", 1))
-GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
-
-
-def run_cmd(cmd: str, run_folder: str):
-    import subprocess  # nosec
-
-    # Propagate errors from subprocess.
-    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
-        exit(exit_code)  # pylint: disable=consider-using-sys-exit
+from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
 
 
 @app.function(
diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py
new file mode 100644
index 000000000..d46d970cf
--- /dev/null
+++ b/cicd/single_gpu.py
@@ -0,0 +1,66 @@
+"""Modal app to run axolotl GPU tests"""
+
+# pylint: disable=duplicate-code
+
+import os
+import pathlib
+import tempfile
+
+import jinja2
+import modal
+from jinja2 import select_autoescape
+from modal import App, Image
+
+cicd_path = pathlib.Path(__file__).parent.resolve()
+
+template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
+template_env = jinja2.Environment(
+    loader=template_loader, autoescape=select_autoescape()
+)
+df_template = template_env.get_template("Dockerfile.jinja")
+
+df_args = {
+    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
+    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
+    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
+    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
+    "CUDA": os.environ.get("CUDA", "121"),
+    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
+    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
+    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
+    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
+    "HF_HOME": "/workspace/data/huggingface-cache/hub",
+}
+
+dockerfile_contents = df_template.render(**df_args)
+
+temp_dir = tempfile.mkdtemp()
+with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
+    f.write(dockerfile_contents)
+
+cicd_image = Image.from_dockerfile(
+    pathlib.Path(temp_dir) / "Dockerfile",
+    context_mount=None,
+    force_build=True,
+    gpu="A10G",
+).env(df_args)
+
+app = App("Axolotl CI/CD", secrets=[])
+
+hf_cache_volume = modal.Volume.from_name(
+    "axolotl-ci-hf-hub-cache", create_if_missing=True
+)
+VOLUME_CONFIG = {
+    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
+}
+
+N_GPUS = int(os.environ.get("N_GPUS", 1))
+GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
+
+
+def run_cmd(cmd: str, run_folder: str):
+    import subprocess  # nosec
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py
index 5cb397b28..670561ede 100755
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -1057,6 +1057,8 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
             # default to saving each epoch if not defined
             training_args_kwargs["save_strategy"] = "epoch"
 
+        training_args_kwargs["save_only_model"] = self.cfg.save_only_model
+
         if self.cfg.dataset_processes:
             training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
 
diff --git a/src/axolotl/utils/samplers/multipack.py b/src/axolotl/utils/samplers/multipack.py
index c38313c7c..2df2d9e19 100644
--- a/src/axolotl/utils/samplers/multipack.py
+++ b/src/axolotl/utils/samplers/multipack.py
@@ -6,7 +6,7 @@ into fixed-capacity batches to optimize memory usage and training throughput.
 import logging
 import math
 from concurrent.futures import ProcessPoolExecutor
-from multiprocessing import cpu_count
+from multiprocessing import cpu_count, get_context
 from typing import Iterable, Union
 
 import numba
@@ -126,6 +126,7 @@ def pack_parallel(
     bin_size: int,
     num_processes: int | None = None,
     safe_mode: bool = True,
+    mp_start_method: str | None = "spawn",
 ):
     """
     Pack sequences into bins using parallel processing
@@ -137,7 +138,9 @@ def pack_parallel(
         bin_size: Maximum number of bins to use
         num_processes: Number of parallel processes to use
         safe_mode: If True, use a more conservative packing approach
-
+        mp_start_method: Multiprocessing start method ('fork', 'spawn', 'forkserver').
+                         'spawn' is often safer with Numba/PyTorch.
+                         Set to None to use system default.
     Returns:
         List of bins, where each bin contains indices of sequences assigned to it
     """
@@ -154,9 +157,33 @@ def pack_parallel(
 
     # Process groups in parallel
     all_bins = []
-    with ProcessPoolExecutor(max_workers=num_processes) as executor:
-        for group_bins in executor.map(_process_group, tasks):
+
+    mp_ctx = None
+    if mp_start_method:
+        try:
+            mp_ctx = get_context(mp_start_method)
+        except ValueError:
+            LOG.warning(
+                f"Failed to get multiprocessing context '{mp_start_method}'. "
+                f"Falling back to default. Available: {get_context().get_all_start_methods()}"
+            )
+            mp_ctx = (
+                None  # Fallback to default context if specified one is not available
+            )
+
+    if num_processes == 1:
+        LOG.debug("Using single process for pack_parallel, running sequentially.")
+        for task_args in tasks:
+            group_bins = _process_group(task_args)
             all_bins.extend(group_bins)
+    else:
+        # Use ProcessPoolExecutor only if num_processes > 1
+        # Pass mp_context if available
+        with ProcessPoolExecutor(
+            max_workers=num_processes, mp_context=mp_ctx
+        ) as executor:
+            for group_bins in executor.map(_process_group, tasks):
+                all_bins.extend(group_bins)
 
     return all_bins
 
diff --git a/tests/e2e/patched/test_4d_multipack_llama.py b/tests/e2e/patched/test_4d_multipack_llama.py
index 270956883..12dd51c13 100644
--- a/tests/e2e/patched/test_4d_multipack_llama.py
+++ b/tests/e2e/patched/test_4d_multipack_llama.py
@@ -57,9 +57,9 @@ class Test4dMultipackLlama(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "save_steps": 10,
-                "eval_steps": 10,
+                "max_steps": 5,
+                "save_steps": 3,
+                "eval_steps": 4,
                 "fp16": True,
             }
         )
@@ -105,9 +105,9 @@ class Test4dMultipackLlama(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "save_steps": 10,
-                "eval_steps": 10,
+                "max_steps": 5,
+                "save_steps": 3,
+                "eval_steps": 4,
                 "fp16": True,
             }
         )
diff --git a/tests/e2e/patched/test_mistral_samplepack.py b/tests/e2e/patched/test_mistral_samplepack.py
index ccfeb3d63..fe8fafb19 100644
--- a/tests/e2e/patched/test_mistral_samplepack.py
+++ b/tests/e2e/patched/test_mistral_samplepack.py
@@ -57,9 +57,9 @@ class TestMistral(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "save_steps": 10,
-                "eval_steps": 10,
+                "max_steps": 5,
+                "save_steps": 3,
+                "eval_steps": 4,
                 "bf16": "auto",
             }
         )
@@ -99,9 +99,9 @@ class TestMistral(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_torch_fused",
                 "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "save_steps": 10,
-                "eval_steps": 10,
+                "max_steps": 5,
+                "save_steps": 3,
+                "eval_steps": 4,
                 "bf16": "auto",
             }
         )
diff --git a/tests/e2e/patched/test_mixtral_samplepack.py b/tests/e2e/patched/test_mixtral_samplepack.py
index f035b1f28..ebc2ba092 100644
--- a/tests/e2e/patched/test_mixtral_samplepack.py
+++ b/tests/e2e/patched/test_mixtral_samplepack.py
@@ -54,9 +54,9 @@ class TestMixtral(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "save_steps": 10,
-                "eval_steps": 10,
+                "max_steps": 5,
+                "save_steps": 3,
+                "eval_steps": 4,
                 "bf16": "auto",
             }
         )
@@ -93,9 +93,9 @@ class TestMixtral(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "save_steps": 10,
-                "eval_steps": 10,
+                "max_steps": 5,
+                "save_steps": 3,
+                "eval_steps": 4,
                 "bf16": "auto",
             }
         )
diff --git a/tests/e2e/patched/test_phi_multipack.py b/tests/e2e/patched/test_phi_multipack.py
index c42ed8baf..d8130d119 100644
--- a/tests/e2e/patched/test_phi_multipack.py
+++ b/tests/e2e/patched/test_phi_multipack.py
@@ -56,9 +56,9 @@ class TestPhiMultipack(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "eval_steps": 10,
-                "save_steps": 10,
+                "max_steps": 5,
+                "eval_steps": 3,
+                "save_steps": 4,
                 "bf16": "auto",
             }
         )
@@ -108,9 +108,9 @@ class TestPhiMultipack(unittest.TestCase):
                 "learning_rate": 0.00001,
                 "optimizer": "adamw_bnb_8bit",
                 "lr_scheduler": "cosine",
-                "max_steps": 20,
-                "eval_steps": 10,
-                "save_steps": 10,
+                "max_steps": 5,
+                "eval_steps": 3,
+                "save_steps": 4,
                 "bf16": "auto",
             }
         )