diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 87b11b786..0d8dd6aa0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -365,3 +365,43 @@ jobs: - name: Run tests job on Modal run: | modal run cicd.e2e_tests + + docker-e2e-cleanup: + runs-on: [self-hosted, modal] + timeout-minutes: 90 + needs: [docker-e2e-tests] + + strategy: + fail-fast: false + matrix: + include: + - cuda: 124 + cuda_version: 12.4.1 + python_version: "3.11" + pytorch: 2.6.0 + num_gpus: 1 + axolotl_extras: vllm + steps: + - name: Checkout + uses: actions/checkout@v4 + - name: Install Python + uses: actions/setup-python@v5 + with: + python-version: "3.11" + - name: Install Modal + run: | + python -m pip install --upgrade pip + pip install modal==0.71.8 jinja2 + - name: Update env vars + run: | + echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV + echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV + echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV + echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV + echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV + echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV + echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV + echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV + - name: Run tests job on Modal + run: | + modal run cicd.cleanup diff --git a/cicd/__init__.py b/cicd/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/cicd/cicd.sh b/cicd/cicd.sh index 86cc4fa96..65ee8699d 100755 --- a/cicd/cicd.sh +++ b/cicd/cicd.sh @@ -18,7 +18,7 @@ pytest -v --durations=10 \ --cov-append # Run patched tests excluding lora kernels with coverage append -pytest -v --durations=10 \ +pytest --full-trace -vvv --durations=10 \ --ignore=tests/e2e/patched/lora_kernels \ /workspace/axolotl/tests/e2e/patched \ --cov=axolotl \ diff --git a/cicd/cleanup.py b/cicd/cleanup.py new file mode 100644 index 000000000..007489993 --- /dev/null +++ b/cicd/cleanup.py @@ -0,0 +1,19 @@ +"""Modal app to run axolotl GPU cleanup""" + +from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd + + +@app.function( + image=cicd_image, + timeout=60 * 60, + cpu=8.0, + memory=131072, + volumes=VOLUME_CONFIG, +) +def cleanup(): + run_cmd("./cicd/cleanup.sh", "/workspace/axolotl") + + +@app.local_entrypoint() +def main(): + cleanup.remote() diff --git a/cicd/cleanup.sh b/cicd/cleanup.sh new file mode 100755 index 000000000..4ea851bb4 --- /dev/null +++ b/cicd/cleanup.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set -e + +# cleanup old cache files for datasets processing and intermediate mappings +find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \; +find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \; diff --git a/cicd/e2e_tests.py b/cicd/e2e_tests.py index 998f8c35d..2bc8ca072 100644 --- a/cicd/e2e_tests.py +++ b/cicd/e2e_tests.py @@ -1,69 +1,6 @@ """Modal app to run axolotl GPU tests""" -# pylint: disable=duplicate-code - -import os -import pathlib -import tempfile - -import jinja2 -import modal -from jinja2 import select_autoescape -from modal import App, Image - -cicd_path = pathlib.Path(__file__).parent.resolve() - -template_loader = jinja2.FileSystemLoader(searchpath=cicd_path) -template_env = jinja2.Environment( - loader=template_loader, autoescape=select_autoescape() -) -df_template = template_env.get_template("Dockerfile.jinja") - -df_args = { - "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""), - "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""), - "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"), - "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"), - "CUDA": os.environ.get("CUDA", "121"), - "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"), - "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""), - "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""), - "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""), - "HF_HOME": "/workspace/data/huggingface-cache/hub", -} - -dockerfile_contents = df_template.render(**df_args) - -temp_dir = tempfile.mkdtemp() -with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f: - f.write(dockerfile_contents) - -cicd_image = Image.from_dockerfile( - pathlib.Path(temp_dir) / "Dockerfile", - context_mount=None, - force_build=True, - gpu="A10G", -).env(df_args) - -app = App("Axolotl CI/CD", secrets=[]) - -hf_cache_volume = modal.Volume.from_name( - "axolotl-ci-hf-hub-cache", create_if_missing=True -) -VOLUME_CONFIG = { - "/workspace/data/huggingface-cache/hub": hf_cache_volume, -} - -N_GPUS = int(os.environ.get("N_GPUS", 1)) -GPU_CONFIG = modal.gpu.L40S(count=N_GPUS) - - -def run_cmd(cmd: str, run_folder: str): - import subprocess # nosec - - # Propagate errors from subprocess. - if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec - exit(exit_code) # pylint: disable=consider-using-sys-exit +from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd @app.function( diff --git a/cicd/single_gpu.py b/cicd/single_gpu.py new file mode 100644 index 000000000..d46d970cf --- /dev/null +++ b/cicd/single_gpu.py @@ -0,0 +1,66 @@ +"""Modal app to run axolotl GPU tests""" + +# pylint: disable=duplicate-code + +import os +import pathlib +import tempfile + +import jinja2 +import modal +from jinja2 import select_autoescape +from modal import App, Image + +cicd_path = pathlib.Path(__file__).parent.resolve() + +template_loader = jinja2.FileSystemLoader(searchpath=cicd_path) +template_env = jinja2.Environment( + loader=template_loader, autoescape=select_autoescape() +) +df_template = template_env.get_template("Dockerfile.jinja") + +df_args = { + "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""), + "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""), + "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"), + "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"), + "CUDA": os.environ.get("CUDA", "121"), + "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"), + "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""), + "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""), + "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""), + "HF_HOME": "/workspace/data/huggingface-cache/hub", +} + +dockerfile_contents = df_template.render(**df_args) + +temp_dir = tempfile.mkdtemp() +with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f: + f.write(dockerfile_contents) + +cicd_image = Image.from_dockerfile( + pathlib.Path(temp_dir) / "Dockerfile", + context_mount=None, + force_build=True, + gpu="A10G", +).env(df_args) + +app = App("Axolotl CI/CD", secrets=[]) + +hf_cache_volume = modal.Volume.from_name( + "axolotl-ci-hf-hub-cache", create_if_missing=True +) +VOLUME_CONFIG = { + "/workspace/data/huggingface-cache/hub": hf_cache_volume, +} + +N_GPUS = int(os.environ.get("N_GPUS", 1)) +GPU_CONFIG = modal.gpu.L40S(count=N_GPUS) + + +def run_cmd(cmd: str, run_folder: str): + import subprocess # nosec + + # Propagate errors from subprocess. + if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec + exit(exit_code) # pylint: disable=consider-using-sys-exit diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py index 5cb397b28..670561ede 100755 --- a/src/axolotl/core/trainer_builder.py +++ b/src/axolotl/core/trainer_builder.py @@ -1057,6 +1057,8 @@ class HFRLTrainerBuilder(TrainerBuilderBase): # default to saving each epoch if not defined training_args_kwargs["save_strategy"] = "epoch" + training_args_kwargs["save_only_model"] = self.cfg.save_only_model + if self.cfg.dataset_processes: training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes diff --git a/src/axolotl/utils/samplers/multipack.py b/src/axolotl/utils/samplers/multipack.py index c38313c7c..2df2d9e19 100644 --- a/src/axolotl/utils/samplers/multipack.py +++ b/src/axolotl/utils/samplers/multipack.py @@ -6,7 +6,7 @@ into fixed-capacity batches to optimize memory usage and training throughput. import logging import math from concurrent.futures import ProcessPoolExecutor -from multiprocessing import cpu_count +from multiprocessing import cpu_count, get_context from typing import Iterable, Union import numba @@ -126,6 +126,7 @@ def pack_parallel( bin_size: int, num_processes: int | None = None, safe_mode: bool = True, + mp_start_method: str | None = "spawn", ): """ Pack sequences into bins using parallel processing @@ -137,7 +138,9 @@ def pack_parallel( bin_size: Maximum number of bins to use num_processes: Number of parallel processes to use safe_mode: If True, use a more conservative packing approach - + mp_start_method: Multiprocessing start method ('fork', 'spawn', 'forkserver'). + 'spawn' is often safer with Numba/PyTorch. + Set to None to use system default. Returns: List of bins, where each bin contains indices of sequences assigned to it """ @@ -154,9 +157,33 @@ def pack_parallel( # Process groups in parallel all_bins = [] - with ProcessPoolExecutor(max_workers=num_processes) as executor: - for group_bins in executor.map(_process_group, tasks): + + mp_ctx = None + if mp_start_method: + try: + mp_ctx = get_context(mp_start_method) + except ValueError: + LOG.warning( + f"Failed to get multiprocessing context '{mp_start_method}'. " + f"Falling back to default. Available: {get_context().get_all_start_methods()}" + ) + mp_ctx = ( + None # Fallback to default context if specified one is not available + ) + + if num_processes == 1: + LOG.debug("Using single process for pack_parallel, running sequentially.") + for task_args in tasks: + group_bins = _process_group(task_args) all_bins.extend(group_bins) + else: + # Use ProcessPoolExecutor only if num_processes > 1 + # Pass mp_context if available + with ProcessPoolExecutor( + max_workers=num_processes, mp_context=mp_ctx + ) as executor: + for group_bins in executor.map(_process_group, tasks): + all_bins.extend(group_bins) return all_bins diff --git a/tests/e2e/patched/test_4d_multipack_llama.py b/tests/e2e/patched/test_4d_multipack_llama.py index 270956883..12dd51c13 100644 --- a/tests/e2e/patched/test_4d_multipack_llama.py +++ b/tests/e2e/patched/test_4d_multipack_llama.py @@ -57,9 +57,9 @@ class Test4dMultipackLlama(unittest.TestCase): "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 5, + "save_steps": 3, + "eval_steps": 4, "fp16": True, } ) @@ -105,9 +105,9 @@ class Test4dMultipackLlama(unittest.TestCase): "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 5, + "save_steps": 3, + "eval_steps": 4, "fp16": True, } ) diff --git a/tests/e2e/patched/test_mistral_samplepack.py b/tests/e2e/patched/test_mistral_samplepack.py index ccfeb3d63..fe8fafb19 100644 --- a/tests/e2e/patched/test_mistral_samplepack.py +++ b/tests/e2e/patched/test_mistral_samplepack.py @@ -57,9 +57,9 @@ class TestMistral(unittest.TestCase): "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 5, + "save_steps": 3, + "eval_steps": 4, "bf16": "auto", } ) @@ -99,9 +99,9 @@ class TestMistral(unittest.TestCase): "learning_rate": 0.00001, "optimizer": "adamw_torch_fused", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 5, + "save_steps": 3, + "eval_steps": 4, "bf16": "auto", } ) diff --git a/tests/e2e/patched/test_mixtral_samplepack.py b/tests/e2e/patched/test_mixtral_samplepack.py index f035b1f28..ebc2ba092 100644 --- a/tests/e2e/patched/test_mixtral_samplepack.py +++ b/tests/e2e/patched/test_mixtral_samplepack.py @@ -54,9 +54,9 @@ class TestMixtral(unittest.TestCase): "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 5, + "save_steps": 3, + "eval_steps": 4, "bf16": "auto", } ) @@ -93,9 +93,9 @@ class TestMixtral(unittest.TestCase): "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "save_steps": 10, - "eval_steps": 10, + "max_steps": 5, + "save_steps": 3, + "eval_steps": 4, "bf16": "auto", } ) diff --git a/tests/e2e/patched/test_phi_multipack.py b/tests/e2e/patched/test_phi_multipack.py index c42ed8baf..d8130d119 100644 --- a/tests/e2e/patched/test_phi_multipack.py +++ b/tests/e2e/patched/test_phi_multipack.py @@ -56,9 +56,9 @@ class TestPhiMultipack(unittest.TestCase): "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "eval_steps": 10, - "save_steps": 10, + "max_steps": 5, + "eval_steps": 3, + "save_steps": 4, "bf16": "auto", } ) @@ -108,9 +108,9 @@ class TestPhiMultipack(unittest.TestCase): "learning_rate": 0.00001, "optimizer": "adamw_bnb_8bit", "lr_scheduler": "cosine", - "max_steps": 20, - "eval_steps": 10, - "save_steps": 10, + "max_steps": 5, + "eval_steps": 3, + "save_steps": 4, "bf16": "auto", } )