Various fixes for CI, save_only_model for RL, prevent packing multiprocessing deadlocks (#2661)
* lean mistral ft tests, remove e2e torch 2.4.1 test * make sure to pass save_only_model for RL * more tests to make ci leaner, add cleanup to modal ci * fix module for import in e2e tests * use mp spawn to prevent deadlocks with packing * make sure cleanup shell script is executable when cloned out
This commit is contained in:
40
.github/workflows/tests.yml
vendored
40
.github/workflows/tests.yml
vendored
@@ -365,3 +365,43 @@ jobs:
|
|||||||
- name: Run tests job on Modal
|
- name: Run tests job on Modal
|
||||||
run: |
|
run: |
|
||||||
modal run cicd.e2e_tests
|
modal run cicd.e2e_tests
|
||||||
|
|
||||||
|
docker-e2e-cleanup:
|
||||||
|
runs-on: [self-hosted, modal]
|
||||||
|
timeout-minutes: 90
|
||||||
|
needs: [docker-e2e-tests]
|
||||||
|
|
||||||
|
strategy:
|
||||||
|
fail-fast: false
|
||||||
|
matrix:
|
||||||
|
include:
|
||||||
|
- cuda: 124
|
||||||
|
cuda_version: 12.4.1
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.6.0
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras: vllm
|
||||||
|
steps:
|
||||||
|
- name: Checkout
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
- name: Install Python
|
||||||
|
uses: actions/setup-python@v5
|
||||||
|
with:
|
||||||
|
python-version: "3.11"
|
||||||
|
- name: Install Modal
|
||||||
|
run: |
|
||||||
|
python -m pip install --upgrade pip
|
||||||
|
pip install modal==0.71.8 jinja2
|
||||||
|
- name: Update env vars
|
||||||
|
run: |
|
||||||
|
echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
|
||||||
|
echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
|
||||||
|
echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
|
||||||
|
echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
|
||||||
|
echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
|
||||||
|
echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
|
||||||
|
echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
|
||||||
|
echo "CODECOV_TOKEN=${{ secrets.CODECOV_TOKEN }}" >> $GITHUB_ENV
|
||||||
|
- name: Run tests job on Modal
|
||||||
|
run: |
|
||||||
|
modal run cicd.cleanup
|
||||||
|
|||||||
0
cicd/__init__.py
Normal file
0
cicd/__init__.py
Normal file
@@ -18,7 +18,7 @@ pytest -v --durations=10 \
|
|||||||
--cov-append
|
--cov-append
|
||||||
|
|
||||||
# Run patched tests excluding lora kernels with coverage append
|
# Run patched tests excluding lora kernels with coverage append
|
||||||
pytest -v --durations=10 \
|
pytest --full-trace -vvv --durations=10 \
|
||||||
--ignore=tests/e2e/patched/lora_kernels \
|
--ignore=tests/e2e/patched/lora_kernels \
|
||||||
/workspace/axolotl/tests/e2e/patched \
|
/workspace/axolotl/tests/e2e/patched \
|
||||||
--cov=axolotl \
|
--cov=axolotl \
|
||||||
|
|||||||
19
cicd/cleanup.py
Normal file
19
cicd/cleanup.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
"""Modal app to run axolotl GPU cleanup"""
|
||||||
|
|
||||||
|
from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd
|
||||||
|
|
||||||
|
|
||||||
|
@app.function(
|
||||||
|
image=cicd_image,
|
||||||
|
timeout=60 * 60,
|
||||||
|
cpu=8.0,
|
||||||
|
memory=131072,
|
||||||
|
volumes=VOLUME_CONFIG,
|
||||||
|
)
|
||||||
|
def cleanup():
|
||||||
|
run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")
|
||||||
|
|
||||||
|
|
||||||
|
@app.local_entrypoint()
|
||||||
|
def main():
|
||||||
|
cleanup.remote()
|
||||||
6
cicd/cleanup.sh
Executable file
6
cicd/cleanup.sh
Executable file
@@ -0,0 +1,6 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
set -e
|
||||||
|
|
||||||
|
# cleanup old cache files for datasets processing and intermediate mappings
|
||||||
|
find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
|
||||||
|
find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;
|
||||||
@@ -1,69 +1,6 @@
|
|||||||
"""Modal app to run axolotl GPU tests"""
|
"""Modal app to run axolotl GPU tests"""
|
||||||
|
|
||||||
# pylint: disable=duplicate-code
|
from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
|
||||||
|
|
||||||
import os
|
|
||||||
import pathlib
|
|
||||||
import tempfile
|
|
||||||
|
|
||||||
import jinja2
|
|
||||||
import modal
|
|
||||||
from jinja2 import select_autoescape
|
|
||||||
from modal import App, Image
|
|
||||||
|
|
||||||
cicd_path = pathlib.Path(__file__).parent.resolve()
|
|
||||||
|
|
||||||
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
|
||||||
template_env = jinja2.Environment(
|
|
||||||
loader=template_loader, autoescape=select_autoescape()
|
|
||||||
)
|
|
||||||
df_template = template_env.get_template("Dockerfile.jinja")
|
|
||||||
|
|
||||||
df_args = {
|
|
||||||
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
|
||||||
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
|
||||||
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
|
|
||||||
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
|
|
||||||
"CUDA": os.environ.get("CUDA", "121"),
|
|
||||||
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
|
||||||
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
|
||||||
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
|
||||||
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
|
||||||
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
|
||||||
}
|
|
||||||
|
|
||||||
dockerfile_contents = df_template.render(**df_args)
|
|
||||||
|
|
||||||
temp_dir = tempfile.mkdtemp()
|
|
||||||
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
|
||||||
f.write(dockerfile_contents)
|
|
||||||
|
|
||||||
cicd_image = Image.from_dockerfile(
|
|
||||||
pathlib.Path(temp_dir) / "Dockerfile",
|
|
||||||
context_mount=None,
|
|
||||||
force_build=True,
|
|
||||||
gpu="A10G",
|
|
||||||
).env(df_args)
|
|
||||||
|
|
||||||
app = App("Axolotl CI/CD", secrets=[])
|
|
||||||
|
|
||||||
hf_cache_volume = modal.Volume.from_name(
|
|
||||||
"axolotl-ci-hf-hub-cache", create_if_missing=True
|
|
||||||
)
|
|
||||||
VOLUME_CONFIG = {
|
|
||||||
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
|
|
||||||
}
|
|
||||||
|
|
||||||
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
|
||||||
GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
|
|
||||||
|
|
||||||
|
|
||||||
def run_cmd(cmd: str, run_folder: str):
|
|
||||||
import subprocess # nosec
|
|
||||||
|
|
||||||
# Propagate errors from subprocess.
|
|
||||||
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
|
|
||||||
exit(exit_code) # pylint: disable=consider-using-sys-exit
|
|
||||||
|
|
||||||
|
|
||||||
@app.function(
|
@app.function(
|
||||||
|
|||||||
66
cicd/single_gpu.py
Normal file
66
cicd/single_gpu.py
Normal file
@@ -0,0 +1,66 @@
|
|||||||
|
"""Modal app to run axolotl GPU tests"""
|
||||||
|
|
||||||
|
# pylint: disable=duplicate-code
|
||||||
|
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
import tempfile
|
||||||
|
|
||||||
|
import jinja2
|
||||||
|
import modal
|
||||||
|
from jinja2 import select_autoescape
|
||||||
|
from modal import App, Image
|
||||||
|
|
||||||
|
cicd_path = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
|
template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
|
||||||
|
template_env = jinja2.Environment(
|
||||||
|
loader=template_loader, autoescape=select_autoescape()
|
||||||
|
)
|
||||||
|
df_template = template_env.get_template("Dockerfile.jinja")
|
||||||
|
|
||||||
|
df_args = {
|
||||||
|
"AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
|
||||||
|
"AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
|
||||||
|
"PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.4.1"),
|
||||||
|
"BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu121-2.4.1"),
|
||||||
|
"CUDA": os.environ.get("CUDA", "121"),
|
||||||
|
"GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
|
||||||
|
"GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
|
||||||
|
"NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
|
||||||
|
"CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
|
||||||
|
"HF_HOME": "/workspace/data/huggingface-cache/hub",
|
||||||
|
}
|
||||||
|
|
||||||
|
dockerfile_contents = df_template.render(**df_args)
|
||||||
|
|
||||||
|
temp_dir = tempfile.mkdtemp()
|
||||||
|
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
|
||||||
|
f.write(dockerfile_contents)
|
||||||
|
|
||||||
|
cicd_image = Image.from_dockerfile(
|
||||||
|
pathlib.Path(temp_dir) / "Dockerfile",
|
||||||
|
context_mount=None,
|
||||||
|
force_build=True,
|
||||||
|
gpu="A10G",
|
||||||
|
).env(df_args)
|
||||||
|
|
||||||
|
app = App("Axolotl CI/CD", secrets=[])
|
||||||
|
|
||||||
|
hf_cache_volume = modal.Volume.from_name(
|
||||||
|
"axolotl-ci-hf-hub-cache", create_if_missing=True
|
||||||
|
)
|
||||||
|
VOLUME_CONFIG = {
|
||||||
|
"/workspace/data/huggingface-cache/hub": hf_cache_volume,
|
||||||
|
}
|
||||||
|
|
||||||
|
N_GPUS = int(os.environ.get("N_GPUS", 1))
|
||||||
|
GPU_CONFIG = modal.gpu.L40S(count=N_GPUS)
|
||||||
|
|
||||||
|
|
||||||
|
def run_cmd(cmd: str, run_folder: str):
|
||||||
|
import subprocess # nosec
|
||||||
|
|
||||||
|
# Propagate errors from subprocess.
|
||||||
|
if exit_code := subprocess.call(cmd.split(), cwd=run_folder): # nosec
|
||||||
|
exit(exit_code) # pylint: disable=consider-using-sys-exit
|
||||||
@@ -1057,6 +1057,8 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
|
|||||||
# default to saving each epoch if not defined
|
# default to saving each epoch if not defined
|
||||||
training_args_kwargs["save_strategy"] = "epoch"
|
training_args_kwargs["save_strategy"] = "epoch"
|
||||||
|
|
||||||
|
training_args_kwargs["save_only_model"] = self.cfg.save_only_model
|
||||||
|
|
||||||
if self.cfg.dataset_processes:
|
if self.cfg.dataset_processes:
|
||||||
training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
|
training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
|
||||||
|
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ into fixed-capacity batches to optimize memory usage and training throughput.
|
|||||||
import logging
|
import logging
|
||||||
import math
|
import math
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
from multiprocessing import cpu_count
|
from multiprocessing import cpu_count, get_context
|
||||||
from typing import Iterable, Union
|
from typing import Iterable, Union
|
||||||
|
|
||||||
import numba
|
import numba
|
||||||
@@ -126,6 +126,7 @@ def pack_parallel(
|
|||||||
bin_size: int,
|
bin_size: int,
|
||||||
num_processes: int | None = None,
|
num_processes: int | None = None,
|
||||||
safe_mode: bool = True,
|
safe_mode: bool = True,
|
||||||
|
mp_start_method: str | None = "spawn",
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Pack sequences into bins using parallel processing
|
Pack sequences into bins using parallel processing
|
||||||
@@ -137,7 +138,9 @@ def pack_parallel(
|
|||||||
bin_size: Maximum number of bins to use
|
bin_size: Maximum number of bins to use
|
||||||
num_processes: Number of parallel processes to use
|
num_processes: Number of parallel processes to use
|
||||||
safe_mode: If True, use a more conservative packing approach
|
safe_mode: If True, use a more conservative packing approach
|
||||||
|
mp_start_method: Multiprocessing start method ('fork', 'spawn', 'forkserver').
|
||||||
|
'spawn' is often safer with Numba/PyTorch.
|
||||||
|
Set to None to use system default.
|
||||||
Returns:
|
Returns:
|
||||||
List of bins, where each bin contains indices of sequences assigned to it
|
List of bins, where each bin contains indices of sequences assigned to it
|
||||||
"""
|
"""
|
||||||
@@ -154,9 +157,33 @@ def pack_parallel(
|
|||||||
|
|
||||||
# Process groups in parallel
|
# Process groups in parallel
|
||||||
all_bins = []
|
all_bins = []
|
||||||
with ProcessPoolExecutor(max_workers=num_processes) as executor:
|
|
||||||
for group_bins in executor.map(_process_group, tasks):
|
mp_ctx = None
|
||||||
|
if mp_start_method:
|
||||||
|
try:
|
||||||
|
mp_ctx = get_context(mp_start_method)
|
||||||
|
except ValueError:
|
||||||
|
LOG.warning(
|
||||||
|
f"Failed to get multiprocessing context '{mp_start_method}'. "
|
||||||
|
f"Falling back to default. Available: {get_context().get_all_start_methods()}"
|
||||||
|
)
|
||||||
|
mp_ctx = (
|
||||||
|
None # Fallback to default context if specified one is not available
|
||||||
|
)
|
||||||
|
|
||||||
|
if num_processes == 1:
|
||||||
|
LOG.debug("Using single process for pack_parallel, running sequentially.")
|
||||||
|
for task_args in tasks:
|
||||||
|
group_bins = _process_group(task_args)
|
||||||
all_bins.extend(group_bins)
|
all_bins.extend(group_bins)
|
||||||
|
else:
|
||||||
|
# Use ProcessPoolExecutor only if num_processes > 1
|
||||||
|
# Pass mp_context if available
|
||||||
|
with ProcessPoolExecutor(
|
||||||
|
max_workers=num_processes, mp_context=mp_ctx
|
||||||
|
) as executor:
|
||||||
|
for group_bins in executor.map(_process_group, tasks):
|
||||||
|
all_bins.extend(group_bins)
|
||||||
|
|
||||||
return all_bins
|
return all_bins
|
||||||
|
|
||||||
|
|||||||
@@ -57,9 +57,9 @@ class Test4dMultipackLlama(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 20,
|
"max_steps": 5,
|
||||||
"save_steps": 10,
|
"save_steps": 3,
|
||||||
"eval_steps": 10,
|
"eval_steps": 4,
|
||||||
"fp16": True,
|
"fp16": True,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -105,9 +105,9 @@ class Test4dMultipackLlama(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 20,
|
"max_steps": 5,
|
||||||
"save_steps": 10,
|
"save_steps": 3,
|
||||||
"eval_steps": 10,
|
"eval_steps": 4,
|
||||||
"fp16": True,
|
"fp16": True,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -57,9 +57,9 @@ class TestMistral(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 20,
|
"max_steps": 5,
|
||||||
"save_steps": 10,
|
"save_steps": 3,
|
||||||
"eval_steps": 10,
|
"eval_steps": 4,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -99,9 +99,9 @@ class TestMistral(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 20,
|
"max_steps": 5,
|
||||||
"save_steps": 10,
|
"save_steps": 3,
|
||||||
"eval_steps": 10,
|
"eval_steps": 4,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -54,9 +54,9 @@ class TestMixtral(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 20,
|
"max_steps": 5,
|
||||||
"save_steps": 10,
|
"save_steps": 3,
|
||||||
"eval_steps": 10,
|
"eval_steps": 4,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -93,9 +93,9 @@ class TestMixtral(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 20,
|
"max_steps": 5,
|
||||||
"save_steps": 10,
|
"save_steps": 3,
|
||||||
"eval_steps": 10,
|
"eval_steps": 4,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -56,9 +56,9 @@ class TestPhiMultipack(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 20,
|
"max_steps": 5,
|
||||||
"eval_steps": 10,
|
"eval_steps": 3,
|
||||||
"save_steps": 10,
|
"save_steps": 4,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
@@ -108,9 +108,9 @@ class TestPhiMultipack(unittest.TestCase):
|
|||||||
"learning_rate": 0.00001,
|
"learning_rate": 0.00001,
|
||||||
"optimizer": "adamw_bnb_8bit",
|
"optimizer": "adamw_bnb_8bit",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
"max_steps": 20,
|
"max_steps": 5,
|
||||||
"eval_steps": 10,
|
"eval_steps": 3,
|
||||||
"save_steps": 10,
|
"save_steps": 4,
|
||||||
"bf16": "auto",
|
"bf16": "auto",
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user