Compare commits

..

6 Commits

Author SHA1 Message Date
Dan Saunders
f3c8a25b30 Merge branch 'main' into codecov-pulls-only 2025-06-18 16:00:37 -04:00
Dan Saunders
016eb8055f accidental file 2025-06-17 13:58:02 -04:00
Dan Saunders
639ddeff6a return codecov artifact from modal image 2025-06-17 13:33:02 -04:00
Dan Saunders
753e4e3dec updates 2025-06-17 10:45:32 -04:00
Dan Saunders
2538c3b761 update to run only if succeeded 2025-06-17 10:45:32 -04:00
Dan Saunders
aa3639b7ad run codecov action at end of CI; only_pulls: true 2025-06-17 10:45:32 -04:00
31 changed files with 150 additions and 82 deletions

View File

@@ -106,13 +106,12 @@ jobs:
pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml
pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
- name: Upload coverage artifacts
uses: actions/upload-artifact@v4
with:
token: ${{ secrets.CODECOV_TOKEN }}
files: ./coverage.xml
flags: unittests,pytorch-${{ matrix.pytorch_version }}
fail_ci_if_error: false
name: coverage-${{ matrix.pytorch_version }}-${{ github.run_id }}
path: ./coverage.xml
retention-days: 1
- name: cleanup pip cache
run: |
@@ -234,6 +233,14 @@ jobs:
run: |
modal run cicd.e2e_tests
- name: Upload coverage artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: coverage-e2e-1st-${{ github.run_id }}
path: ./e2e-coverage.xml
retention-days: 1
docker-e2e-tests:
if: github.repository_owner == 'axolotl-ai-cloud'
# this job needs to be run on self-hosted GPU runners...
@@ -297,6 +304,14 @@ jobs:
run: |
modal run cicd.e2e_tests
- name: Upload coverage artifacts
if: always()
uses: actions/upload-artifact@v4
with:
name: coverage-e2e-${{ matrix.cuda }}-${{ matrix.pytorch }}-${{ github.run_id }}
path: ./e2e-coverage.xml
retention-days: 1
docker-e2e-cleanup:
runs-on: [self-hosted, modal]
timeout-minutes: 90
@@ -336,3 +351,26 @@ jobs:
- name: Run tests job on Modal
run: |
modal run cicd.cleanup
upload-coverage:
name: Upload Coverage to Codecov
runs-on: ubuntu-latest
needs: [pytest, docker-e2e-tests, docker-e2e-tests-1st]
if: github.event_name == 'pull_request' || github.ref == 'refs/heads/main'
steps:
- name: Download coverage reports
uses: actions/download-artifact@v4
with:
path: coverage-reports
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
directory: coverage-reports
fail_ci_if_error: false
verbose: true
name: codecov-umbrella
override_commit: ${{ github.event.pull_request.head.sha || github.sha }}
override_pr: ${{ github.event.pull_request.number }}

View File

@@ -51,5 +51,3 @@ pytest -v --durations=10 \
--cov=axolotl \
--cov-append \
--cov-report=xml:e2e-coverage.xml
codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true

View File

@@ -1,5 +1,7 @@
"""Modal app to run axolotl GPU tests"""
import pathlib
from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
@@ -12,9 +14,21 @@ from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd
volumes=VOLUME_CONFIG,
)
def cicd_pytest():
run_cmd("./cicd/cicd.sh", "/workspace/axolotl")
# Read the coverage file if it exists
coverage_file = pathlib.Path("/workspace/axolotl/e2e-coverage.xml")
if coverage_file.exists():
return coverage_file.read_text(encoding="utf-8")
return None
@app.local_entrypoint()
def main():
cicd_pytest.remote()
coverage = cicd_pytest.remote()
# Save the coverage file to the local filesystem if it was generated
if coverage:
with open("e2e-coverage.xml", "w", encoding="utf-8") as f:
f.write(coverage)

View File

@@ -77,7 +77,18 @@ def run_cmd(cmd: str, run_folder: str):
def cicd_pytest():
run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")
# Read the coverage file if it exists
coverage_file = pathlib.Path("/workspace/axolotl/multigpu-coverage.xml")
if coverage_file.exists():
return coverage_file.read_text(encoding="utf-8")
return None
@app.local_entrypoint()
def main():
cicd_pytest.remote()
coverage = cicd_pytest.remote()
# Save the coverage file to the local filesystem if it was generated
if coverage:
with open("multigpu-coverage.xml", "w", encoding="utf-8") as file:
file.write(coverage)

View File

@@ -9,11 +9,11 @@ description: Frequently asked questions
> A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd)
**Q: exitcode: -9**
**Q: Exitcode -9**
> A: This usually happens when you run out of system RAM.
**Q: exitcode: -7 while using deepspeed**
**Q: Exitcode -7 while using deepspeed**
> A: Try upgrading deepspeed w: `pip install -U deepspeed`

View File

@@ -18,7 +18,7 @@ tokenizers>=0.21.1
accelerate==1.7.0
datasets==3.6.0
deepspeed>=0.17.0
trl==0.18.2
trl==0.18.1
hf_xet==1.1.2
optimum==1.16.2

View File

@@ -7,6 +7,7 @@ from typing import Union
import yaml
from axolotl.cli.art import print_axolotl_text_art
from axolotl.cli.cloud.modal_ import ModalCloud
from axolotl.utils.dict import DictDefault
@@ -23,6 +24,7 @@ def do_cli_preprocess(
cloud_config: Union[Path, str],
config: Union[Path, str],
) -> None:
print_axolotl_text_art()
cloud_cfg = load_cloud_cfg(cloud_config)
cloud = ModalCloud(cloud_cfg)
with open(config, "r", encoding="utf-8") as file:
@@ -37,6 +39,7 @@ def do_cli_train(
cwd=None,
**kwargs,
) -> None:
print_axolotl_text_art()
cloud_cfg = load_cloud_cfg(cloud_config)
cloud = ModalCloud(cloud_cfg)
with open(config, "r", encoding="utf-8") as file:
@@ -51,6 +54,7 @@ def do_cli_lm_eval(
cloud_config: Union[Path, str],
config: Union[Path, str],
) -> None:
print_axolotl_text_art()
cloud_cfg = load_cloud_cfg(cloud_config)
cloud = ModalCloud(cloud_cfg)
with open(config, "r", encoding="utf-8") as file:

View File

@@ -26,9 +26,7 @@ from axolotl.utils.mlflow_ import setup_mlflow_env_vars
from axolotl.utils.trainer import prepare_opinionated_env, prepare_optim_env
from axolotl.utils.wandb_ import setup_wandb_env_vars
LOG = get_logger(__name__)
API_KEY_FIELDS = {"comet_api_key"}
LOG = get_logger(__name__, use_environ=True)
def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
@@ -235,15 +233,4 @@ def load_cfg(
setup_comet_env_vars(cfg)
plugin_set_cfg(cfg)
cfg_to_log = {
k: "[REDACTED]" if k in API_KEY_FIELDS else v
for k, v in cfg.items()
if v is not None
}
LOG.info(
"config:\n%s",
json.dumps(cfg_to_log, indent=2, default=str, sort_keys=True),
)
return cfg

View File

@@ -9,6 +9,7 @@ from dotenv import load_dotenv
from transformers.hf_argparser import HfArgumentParser
from axolotl.cli.args import TrainerCliArgs
from axolotl.cli.art import print_axolotl_text_art
from axolotl.cli.checks import check_accelerate_default_config, check_user_token
from axolotl.cli.config import load_cfg
from axolotl.common.datasets import load_datasets, load_preference_datasets
@@ -34,6 +35,7 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
patch_optimized_env()
# pylint: disable=duplicate-code
print_axolotl_text_art()
check_accelerate_default_config()
if int(os.getenv("LOCAL_RANK", "0")) == 0:
check_user_token()

View File

@@ -13,6 +13,7 @@ from dotenv import load_dotenv
from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
from axolotl.cli.args import InferenceCliArgs
from axolotl.cli.art import print_axolotl_text_art
from axolotl.cli.config import load_cfg
from axolotl.cli.utils import load_model_and_tokenizer
from axolotl.utils.chat_templates import (
@@ -254,6 +255,7 @@ def do_cli(
kwargs: Additional keyword arguments to override config file values.
"""
# pylint: disable=duplicate-code
print_axolotl_text_art()
parsed_cfg = load_cfg(config, inference=True, rl=None, **kwargs)
parsed_cfg.sample_packing = False
parser = transformers.HfArgumentParser(InferenceCliArgs)

View File

@@ -20,7 +20,6 @@ from axolotl.cli.args import (
TrainerCliArgs,
VllmServeCliArgs,
)
from axolotl.cli.art import print_axolotl_text_art
from axolotl.cli.sweeps import generate_sweep_configs
from axolotl.cli.utils import (
add_options_from_config,
@@ -41,7 +40,6 @@ LOG = get_logger(__name__)
@click.version_option(version=axolotl.__version__, prog_name="axolotl")
def cli():
"""Axolotl CLI - Train and fine-tune large language models"""
print_axolotl_text_art()
@cli.command()

View File

@@ -6,6 +6,7 @@ from typing import Union
import fire
from dotenv import load_dotenv
from axolotl.cli.art import print_axolotl_text_art
from axolotl.cli.config import load_cfg
from axolotl.cli.utils import load_model_and_tokenizer
from axolotl.utils.dict import DictDefault
@@ -22,6 +23,8 @@ def do_merge_lora(*, cfg: DictDefault) -> None:
Args:
cfg: Dictionary mapping `axolotl` config keys to values.
"""
print_axolotl_text_art()
model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)
safe_serialization = cfg.save_safetensors is True

View File

@@ -22,6 +22,7 @@ from huggingface_hub import split_torch_state_dict_into_shards
from safetensors.torch import save_file as safe_save_file
from torch.distributed.checkpoint.format_utils import _EmptyStateDictLoadPlanner
from axolotl.cli.art import print_axolotl_text_art
from axolotl.cli.config import load_cfg
from axolotl.utils.logging import get_logger
@@ -193,6 +194,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
kwargs: Additional keyword arguments to override config file values.
"""
# pylint: disable=duplicate-code
print_axolotl_text_art()
parsed_cfg = load_cfg(config, **kwargs)
fsdp_dir = Path(parsed_cfg.output_dir) / "pytorch_model_fsdp_0"

View File

@@ -12,6 +12,7 @@ from dotenv import load_dotenv
from transformers import AutoModelForCausalLM
from axolotl.cli.args import PreprocessCliArgs
from axolotl.cli.art import print_axolotl_text_art
from axolotl.cli.checks import check_accelerate_default_config, check_user_token
from axolotl.cli.config import load_cfg
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
@@ -32,6 +33,7 @@ def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
cfg: Dictionary mapping `axolotl` config keys to values.
cli_args: Preprocessing-specific CLI arguments.
"""
print_axolotl_text_art()
check_accelerate_default_config()
check_user_token()

View File

@@ -7,6 +7,7 @@ from typing import Union
from transformers import AutoModelForCausalLM
from axolotl.cli.art import print_axolotl_text_art
from axolotl.cli.config import load_cfg
from axolotl.loaders import load_tokenizer
from axolotl.utils.logging import get_logger
@@ -26,6 +27,7 @@ def do_quantize(
config (Union[Path, str]): The path to the config file
cli_args (dict): Additional command-line arguments
"""
print_axolotl_text_art()
cfg = load_cfg(config)

View File

@@ -11,6 +11,7 @@ from dotenv import load_dotenv
from transformers.hf_argparser import HfArgumentParser
from axolotl.cli.args import TrainerCliArgs
from axolotl.cli.art import print_axolotl_text_art
from axolotl.cli.checks import check_accelerate_default_config, check_user_token
from axolotl.cli.config import load_cfg
from axolotl.common.datasets import load_datasets, load_preference_datasets
@@ -34,6 +35,7 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
# Enable expandable segments for cuda allocation to improve VRAM usage
patch_optimized_env()
print_axolotl_text_art()
check_accelerate_default_config()
if int(os.getenv("LOCAL_RANK", "0")) == 0:
check_user_token()

View File

@@ -33,7 +33,7 @@ from transformers import PreTrainedModel, Trainer
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger
LOG = get_logger(__name__)
LOG = get_logger(__name__, use_environ=True)
if TYPE_CHECKING:
from axolotl.common.datasets import TrainDatasetMeta

View File

@@ -28,7 +28,7 @@ from axolotl.utils.logging import get_logger
from .args import CutCrossEntropyArgs # pylint: disable=unused-import. # noqa: F401
LOG = get_logger(__name__)
LOG = get_logger(__name__, use_environ=True)
_CCE_INSTALL_MESSAGE = (
"Please install cut_cross_entropy with transformers support using "

View File

@@ -27,7 +27,7 @@ from axolotl.utils.logging import get_logger
from .args import LigerArgs # pylint: disable=unused-import. # noqa: F401
from .utils import patch_with_compile_disable
LOG = get_logger(__name__)
LOG = get_logger(__name__, use_environ=True)
class LigerPlugin(BasePlugin):

View File

@@ -15,7 +15,6 @@
"""
Module for handling LIGER input arguments.
"""
from typing import Optional
from pydantic import BaseModel, model_validator

View File

@@ -273,7 +273,7 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
{"additional_special_tokens": additional_special_tokens}
)
if is_main_process():
if is_main_process(use_environ=True):
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")

View File

@@ -13,9 +13,9 @@ import inspect
import accelerate
import torch
import torch.distributed as dist
from accelerate.logging import get_logger
from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.enums import RingAttnFunc
LOG = get_logger(__name__)

View File

@@ -4,12 +4,12 @@ import inspect
import types
import torch
from accelerate.logging import get_logger
from peft import PeftModelForCausalLM
from torch import nn
from transformers.models.llama.modeling_llama import LlamaFlashAttention2
from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.logging import get_logger
LOG = get_logger(__name__)

View File

@@ -23,6 +23,7 @@ from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
from transformers.trainer import Trainer
from axolotl.cli.art import print_axolotl_text_art
from axolotl.common.datasets import TrainDatasetMeta
from axolotl.contribs.lgpl import ( # pylint: disable = no-name-in-module
fix_untrained_tokens,
@@ -544,6 +545,8 @@ def train(
Returns:
Tuple of (model, tokenizer) after training
"""
print_axolotl_text_art()
# Setup model, tokenizer, (causal or RLHF) trainer, etc.
(
trainer,

View File

@@ -21,7 +21,7 @@ from axolotl.utils.schemas.config import (
from axolotl.utils.schemas.config import AxolotlInputConfig as AxolotlInputConfigBase
from axolotl.utils.schemas.datasets import DPODataset, KTODataset, SFTDataset
LOG = get_logger(__name__)
LOG = get_logger(__name__, use_environ=True)
def choose_device(cfg):

View File

@@ -1,4 +1,6 @@
"""Utilities for distributed functionality."""
"""
utility helpers for distributed checks
"""
import os
import pickle # nosec
@@ -17,7 +19,7 @@ from transformers.utils.import_utils import (
distributed_state = None # pylint: disable=invalid-name
def get_device_type() -> torch.device:
def get_device_type():
device = torch.device("cpu")
if is_torch_cuda_available():
device = torch.device("cuda")
@@ -28,7 +30,7 @@ def get_device_type() -> torch.device:
return device
def get_device_count() -> int:
def get_device_count():
cur_device = get_device_type()
if "cuda" in str(cur_device):
return torch.cuda.device_count()
@@ -37,7 +39,7 @@ def get_device_count() -> int:
return 1
def get_current_device() -> int:
def get_current_device():
cur_device = get_device_type()
if "cuda" in str(cur_device):
return torch.cuda.current_device()
@@ -46,24 +48,15 @@ def get_current_device() -> int:
return 0
def init_distributed_state():
def is_distributed():
"""
Check if distributed training is initialized.
"""
global distributed_state # pylint: disable=global-statement
if distributed_state is None:
if not distributed_state:
timeout = int(os.environ.get("AXOLOTL_NCCL_TIMEOUT", 1800))
distributed_state = PartialState(timeout=timedelta(seconds=timeout))
def get_distributed_state() -> PartialState | None:
return distributed_state
def is_distributed() -> bool:
"""Check if distributed training is initialized."""
init_distributed_state()
if distributed_state is None:
return False
return distributed_state.use_distributed and distributed_state.initialized
@@ -76,31 +69,31 @@ def barrier():
dist.barrier()
def is_main_process() -> bool:
def is_main_process(use_environ=False):
"""
Check if the current process is the main process. If not in distributed mode,
always return `True`.
We use a simpler logic when the distributed state is not initialized: we just log
on the 0-th local rank.
Args:
- use_environ (bool, optional): Use environment variable to determine main process.
Returns:
`True` if the current process is the main process, `False` otherwise.
- bool: `True` if the current process is the main process, `False` otherwise.
"""
if get_distributed_state() is None:
if use_environ:
return os.environ.get("LOCAL_RANK", "0") == "0"
if not is_distributed():
return True
return dist.get_rank() == 0
def is_local_main_process() -> bool:
if get_distributed_state() is None:
def is_local_main_process(use_environ=False):
if use_environ:
return os.environ.get("LOCAL_RANK", "0") == "0"
return PartialState().is_local_main_process
def get_world_size() -> int:
def get_world_size():
return int(os.getenv("WORLD_SIZE", "1"))
@@ -122,7 +115,7 @@ def cleanup_distributed():
@contextmanager
def zero_first(is_main: bool):
def zero_first(is_main):
"""
runs the wrapped context so that rank 0 runs first before other ranks
"""

View File

@@ -5,8 +5,9 @@ module to freeze/unfreeze parameters by name
import re
from typing import Callable, List, Tuple, Union
from accelerate.logging import get_logger
from axolotl.utils.distributed import is_main_process
from axolotl.utils.logging import get_logger
LOG = get_logger(__name__)

View File

@@ -1,4 +1,6 @@
"""Logging helpers to only log on main process."""
"""
logging helpers to only log on main process
"""
import functools
import logging
@@ -12,18 +14,27 @@ from axolotl.utils.distributed import is_main_process
class MultiProcessAdapter(logging.LoggerAdapter):
"""
Logger adapter for distributed logging, specifically to only log on main process.
logger adapter for distributed logging, specifically to only log on main process
"""
def __init__(self, logger, use_environ=False, extra=None):
super().__init__(logger, extra)
self.use_environ = use_environ
@staticmethod
def _should_log(main_process_only: bool):
return not main_process_only or is_main_process()
def _should_log(main_process_only, use_environ=False):
return not main_process_only or (
main_process_only and is_main_process(use_environ=use_environ)
)
def log(self, level, msg, *args, **kwargs):
use_environ = kwargs.pop("use_environ", self.use_environ)
main_process_only = kwargs.pop("main_process_only", True)
kwargs.setdefault("stacklevel", 2)
if self.isEnabledFor(level) and self._should_log(main_process_only):
if self.isEnabledFor(level) and self._should_log(
main_process_only, use_environ=use_environ
):
msg, kwargs = self.process(msg, kwargs)
self.logger.log(level, msg, *args, **kwargs)
@@ -39,11 +50,13 @@ class MultiProcessAdapter(logging.LoggerAdapter):
self.warning(*args, **kwargs)
def get_logger(name: str, log_level: str | None = None) -> MultiProcessAdapter:
def get_logger(
name: str, log_level: str | None = None, use_environ: bool = False
) -> MultiProcessAdapter:
if log_level is None:
log_level = os.environ.get("AXOLOTL_LOG_LEVEL", None)
logger = logging.getLogger(name)
if log_level is not None:
logger.setLevel(log_level.upper())
logger.root.setLevel(log_level.upper())
return MultiProcessAdapter(logger, extra={})
return MultiProcessAdapter(logger, use_environ=use_environ, extra={})

View File

@@ -48,7 +48,7 @@ from axolotl.utils.schemas.trl import TRLConfig
from axolotl.utils.schemas.validation import ValidationMixin
from axolotl.utils.schemas.vllm import VllmConfig
LOG = get_logger(__name__)
LOG = get_logger(__name__, use_environ=True)
# pylint: disable=too-many-ancestors

View File

@@ -4,7 +4,7 @@ from pydantic import BaseModel, Field, field_validator
from axolotl.utils.logging import get_logger
LOG = get_logger(__name__)
LOG = get_logger(__name__, use_environ=True)
class ModelInputConfig(BaseModel):

View File

@@ -11,14 +11,14 @@ from typing import List, Optional
import numpy as np
import torch
import torch.cuda
from accelerate.logging import get_logger
from datasets import IterableDataset, disable_caching, enable_caching
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers.utils import is_torch_bf16_gpu_available
from axolotl.monkeypatch.trainer_eval_guard import patch_evaluation_loop_for_fsdp2
from axolotl.utils.distributed import init_distributed_state, reduce_and_broadcast
from axolotl.utils.distributed import reduce_and_broadcast
from axolotl.utils.environment import check_cuda_p2p_ib_support
from axolotl.utils.logging import get_logger
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
LOG = get_logger(__name__)
@@ -537,12 +537,6 @@ def setup_deepspeed_env(cfg, stage=None):
os.environ["ACCELERATE_DEEPSPEED_ZERO_STAGE"] = str(stage)
if stage == 3:
os.environ["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = "true"
# NOTE(djsaunde): The distribued state cannot be initialized prior to the
# ACCELERATE_USE_DEEPSPEED assignment, but it must be initialized some time prior
# to model load.
init_distributed_state()
# If we don't assign this, it doesn't actually get set in the accelerate weakref
_ = HfTrainerDeepSpeedConfig(cfg.deepspeed)