diff --git a/examples/cloud/modal.yaml b/examples/cloud/modal.yaml new file mode 100644 index 000000000..f42a65980 --- /dev/null +++ b/examples/cloud/modal.yaml @@ -0,0 +1,15 @@ +volumes: + - name: axolotl-data + mount: /workspace/data + - name: axolotl-artifacts + mount: /workspace/artifacts +secrets: + - HF_TOKEN + - WANDB_API_KEY +branch: +gpu: h100 +gpu_count: 1 +memory: 128 +timeout: 86400 +timeout_preprocess: 14400 +memory_preprocess: 32 diff --git a/scripts/motd b/scripts/motd index b3ffa165e..bc123c312 100644 --- a/scripts/motd +++ b/scripts/motd @@ -1,10 +1,15 @@ - dP dP dP - 88 88 88 - .d8888b. dP. .dP .d8888b. 88 .d8888b. d8888P 88 - 88' `88 `8bd8' 88' `88 88 88' `88 88 88 - 88. .88 .d88b. 88. .88 88 88. .88 88 88 - `88888P8 dP' `dP `88888P' dP `88888P' dP dP + #@@ #@@ @@# @@# + @@ @@ @@ @@ =@@# @@ #@ =@@#. + @@ #@@@@@@@@@ @@ #@#@= @@ #@ .=@@ + #@@@@@@@@@@@@@@@@@ =@# @# ##= ## =####=+ @@ =#####+ =#@@###. @@ + @@@@@@@@@@/ +@@/ +@@ #@ =@= #@= @@ =@#+ +#@# @@ =@#+ +#@# #@. @@ + @@@@@@@@@@ ##@@ ##@@ =@# @# =@# @# @@ @@ @@ @@ #@ #@ @@ + @@@@@@@@@@@@@@@@@@@@ #@=+++#@= =@@# @@ @@ @@ @@ #@ #@ @@ + =@#=====@@ =@# @# @@ @@ @@ @@ #@ #@ @@ + @@@@@@@@@@@@@@@@ @@@@ #@ #@= #@= +@@ #@# =@# @@. =@# =@# #@. @@ + =@# @# #@= #@ =#@@@@#= +#@@= +#@@@@#= .##@@+ @@ + @@@@ @@@@@@@@@@@@@@@@ Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory ie empty, run the following commands: diff --git a/src/axolotl/cli/cloud/__init__.py b/src/axolotl/cli/cloud/__init__.py new file mode 100644 index 000000000..ef3cbc5c9 --- /dev/null +++ b/src/axolotl/cli/cloud/__init__.py @@ -0,0 +1,44 @@ +""" +launch axolotl in supported cloud platforms +""" +from pathlib import Path +from typing import Union + +import yaml + +from axolotl.cli import print_axolotl_text_art +from axolotl.cli.cloud.modal_ import ModalCloud +from axolotl.utils.dict import DictDefault + + +def load_cloud_cfg(cloud_config: Union[Path, str]) -> DictDefault: + """Load and validate cloud configuration.""" + # Load cloud configuration. + with open(cloud_config, encoding="utf-8") as file: + cloud_cfg: DictDefault = DictDefault(yaml.safe_load(file)) + return cloud_cfg + + +def do_cli_preprocess( + cloud_config: Union[Path, str], + config: Union[Path, str] = Path("examples/"), +) -> None: + print_axolotl_text_art() + cloud_cfg = load_cloud_cfg(cloud_config) + cloud = ModalCloud(cloud_cfg) + with open(config, "r", encoding="utf-8") as file: + config_yaml = file.read() + cloud.preprocess(config_yaml) + + +def do_cli_train( + cloud_config: Union[Path, str], + config: Union[Path, str] = Path("examples/"), + accelerate: bool = True, +) -> None: + print_axolotl_text_art() + cloud_cfg = load_cloud_cfg(cloud_config) + cloud = ModalCloud(cloud_cfg) + with open(config, "r", encoding="utf-8") as file: + config_yaml = file.read() + cloud.train(config_yaml, accelerate=accelerate) diff --git a/src/axolotl/cli/cloud/base.py b/src/axolotl/cli/cloud/base.py new file mode 100644 index 000000000..44d1b0c17 --- /dev/null +++ b/src/axolotl/cli/cloud/base.py @@ -0,0 +1,18 @@ +""" +base class for cloud platforms from cli +""" +from abc import ABC, abstractmethod + + +class Cloud(ABC): + """ + Abstract base class for cloud platforms. + """ + + @abstractmethod + def preprocess(self, config_yaml: str, *args, **kwargs) -> None: + pass + + @abstractmethod + def train(self, config_yaml: str, accelerate: bool = True) -> str: + pass diff --git a/src/axolotl/cli/cloud/modal_.py b/src/axolotl/cli/cloud/modal_.py new file mode 100644 index 000000000..724e8cb20 --- /dev/null +++ b/src/axolotl/cli/cloud/modal_.py @@ -0,0 +1,229 @@ +""" +Modal Cloud support from CLI +""" +import copy +import os +from pathlib import Path + +import modal + +from axolotl.cli.cloud.base import Cloud + + +def run_cmd(cmd: str, run_folder: str, volumes=None): + """Run a command inside a folder, with Modal Volume reloading before and commit on success.""" + import subprocess # nosec B404 + + # Ensure volumes contain latest files. + if volumes: + for _, vol in volumes.items(): + vol.reload() + + # modal workaround so it doesn't use the automounted axolotl + new_env = copy.deepcopy(os.environ) + if "PYTHONPATH" in new_env: + del new_env["PYTHONPATH"] + + # Propagate errors from subprocess. + if exit_code := subprocess.call( # nosec B603 + cmd.split(), cwd=run_folder, env=new_env + ): + exit(exit_code) # pylint: disable=consider-using-sys-exit + + # Commit writes to volume. + if volumes: + for _, vol in volumes.items(): + vol.commit() + + +class ModalCloud(Cloud): + """ + Modal Cloud implementation. + """ + + def __init__(self, config, app=None): + self.config = config + if not app: + app = modal.App() + self.app = app + + self.volumes = {} + if config.volumes: + for volume_config in config.volumes: + _, mount, vol = self.create_volume(volume_config) + self.volumes[mount] = (vol, volume_config) + + def get_env(self): + res = { + "HF_DATASETS_CACHE": "/workspace/data/huggingface-cache/datasets", + "HF_HUB_CACHE": "/workspace/data/huggingface-cache/hub", + } + + for key in self.config.get("env", []): + if isinstance(key, str): + if val := os.environ.get(key, ""): + res[key] = val + elif isinstance(key, dict): + (key_, val) = list(key.items())[0] + res[key_] = val + return res + + def get_image(self): + docker_tag = "main-py3.11-cu124-2.5.1" + if self.config.docker_tag: + docker_tag = self.config.docker_tag + image = modal.Image.from_registry(f"axolotlai/axolotl:{docker_tag}") + + # branch + if self.config.branch: + image = image.dockerfile_commands( + [ + f"RUN cd /workspace/axolotl && git fetch && git checkout {self.config.branch}", + ] + ) + + if env := self.get_env(): + image = image.env(env) + + image = image.pip_install("fastapi==0.110.0", "pydantic==2.6.3") + + return image + + def get_secrets(self): + res = [] + if self.config.secrets: + for key in self.config.get("secrets", []): + # pylint: disable=duplicate-code + if isinstance(key, str): + if val := os.environ.get(key, ""): + res.append(modal.Secret.from_dict({key: val})) + elif isinstance(key, dict): + (key_, val) = list(key.items())[0] + res.append(modal.Secret.from_dict({key_: val})) + return res + + def create_volume(self, volume_config): + name = volume_config.name + mount = volume_config.mount + return name, mount, modal.Volume.from_name(name, create_if_missing=True) + + def get_ephemeral_disk_size(self): + return 1000 * 525 # 1 TiB + + def get_preprocess_timeout(self): + if self.config.timeout_preprocess: + return int(self.config.timeout_preprocess) + return 60 * 60 * 3 # 3 hours + + def get_preprocess_memory(self): + memory = 128 # default to 128GiB + if self.config.memory: + memory = int(self.config.memory) + if self.config.memory_preprocess: + memory = int(self.config.memory_preprocess) + return 1024 * memory + + def get_preprocess_env(self): + return self.app.function( + image=self.get_image(), + volumes={k: v[0] for k, v in self.volumes.items()}, + cpu=8.0, + ephemeral_disk=self.get_ephemeral_disk_size(), + memory=self.get_preprocess_memory(), + timeout=self.get_preprocess_timeout(), + secrets=self.get_secrets(), + ) + + def preprocess(self, config_yaml: str, *args, **kwargs): + modal_fn = self.get_preprocess_env()(_preprocess) + with modal.enable_output(): + with self.app.run(detach=True): + modal_fn.remote( + config_yaml, + volumes={k: v[0] for k, v in self.volumes.items()}, + *args, + **kwargs, + ) + + def get_train_timeout(self): + if self.config.timeout: + return int(self.config.timeout) + return 60 * 60 * 24 # 30 hours + + def get_train_gpu(self): # pylint: disable=too-many-return-statements + count = self.config.gpu_count or 1 + family = self.config.gpu.lower() or "l40s" + + if family == "l40s": + return modal.gpu.L40S(count=count) + if family == "a100": + return modal.gpu.A100(count=count, size="40GB") + if family == "a100-80gb": + return modal.gpu.A100(count=count, size="80GB") + if family == "a10g": + return modal.gpu.A10G(count=count) + if family == "h100": + return modal.gpu.H100(count=count) + if family == "t4": + return modal.gpu.T4(count=count) + if family == "l4": + return modal.gpu.L4(count=count) + raise ValueError(f"Unsupported GPU family: {family}") + + def get_train_memory(self): + memory = 128 # default to 128GiB + if self.config.memory: + memory = int(self.config.memory) + return 1024 * memory + + def get_train_env(self): + return self.app.function( + image=self.get_image(), + volumes={k: v[0] for k, v in self.volumes.items()}, + cpu=16.0, + gpu=self.get_train_gpu(), + memory=self.get_train_memory(), + timeout=self.get_train_timeout(), + secrets=self.get_secrets(), + ) + + def train(self, config_yaml: str, accelerate: bool = True): + modal_fn = self.get_train_env()(_train) + with modal.enable_output(): + with self.app.run(detach=True): + modal_fn.remote( + config_yaml, + accelerate=accelerate, + volumes={k: v[0] for k, v in self.volumes.items()}, + ) + + +def _preprocess(config_yaml: str, volumes=None): + Path("/workspace/artifacts/axolotl").mkdir(parents=True, exist_ok=True) + with open( + "/workspace/artifacts/axolotl/config.yaml", "w", encoding="utf-8" + ) as f_out: + f_out.write(config_yaml) + run_folder = "/workspace/artifacts/axolotl" + run_cmd( + "axolotl preprocess /workspace/artifacts/axolotl/config.yaml --dataset-processes=8", + run_folder, + volumes, + ) + + +def _train(config_yaml: str, accelerate: bool = True, volumes=None): + with open( + "/workspace/artifacts/axolotl/config.yaml", "w", encoding="utf-8" + ) as f_out: + f_out.write(config_yaml) + run_folder = "/workspace/artifacts/axolotl" + if accelerate: + accelerate_args = "--accelerate" + else: + accelerate_args = "--no-accelerate" + run_cmd( + f"axolotl train {accelerate_args} /workspace/artifacts/axolotl/config.yaml", + run_folder, + volumes, + ) diff --git a/src/axolotl/cli/main.py b/src/axolotl/cli/main.py index 14803e43b..f3950f4b2 100644 --- a/src/axolotl/cli/main.py +++ b/src/axolotl/cli/main.py @@ -25,15 +25,21 @@ def cli(): @cli.command() @click.argument("config", type=click.Path(exists=True, path_type=str)) +@click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str)) @add_options_from_dataclass(PreprocessCliArgs) @add_options_from_config(AxolotlInputConfig) -def preprocess(config: str, **kwargs): +def preprocess(config: str, cloud: Optional[str] = None, **kwargs): """Preprocess datasets before training.""" kwargs = {k: v for k, v in kwargs.items() if v is not None} - from axolotl.cli.preprocess import do_cli + if cloud: + from axolotl.cli.cloud import do_cli_preprocess - do_cli(config=config, **kwargs) + do_cli_preprocess(cloud_config=cloud, config=config) + else: + from axolotl.cli.preprocess import do_cli + + do_cli(config=config, **kwargs) @cli.command() @@ -43,25 +49,33 @@ def preprocess(config: str, **kwargs): default=True, help="Use accelerate launch for multi-GPU training", ) +@click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str)) @add_options_from_dataclass(TrainerCliArgs) @add_options_from_config(AxolotlInputConfig) -def train(config: str, accelerate: bool, **kwargs): +def train(config: str, accelerate: bool, cloud: Optional[str], **kwargs): """Train or fine-tune a model.""" kwargs = {k: v for k, v in kwargs.items() if v is not None} # Enable expandable segments for cuda allocation to improve VRAM usage set_pytorch_cuda_alloc_conf() + from axolotl.cli.cloud import do_cli_train if accelerate: - base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"] - if config: - base_cmd.append(config) - cmd = build_command(base_cmd, kwargs) - subprocess.run(cmd, check=True) # nosec B603 + if cloud: + do_cli_train(cloud_config=cloud, config=config, accelerate=True) + else: + base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"] + if config: + base_cmd.append(config) + cmd = build_command(base_cmd, kwargs) + subprocess.run(cmd, check=True) # nosec B603 else: - from axolotl.cli.train import do_cli + if cloud: + do_cli_train(cloud_config=cloud, config=config, accelerate=False) + else: + from axolotl.cli.train import do_cli - do_cli(config=config, **kwargs) + do_cli(config=config, **kwargs) @cli.command()