native support for modal cloud from CLI

2025-01-05 21:49:53 -05:00
parent 3915abee4c
commit 6da8abc01f
6 changed files with 342 additions and 17 deletions
--- a/examples/cloud/modal.yaml
+++ b/examples/cloud/modal.yaml
@@ -0,0 +1,15 @@
 volumes:
  - name: axolotl-data
    mount: /workspace/data
  - name: axolotl-artifacts
    mount: /workspace/artifacts
 secrets:
  - HF_TOKEN
  - WANDB_API_KEY
 branch:
 gpu: h100
 gpu_count: 1
 memory: 128
 timeout: 86400
 timeout_preprocess: 14400
 memory_preprocess: 32
--- a/scripts/motd
+++ b/scripts/motd
@@ -1,10 +1,15 @@
-                                 dP            dP   dP
+     #@@ #@@      @@# @@#
-                                 88            88   88
+    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.
-      .d8888b. dP.  .dP .d8888b. 88 .d8888b. d8888P 88
+    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@
-      88'  `88  `8bd8'  88'  `88 88 88'  `88   88   88
+      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@
-      88.  .88  .d88b.  88.  .88 88 88.  .88   88   88
+    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@
-      `88888P8 dP'  `dP `88888P' dP `88888P'   dP   dP
+    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@
     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@
                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@
    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@
                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@
    @@@@  @@@@@@@@@@@@@@@@
 Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory ie empty, run the following commands:
--- a/src/axolotl/cli/cloud/init.py
+++ b/src/axolotl/cli/cloud/init.py
@@ -0,0 +1,44 @@
 """
 launch axolotl in supported cloud platforms
 """
 from pathlib import Path
 from typing import Union
 import yaml
 from axolotl.cli import print_axolotl_text_art
 from axolotl.cli.cloud.modal_ import ModalCloud
 from axolotl.utils.dict import DictDefault
 def load_cloud_cfg(cloud_config: Union[Path, str]) -> DictDefault:
    """Load and validate cloud configuration."""
    # Load cloud configuration.
    with open(cloud_config, encoding="utf-8") as file:
        cloud_cfg: DictDefault = DictDefault(yaml.safe_load(file))
    return cloud_cfg
 def do_cli_preprocess(
    cloud_config: Union[Path, str],
    config: Union[Path, str] = Path("examples/"),
 ) -> None:
    print_axolotl_text_art()
    cloud_cfg = load_cloud_cfg(cloud_config)
    cloud = ModalCloud(cloud_cfg)
    with open(config, "r", encoding="utf-8") as file:
        config_yaml = file.read()
    cloud.preprocess(config_yaml)
 def do_cli_train(
    cloud_config: Union[Path, str],
    config: Union[Path, str] = Path("examples/"),
    accelerate: bool = True,
 ) -> None:
    print_axolotl_text_art()
    cloud_cfg = load_cloud_cfg(cloud_config)
    cloud = ModalCloud(cloud_cfg)
    with open(config, "r", encoding="utf-8") as file:
        config_yaml = file.read()
    cloud.train(config_yaml, accelerate=accelerate)
--- a/src/axolotl/cli/cloud/base.py
+++ b/src/axolotl/cli/cloud/base.py
@@ -0,0 +1,18 @@
 """
 base class for cloud platforms from cli
 """
 from abc import ABC, abstractmethod
 class Cloud(ABC):
    """
    Abstract base class for cloud platforms.
    """
    @abstractmethod
    def preprocess(self, config_yaml: str, *args, **kwargs) -> None:
        pass
    @abstractmethod
    def train(self, config_yaml: str, accelerate: bool = True) -> str:
        pass
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -0,0 +1,229 @@
 """
 Modal Cloud support from CLI
 """
 import copy
 import os
 from pathlib import Path
 import modal
 from axolotl.cli.cloud.base import Cloud
 def run_cmd(cmd: str, run_folder: str, volumes=None):
    """Run a command inside a folder, with Modal Volume reloading before and commit on success."""
    import subprocess  # nosec B404
    # Ensure volumes contain latest files.
    if volumes:
        for _, vol in volumes.items():
            vol.reload()
    # modal workaround so it doesn't use the automounted axolotl
    new_env = copy.deepcopy(os.environ)
    if "PYTHONPATH" in new_env:
        del new_env["PYTHONPATH"]
    # Propagate errors from subprocess.
    if exit_code := subprocess.call(  # nosec B603
        cmd.split(), cwd=run_folder, env=new_env
    ):
        exit(exit_code)  # pylint: disable=consider-using-sys-exit
    # Commit writes to volume.
    if volumes:
        for _, vol in volumes.items():
            vol.commit()
 class ModalCloud(Cloud):
    """
    Modal Cloud implementation.
    """
    def __init__(self, config, app=None):
        self.config = config
        if not app:
            app = modal.App()
        self.app = app
        self.volumes = {}
        if config.volumes:
            for volume_config in config.volumes:
                _, mount, vol = self.create_volume(volume_config)
                self.volumes[mount] = (vol, volume_config)
    def get_env(self):
        res = {
            "HF_DATASETS_CACHE": "/workspace/data/huggingface-cache/datasets",
            "HF_HUB_CACHE": "/workspace/data/huggingface-cache/hub",
        }
        for key in self.config.get("env", []):
            if isinstance(key, str):
                if val := os.environ.get(key, ""):
                    res[key] = val
            elif isinstance(key, dict):
                (key_, val) = list(key.items())[0]
                res[key_] = val
        return res
    def get_image(self):
        docker_tag = "main-py3.11-cu124-2.5.1"
        if self.config.docker_tag:
            docker_tag = self.config.docker_tag
        image = modal.Image.from_registry(f"axolotlai/axolotl:{docker_tag}")
        # branch
        if self.config.branch:
            image = image.dockerfile_commands(
                [
                    f"RUN cd /workspace/axolotl && git fetch && git checkout {self.config.branch}",
                ]
            )
        if env := self.get_env():
            image = image.env(env)
        image = image.pip_install("fastapi==0.110.0", "pydantic==2.6.3")
        return image
    def get_secrets(self):
        res = []
        if self.config.secrets:
            for key in self.config.get("secrets", []):
                # pylint: disable=duplicate-code
                if isinstance(key, str):
                    if val := os.environ.get(key, ""):
                        res.append(modal.Secret.from_dict({key: val}))
                elif isinstance(key, dict):
                    (key_, val) = list(key.items())[0]
                    res.append(modal.Secret.from_dict({key_: val}))
        return res
    def create_volume(self, volume_config):
        name = volume_config.name
        mount = volume_config.mount
        return name, mount, modal.Volume.from_name(name, create_if_missing=True)
    def get_ephemeral_disk_size(self):
        return 1000 * 525  # 1 TiB
    def get_preprocess_timeout(self):
        if self.config.timeout_preprocess:
            return int(self.config.timeout_preprocess)
        return 60 * 60 * 3  # 3 hours
    def get_preprocess_memory(self):
        memory = 128  # default to 128GiB
        if self.config.memory:
            memory = int(self.config.memory)
        if self.config.memory_preprocess:
            memory = int(self.config.memory_preprocess)
        return 1024 * memory
    def get_preprocess_env(self):
        return self.app.function(
            image=self.get_image(),
            volumes={k: v[0] for k, v in self.volumes.items()},
            cpu=8.0,
            ephemeral_disk=self.get_ephemeral_disk_size(),
            memory=self.get_preprocess_memory(),
            timeout=self.get_preprocess_timeout(),
            secrets=self.get_secrets(),
        )
    def preprocess(self, config_yaml: str, *args, **kwargs):
        modal_fn = self.get_preprocess_env()(_preprocess)
        with modal.enable_output():
            with self.app.run(detach=True):
                modal_fn.remote(
                    config_yaml,
                    volumes={k: v[0] for k, v in self.volumes.items()},
                    *args,
                    **kwargs,
                )
    def get_train_timeout(self):
        if self.config.timeout:
            return int(self.config.timeout)
        return 60 * 60 * 24  # 30 hours
    def get_train_gpu(self):  # pylint: disable=too-many-return-statements
        count = self.config.gpu_count or 1
        family = self.config.gpu.lower() or "l40s"
        if family == "l40s":
            return modal.gpu.L40S(count=count)
        if family == "a100":
            return modal.gpu.A100(count=count, size="40GB")
        if family == "a100-80gb":
            return modal.gpu.A100(count=count, size="80GB")
        if family == "a10g":
            return modal.gpu.A10G(count=count)
        if family == "h100":
            return modal.gpu.H100(count=count)
        if family == "t4":
            return modal.gpu.T4(count=count)
        if family == "l4":
            return modal.gpu.L4(count=count)
        raise ValueError(f"Unsupported GPU family: {family}")
    def get_train_memory(self):
        memory = 128  # default to 128GiB
        if self.config.memory:
            memory = int(self.config.memory)
        return 1024 * memory
    def get_train_env(self):
        return self.app.function(
            image=self.get_image(),
            volumes={k: v[0] for k, v in self.volumes.items()},
            cpu=16.0,
            gpu=self.get_train_gpu(),
            memory=self.get_train_memory(),
            timeout=self.get_train_timeout(),
            secrets=self.get_secrets(),
        )
    def train(self, config_yaml: str, accelerate: bool = True):
        modal_fn = self.get_train_env()(_train)
        with modal.enable_output():
            with self.app.run(detach=True):
                modal_fn.remote(
                    config_yaml,
                    accelerate=accelerate,
                    volumes={k: v[0] for k, v in self.volumes.items()},
                )
 def _preprocess(config_yaml: str, volumes=None):
    Path("/workspace/artifacts/axolotl").mkdir(parents=True, exist_ok=True)
    with open(
        "/workspace/artifacts/axolotl/config.yaml", "w", encoding="utf-8"
    ) as f_out:
        f_out.write(config_yaml)
    run_folder = "/workspace/artifacts/axolotl"
    run_cmd(
        "axolotl preprocess /workspace/artifacts/axolotl/config.yaml --dataset-processes=8",
        run_folder,
        volumes,
    )
 def _train(config_yaml: str, accelerate: bool = True, volumes=None):
    with open(
        "/workspace/artifacts/axolotl/config.yaml", "w", encoding="utf-8"
    ) as f_out:
        f_out.write(config_yaml)
    run_folder = "/workspace/artifacts/axolotl"
    if accelerate:
        accelerate_args = "--accelerate"
    else:
        accelerate_args = "--no-accelerate"
    run_cmd(
        f"axolotl train {accelerate_args} /workspace/artifacts/axolotl/config.yaml",
        run_folder,
        volumes,
    )
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -25,15 +25,21 @@ def cli():
@cli.command()
@click.argument("config", type=click.Path(exists=True, path_type=str))
@click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str))
@add_options_from_dataclass(PreprocessCliArgs)
@add_options_from_config(AxolotlInputConfig)
-def preprocess(config: str, **kwargs):
+def preprocess(config: str, cloud: Optional[str] = None, **kwargs):
    """Preprocess datasets before training."""
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
-    from axolotl.cli.preprocess import do_cli
+    if cloud:
        from axolotl.cli.cloud import do_cli_preprocess
-    do_cli(config=config, **kwargs)
+        do_cli_preprocess(cloud_config=cloud, config=config)
    else:
        from axolotl.cli.preprocess import do_cli
        do_cli(config=config, **kwargs)
@cli.command()
@@ -43,25 +49,33 @@ def preprocess(config: str, **kwargs):
    default=True,
    help="Use accelerate launch for multi-GPU training",
 )
@click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str))
@add_options_from_dataclass(TrainerCliArgs)
@add_options_from_config(AxolotlInputConfig)
-def train(config: str, accelerate: bool, **kwargs):
+def train(config: str, accelerate: bool, cloud: Optional[str], **kwargs):
    """Train or fine-tune a model."""
    kwargs = {k: v for k, v in kwargs.items() if v is not None}
    # Enable expandable segments for cuda allocation to improve VRAM usage
    set_pytorch_cuda_alloc_conf()
    from axolotl.cli.cloud import do_cli_train
    if accelerate:
-        base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"]
+        if cloud:
-        if config:
+            do_cli_train(cloud_config=cloud, config=config, accelerate=True)
-            base_cmd.append(config)
+        else:
-        cmd = build_command(base_cmd, kwargs)
+            base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"]
-        subprocess.run(cmd, check=True)  # nosec B603
+            if config:
                base_cmd.append(config)
            cmd = build_command(base_cmd, kwargs)
            subprocess.run(cmd, check=True)  # nosec B603
    else:
-        from axolotl.cli.train import do_cli
+        if cloud:
            do_cli_train(cloud_config=cloud, config=config, accelerate=False)
        else:
            from axolotl.cli.train import do_cli
-        do_cli(config=config, **kwargs)
+            do_cli(config=config, **kwargs)
@cli.command()