use alternate math-hard repo

apply chat template as arg
revision support
2025-01-13 08:46:35 -05:00 · 2025-01-12 17:38:32 -05:00 · 2025-01-12 05:17:03 -05:00 · 2025-01-11 23:18:27 -05:00 · 2025-01-08 08:38:06 -05:00 · 2025-01-07 15:13:18 -05:00
15 changed files with 589 additions and 55 deletions
--- a/README.md
+++ b/README.md
@@ -217,7 +217,7 @@ If you love axolotl, consider sponsoring the project by reaching out directly to

 ---

- [Modal](https://modal.com/) Modal lets you run data/AI jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale, fine-tune LLM models, run protein folding simulations, and much more.
+- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl) Modal lets you run data/AI jobs in the cloud, by just writing a few lines of Python. Customers use Modal to deploy Gen AI models at large scale, fine-tune LLM models, run protein folding simulations, and much more.

 ---

--- a/examples/cloud/modal.yaml
+++ b/examples/cloud/modal.yaml
@@ -0,0 +1,15 @@
+volumes:
+  - name: axolotl-data
+    mount: /workspace/data
+  - name: axolotl-artifacts
+    mount: /workspace/artifacts
+secrets:
+  - HF_TOKEN
+  - WANDB_API_KEY
+branch: cli-cloud-modal
+gpu: h100
+gpu_count: 1
+memory: 128
+timeout: 86400
+timeout_preprocess: 14400
+memory_preprocess: 32
--- a/lm_eval-kd.yaml
+++ b/lm_eval-kd.yaml
@@ -0,0 +1,11 @@
+lm_eval_model: axolotl-ai-co/numina-8b-ep1-exp1
+lm_eval_tasks:
+  - leaderboard_math_hard
+lm_eval_batch_size: 64
+
+apply_chat_template: false
+wandb_project: numina-kd-experiment
+wandb_entity: axolotl-ai
+bf16: true
+flash_attention: true
+output_dir: ./outputs/model-evals-out
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,6 +25,7 @@ hf_transfer
 sentencepiece
 gradio==3.50.2

+modal==0.70.5
 pydantic==2.6.3
 addict
 fire
@@ -53,7 +54,7 @@ zstandard==0.22.0
 fastcore

 # lm eval harness
-lm_eval==0.4.4
+lm_eval==0.4.7
 langdetect==1.0.9
 immutabledict==4.2.0
 antlr4-python3-runtime==4.13.2
--- a/scripts/motd
+++ b/scripts/motd
@@ -1,10 +1,15 @@

-                                 dP            dP   dP
-                                 88            88   88
-      .d8888b. dP.  .dP .d8888b. 88 .d8888b. d8888P 88
-      88'  `88  `8bd8'  88'  `88 88 88'  `88   88   88
-      88.  .88  .d88b.  88.  .88 88 88.  .88   88   88
-      `88888P8 dP'  `dP `88888P' dP `88888P'   dP   dP
+     #@@ #@@      @@# @@#
+    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.
+    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@
+      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@
+    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@
+    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@
+     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@
+                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@
+    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@
+                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@
+    @@@@  @@@@@@@@@@@@@@@@

 Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory ie empty, run the following commands:

--- a/src/axolotl/cli/cloud/init.py
+++ b/src/axolotl/cli/cloud/init.py
@@ -0,0 +1,56 @@
+"""
+launch axolotl in supported cloud platforms
+"""
+from pathlib import Path
+from typing import Union
+
+import yaml
+
+from axolotl.cli import print_axolotl_text_art
+from axolotl.cli.cloud.modal_ import ModalCloud
+from axolotl.utils.dict import DictDefault
+
+
+def load_cloud_cfg(cloud_config: Union[Path, str]) -> DictDefault:
+    """Load and validate cloud configuration."""
+    # Load cloud configuration.
+    with open(cloud_config, encoding="utf-8") as file:
+        cloud_cfg: DictDefault = DictDefault(yaml.safe_load(file))
+    return cloud_cfg
+
+
+def do_cli_preprocess(
+    cloud_config: Union[Path, str],
+    config: Union[Path, str] = Path("examples/"),
+) -> None:
+    print_axolotl_text_art()
+    cloud_cfg = load_cloud_cfg(cloud_config)
+    cloud = ModalCloud(cloud_cfg)
+    with open(config, "r", encoding="utf-8") as file:
+        config_yaml = file.read()
+    cloud.preprocess(config_yaml)
+
+
+def do_cli_train(
+    cloud_config: Union[Path, str],
+    config: Union[Path, str] = Path("examples/"),
+    accelerate: bool = True,
+) -> None:
+    print_axolotl_text_art()
+    cloud_cfg = load_cloud_cfg(cloud_config)
+    cloud = ModalCloud(cloud_cfg)
+    with open(config, "r", encoding="utf-8") as file:
+        config_yaml = file.read()
+    cloud.train(config_yaml, accelerate=accelerate)
+
+
+def do_cli_lm_eval(
+    cloud_config: Union[Path, str],
+    config: Union[Path, str] = Path("examples/"),
+) -> None:
+    print_axolotl_text_art()
+    cloud_cfg = load_cloud_cfg(cloud_config)
+    cloud = ModalCloud(cloud_cfg)
+    with open(config, "r", encoding="utf-8") as file:
+        config_yaml = file.read()
+    cloud.lm_eval(config_yaml)
--- a/src/axolotl/cli/cloud/base.py
+++ b/src/axolotl/cli/cloud/base.py
@@ -0,0 +1,18 @@
+"""
+base class for cloud platforms from cli
+"""
+from abc import ABC, abstractmethod
+
+
+class Cloud(ABC):
+    """
+    Abstract base class for cloud platforms.
+    """
+
+    @abstractmethod
+    def preprocess(self, config_yaml: str, *args, **kwargs) -> None:
+        pass
+
+    @abstractmethod
+    def train(self, config_yaml: str, accelerate: bool = True) -> str:
+        pass
--- a/src/axolotl/cli/cloud/modal_.py
+++ b/src/axolotl/cli/cloud/modal_.py
@@ -0,0 +1,272 @@
+"""
+Modal Cloud support from CLI
+"""
+import copy
+import json
+import os
+import subprocess  # nosec B404
+from pathlib import Path
+from random import randint
+
+import modal
+
+from axolotl.cli.cloud.base import Cloud
+
+
+def run_cmd(cmd: str, run_folder: str, volumes=None):
+    """Run a command inside a folder, with Modal Volume reloading before and commit on success."""
+    # Ensure volumes contain latest files.
+    if volumes:
+        for _, vol in volumes.items():
+            vol.reload()
+
+    # modal workaround so it doesn't use the automounted axolotl
+    new_env = copy.deepcopy(os.environ)
+    if "PYTHONPATH" in new_env:
+        del new_env["PYTHONPATH"]
+
+    # Propagate errors from subprocess.
+    if exit_code := subprocess.call(  # nosec B603
+        cmd.split(), cwd=run_folder, env=new_env
+    ):
+        exit(exit_code)  # pylint: disable=consider-using-sys-exit
+
+    # Commit writes to volume.
+    if volumes:
+        for _, vol in volumes.items():
+            vol.commit()
+
+
+class ModalCloud(Cloud):
+    """
+    Modal Cloud implementation.
+    """
+
+    def __init__(self, config, app=None):
+        self.config = config
+        if not app:
+            app = modal.App()
+        self.app = app
+
+        self.volumes = {}
+        if config.volumes:
+            for volume_config in config.volumes:
+                _, mount, vol = self.create_volume(volume_config)
+                self.volumes[mount] = (vol, volume_config)
+
+    def get_env(self):
+        res = {
+            "HF_DATASETS_CACHE": "/workspace/data/huggingface-cache/datasets",
+            "HF_HUB_CACHE": "/workspace/data/huggingface-cache/hub",
+        }
+
+        for key in self.config.get("env", []):
+            if isinstance(key, str):
+                if val := os.environ.get(key, ""):
+                    res[key] = val
+            elif isinstance(key, dict):
+                (key_, val) = list(key.items())[0]
+                res[key_] = val
+        return res
+
+    def get_image(self):
+        docker_tag = "main-py3.11-cu124-2.5.1"
+        if self.config.docker_tag:
+            docker_tag = self.config.docker_tag
+        docker_image = f"axolotlai/axolotl:{docker_tag}"
+
+        # grab the sha256 hash from docker hub for this image+tag
+        # this ensures that we always get the latest image for this tag, even if it's already cached
+        try:
+            manifest = subprocess.check_output(  # nosec B602
+                f"docker manifest inspect {docker_image}",
+                shell=True,
+            ).decode("utf-8")
+            sha256_hash = json.loads(manifest)["manifests"][0]["digest"]
+        except subprocess.CalledProcessError:
+            sha256_hash = None
+
+        # create the image
+        if sha256_hash:
+            image = modal.Image.from_registry(f"axolotlai/axolotl@{sha256_hash}")
+        else:
+            image = modal.Image.from_registry(docker_image)
+
+        # branch
+        if self.config.branch:
+            image = image.dockerfile_commands(
+                [
+                    # Random id for cache busting of branch commits
+                    f"RUN echo '{str(randint(0, 1000000))}'",  # nosec B311
+                    f"RUN cd /workspace/axolotl && git fetch && git checkout {self.config.branch}",
+                    "RUN cd /workspace/ && git clone https://github.com/winglian/lm-evaluation-harness.git && cd lm-evaluation-harness && pip install -e .[math]",
+                ]
+            )
+
+        if env := self.get_env():
+            image = image.env(env)
+
+        image = image.pip_install("fastapi==0.110.0", "pydantic==2.6.3")
+
+        return image
+
+    def get_secrets(self):
+        res = []
+        if self.config.secrets:
+            for key in self.config.get("secrets", []):
+                # pylint: disable=duplicate-code
+                if isinstance(key, str):
+                    if val := os.environ.get(key, ""):
+                        res.append(modal.Secret.from_dict({key: val}))
+                elif isinstance(key, dict):
+                    (key_, val) = list(key.items())[0]
+                    res.append(modal.Secret.from_dict({key_: val}))
+        return res
+
+    def create_volume(self, volume_config):
+        name = volume_config.name
+        mount = volume_config.mount
+        return name, mount, modal.Volume.from_name(name, create_if_missing=True)
+
+    def get_ephemeral_disk_size(self):
+        return 1000 * 525  # 1 TiB
+
+    def get_preprocess_timeout(self):
+        if self.config.timeout_preprocess:
+            return int(self.config.timeout_preprocess)
+        return 60 * 60 * 3  # 3 hours
+
+    def get_preprocess_memory(self):
+        memory = 128  # default to 128GiB
+        if self.config.memory:
+            memory = int(self.config.memory)
+        if self.config.memory_preprocess:
+            memory = int(self.config.memory_preprocess)
+        return 1024 * memory
+
+    def get_preprocess_env(self):
+        return self.app.function(
+            image=self.get_image(),
+            volumes={k: v[0] for k, v in self.volumes.items()},
+            cpu=8.0,
+            ephemeral_disk=self.get_ephemeral_disk_size(),
+            memory=self.get_preprocess_memory(),
+            timeout=self.get_preprocess_timeout(),
+            secrets=self.get_secrets(),
+        )
+
+    def preprocess(self, config_yaml: str, *args, **kwargs):
+        modal_fn = self.get_preprocess_env()(_preprocess)
+        with modal.enable_output():
+            with self.app.run(detach=True):
+                modal_fn.remote(
+                    config_yaml,
+                    volumes={k: v[0] for k, v in self.volumes.items()},
+                    *args,
+                    **kwargs,
+                )
+
+    def get_train_timeout(self):
+        if self.config.timeout:
+            return int(self.config.timeout)
+        return 60 * 60 * 24  # 24 hours
+
+    def get_train_gpu(self):  # pylint: disable=too-many-return-statements
+        count = self.config.gpu_count or 1
+        family = self.config.gpu.lower() or "l40s"
+
+        if family == "l40s":
+            return modal.gpu.L40S(count=count)
+        if family == "a100":
+            return modal.gpu.A100(count=count, size="40GB")
+        if family == "a100-80gb":
+            return modal.gpu.A100(count=count, size="80GB")
+        if family in ["a10", "a10g"]:
+            return modal.gpu.A10G(count=count)
+        if family == "h100":
+            return modal.gpu.H100(count=count)
+        if family == "t4":
+            return modal.gpu.T4(count=count)
+        if family == "l4":
+            return modal.gpu.L4(count=count)
+        raise ValueError(f"Unsupported GPU family: {family}")
+
+    def get_train_memory(self):
+        memory = 128  # default to 128GiB
+        if self.config.memory:
+            memory = int(self.config.memory)
+        return 1024 * memory
+
+    def get_train_env(self):
+        return self.app.function(
+            image=self.get_image(),
+            volumes={k: v[0] for k, v in self.volumes.items()},
+            cpu=16.0,
+            gpu=self.get_train_gpu(),
+            memory=self.get_train_memory(),
+            timeout=self.get_train_timeout(),
+            secrets=self.get_secrets(),
+        )
+
+    def train(self, config_yaml: str, accelerate: bool = True):
+        modal_fn = self.get_train_env()(_train)
+        with modal.enable_output():
+            with self.app.run(detach=True):
+                modal_fn.remote(
+                    config_yaml,
+                    accelerate=accelerate,
+                    volumes={k: v[0] for k, v in self.volumes.items()},
+                )
+
+    def lm_eval(self, config_yaml: str):
+        modal_fn = self.get_train_env()(_lm_eval)
+        with modal.enable_output():
+            with self.app.run(detach=True):
+                modal_fn.remote(
+                    config_yaml,
+                    volumes={k: v[0] for k, v in self.volumes.items()},
+                )
+
+
+def _preprocess(config_yaml: str, volumes=None):
+    Path("/workspace/artifacts/axolotl").mkdir(parents=True, exist_ok=True)
+    with open(
+        "/workspace/artifacts/axolotl/config.yaml", "w", encoding="utf-8"
+    ) as f_out:
+        f_out.write(config_yaml)
+    run_folder = "/workspace/artifacts/axolotl"
+    run_cmd(
+        "axolotl preprocess /workspace/artifacts/axolotl/config.yaml --dataset-processes=8",
+        run_folder,
+        volumes,
+    )
+
+
+def _train(config_yaml: str, accelerate: bool = True, volumes=None):
+    with open(
+        "/workspace/artifacts/axolotl/config.yaml", "w", encoding="utf-8"
+    ) as f_out:
+        f_out.write(config_yaml)
+    run_folder = "/workspace/artifacts/axolotl"
+    if accelerate:
+        accelerate_args = "--accelerate"
+    else:
+        accelerate_args = "--no-accelerate"
+    run_cmd(
+        f"axolotl train {accelerate_args} /workspace/artifacts/axolotl/config.yaml",
+        run_folder,
+        volumes,
+    )
+
+
+def _lm_eval(config_yaml: str, volumes=None):
+    with open(
+        "/workspace/artifacts/axolotl/config.yaml", "w", encoding="utf-8"
+    ) as f_out:
+        f_out.write(config_yaml)
+    run_folder = "/workspace/artifacts/axolotl"
+    run_cmd(
+        "axolotl lm-eval /workspace/artifacts/axolotl/config.yaml",
+        run_folder,
+        volumes,
+    )
--- a/src/axolotl/cli/main.py
+++ b/src/axolotl/cli/main.py
@@ -13,6 +13,7 @@ from axolotl.cli.utils import (
    fetch_from_github,
 )
 from axolotl.common.cli import EvaluateCliArgs, PreprocessCliArgs, TrainerCliArgs
+from axolotl.integrations.lm_eval.cli import lm_eval
 from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.config.models.input.v0_4_1 import AxolotlInputConfig

@@ -25,15 +26,21 @@ def cli():

@cli.command()
@click.argument("config", type=click.Path(exists=True, path_type=str))
+@click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str))
@add_options_from_dataclass(PreprocessCliArgs)
@add_options_from_config(AxolotlInputConfig)
-def preprocess(config: str, **kwargs):
+def preprocess(config: str, cloud: Optional[str] = None, **kwargs):
    """Preprocess datasets before training."""
    kwargs = {k: v for k, v in kwargs.items() if v is not None}

-    from axolotl.cli.preprocess import do_cli
+    if cloud:
+        from axolotl.cli.cloud import do_cli_preprocess

-    do_cli(config=config, **kwargs)
+        do_cli_preprocess(cloud_config=cloud, config=config)
+    else:
+        from axolotl.cli.preprocess import do_cli
+
+        do_cli(config=config, **kwargs)


@cli.command()
@@ -43,25 +50,33 @@ def preprocess(config: str, **kwargs):
    default=True,
    help="Use accelerate launch for multi-GPU training",
 )
+@click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str))
@add_options_from_dataclass(TrainerCliArgs)
@add_options_from_config(AxolotlInputConfig)
-def train(config: str, accelerate: bool, **kwargs):
+def train(config: str, accelerate: bool, cloud: Optional[str], **kwargs):
    """Train or fine-tune a model."""
    kwargs = {k: v for k, v in kwargs.items() if v is not None}

    # Enable expandable segments for cuda allocation to improve VRAM usage
    set_pytorch_cuda_alloc_conf()
+    from axolotl.cli.cloud import do_cli_train

    if accelerate:
-        base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"]
-        if config:
-            base_cmd.append(config)
-        cmd = build_command(base_cmd, kwargs)
-        subprocess.run(cmd, check=True)  # nosec B603
+        if cloud:
+            do_cli_train(cloud_config=cloud, config=config, accelerate=True)
+        else:
+            base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"]
+            if config:
+                base_cmd.append(config)
+            cmd = build_command(base_cmd, kwargs)
+            subprocess.run(cmd, check=True)  # nosec B603
    else:
-        from axolotl.cli.train import do_cli
+        if cloud:
+            do_cli_train(cloud_config=cloud, config=config, accelerate=False)
+        else:
+            from axolotl.cli.train import do_cli

-        do_cli(config=config, **kwargs)
+            do_cli(config=config, **kwargs)


@cli.command()
@@ -254,6 +269,9 @@ def fetch(directory: str, dest: Optional[str]):
    fetch_from_github(f"{directory}/", dest)


+cli.add_command(lm_eval)
+
+
 def main():
    cli()

--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -424,11 +424,6 @@ class SchedulerMixin(Trainer):

        return self.lr_scheduler

-    def _load_optimizer_and_scheduler(self, checkpoint):
-        if not checkpoint and self.args.optimizer_checkpoint is not None:
-            checkpoint = self.args.optimizer_checkpoint
-        return super()._load_optimizer_and_scheduler(checkpoint)
-

 class AxolotlTrainer(SchedulerMixin, Trainer):
    """
@@ -1769,10 +1764,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        ] = self.cfg.loraplus_lr_embedding
        training_arguments_kwargs["embedding_lr"] = self.cfg.embedding_lr
        training_arguments_kwargs["embedding_lr_scale"] = self.cfg.embedding_lr_scale
-        if self.cfg.optimizer_checkpoint:
-            training_arguments_kwargs[
-                "optimizer_checkpoint"
-            ] = self.cfg.optimizer_checkpoint

        if self.cfg.lr_scheduler in ["one_cycle", "log_sweep"]:
            training_arguments_kwargs["lr_scheduler_type"] = "cosine"
--- a/src/axolotl/integrations/lm_eval/init.py
+++ b/src/axolotl/integrations/lm_eval/init.py
@@ -2,9 +2,9 @@
 Module for the Plugin for LM Eval Harness
 """
 import subprocess  # nosec
-from datetime import datetime

 from axolotl.integrations.base import BasePlugin
+from axolotl.integrations.lm_eval.cli import build_lm_eval_command

 from .args import LMEvalArgs  # pylint: disable=unused-import. # noqa: F401

@@ -18,25 +18,19 @@ class LMEvalPlugin(BasePlugin):
        return "axolotl.integrations.lm_eval.LMEvalArgs"

    def post_train_unload(self, cfg):
-        tasks = ",".join(cfg.lm_eval_tasks)
-        fa2 = ",attn_implementation=flash_attention_2" if cfg.flash_attention else ""
-        dtype = ",dtype=bfloat16" if cfg.bf16 else ",dtype=float16"
-        output_path = cfg.output_dir
-        output_path += "" if cfg.output_dir.endswith("/") else "/"
-        output_path += "lm_eval_results/" + datetime.now().strftime("%Y%m%d_%H%M%S")
-        subprocess.run(  # nosec
-            [
-                "lm_eval",
-                "--model",
-                "hf",
-                "--model_args",
-                f"pretrained={cfg.output_dir}{fa2}{dtype}",
-                "--tasks",
-                tasks,
-                "--batch_size",
-                str(cfg.lm_eval_batch_size),
-                "--output_path",
-                output_path,
-            ],
-            check=True,
-        )
+        if cfg.lm_eval_post_train:
+            # pylint: disable=duplicate-code
+            for lm_eval_args in build_lm_eval_command(
+                cfg.lm_eval_tasks,
+                bfloat16=cfg.bfloat16 or cfg.bf16,
+                flash_attention=cfg.flash_attention,
+                output_dir=cfg.output_dir,
+                batch_size=cfg.lm_eval_batch_size,
+                wandb_project=cfg.wandb_project,
+                wandb_entity=cfg.wandb_entity,
+                model=cfg.lm_eval_model or cfg.hub_model_id,
+            ):
+                subprocess.run(  # nosec
+                    lm_eval_args,
+                    check=True,
+                )
--- a/src/axolotl/integrations/lm_eval/args.py
+++ b/src/axolotl/integrations/lm_eval/args.py
@@ -13,3 +13,5 @@ class LMEvalArgs(BaseModel):

    lm_eval_tasks: List[str] = []
    lm_eval_batch_size: Optional[int] = 8
+    lm_eval_post_train: Optional[bool] = True
+    lm_eval_model: Optional[str] = None
--- a/src/axolotl/integrations/lm_eval/cli.py
+++ b/src/axolotl/integrations/lm_eval/cli.py
@@ -0,0 +1,113 @@
+"""
+axolotl CLI for running lm_eval tasks
+"""
+import subprocess  # nosec
+from collections import defaultdict
+from datetime import datetime
+from typing import Optional
+
+import click
+import yaml
+
+from axolotl.utils.dict import DictDefault
+
+
+def build_lm_eval_command(
+    tasks: list[str],
+    bfloat16=True,
+    flash_attention=False,
+    output_dir="./",
+    batch_size=8,
+    wandb_project=None,
+    wandb_entity=None,
+    model=None,
+    revision=None,
+    apply_chat_template=None,
+    fewshot_as_multiturn=None,
+):
+    tasks_by_num_fewshot: dict[str, list] = defaultdict(list)
+    for task in tasks:
+        num_fewshot = "-1"
+        task_parts = task.split(":")
+        task_name = task_parts[0]
+        if len(task_parts) == 2:
+            task_name, num_fewshot = task_parts
+        tasks_by_num_fewshot[str(num_fewshot)].append(task_name)
+
+    for num_fewshot, tasks_list in tasks_by_num_fewshot.items():
+        tasks_str = ",".join(tasks_list)
+        num_fewshot_val = num_fewshot if num_fewshot != "-1" else None
+        pretrained = "pretrained="
+        pretrained += model if model else output_dir
+        fa2 = ",attn_implementation=flash_attention_2" if flash_attention else ""
+        dtype = ",dtype=bfloat16" if bfloat16 else ",dtype=float16"
+        revision = f",revision={revision}" if revision else ""
+        output_path = output_dir
+        output_path += "" if output_dir.endswith("/") else "/"
+        output_path += "lm_eval_results/" + datetime.now().strftime("%Y%m%d_%H%M%S")
+        lm_eval_args = [
+            "lm_eval",
+            "--model",
+            "hf",
+            "--model_args",
+            f"{pretrained}{fa2}{dtype}{revision}",
+            "--tasks",
+            tasks_str,
+            "--batch_size",
+            str(batch_size),
+            "--output_path",
+            output_path,
+        ]
+        wandb_args = []
+        if wandb_project:
+            wandb_args.append(f"project={wandb_project}")
+        if wandb_entity:
+            wandb_args.append(f"entity={wandb_entity}")
+        if wandb_args:
+            lm_eval_args.append("--wandb_args")
+            lm_eval_args.append(",".join(wandb_args))
+        if apply_chat_template:
+            lm_eval_args.append("--apply_chat_template")
+        if num_fewshot_val:
+            lm_eval_args.append("--num_fewshot")
+            lm_eval_args.append(str(num_fewshot_val))
+            if apply_chat_template and fewshot_as_multiturn:
+                lm_eval_args.append("--fewshot_as_multiturn")
+
+        yield lm_eval_args
+
+
+@click.command()
+@click.argument("config", type=click.Path(exists=True, path_type=str))
+@click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str))
+def lm_eval(config: str, cloud: Optional[str] = None):
+    """
+    use lm eval to evaluate a trained language model
+    """
+
+    if cloud:
+        from axolotl.cli.cloud import do_cli_lm_eval
+
+        do_cli_lm_eval(cloud_config=cloud, config=config)
+    else:
+        with open(config, encoding="utf-8") as file:
+            cfg: DictDefault = DictDefault(yaml.safe_load(file))
+
+        # pylint: disable=duplicate-code
+        for lm_eval_args in build_lm_eval_command(
+            cfg.lm_eval_tasks,
+            bfloat16=cfg.bfloat16 or cfg.bf16,
+            flash_attention=cfg.flash_attention,
+            output_dir=cfg.output_dir,
+            batch_size=cfg.lm_eval_batch_size,
+            wandb_project=cfg.wandb_project,
+            wandb_entity=cfg.wandb_entity,
+            model=cfg.lm_eval_model or cfg.hub_model_id,
+            revision=cfg.revision,
+            apply_chat_template=cfg.apply_chat_template,
+            fewshot_as_multiturn=cfg.fewshot_as_multiturn,
+        ):
+            subprocess.run(  # nosec
+                lm_eval_args,
+                check=True,
+            )
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -603,8 +603,6 @@ class AxolotlInputConfig(
    strict: Optional[bool] = Field(default=False)
    resume_from_checkpoint: Optional[str] = None
    auto_resume_from_checkpoints: Optional[bool] = None
-    optimizer_checkpoint: Optional[str] = None
-
    resize_token_embeddings_to_32x: Optional[bool] = None
    mean_resizing_embeddings: Optional[bool] = False

--- a/src/axolotl/utils/data/pretraining.py
+++ b/src/axolotl/utils/data/pretraining.py
@@ -28,8 +28,10 @@ def encode_pretraining(
    )
    # Convert to PyTorch tensors
    input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
+    targets = [torch.tensor(seq) for seq in res["input_ids"]]
    attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
    new_input_ids = []
+    new_labels = []
    new_attention_mask = []
    # Append EOS and PAD tokens to input_ids, and correct attention_mask
    for i, _ in enumerate(input_ids):
@@ -40,22 +42,34 @@ def encode_pretraining(
            ),
            dim=0,
        )
+        targets[i] = torch.cat(
+            (
+                targets[i],
+                torch.tensor([tokenizer.eos_token_id, -100]),
+            ),
+            dim=0,
+        )
        attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)

    # Concatenate tokens so that their lengths are less than max_tokens
    buffer_input_ids = torch.tensor([], dtype=torch.long)
+    buffer_labels = torch.tensor([], dtype=torch.long)
    buffer_attention_mask = torch.tensor([], dtype=torch.long)

-    for ids, mask in zip(input_ids, attention_mask):
+    for ids, labels, mask in zip(input_ids, targets, attention_mask):
        if buffer_input_ids.numel() == max_tokens:
            new_input_ids.append(buffer_input_ids)
+            new_labels.append(buffer_labels)
            new_attention_mask.append(buffer_attention_mask)
            buffer_input_ids = torch.tensor([], dtype=torch.long)
+            buffer_labels = torch.tensor([], dtype=torch.long)
            buffer_attention_mask = torch.tensor([], dtype=torch.long)
            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_labels = torch.cat((buffer_labels, labels), dim=0)
            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
        elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_labels = torch.cat((buffer_labels, labels), dim=0)
            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
        else:
            buffer_input_ids = torch.cat(
@@ -69,6 +83,17 @@ def encode_pretraining(
                ),
                dim=0,
            )
+            buffer_labels = torch.cat(
+                (
+                    buffer_labels,
+                    torch.full(
+                        (max_tokens - buffer_labels.numel(),),
+                        -100,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
            buffer_attention_mask = torch.cat(
                (
                    buffer_attention_mask,
@@ -81,11 +106,14 @@ def encode_pretraining(
                dim=0,
            )
            new_input_ids.append(buffer_input_ids)
+            new_labels.append(buffer_labels)
            new_attention_mask.append(buffer_attention_mask)
            buffer_input_ids = torch.tensor([], dtype=torch.long)
+            buffer_labels = torch.tensor([], dtype=torch.long)
            buffer_attention_mask = torch.tensor([], dtype=torch.long)

            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
+            buffer_labels = torch.cat((buffer_labels, labels), dim=0)
            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)

    if buffer_input_ids.numel() > 0:  # for any leftover tokens
@@ -101,6 +129,17 @@ def encode_pretraining(
                ),
                dim=0,
            )
+            buffer_labels = torch.cat(
+                (
+                    buffer_labels,
+                    torch.full(
+                        (max_tokens - buffer_labels.numel(),),
+                        -100,
+                        dtype=torch.long,
+                    ),
+                ),
+                dim=0,
+            )
            buffer_attention_mask = torch.cat(
                (
                    buffer_attention_mask,
@@ -113,11 +152,12 @@ def encode_pretraining(
                dim=0,
            )
        new_input_ids.append(buffer_input_ids)
+        new_labels.append(buffer_labels)
        new_attention_mask.append(buffer_attention_mask)

    ret = {
        "input_ids": [seq.tolist() for seq in new_input_ids],
-        "labels": [seq.tolist() for seq in new_input_ids],
+        "labels": [seq.tolist() for seq in new_labels],
        "attention_mask": [seq.tolist() for seq in new_attention_mask],
    }
Author	SHA1	Message	Date
Wing Lian	ee20600b9a	use alternate math-hard repo	2025-01-13 08:46:35 -05:00
Wing Lian	fd91de3ea6	apply chat template as arg	2025-01-12 17:38:32 -05:00
Wing Lian	530bf77cf9	revision support	2025-01-12 05:17:03 -05:00
Wing Lian	bfc91a91ca	use chat template	2025-01-11 23:18:27 -05:00
Wing Lian	5c226b600d	pr feedback	2025-01-08 08:38:06 -05:00
Wing Lian	af66f7c274	update link in README to include utm	2025-01-07 15:13:18 -05:00
Wing Lian	079f94ee99	include modal in requirements	2025-01-07 08:48:25 -05:00
Wing Lian	981ad965d0	allow minimal yaml for lm eval	2025-01-06 17:41:10 -05:00
Wing Lian	7ba701a355	cache bust when using branch, grab sha of latest image tag, update lm-eval dep	2025-01-06 16:19:08 -05:00
Wing Lian	0390bce7aa	lm_eval option to not post eval, and append not extend	2025-01-06 11:52:07 -05:00
Wing Lian	2741d8de23	Fix the sub call to lm-eval	2025-01-06 11:44:55 -05:00
Wing Lian	27a88f37cd	do lm_eval in cloud too	2025-01-06 11:17:14 -05:00
Wing Lian	6da8abc01f	native support for modal cloud from CLI	2025-01-05 21:49:53 -05:00
Wing Lian	3915abee4c	make sure padding is labeled as -100 for pretraining (#2227 )	2024-12-31 15:22:18 -05:00