diff --git a/docs/cli.qmd b/docs/cli.qmd index d9f26dbf8..2bdaf9018 100644 --- a/docs/cli.qmd +++ b/docs/cli.qmd @@ -210,6 +210,8 @@ axolotl lm-eval config.yml Configuration options: ```yaml +lm_eval_model: # model to evaluate (local or hf path) + # List of tasks to evaluate lm_eval_tasks: - arc_challenge @@ -218,7 +220,7 @@ lm_eval_batch_size: # Batch size for evaluation output_dir: # Directory to save evaluation results ``` -See [LM Eval Harness](https://github.com/EleutherAI/lm-evaluation-harness) for more details. +See [LM Eval Harness integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#language-model-evaluation-harness-lm-eval) for full configuration details. ### delinearize-llama4 diff --git a/src/axolotl/integrations/lm_eval/README.md b/src/axolotl/integrations/lm_eval/README.md index f6ed5416e..860c6681d 100644 --- a/src/axolotl/integrations/lm_eval/README.md +++ b/src/axolotl/integrations/lm_eval/README.md @@ -6,6 +6,12 @@ See https://github.com/EleutherAI/lm-evaluation-harness ## Usage +There are two ways to use the LM Eval integration: + +### 1. Post-Training Evaluation + +When training with the plugin enabled, evaluation runs automatically after training completes: + ```yaml plugins: - axolotl.integrations.lm_eval.LMEvalPlugin @@ -16,9 +22,50 @@ lm_eval_tasks: - arc_easy lm_eval_batch_size: # Batch size for evaluation -output_dir: # Directory to save evaluation results + +# Directory to save evaluation results. +# The final model is loaded from this directory +# unless specified otherwise (see below) +output_dir: ``` +Run training as usual: +```bash +axolotl train config.yml +``` + +### 2. Standalone CLI Evaluation + +Evaluate any model directly without training: + +```yaml +lm_eval_model: meta-llama/Llama-2-7b-hf + +plugins: + - axolotl.integrations.lm_eval.LMEvalPlugin + +lm_eval_tasks: + - gsm8k + - hellaswag + - arc_easy + +lm_eval_batch_size: 8 +output_dir: ./outputs +``` + +Run evaluation: +```bash +axolotl lm-eval config.yml +``` + +## Model Selection Priority + +The model to evaluate is selected in the following priority order: + +1. **`lm_eval_model`** - Explicit model path or HuggingFace repo (highest priority) +2. **`hub_model_id`** - Trained model pushed to HuggingFace Hub +3. **`output_dir`** - Local checkpoint directory containing trained model weights + ## Citation ```bib diff --git a/src/axolotl/integrations/lm_eval/__init__.py b/src/axolotl/integrations/lm_eval/__init__.py index 0ab6b8697..6a82dd6cf 100644 --- a/src/axolotl/integrations/lm_eval/__init__.py +++ b/src/axolotl/integrations/lm_eval/__init__.py @@ -5,7 +5,7 @@ Module for the Plugin for LM Eval Harness import subprocess # nosec from axolotl.integrations.base import BasePlugin -from axolotl.integrations.lm_eval.cli import build_lm_eval_command +from axolotl.integrations.lm_eval.cli import build_lm_eval_command, get_model_path from .args import LMEvalArgs as LMEvalArgs @@ -29,7 +29,7 @@ class LMEvalPlugin(BasePlugin): wandb_project=cfg.wandb_project, wandb_entity=cfg.wandb_entity, wandb_name=cfg.wandb_name, - model=cfg.lm_eval_model or cfg.hub_model_id, + model=get_model_path(cfg), ): subprocess.run( # nosec lm_eval_args, diff --git a/src/axolotl/integrations/lm_eval/cli.py b/src/axolotl/integrations/lm_eval/cli.py index ead82dcb7..4b905d476 100644 --- a/src/axolotl/integrations/lm_eval/cli.py +++ b/src/axolotl/integrations/lm_eval/cli.py @@ -13,6 +13,21 @@ import yaml from axolotl.utils.dict import DictDefault +def get_model_path(cfg: DictDefault) -> str | None: + """ + Determine which model path to use for evaluation. + + Priority order (highest to lowest): + 1. lm_eval_model - Explicit model path override + 2. hub_model_id - Model pushed to HuggingFace Hub + 3. None - Falls back to output_dir in build_lm_eval_command + + Returns: + Model path string or None to use output_dir fallback + """ + return cfg.lm_eval_model or cfg.hub_model_id or None + + def build_lm_eval_command( tasks: list[str], bfloat16=True, @@ -108,7 +123,7 @@ def lm_eval(config: str, cloud: Optional[str] = None): wandb_project=cfg.wandb_project, wandb_entity=cfg.wandb_entity, wandb_name=cfg.wandb_name, - model=cfg.lm_eval_model or cfg.hub_model_id, + model=get_model_path(cfg), revision=cfg.revision, apply_chat_template=cfg.apply_chat_template, fewshot_as_multiturn=cfg.fewshot_as_multiturn,