Fix eval + add smoke test (#2586)

* fix evaluate CLI * add smoke test * fix naming * lint
2025-04-29 12:58:54 -04:00
parent c64a951bc9
commit f04f7cf5ad
3 changed files with 83 additions and 26 deletions
--- a/src/axolotl/cli/evaluate.py
+++ b/src/axolotl/cli/evaluate.py
@@ -1,6 +1,7 @@
 """CLI to run evaluation on a model."""
 import logging
 import os
 from pathlib import Path
 from typing import Union
@@ -14,6 +15,7 @@ from axolotl.cli.checks import check_accelerate_default_config, check_user_token
 from axolotl.cli.config import load_cfg
 from axolotl.common.datasets import load_datasets, load_preference_datasets
 from axolotl.evaluate import evaluate
 from axolotl.utils import set_pytorch_cuda_alloc_conf
 from axolotl.utils.dict import DictDefault
 LOG = logging.getLogger(__name__)
@@ -29,10 +31,14 @@ def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: CLI arguments.
    """
    # Enable expandable segments for cuda allocation to improve VRAM usage
    set_pytorch_cuda_alloc_conf()
    # pylint: disable=duplicate-code
    print_axolotl_text_art()
    check_accelerate_default_config()
-    check_user_token()
+    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        check_user_token()
    if cfg.rl:
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
--- a/src/axolotl/evaluate.py
+++ b/src/axolotl/evaluate.py
@@ -12,11 +12,12 @@ from datasets import Dataset
 from transformers.trainer import Trainer
 from axolotl.logging_config import configure_logging
-from axolotl.train import TrainDatasetMeta
+from axolotl.train import (
-from axolotl.utils import set_pytorch_cuda_alloc_conf
+    TrainDatasetMeta,
    setup_model_and_tokenizer,
 )
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import cleanup_distributed
 from axolotl.utils.models import load_model, load_processor, load_tokenizer
 from axolotl.utils.trainer import setup_trainer
 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -24,7 +25,7 @@ src_dir = os.path.join(project_root, "src")
 sys.path.insert(0, src_dir)
 configure_logging()
-LOG = get_logger("axolotl.evaluate")
+LOG = get_logger(__name__)
 def evaluate_dataset(
@@ -75,37 +76,22 @@ def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, f
    Returns:
        Dictionary mapping metric names to their values.
    """
-    # pylint: disable=duplicate-code
+    # Load tokenizer, processor and model
-    # Enable expandable segments for cuda allocation to improve VRAM usage
+    LOG.debug("loading model for evaluation...")
-    set_pytorch_cuda_alloc_conf()
+    model, tokenizer, _, processor = setup_model_and_tokenizer(cfg)
    # Load tokenizer
    LOG.debug(
        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
        main_process_only=True,
    )
    tokenizer = load_tokenizer(cfg)
    # Load processor for multimodal models if needed
    processor = None
    if cfg.is_multimodal:
        processor = load_processor(cfg, tokenizer)
    # Get datasets
    # pylint: disable=duplicate-code
    train_dataset = dataset_meta.train_dataset
    eval_dataset = dataset_meta.eval_dataset
    total_num_steps = dataset_meta.total_num_steps
    # Load model
    LOG.debug("loading model for evaluation...")
    model, _ = load_model(cfg, tokenizer, processor=processor)
    # Set up trainer
    trainer = setup_trainer(
-        cfg,
+        cfg=cfg,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
-        model=(model, None, None),  # No need for model_ref or peft_config
+        model=model,
        tokenizer=tokenizer,
        processor=processor,
        total_num_steps=total_num_steps,
--- a/tests/e2e/test_evaluate.py
+++ b/tests/e2e/test_evaluate.py
@@ -0,0 +1,65 @@
 """E2E smoke test for evaluate CLI command"""
 import os
 from pathlib import Path
 import yaml
 from accelerate.test_utils import execute_subprocess_async
 from transformers.testing_utils import get_torch_dist_unique_port
 from axolotl.utils.dict import DictDefault
 os.environ["WANDB_DISABLED"] = "true"
 class TestE2eEvaluate:
    """Test cases for evaluate CLI"""
    def test_evaluate(self, temp_dir):
        # pylint: disable=duplicate-code
        cfg = DictDefault(
            {
                "base_model": "JackFram/llama-68m",
                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
            }
        )
        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
        execute_subprocess_async(
            [
                "accelerate",
                "launch",
                "--num-processes",
                "2",
                "--main_process_port",
                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.evaluate",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )