Merge branch 'main' into cj_tokenizer_default_prompt_template

2024-10-11 12:29:34 +07:00
parent 7b4b665e99 2fbc6b0c64
commit 3c6a6c61be
9 changed files with 168 additions and 7 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -46,3 +46,9 @@ gcsfs>=2024.5.0
 trl==0.9.6
 zstandard==0.22.0
 fastcore
+
+# lm eval harness
+lm_eval==0.4.4
+langdetect==1.0.9
+immutabledict==4.2.0
+antlr4-python3-runtime==4.13.2
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -55,8 +55,22 @@ LOG = logging.getLogger("axolotl.scripts")

 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

+AXOLOTL_LOGO = """
+     #@@ #@@      @@# @@#
+    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.
+    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@
+      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@
+    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@
+    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@
+     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@
+                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@
+    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@
+                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@
+    @@@@  @@@@@@@@@@@@@@@@
+"""

-def print_axolotl_text_art(suffix=None):
+
+def print_legacy_axolotl_text_art(suffix=None):
    font = "nancyj"
    ascii_text = "  axolotl"
    if suffix:
@@ -69,6 +83,13 @@ def print_axolotl_text_art(suffix=None):
    print_dep_versions()


+def print_axolotl_text_art(
+    **kwargs,  # pylint: disable=unused-argument
+):
+    if is_main_process():
+        print(AXOLOTL_LOGO)
+
+
 def print_dep_versions():
    packages = ["accelerate", "peft", "transformers", "trl", "torch", "bitsandbytes"]
    max_len = max(len(pkg) for pkg in packages)
--- a/src/axolotl/cli/train.py
+++ b/src/axolotl/cli/train.py
@@ -3,13 +3,11 @@ CLI to run training on a model
 """
 import logging
 from pathlib import Path
-from typing import Tuple, Union
+from typing import Union

 import fire
 from dotenv import load_dotenv
 from transformers.hf_argparser import HfArgumentParser
-from transformers.modeling_utils import PreTrainedModel
-from transformers.tokenization_utils import PreTrainedTokenizer

 from axolotl.cli import (
    check_accelerate_default_config,
@@ -20,6 +18,7 @@ from axolotl.cli import (
    print_axolotl_text_art,
 )
 from axolotl.common.cli import TrainerCliArgs
+from axolotl.integrations.base import PluginManager
 from axolotl.prompt_strategies.sharegpt import (
    register_chatml_template,
    register_llama3_template,
@@ -39,7 +38,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    return do_train(parsed_cfg, parsed_cli_args)


-def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
+def do_train(cfg, cli_args) -> None:
    print_axolotl_text_art()
    check_accelerate_default_config()
    check_user_token()
@@ -64,7 +63,13 @@ def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
    else:
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

-    return train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
+    model, tokenizer = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
+    plugin_manager = PluginManager.get_instance()
+
+    del model
+    del tokenizer
+
+    plugin_manager.post_train_unload(cfg)


 if __name__ == "__main__":
--- a/src/axolotl/integrations/base.py
+++ b/src/axolotl/integrations/base.py
@@ -159,6 +159,29 @@ class BasePlugin:
        List[callable]: A list of callback functions to be added to the TrainingArgs
        """

+    def post_train(self, cfg, model):
+        """
+        Performs actions after training is complete.
+
+        Parameters:
+        cfg (dict): The axolotl configuration
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+
+    def post_train_unload(self, cfg):
+        """
+        Performs actions after training is complete and the model is unloaded.
+
+        Parameters:
+        cfg (dict): The configuration for the plugin.
+
+        Returns:
+        None
+        """
+

 def load_plugin(plugin_name: str) -> BasePlugin:
    """
@@ -381,3 +404,17 @@ class PluginManager:
        for plugin in self.plugins:
            callbacks.extend(plugin.add_callbacks_post_trainer(cfg, trainer))
        return callbacks
+
+    def post_train_unload(self, cfg):
+        """
+        Calls the post_train_unload method of all registered plugins.
+
+        Parameters:
+        cfg (dict): The configuration for the plugins.
+        model (object): The loaded model.
+
+        Returns:
+        None
+        """
+        for plugin in self.plugins:
+            plugin.post_train_unload(cfg)
--- a/src/axolotl/integrations/lm_eval/README.md
+++ b/src/axolotl/integrations/lm_eval/README.md
@@ -0,0 +1,13 @@
+# LM Eval Harness
+
+### Usage
+
+```yaml
+plugins:
+  - axolotl.integrations.lm_eval.LMEvalPlugin
+
+lm_eval_tasks:
+  - gsm8k
+  - hellaswag
+  - arc_easy
+```
--- a/src/axolotl/integrations/lm_eval/init.py
+++ b/src/axolotl/integrations/lm_eval/init.py
@@ -0,0 +1,42 @@
+"""
+Module for the Plugin for LM Eval Harness
+"""
+import subprocess  # nosec
+from datetime import datetime
+
+from axolotl.integrations.base import BasePlugin
+
+from .args import LMEvalArgs  # pylint: disable=unused-import. # noqa: F401
+
+
+class LMEvalPlugin(BasePlugin):
+    """
+    Plugin for LM Evaluation Harness integraton with Axolotl.
+    """
+
+    def get_input_args(self):
+        return "axolotl.integrations.lm_eval.LMEvalArgs"
+
+    def post_train_unload(self, cfg):
+        tasks = ",".join(cfg.lm_eval_tasks)
+        fa2 = ",attn_implementation=flash_attention_2" if cfg.flash_attention else ""
+        dtype = ",dtype=bfloat16" if cfg.bf16 else ",dtype=float16"
+        output_path = cfg.output_dir
+        output_path += "" if cfg.output_dir.endswith("/") else "/"
+        output_path += "lm_eval_results/" + datetime.now().strftime("%Y%m%d_%H%M%S")
+        subprocess.run(  # nosec
+            [
+                "lm_eval",
+                "--model",
+                "hf",
+                "--model_args",
+                f"pretrained={cfg.output_dir}{fa2}{dtype}",
+                "--tasks",
+                tasks,
+                "--batch_size",
+                str(cfg.lm_eval_batch_size),
+                "--output_path",
+                output_path,
+            ],
+            check=True,
+        )
--- a/src/axolotl/integrations/lm_eval/args.py
+++ b/src/axolotl/integrations/lm_eval/args.py
@@ -0,0 +1,15 @@
+"""
+Module for handling lm eval harness input arguments.
+"""
+from typing import List, Optional
+
+from pydantic import BaseModel
+
+
+class LMEvalArgs(BaseModel):
+    """
+    Input args for lm eval harness
+    """
+
+    lm_eval_tasks: List[str] = []
+    lm_eval_batch_size: Optional[int] = 8
--- a/src/axolotl/utils/chat_templates.py
+++ b/src/axolotl/utils/chat_templates.py
@@ -30,7 +30,9 @@ _CHAT_TEMPLATES = {

 CHAT_TEMPLATES = {
    "alpaca": "{% for message in messages %}{% if message['role'] == 'user' %}{{ '### Instruction: ' + message['content'] + '\n\n' }}{% elif message['role'] == 'assistant' %}{{ '### Response: ' + message['content'] + eos_token}}{% endif %}{% endfor %}",
-    "inst": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",  # I don't know what this one is called. Used by Mistral/Mixtral.
+    "mistral_v1": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",  # Mistral 7B V1, Mistral 7B V2, Mixtral 8x7B V1...
+    "mistral_v2v3": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",  # V3: Mistral 7B V3, Small, Large...
+    "mistral_v3_tekken": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST]' + message['content'] + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}",  # V3-Tekken: Nemo, Pixtral...
    "chatml": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
    "gemma": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}",
    "cohere": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}",
--- a/src/axolotl/utils/config/models/input/v0_4_1/init.py
+++ b/src/axolotl/utils/config/models/input/v0_4_1/init.py
@@ -1051,6 +1051,26 @@ class AxolotlInputConfig(
                "evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch."
            )

+        if data.get("do_bench_eval") and not (
+            data.get("evals_per_epoch") or data.get("eval_steps")
+        ):
+            raise ValueError(
+                "do_bench_eval requires evals_per_epoch or eval_steps to be set."
+            )
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_test_datasets_bench(cls, data):
+        if (
+            data.get("do_bench_eval")
+            and not data.get("test_datasets")
+            and not data.get("val_set_size")
+        ):
+            LOG.warning(
+                "`do_bench_eval` needs a test dataset to run evals, adding an empty test_dataset."
+            )
+            data["test_datasets"] = [{"path": "axolotl-ai-co/empty-test-ds"}]
        return data

    @model_validator(mode="before")