fix for gather across multiple gpus

gather benchmarks from all ranks
improve support for customized dataset for bench evals
2023-08-29 06:57:28 -07:00 · 2023-08-28 11:29:59 -04:00 · 2023-08-28 06:03:53 -04:00 · 2023-08-28 05:43:19 -04:00 · 2023-08-28 05:39:13 -04:00 · 2023-08-28 05:39:13 -04:00
10 changed files with 96 additions and 206 deletions
--- a/README.md
+++ b/README.md
@@ -328,15 +328,6 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
      name: enron_emails
      type: completion # format from earlier

-  # huggingface repo with multiple named configurations/subsets
-  datasets:
-    - path: bigcode/commitpackft
-      name:
-        - ruby
-        - python
-        - typescript
-      type: ... # unimplemented custom format
-
  # local
  datasets:
    - path: data.jsonl # or json
@@ -416,10 +407,6 @@ fp16: true
 # Use CUDA tf32
 tf32: true # require >=ampere

-# No AMP (automatic mixed precision)
-bfloat16: true # require >=ampere
-float16: true
-
 # a list of one or more datasets to finetune the model with
 datasets:
  # hf dataset repo | "json" for local dataset, make sure to fill data_files
@@ -472,9 +459,6 @@ dataset_shard_idx:
 # the maximum length of an input to train with, this should typically be less than 2048
 # as most models have a token/context limit of 2048
 sequence_len: 2048
-# pad inputs so each step uses constant sized buffers
-# this will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
-pad_to_sequence_len:
 # max sequence length to concatenate training samples together up to
 # inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
 # FutureWarning: This will soon be DEPRECATED
@@ -626,6 +610,9 @@ deepspeed:
 # Path to torch distx for optim 'adamw_anyprecision'
 torchdistx_path:

+# Set padding for data collator to 'longest'
+collator_pad_to_longest:
+
 # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
 pretraining_dataset:

@@ -665,7 +652,6 @@ fsdp:
 fsdp_config:
  fsdp_offload_params: true
  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_sync_module_states: true
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
 ```

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -11,7 +11,7 @@ RUN apt-get update && \

 WORKDIR /workspace

-RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main"
+RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
 RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
 # If AXOLOTL_EXTRAS is set, append it in brackets
 RUN cd axolotl && \
--- a/examples/pythia-12b/config.yml
+++ b/examples/pythia-12b/config.yml
@@ -47,3 +47,4 @@ local_rank:
 gradient_checkpointing: true
 fsdp:
 fsdp_config:
+collator_pad_to_longest: true
--- a/requirements.txt
+++ b/requirements.txt
@@ -25,4 +25,3 @@ rouge-score==0.1.2
 scipy
 scikit-learn==1.2.2
 pynvml
-art
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -6,17 +6,14 @@ import os
 import random
 import signal
 import sys
-from dataclasses import dataclass, field
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Union

 import fire
 import torch
-import transformers
 import yaml

 # add src to the pythonpath so we don't need to pip install this
-from art import text2art
 from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer

@@ -25,7 +22,7 @@ from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.data import prepare_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
-from axolotl.utils.models import load_model, load_model_config, load_tokenizer
+from axolotl.utils.models import load_model, load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
 from axolotl.utils.trainer import setup_trainer
 from axolotl.utils.wandb import setup_wandb_env_vars
@@ -40,26 +37,16 @@ LOG = logging.getLogger("axolotl.scripts")
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"


-@dataclass
-class TrainerCliArgs:
-    """
-    dataclass representing the various non-training arguments
-    """
+def print_axolotl_text_art():
+    ascii_art = """
+                           dP            dP   dP
+                           88            88   88
+.d8888b. dP.  .dP .d8888b. 88 .d8888b. d8888P 88
+88'  `88  `8bd8'  88'  `88 88 88'  `88   88   88
+88.  .88  .d88b.  88.  .88 88 88.  .88   88   88
+`88888P8 dP'  `dP `88888P' dP `88888P'   dP   dP
+"""

-    debug: bool = field(default=False)
-    inference: bool = field(default=False)
-    merge_lora: bool = field(default=False)
-    prepare_ds_only: bool = field(default=False)
-    prompter: Optional[str] = field(default=None)
-    shard: bool = field(default=False)
-
-
-def print_axolotl_text_art(suffix=None):
-    font = "nancyj"
-    ascii_text = "  axolotl"
-    if suffix:
-        ascii_text += f"  x  {suffix}"
-    ascii_art = text2art(" axolotl", font=font)
    if is_main_process():
        print(ascii_art)

@@ -74,8 +61,6 @@ def get_multi_line_input() -> Optional[str]:


 def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
-    if prompter == "None":
-        prompter = None
    default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}

    for token, symbol in default_tokens.items():
@@ -150,10 +135,6 @@ def choose_config(path: Path):
            "No YAML config files found in the specified directory. Are you using a .yml extension?"
        )

-    if len(yaml_files) == 1:
-        print(f"Using default YAML file '{yaml_files[0]}'")
-        return yaml_files[0]
-
    print("Choose a YAML file:")
    for idx, file in enumerate(yaml_files):
        print(f"{idx + 1}. {file}")
@@ -177,20 +158,45 @@ def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> b


 def train(
-    *,
-    cfg: DictDefault,
-    cli_args: TrainerCliArgs,
+    config: Path = Path("configs/"),
+    prepare_ds_only: bool = False,
+    **kwargs,
 ):
+    print_axolotl_text_art()
+    if Path(config).is_dir():
+        config = choose_config(config)
+
+    # load the config from the yaml file
+    with open(config, encoding="utf-8") as file:
+        cfg: DictDefault = DictDefault(yaml.safe_load(file))
+    # if there are any options passed in the cli, if it is something that seems valid from the yaml,
+    # then overwrite the value
+    cfg_keys = cfg.keys()
+    for k, _ in kwargs.items():
+        # if not strict, allow writing to cfg even if it's not in the yml already
+        if k in cfg_keys or not cfg.strict:
+            # handle booleans
+            if isinstance(cfg[k], bool):
+                cfg[k] = bool(kwargs[k])
+            else:
+                cfg[k] = kwargs[k]
+
+    validate_config(cfg)
+
+    normalize_config(cfg)
+
+    setup_wandb_env_vars(cfg)
+
    # load the tokenizer first
    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
    tokenizer = load_tokenizer(cfg)

-    if not (
-        cli_args.shard or cli_args.merge_lora or cli_args.inference
+    if (
+        check_not_in(["shard", "merge_lora"], kwargs) and not cfg.inference
    ):  # don't need to load dataset for these
        train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)

-    if cli_args.debug or cfg.debug:
+    if cfg.debug or "debug" in kwargs:
        LOG.info("check_dataset_labels...")
        check_dataset_labels(
            train_dataset.select(
@@ -199,17 +205,17 @@ def train(
            tokenizer,
        )

-    if cli_args.prepare_ds_only:
+    if prepare_ds_only:
        LOG.info("Finished preparing dataset. Exiting...")
        return

    # Load the model and tokenizer
    LOG.info("loading model and (optionally) peft_config...")
-    model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
+    model, peft_config = load_model(cfg, tokenizer)

    safe_serialization = cfg.save_safetensors is True

-    if cli_args.merge_lora and cfg.adapter is not None:
+    if "merge_lora" in kwargs and cfg.adapter is not None:
        LOG.info("running merge of LoRA with base model")
        model = model.merge_and_unload()
        model.to(dtype=torch.float16)
@@ -223,13 +229,18 @@ def train(
            tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
        return

-    if cli_args.inference:
-        LOG.debug("Running inference on model")
-        do_inference(cfg, model, tokenizer, prompter=cli_args.prompter)
+    if cfg.inference:
+        LOG.info("calling do_inference function")
+        prompter: Optional[str] = "AlpacaPrompter"
+        if "prompter" in kwargs:
+            if kwargs["prompter"] == "None":
+                prompter = None
+            else:
+                prompter = kwargs["prompter"]
+        do_inference(cfg, model, tokenizer, prompter=prompter)
        return

-    if cli_args.shard:
-        LOG.debug("Re-saving model w/ sharding")
+    if "shard" in kwargs:
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
        return

@@ -311,51 +322,5 @@ def train(
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)


-def load_cfg(config: Path = Path("examples/"), **kwargs):
-    if Path(config).is_dir():
-        config = choose_config(config)
-
-    # load the config from the yaml file
-    with open(config, encoding="utf-8") as file:
-        cfg: DictDefault = DictDefault(yaml.safe_load(file))
-    # if there are any options passed in the cli, if it is something that seems valid from the yaml,
-    # then overwrite the value
-    cfg_keys = cfg.keys()
-    for k, _ in kwargs.items():
-        # if not strict, allow writing to cfg even if it's not in the yml already
-        if k in cfg_keys or not cfg.strict:
-            # handle booleans
-            if isinstance(cfg[k], bool):
-                cfg[k] = bool(kwargs[k])
-            else:
-                cfg[k] = kwargs[k]
-
-    model_config = load_model_config(cfg)
-
-    # figure out if the model is llama
-    cfg.is_llama_derived_model = (
-        (hasattr(model_config, "model_type") and model_config.model_type == "llama")
-        or cfg.is_llama_derived_model
-        or "llama" in cfg.base_model
-        or (cfg.model_type and "llama" in cfg.model_type.lower())
-    )
-    validate_config(cfg)
-
-    normalize_config(cfg)
-
-    setup_wandb_env_vars(cfg)
-    return cfg
-
-
-def do_train(config: Path = Path("examples/"), **kwargs):
-    print_axolotl_text_art()
-    parsed_cfg = load_cfg(config, **kwargs)
-    parser = transformers.HfArgumentParser((TrainerCliArgs))
-    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
-        return_remaining_strings=True
-    )
-    train(cfg=parsed_cfg, cli_args=parsed_cli_args)
-
-
 if __name__ == "__main__":
-    fire.Fire(do_train)
+    fire.Fire(train)
--- a/src/axolotl/monkeypatch/fsdp.py
+++ b/src/axolotl/monkeypatch/fsdp.py
@@ -1,45 +0,0 @@
-"""
-Monkeypatch to fix fsdp set state when no previous state was set
-"""
-
-import contextlib
-from typing import Generator, Optional
-
-import torch
-from torch import nn
-from torch.distributed.fsdp.api import (
-    OptimStateDictConfig,
-    StateDictConfig,
-    StateDictType,
-)
-from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel
-
-
-@staticmethod
-@contextlib.contextmanager
-def state_dict_type_patch(
-    module: nn.Module,
-    state_dict_type: StateDictType,
-    state_dict_config: Optional[StateDictConfig] = None,
-    optim_state_dict_config: Optional[OptimStateDictConfig] = None,
-) -> Generator:
-    prev_state_dict_settings = FullyShardedDataParallel.set_state_dict_type(
-        module,
-        state_dict_type,
-        state_dict_config,
-        optim_state_dict_config,
-    )
-    yield
-    if prev_state_dict_settings.state_dict_type:
-        FullyShardedDataParallel.set_state_dict_type(
-            module,
-            prev_state_dict_settings.state_dict_type,
-            prev_state_dict_settings.state_dict_config,
-            prev_state_dict_settings.optim_state_dict_config,
-        )
-
-
-def replace_fsdp_state_dict_type():
-    torch.distributed.fsdp.fully_sharded_data_parallel.FullyShardedDataParallel.state_dict_type = (
-        state_dict_type_patch
-    )
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -152,16 +152,6 @@ def validate_config(cfg):
    if (cfg.base_model and "falcon" in cfg.base_model.lower()) and cfg.fsdp:
        raise ValueError("FSDP is not supported for falcon models")

-    if (
-        cfg.fsdp
-        and cfg.fsdp_config
-        and cfg.fsdp_config.fsdp_state_dict_type
-        and not cfg.fsdp_config.fsdp_sync_module_states
-    ):
-        LOG.warning(
-            "We recommend setting fsdp_config.fsdp_sync_module_states to `true`"
-        )
-
    if (
        cfg.base_model and "mpt" in cfg.base_model.lower()
    ) and cfg.gradient_checkpointing:
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -134,17 +134,8 @@ def load_tokenized_prepared_datasets(
            seed = 42

        datasets = []
-
-        def for_d_in_datasets(dataset_configs):
-            for dataset in dataset_configs:
-                if dataset.name and isinstance(dataset.name, list):
-                    for name in dataset.name:
-                        yield DictDefault({**dataset, "name": name})
-                else:
-                    yield dataset
-
        # pylint: disable=invalid-name
-        for d in for_d_in_datasets(cfg.datasets):
+        for d in cfg.datasets:
            ds: Union[Dataset, DatasetDict] = None
            ds_from_hub = False
            try:
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -5,13 +5,12 @@ import logging
 import math
 import os
 from pathlib import Path
-from typing import Optional, Tuple  # noqa: F401
+from typing import TYPE_CHECKING, Optional, Tuple  # noqa: F401

 import bitsandbytes as bnb
 import torch
 import transformers
 from optimum.bettertransformer import BetterTransformer
-from peft import PeftConfig
 from transformers import (  # noqa: F401
    AutoConfig,
    AutoModelForCausalLM,
@@ -24,17 +23,13 @@ from transformers import (  # noqa: F401

 from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
 from axolotl.utils.bench import log_gpu_memory_usage
-from axolotl.utils.dict import DictDefault

 LOG = logging.getLogger("axolotl")

+if TYPE_CHECKING:
+    from peft import PeftConfig  # noqa: F401

-def load_model_config(cfg):
-    model_config_name = cfg.base_model_config or cfg.base_model
-    trust_remote_code: bool = False or cfg.trust_remote_code
-    return AutoConfig.from_pretrained(
-        model_config_name, trust_remote_code=trust_remote_code
-    )
+    from axolotl.utils.dict import DictDefault  # noqa: F401


 def load_tokenizer(cfg):
@@ -91,10 +86,8 @@ def load_tokenizer(cfg):


 def load_model(
-    cfg: DictDefault,
-    tokenizer: PreTrainedTokenizerBase,
-    inference: bool = False,
-) -> Tuple[PreTrainedModel, Optional[PeftConfig]]:
+    cfg, tokenizer
+):  # type: (DictDefault, PreTrainedTokenizerBase) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
    """
    Load a model for a given configuration and tokenizer.
    """
@@ -104,9 +97,14 @@ def load_model(

    # TODO refactor as a kwarg
    load_in_8bit = cfg.load_in_8bit
+    cfg.is_llama_derived_model = (
+        "llama" in base_model
+        or (cfg.model_type and "llama" in cfg.model_type.lower())
+        or cfg.is_llama_derived_model
+    )

    if cfg.is_llama_derived_model and cfg.flash_attention:
-        if cfg.device not in ["mps", "cpu"] and not inference:
+        if cfg.device not in ["mps", "cpu"] and not cfg.inference:
            from axolotl.monkeypatch.llama_attn_hijack_flash import (
                replace_llama_attn_with_flash_attn,
            )
@@ -148,7 +146,7 @@ def load_model(
    if (
        cfg.is_llama_derived_model
        and (cfg.max_packed_sequence_len or cfg.sample_packing)
-        and not inference
+        and not cfg.inference
    ):
        from axolotl.monkeypatch.llama_expand_mask import hijack_expand_mask

@@ -426,15 +424,15 @@ def load_model(
    return model, lora_config


-def load_adapter(model, cfg, adapter, inference=False):
-    # type: (PreTrainedModel, DictDefault, Optional[str], bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+def load_adapter(model, cfg, adapter):
+    # type: (PreTrainedModel, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]

    if adapter is None:
        return model, None
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    if adapter in ["lora", "qlora"]:
-        return load_lora(model, cfg, inference=inference)
+        return load_lora(model, cfg)
    if adapter == "llama-adapter":
        return load_llama_adapter(model, cfg)

@@ -466,8 +464,12 @@ def load_llama_adapter(model, cfg):
    return model, peft_config


-def find_all_linear_names(model):
-    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
+def find_all_linear_names(bits, model):
+    cls = (
+        bnb.nn.Linear4bit
+        if bits == 4
+        else (bnb.nn.Linear8bitLt if bits == 8 else torch.nn.Linear)
+    )
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
@@ -480,15 +482,21 @@ def find_all_linear_names(model):
    return list(lora_module_names)


-def load_lora(model, cfg, inference=False):
-    # type: (PreTrainedModel, DictDefault, bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
+def load_lora(model, cfg):
+    # type: (PreTrainedModel, DictDefault) -> Tuple[PreTrainedModel, Optional[PeftConfig]]

    from peft import LoraConfig, PeftModel, get_peft_model

    lora_target_modules = list(cfg.lora_target_modules or [])

    if cfg.lora_target_linear:
-        linear_names = find_all_linear_names(model)
+        bits = None
+        if cfg.load_in_4bit:
+            bits = 4
+        elif cfg.load_in_8bit:
+            bits = 8
+
+        linear_names = find_all_linear_names(bits, model)
        LOG.info(f"found linear modules: {repr(linear_names)}")
        lora_target_modules = list(set(lora_target_modules + linear_names))

@@ -508,7 +516,7 @@ def load_lora(model, cfg, inference=False):
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
-            is_trainable=(not inference),
+            is_trainable=not cfg.inference,
        )
    else:
        model = get_peft_model(model, lora_config)
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -471,9 +471,6 @@ def setup_fsdp_envs(cfg):
        os.environ[
            "FSDP_TRANSFORMER_CLS_TO_WRAP"
        ] = cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap
-    from axolotl.monkeypatch.fsdp import replace_fsdp_state_dict_type
-
-    replace_fsdp_state_dict_type()


 def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
@@ -650,12 +647,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
        callbacks.append(SaveBetterTransformerModelCallback)

    data_collator_kwargs = {
-        "padding": True,  # True/"longest" is the default
+        "padding": True,
    }
-    if cfg.pad_to_sequence_len:
-        data_collator_kwargs["pad_to_multiple_of"] = 64 * math.ceil(
-            cfg.sequence_len / 64
-        )
+    if cfg.collator_pad_to_longest:
+        data_collator_kwargs["padding"] = "longest"
    else:
        # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
        # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
Author	SHA1	Message	Date
Wing Lian	c3de28942c	fix for gather across multiple gpus Some checks failed pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-08-29 06:57:28 -07:00
Wing Lian	45848a9285	gather benchmarks from all ranks	2023-08-28 11:29:59 -04:00
Wing Lian	d6cea18034	improve support for customized dataset for bench evals	2023-08-28 06:03:53 -04:00
Wing Lian	606846e0a5	missing transformers import	2023-08-28 05:43:19 -04:00
Wing Lian	a6c9223114	more fixes	2023-08-28 05:39:13 -04:00
Wing Lian	8b16ecd448	updated dataset	2023-08-28 05:39:13 -04:00
Wing Lian	f5db88a10d	fixes	2023-08-28 05:39:13 -04:00
Wing Lian	99d844f215	benchmark callback has its own dataloader and collator	2023-08-28 05:39:13 -04:00
Wing Lian	aefd4d74fa	better handling when no subjects	2023-08-28 05:39:13 -04:00
Wing Lian	24b0e93235	dataset handling and aggregate across benchmark	2023-08-28 05:39:13 -04:00
Wing Lian	2455254b92	more fixes	2023-08-28 05:39:13 -04:00
Wing Lian	918e040601	rename mmlu to bench	2023-08-28 05:39:13 -04:00
Wing Lian	ef062d8fcb	more fixes	2023-08-28 05:39:13 -04:00
Wing Lian	d4c8b66f3d	fix elif and add better messaging	2023-08-28 05:39:13 -04:00
Wing Lian	64e9824d3e	fix the data file	2023-08-28 05:39:13 -04:00
Wing Lian	1134654c98	sample benchmarks, ensure we drop long samples	2023-08-28 05:39:13 -04:00
Wing Lian	2fc756c289	fix mmlu evals	2023-08-28 05:39:13 -04:00
Wing Lian	943b84c490	another callback fix for collator max len attribute	2023-08-28 05:39:13 -04:00
Wing Lian	6f166464d8	include metrics in callback	2023-08-28 05:39:13 -04:00
Wing Lian	e3b07402a7	make sure to define all the explicit positional args	2023-08-28 05:39:13 -04:00
Wing Lian	8d3c8a3eab	default to mmlu-zs	2023-08-28 05:39:13 -04:00
Wing Lian	c30120e684	use hf dataset for mmlu evals	2023-08-28 05:39:13 -04:00
Wing Lian	9aed60fa54	add mmlu callback	2023-08-28 05:39:12 -04:00