speed up flash-attn inference

update flash-attn patch for 70B/GQA and inference using helper from flash-attn tests
sync xformers patch to follow shared format and be diffable
2023-08-13 18:03:38 +00:00 · 2023-08-13 15:41:44 +00:00 · 2023-08-13 15:41:06 +00:00 · 2023-08-13 15:40:43 +00:00 · 2023-08-14 00:36:00 +09:00 · 2023-08-13 11:28:58 -04:00
15 changed files with 661 additions and 395 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -0,0 +1,13 @@
+# These are supported funding model platforms
+
+github: OpenAccess-AI-Collective # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+patreon: # Replace with a single Patreon username
+open_collective: # Replace with a single Open Collective username
+ko_fi: # Replace with a single Ko-fi username
+tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
+community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
+liberapay: # Replace with a single Liberapay username
+issuehunt: # Replace with a single IssueHunt username
+otechie: # Replace with a single Otechie username
+lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
+custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
--- a/README.md
+++ b/README.md
@@ -136,7 +136,7 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"instruction": "...", "input": "...", "output": "..."}
  ```
- `sharegpt:chat`: conversations
+- `sharegpt:chat`: conversations where `from` is `human`/`gpt`
  ```json
  {"conversations": [{"from": "...", "value": "..."}]}
  ```
@@ -225,6 +225,10 @@ Have dataset(s) in one of the following format (JSONL recommended):
  ```json
  {"conversations": [{"role": "...", "value": "..."}]}
  ```
+- `sharegpt_simple.load_guanaco`: conversations where `from` is `prompter`/`assistant` instead of default sharegpt
+  ```json
+  {"conversations": [{"from": "...", "value": "..."}]}
+  ```
 - `sharegpt_jokes`: creates a chat where bot is asked to tell a joke, then explain why the joke is funny
  ```json
  {"conversations": [{"title": "...", "text": "...", "explanation": "..."}]}
--- a/scripts/finetune.py
+++ b/scripts/finetune.py
@@ -18,7 +18,7 @@ from optimum.bettertransformer import BetterTransformer
 from transformers import GenerationConfig, TextStreamer

 from axolotl.logging_config import configure_logging
-from axolotl.utils.bench import log_gpu_memory_usage
+from axolotl.utils.config import normalize_config, validate_config
 from axolotl.utils.data import load_prepare_datasets, load_pretraining_dataset
 from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import barrier, is_main_process
@@ -29,7 +29,6 @@ from axolotl.utils.trainer import (
    process_datasets_for_packing,
    setup_trainer,
 )
-from axolotl.utils.validation import validate_config
 from axolotl.utils.wandb import setup_wandb_env_vars

 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -44,27 +43,6 @@ DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
 os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"


-def choose_device(cfg):
-    def get_device():
-        try:
-            if torch.cuda.is_available():
-                return f"cuda:{cfg.local_rank}"
-
-            if torch.backends.mps.is_available():
-                return "mps"
-
-            raise SystemError("No CUDA/mps device found")
-        except Exception:  # pylint: disable=broad-exception-caught
-            return "cpu"
-
-    cfg.device = get_device()
-    if cfg.device_map != "auto":
-        if cfg.device.startswith("cuda"):
-            cfg.device_map = {"": cfg.local_rank}
-        else:
-            cfg.device_map = {"": cfg.device}
-
-
 def get_multi_line_input() -> Optional[str]:
    print("Give me an instruction (Ctrl + D to finish): ")
    instruction = ""
@@ -194,36 +172,13 @@ def train(

    validate_config(cfg)

-    # setup some derived config / hyperparams
-    cfg.gradient_accumulation_steps = cfg.gradient_accumulation_steps or (
-        cfg.batch_size // cfg.micro_batch_size
-    )
-    cfg.batch_size = (
-        cfg.batch_size or cfg.micro_batch_size * cfg.gradient_accumulation_steps
-    )
-    cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
-    cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    choose_device(cfg)
-    cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
-    if cfg.ddp:
-        cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
-        cfg.batch_size = cfg.batch_size * cfg.world_size
+    normalize_config(cfg)

    setup_wandb_env_vars(cfg)
-    if cfg.device == "mps":
-        cfg.load_in_8bit = False
-        cfg.tf32 = False
-        if cfg.bf16:
-            cfg.fp16 = True
-        cfg.bf16 = False
-
-    if cfg.tf32:
-        torch.backends.cuda.matmul.allow_tf32 = True

    # load the tokenizer first
-    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
-    LOG.info(f"loading tokenizer... {tokenizer_config}")
-    tokenizer = load_tokenizer(tokenizer_config, cfg.tokenizer_type, cfg)
+    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
+    tokenizer = load_tokenizer(cfg)

    if (
        check_not_in(["shard", "merge_lora"], kwargs) and not cfg.inference
@@ -269,8 +224,6 @@ def train(
        LOG.info("Finished preparing dataset. Exiting...")
        return

-    log_gpu_memory_usage(LOG, "baseline", cfg.device)
-
    # Load the model and tokenizer
    LOG.info("loading model and (optionally) peft_config...")
    model, peft_config = load_model(cfg, tokenizer)
@@ -354,6 +307,7 @@ def train(

    if not Path(cfg.output_dir).is_dir():
        os.makedirs(cfg.output_dir, exist_ok=True)
+    tokenizer.save_pretrained(cfg.output_dir)
    if cfg.flash_optimum:
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True, enable_math=True, enable_mem_efficient=True
--- a/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_flash.py
@@ -2,142 +2,38 @@

 # copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py

+import warnings
 from typing import Optional, Tuple

 import torch
+import torch.nn.functional as F
 import transformers
 from einops import rearrange
 from flash_attn.bert_padding import pad_input, unpad_input
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+
+from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids

 try:
-    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func
+    from flash_attn.flash_attn_interface import (  # pylint: disable=ungrouped-imports
+        flash_attn_kvpacked_func,
+        flash_attn_varlen_kvpacked_func,
+        flash_attn_varlen_qkvpacked_func,
+    )
 except ImportError:
+    from flash_attn.flash_attn_interface import (
+        flash_attn_unpadded_kvpacked_func as flash_attn_varlen_kvpacked_func,
+    )
    from flash_attn.flash_attn_interface import (
        flash_attn_unpadded_qkvpacked_func as flash_attn_varlen_qkvpacked_func,
    )

-from transformers.models.llama.modeling_llama import apply_rotary_pos_emb

-from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
-
-
-def forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.Tensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    """Input shape: Batch x Time x Channel
-
-    attention_mask: [bsz, q_len]
-    """
-    # pylint: disable=duplicate-code
-    bsz, q_len, _ = hidden_states.size()
-
-    query_states = (
-        self.q_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    key_states = (
-        self.k_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    value_states = (
-        self.v_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    # [bsz, q_len, nh, hd]
-    # [bsz, nh, q_len, hd]
-
-    kv_seq_len = key_states.shape[-2]
-    assert past_key_value is None, "past_key_value is not supported"
-
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    query_states, key_states = apply_rotary_pos_emb(
-        query_states, key_states, cos, sin, position_ids
-    )
-    # [bsz, nh, t, hd]
-    assert not output_attentions, "output_attentions is not supported"
-    assert not use_cache, "use_cache is not supported"
-
-    # Flash attention codes from
-    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
-
-    # transform the data into the format required by flash attention
-    qkv = torch.stack(
-        [query_states, key_states, value_states], dim=2
-    )  # [bsz, nh, 3, q_len, hd]
-    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
-    # We have disabled _prepare_decoder_attention_mask in LlamaModel
-    # the attention_mask should be the same as the key_padding_mask
-    key_padding_mask = attention_mask
-
-    if key_padding_mask is None:
-        qkv = rearrange(qkv, "b s ... -> (b s) ...")
-        max_s = q_len
-        cu_q_lens = torch.arange(
-            0,
-            (bsz + 1) * q_len,
-            step=q_len,
-            dtype=torch.int32,
-            device=qkv.device,
-        )
-        output = flash_attn_varlen_qkvpacked_func(
-            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
-        )
-        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
-    elif position_ids.shape[0] == 1:
-        # special handling using sample packing
-        qkv = rearrange(qkv, "b s ... -> (b s) ...")
-        cu_q_lens, max_s = get_cu_seqlens_from_pos_ids(position_ids)
-        cu_q_lens = cu_q_lens.squeeze()
-
-        output = flash_attn_varlen_qkvpacked_func(
-            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
-        )
-        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
-    else:
-        nheads = qkv.shape[-2]
-
-        # pylint: disable=invalid-name
-        x = rearrange(qkv, "b s three h d -> b s (three h d)")
-        x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
-        x_unpad = rearrange(
-            x_unpad,
-            "nnz (three h d) -> nnz three h d",
-            three=3,
-            h=nheads,
-        )
-        output_unpad = flash_attn_varlen_qkvpacked_func(
-            x_unpad,
-            cu_q_lens,
-            max_s,
-            0.0,
-            softmax_scale=None,
-            causal=True,
-        )
-        output = rearrange(
-            pad_input(
-                rearrange(output_unpad, "nnz h d -> nnz (h d)"),
-                indices,
-                bsz,
-                q_len,
-            ),
-            "b s (h d) -> b s h d",
-            h=nheads,
-        )
-
-    return (
-        self.o_proj(rearrange(output, "b s h d -> b s (h d)")),
-        None,
-        None,
+def replace_llama_attn_with_flash_attn():
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
+        _prepare_decoder_attention_mask
    )
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = flashattn_forward


 # Disable the transformation of the attention mask in LlamaModel as the flash attention
@@ -153,8 +49,310 @@ def _prepare_decoder_attention_mask(
    return attention_mask


-def replace_llama_attn_with_flash_attn():
-    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (  # pylint: disable=protected-access
-        _prepare_decoder_attention_mask
+def flashattn_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+
+    attention_mask: [bsz, q_len]
+    """
+    # pylint: disable=duplicate-code
+    bsz, q_len, _ = hidden_states.size()
+
+    if not hasattr(self, "pretraining_tp"):
+        self.pretraining_tp = 1
+
+    if self.pretraining_tp > 1:
+        key_value_slicing = (
+            self.num_key_value_heads * self.head_dim
+        ) // self.pretraining_tp
+        query_slices = self.q_proj.weight.split(
+            (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
+        )
+        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+        query_states = [
+            F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        query_states = torch.cat(query_states, dim=-1)
+
+        key_states = [
+            F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        key_states = torch.cat(key_states, dim=-1)
+
+        value_states = [
+            F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        value_states = torch.cat(value_states, dim=-1)
+
+    else:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(
+        bsz, q_len, self.num_heads, self.head_dim
+    ).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+
+    #
+    # flash-attn v2 start
+    #
+
+    if self.training:
+        # during training q,k,v always have same seqlen
+        assert key_states.shape == query_states.shape
+        is_causal = True
+    else:
+        # turn off FA causal mask after first inference autoregressive iteration
+        # only on first autoregressive step q,k,v have same seqlen
+        is_causal = past_key_value is not None
+
+    if self.training and attention_mask.shape[0] == 1:
+        # special handling using sample packing
+        qkv = torch.stack(
+            [query_states, key_states, value_states], dim=2
+        )  # [bsz, nh, 3, q_len, hd]
+        qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+        qkv = rearrange(qkv, "b s ... -> (b s) ...")
+        cu_q_lens, max_s = get_cu_seqlens_from_pos_ids(position_ids)
+        cu_q_lens = cu_q_lens.squeeze()
+
+        output = flash_attn_varlen_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=is_causal
+        )
+        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
+    elif query_states.shape == key_states.shape:
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        qkv_unpad, cu_seqlens_q, max_seqlen_q, _, output_pad_fn = generate_qkv(
+            query_states,
+            key_states,
+            value_states,
+            qkvpacked=True,
+            # We have disabled _prepare_decoder_attention_mask in LlamaModel
+            # the attention_mask should be the same as the key_padding_mask
+            key_padding_mask=attention_mask,
+            query_padding_mask=attention_mask[:, -query_states.size(1) :]
+            if attention_mask is not None
+            else None,
+        )
+        output_unpad = flash_attn_varlen_qkvpacked_func(
+            qkv_unpad,
+            cu_seqlens_q,
+            max_seqlen_q,
+            0.0,
+            softmax_scale=None,
+            causal=is_causal,
+        )
+        output = output_pad_fn(output_unpad)
+    else:
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+        if attention_mask is None or attention_mask.all().item():
+            output = flash_attn_kvpacked_func(
+                query_states,
+                torch.stack([key_states, value_states], 2),
+                causal=is_causal,
+            )
+        else:
+            (  # pylint: disable=unbalanced-tuple-unpacking
+                q_unpad,
+                kv_unpad,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                _,
+                _,
+                output_pad_fn,
+            ) = generate_qkv(
+                query_states,
+                key_states,
+                value_states,
+                kvpacked=True,
+                key_padding_mask=attention_mask,
+                query_padding_mask=attention_mask[:, -query_states.size(1) :]
+                if attention_mask is not None
+                else None,
+            )
+            output_unpad = flash_attn_varlen_kvpacked_func(
+                q_unpad,
+                kv_unpad,
+                cu_seqlens_q,
+                cu_seqlens_k,
+                max_seqlen_q,
+                max_seqlen_k,
+                0.0,
+                softmax_scale=None,
+                causal=is_causal,
+            )
+            output = output_pad_fn(output_unpad)
+
+    attn_output = output
+    if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+    attn_output = rearrange(attn_output, "b s h d -> b s (h d)")
+
+    #
+    # flash-attn v2 end
+    #
+
+    if self.pretraining_tp > 1:
+        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
+        o_proj_slices = self.o_proj.weight.split(
+            self.hidden_size // self.pretraining_tp, dim=1
+        )
+        attn_output = sum(
+            F.linear(attn_output[i], o_proj_slices[i])
+            for i in range(self.pretraining_tp)
+        )
+    else:
+        attn_output = self.o_proj(attn_output)
+
+    return attn_output, None, past_key_value
+
+
+# based on https://github.com/Dao-AILab/flash-attention/blob/364a5b/tests/test_flash_attn.py#L38
+def generate_qkv(
+    q,
+    k,
+    v,
+    query_padding_mask=None,
+    key_padding_mask=None,
+    kvpacked=False,
+    qkvpacked=False,
+):  # pylint: disable=invalid-name,unnecessary-lambda-assignment
+    """
+    Arguments:
+        q: (batch_size, seqlen_q, nheads, d)
+        k: (batch_size, seqlen_k, nheads_k, d)
+        v: (batch_size, seqlen_k, nheads_k, d)
+        query_padding_mask: (batch_size, seqlen), bool
+        key_padding_mask: (batch_size, seqlen), bool
+    """
+    assert not (kvpacked and qkvpacked)
+    batch_size, seqlen_q, nheads, d = q.shape
+    _, seqlen_k, nheads_k, _ = k.shape
+    assert k.shape == (batch_size, seqlen_k, nheads_k, d)
+    assert v.shape == (batch_size, seqlen_k, nheads_k, d)
+
+    if query_padding_mask is not None:
+        q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(
+            q, query_padding_mask
+        )
+
+        output_pad_fn = lambda output_unpad: pad_input(  # noqa: E731
+            output_unpad, indices_q, batch_size, seqlen_q
+        )
+
+    else:
+        q_unpad = rearrange(q, "b s h d -> (b s) h d")
+        cu_seqlens_q = torch.arange(
+            0,
+            (batch_size + 1) * seqlen_q,
+            step=seqlen_q,
+            dtype=torch.int32,
+            device=q_unpad.device,
+        )
+        max_seqlen_q = seqlen_q
+
+        output_pad_fn = lambda output_unpad: rearrange(  # noqa: E731
+            output_unpad, "(b s) h d -> b s h d", b=batch_size
+        )
+
+    if key_padding_mask is not None:
+        k_unpad, _, cu_seqlens_k, max_seqlen_k = unpad_input(k, key_padding_mask)
+        v_unpad, _, _, _ = unpad_input(v, key_padding_mask)
+    else:
+        k_unpad = rearrange(k, "b s h d -> (b s) h d")
+        v_unpad = rearrange(v, "b s h d -> (b s) h d")
+        cu_seqlens_k = torch.arange(
+            0,
+            (batch_size + 1) * seqlen_k,
+            step=seqlen_k,
+            dtype=torch.int32,
+            device=k_unpad.device,
+        )
+        max_seqlen_k = seqlen_k
+
+    if qkvpacked:
+        assert nheads == nheads_k
+        qkv_unpad = torch.stack([q_unpad, k_unpad, v_unpad], dim=1)
+        qkv = torch.stack([q, k, v], dim=2)
+        return (qkv_unpad, cu_seqlens_q, max_seqlen_q, qkv, output_pad_fn)
+
+    if kvpacked:
+        kv_unpad = torch.stack([k_unpad, v_unpad], dim=1)
+        kv = torch.stack([k, v], dim=2)
+        return (
+            q_unpad,
+            kv_unpad,
+            cu_seqlens_q,
+            cu_seqlens_k,
+            max_seqlen_q,
+            max_seqlen_k,
+            q,
+            kv,
+            output_pad_fn,
+        )
+
+    return (
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        q,
+        k,
+        v,
+        output_pad_fn,
    )
-    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward
--- a/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py
@@ -0,0 +1,140 @@
+"""
+Patched LlamaAttention to use torch.nn.functional.scaled_dot_product_attention
+"""
+
+import warnings
+from typing import Optional, Tuple
+
+import torch
+import torch.nn.functional as F
+import transformers.models.llama.modeling_llama
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv
+
+
+def hijack_llama_sdp_attention():
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = (
+        sdp_attention_forward
+    )
+
+
+def sdp_attention_forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    # pylint: disable=duplicate-code
+    bsz, q_len, _ = hidden_states.size()
+
+    if not hasattr(self, "pretraining_tp"):
+        self.pretraining_tp = 1
+
+    if self.pretraining_tp > 1:
+        key_value_slicing = (
+            self.num_key_value_heads * self.head_dim
+        ) // self.pretraining_tp
+        query_slices = self.q_proj.weight.split(
+            (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
+        )
+        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
+        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)
+
+        query_states = [
+            F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        query_states = torch.cat(query_states, dim=-1)
+
+        key_states = [
+            F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        key_states = torch.cat(key_states, dim=-1)
+
+        value_states = [
+            F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
+        ]
+        value_states = torch.cat(value_states, dim=-1)
+
+    else:
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+    query_states = query_states.view(
+        bsz, q_len, self.num_heads, self.head_dim
+    ).transpose(1, 2)
+    key_states = key_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    value_states = value_states.view(
+        bsz, q_len, self.num_key_value_heads, self.head_dim
+    ).transpose(1, 2)
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+
+    kv_seq_len = key_states.shape[-2]
+    if past_key_value is not None:
+        kv_seq_len += past_key_value[0].shape[-2]
+
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+
+    if past_key_value is not None:
+        # reuse k, v, self_attention
+        key_states = torch.cat([past_key_value[0], key_states], dim=2)
+        value_states = torch.cat([past_key_value[1], value_states], dim=2)
+
+    past_key_value = (key_states, value_states) if use_cache else None
+
+    # repeat k/v heads if n_kv_heads < n_heads
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)
+
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )
+
+    #
+    # sdp-attn start
+    #
+
+    with torch.backends.cuda.sdp_kernel():
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=attention_mask,
+            is_causal=False,
+        )
+
+    if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
+    attn_output = attn_output.transpose(1, 2)
+    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+    #
+    # sdp-attn end
+    #
+
+    if self.pretraining_tp > 1:
+        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
+        o_proj_slices = self.o_proj.weight.split(
+            self.hidden_size // self.pretraining_tp, dim=1
+        )
+        attn_output = sum(
+            F.linear(attn_output[i], o_proj_slices[i])
+            for i in range(self.pretraining_tp)
+        )
+    else:
+        attn_output = self.o_proj(attn_output)
+
+    return attn_output, None, past_key_value
--- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
+++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
@@ -3,13 +3,13 @@ Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-g
 """

 import logging
-import math
+import warnings
 from typing import Optional, Tuple

 import torch
 import torch.nn.functional as F
 import transformers.models.llama.modeling_llama
-from torch import nn
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv

 try:
    import xformers.ops
@@ -21,12 +21,6 @@ def hijack_llama_attention():
    transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward


-def hijack_llama_sdp_attention():
-    transformers.models.llama.modeling_llama.LlamaAttention.forward = (
-        sdp_attention_forward
-    )
-
-
 def xformers_forward(
    self,
    hidden_states: torch.Tensor,
@@ -81,15 +75,15 @@ def xformers_forward(
    value_states = value_states.view(
        bsz, q_len, self.num_key_value_heads, self.head_dim
    ).transpose(1, 2)
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        kv_seq_len += past_key_value[0].shape[-2]
+
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    (
-        query_states,
-        key_states,
-    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
+    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
    # [bsz, nh, t, hd]
@@ -102,74 +96,50 @@ def xformers_forward(
    past_key_value = (key_states, value_states) if use_cache else None

    # repeat k/v heads if n_kv_heads < n_heads
-    key_states = transformers.models.llama.modeling_llama.repeat_kv(
-        key_states, self.num_key_value_groups
-    )
-    value_states = transformers.models.llama.modeling_llama.repeat_kv(
-        value_states, self.num_key_value_groups
-    )
+    key_states = repeat_kv(key_states, self.num_key_value_groups)
+    value_states = repeat_kv(value_states, self.num_key_value_groups)

-    # We only apply xformers optimizations if we don't need to output the whole attention matrix
-    if not output_attentions:
-        query_states = query_states.transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
+    if output_attentions:
+        warnings.warn(
+            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
+        )

-        # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
-        # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
-        if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
-            # input and output should be of form (bsz, q_len, num_heads, head_dim)
-            attn_output = xformers.ops.memory_efficient_attention(
-                query_states, key_states, value_states, attn_bias=None
-            )
-        else:
-            # input and output should be of form (bsz, q_len, num_heads, head_dim)
-            attn_output = xformers.ops.memory_efficient_attention(
-                query_states,
-                key_states,
-                value_states,
-                # attn_bias=attention_mask,
-                attn_bias=xformers.ops.LowerTriangularMask(),
-            )
-        attn_weights = None
+    #
+    # xformers-attn start
+    #
+
+    query_states = query_states.transpose(1, 2)
+    key_states = key_states.transpose(1, 2)
+    value_states = value_states.transpose(1, 2)
+
+    # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
+    # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
+    if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
+        # input and output should be of form (bsz, q_len, num_heads, head_dim)
+        attn_output = xformers.ops.memory_efficient_attention(
+            query_states, key_states, value_states, attn_bias=None
+        )
    else:
-        attn_weights = torch.matmul(
-            query_states, key_states.transpose(2, 3)
-        ) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
-            )
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32
-        ).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        # end x-formers vs. not x-formers if-else block
+        # input and output should be of form (bsz, q_len, num_heads, head_dim)
+        attn_output = xformers.ops.memory_efficient_attention(
+            query_states,
+            key_states,
+            value_states,
+            # attn_bias=attention_mask,
+            attn_bias=xformers.ops.LowerTriangularMask(),
+        )

+    if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
+        raise ValueError(
+            f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
+            f" {attn_output.size()}"
+        )
    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

+    #
+    # xformers-attn end
+    #
+
    if self.pretraining_tp > 1:
        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
        o_proj_slices = self.o_proj.weight.split(
@@ -182,103 +152,4 @@ def xformers_forward(
    else:
        attn_output = self.o_proj(attn_output)

-    return attn_output, attn_weights, past_key_value
-
-
-def sdp_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    attention_mask: Optional[torch.Tensor] = None,
-    position_ids: Optional[torch.LongTensor] = None,
-    past_key_value: Optional[Tuple[torch.Tensor]] = None,
-    output_attentions: bool = False,
-    use_cache: bool = False,
-) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-    # pylint: disable=duplicate-code
-    bsz, q_len, _ = hidden_states.size()
-
-    query_states = (
-        self.q_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    key_states = (
-        self.k_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-    value_states = (
-        self.v_proj(hidden_states)
-        .view(bsz, q_len, self.num_heads, self.head_dim)
-        .transpose(1, 2)
-    )
-
-    kv_seq_len = key_states.shape[-2]
-    if past_key_value is not None:
-        kv_seq_len += past_key_value[0].shape[-2]
-    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-    (
-        query_states,
-        key_states,
-    ) = transformers.models.llama.modeling_llama.apply_rotary_pos_emb(
-        query_states, key_states, cos, sin, position_ids
-    )
-    # [bsz, nh, t, hd]
-
-    if past_key_value is not None:
-        # reuse k, v, self_attention
-        key_states = torch.cat([past_key_value[0], key_states], dim=2)
-        value_states = torch.cat([past_key_value[1], value_states], dim=2)
-
-    past_key_value = (key_states, value_states) if use_cache else None
-
-    # We only apply sdp attention if we don't need to output the whole attention matrix
-    if not output_attentions:
-        with torch.backends.cuda.sdp_kernel():
-            attn_output = torch.nn.functional.scaled_dot_product_attention(
-                query_states,
-                key_states,
-                value_states,
-                attn_mask=attention_mask,
-                is_causal=False,
-            )
-            attn_weights = None
-    else:
-        attn_weights = torch.matmul(
-            query_states, key_states.transpose(2, 3)
-        ) / math.sqrt(self.head_dim)
-
-        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-            attn_weights = torch.max(
-                attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min)
-            )
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(
-            attn_weights, dim=-1, dtype=torch.float32
-        ).to(query_states.dtype)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-    attn_output = attn_output.transpose(1, 2)
-    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
-
-    attn_output = self.o_proj(attn_output)
-
-    return attn_output, attn_weights, past_key_value
+    return attn_output, None, past_key_value
--- a/src/axolotl/utils/bench.py
+++ b/src/axolotl/utils/bench.py
@@ -4,13 +4,23 @@ import pynvml
 import torch


-def gpu_memory_usage(device):
+def gpu_memory_usage(device=0):
+    return torch.cuda.memory_allocated(device) / 1024.0**3
+
+
+def gpu_memory_usage_all(device=0):
+    usage = torch.cuda.memory_allocated(device) / 1024.0**3
+    reserved = torch.cuda.memory_reserved(device) / 1024.0**3
+    smi = gpu_memory_usage_smi(device)
+    return usage, reserved - usage, max(0, smi - reserved)
+
+
+def gpu_memory_usage_smi(device=0):
    if isinstance(device, torch.device):
        device = device.index
    if isinstance(device, str) and device.startswith("cuda:"):
        device = int(device[5:])

-    # NB torch.cuda.memory_usage returns zero so we use lower level api
    pynvml.nvmlInit()
    handle = pynvml.nvmlDeviceGetHandleByIndex(device)
    info = pynvml.nvmlDeviceGetMemoryInfo(handle)
@@ -18,6 +28,13 @@ def gpu_memory_usage(device):


 def log_gpu_memory_usage(log, msg, device):
+    usage, cache, misc = gpu_memory_usage_all(device)
+    extras = []
+    if cache > 0:
+        extras.append(f"+{cache:.03f}GB cache")
+    if misc > 0:
+        extras.append(f"+{misc:.03f}GB misc")
    log.info(
-        f"GPU memory usage {msg}: {gpu_memory_usage(device):.03f} GB", stacklevel=2
+        f"GPU memory usage {msg}: {usage:.03f}GB ({', '.join(extras)})", stacklevel=2
    )
+    return usage, cache, misc
--- a/src/axolotl/utils/callbacks.py
+++ b/src/axolotl/utils/callbacks.py
@@ -74,10 +74,10 @@ class SaveBetterTransformerModelCallback(
        return control


-class PrintGPUStatsCallback(
+class GPUStatsCallback(
    TrainerCallback
 ):  # pylint: disable=too-few-public-methods disable=unused-argument
-    """Callback to print GPU utilization"""
+    """Callback to track GPU utilization"""

    def __init__(self, cfg):
        self.cfg = cfg
@@ -90,7 +90,7 @@ class PrintGPUStatsCallback(
        control: TrainerControl,
        **kwargs,
    ):
-        if not self.logged:
+        if not self.logged and state.global_step > 1:
            log_gpu_memory_usage(LOG, "while training", self.cfg.device)
            self.logged = True
        return control
--- a/src/axolotl/utils/validation.py
+++ b/src/axolotl/utils/validation.py
@@ -1,12 +1,70 @@
-"""Module for validating config files"""
+"""Module for working with config dicts"""

 import logging
+import os

 import torch

+from axolotl.utils.bench import log_gpu_memory_usage
+
 LOG = logging.getLogger("axolotl")


+def choose_device(cfg):
+    def get_device():
+        try:
+            if torch.cuda.is_available():
+                return f"cuda:{cfg.local_rank}"
+
+            if torch.backends.mps.is_available():
+                return "mps"
+
+            raise SystemError("No CUDA/mps device found")
+        except Exception:  # pylint: disable=broad-exception-caught
+            return "cpu"
+
+    cfg.device = get_device()
+    if cfg.device_map != "auto":
+        if cfg.device.startswith("cuda"):
+            cfg.device_map = {"": cfg.local_rank}
+        else:
+            cfg.device_map = {"": cfg.device}
+
+    # in `accelerate launch`, we need to not pass through any device map and let
+    # accelerate figure out which parts of the model to put on which gpu
+    accelerate_vars = [var for var in os.environ if var.startswith("ACCELERATE_USE_")]
+    if accelerate_vars:
+        cfg.device_map = None
+
+
+def normalize_config(cfg):
+    # setup some derived config / hyperparams
+    cfg.gradient_accumulation_steps = cfg.gradient_accumulation_steps or (
+        cfg.batch_size // cfg.micro_batch_size
+    )
+    cfg.batch_size = (
+        cfg.batch_size or cfg.micro_batch_size * cfg.gradient_accumulation_steps
+    )
+    cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
+    cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    choose_device(cfg)
+    cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
+    if cfg.ddp:
+        cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
+        cfg.batch_size = cfg.batch_size * cfg.world_size
+
+    if cfg.device == "mps":
+        cfg.load_in_8bit = False
+        cfg.tf32 = False
+        if cfg.bf16:
+            cfg.fp16 = True
+        cfg.bf16 = False
+    else:
+        torch.backends.cuda.matmul.allow_tf32 = cfg.tf32 or False
+
+    log_gpu_memory_usage(LOG, "baseline", cfg.device)
+
+
 def validate_config(cfg):
    if cfg.max_packed_sequence_len and cfg.sample_packing:
        raise ValueError(
--- a/src/axolotl/utils/dict.py
+++ b/src/axolotl/utils/dict.py
@@ -10,3 +10,6 @@ class DictDefault(Dict):

    def __missing__(self, key):
        return None
+
+    def __or__(self, other):
+        return DictDefault(super().__or__(other))
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -32,37 +32,27 @@ if TYPE_CHECKING:
    from axolotl.utils.dict import DictDefault  # noqa: F401


-def load_tokenizer(
-    tokenizer_config,
-    tokenizer_type,
-    cfg,
-):
+def load_tokenizer(cfg):
    tokenizer_kwargs = {}
    use_fast = True  # this is the default
+
    if cfg.tokenizer_use_fast is not None:
        use_fast = cfg.tokenizer_use_fast
    if cfg.tokenizer_legacy is not None:
        # True is the default w/ https://github.com/huggingface/transformers/pull/25224
        tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy
-    if tokenizer_type:
-        tokenizer = getattr(transformers, tokenizer_type).from_pretrained(
-            tokenizer_config,
-            trust_remote_code=cfg.trust_remote_code or False,
-            use_fast=use_fast,
-            **tokenizer_kwargs,
-        )
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(
-            tokenizer_config,
-            trust_remote_code=cfg.trust_remote_code or False,
-            use_fast=use_fast,
-            **tokenizer_kwargs,
-        )

-    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
-    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
-    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
-    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+    tokenizer_cls = AutoTokenizer
+    if cfg.tokenizer_type:
+        tokenizer_cls = getattr(transformers, cfg.tokenizer_type)
+
+    tokenizer_config = cfg.tokenizer_config or cfg.base_model_config
+    tokenizer = tokenizer_cls.from_pretrained(
+        tokenizer_config,
+        trust_remote_code=cfg.trust_remote_code or False,
+        use_fast=use_fast,
+        **tokenizer_kwargs,
+    )

    if tokenizer.__class__.__name__ in [
        "LlamaTokenizer",
@@ -70,6 +60,11 @@ def load_tokenizer(
    ]:
        tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN

+    LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
+    LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
+    LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
+    LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
+
    if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        os.environ["TOKENIZERS_PARALLELISM"] = "false"
@@ -92,7 +87,6 @@ def load_model(
    base_model = cfg.base_model
    base_model_config = cfg.base_model_config
    model_type = cfg.model_type
-    adapter = cfg.adapter

    # TODO refactor as a kwarg
    load_in_8bit = cfg.load_in_8bit
@@ -118,9 +112,7 @@ def load_model(
        LOG.info("patching with xformers attention")
        hijack_llama_attention()
    elif cfg.is_llama_derived_model and cfg.sdp_attention:
-        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
-            hijack_llama_sdp_attention,
-        )
+        from axolotl.monkeypatch.llama_attn_hijack_sdp import hijack_llama_sdp_attention

        LOG.info("patching with sdp attention")
        hijack_llama_sdp_attention()
@@ -241,6 +233,7 @@ def load_model(
            model = LlamaForCausalLM.from_pretrained(
                base_model,
                config=config,
+                device_map=cfg.device_map,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                torch_dtype=torch_dtype,
@@ -275,6 +268,7 @@ def load_model(
        elif model_type and not cfg.trust_remote_code:
            model = getattr(transformers, model_type).from_pretrained(
                base_model,
+                device_map=cfg.device_map,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                torch_dtype=torch_dtype,
@@ -305,6 +299,7 @@ def load_model(
            model = AutoModelForCausalLM.from_pretrained(
                base_model,
                config=config,
+                device_map=cfg.device_map,
                load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
                load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
                torch_dtype=torch_dtype,
@@ -318,6 +313,7 @@ def load_model(
        LOG.exception(err)
        model = AutoModelForCausalLM.from_pretrained(
            base_model,
+            device_map=cfg.device_map,
            load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
            load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
            torch_dtype=torch_dtype,
@@ -364,7 +360,7 @@ def load_model(
                    if hasattr(module, "weight"):
                        module.to(torch_dtype)

-    model, lora_config = load_adapter(model, cfg, adapter)
+    model, lora_config = load_adapter(model, cfg, cfg.adapter)

    if cfg.ddp and not load_in_8bit:
        model.to(f"cuda:{cfg.local_rank}")
@@ -381,9 +377,6 @@ def load_model(
                module.scales = module.scales.half()
                module.bias = module.bias.half()

-    if model.device.type == "cuda":
-        log_gpu_memory_usage(LOG, "after adapters", model.device)
-
    if (
        torch.cuda.device_count() > 1
        and int(os.getenv("WORLD_SIZE", "1")) > 1
@@ -406,6 +399,9 @@ def load_model(
    if cfg.flash_optimum:
        model = BetterTransformer.transform(model)

+    if cfg.adapter is not None:
+        log_gpu_memory_usage(LOG, "after adapters", model.device)
+
    # TODO resume_from_checkpoint handling
    return model, lora_config

--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -22,7 +22,7 @@ from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
 from transformers.trainer_pt_utils import get_parameter_names

 from axolotl.utils.callbacks import (
-    PrintGPUStatsCallback,
+    GPUStatsCallback,
    SaveBetterTransformerModelCallback,
    SavePeftModelCallback,
 )
@@ -555,7 +555,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
        trainer_kwargs["optimizers"] = (optimizer, lr_scheduler)

    callbacks = []
-    callbacks.append(PrintGPUStatsCallback(cfg))
+    callbacks.append(GPUStatsCallback(cfg))
    # TODO on_save callback to sync checkpoints to GCP/AWS in background
    if cfg.early_stopping_patience:
        early_stop_cb = EarlyStoppingCallback(
--- a/tests/test_dict.py
+++ b/tests/test_dict.py
@@ -72,6 +72,13 @@ class DictDefaultTest(unittest.TestCase):

        assert cfg.random_key is None, "DictDefault should return None for missing keys"

+    def test_dict_or(self):
+        cfg = DictDefault({}) | DictDefault({})
+
+        assert (
+            cfg.random_key is None
+        ), "DictDefault should return None for missing keys after | operation"
+
    def test_dict_nested_missingparentkey(self):
        """
        Due to subclassing Dict, DictDefault will error if we try to access a nested key whose parent key does not exist.
--- a/tests/test_tokenizers.py
+++ b/tests/test_tokenizers.py
@@ -13,17 +13,22 @@ class TestTokenizers(unittest.TestCase):
    """

    def test_default_use_fast(self):
-        cfg = DictDefault({})
-        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
+        cfg = DictDefault(
+            {
+                "tokenizer_config": "huggyllama/llama-7b",
+            }
+        )
+        tokenizer = load_tokenizer(cfg)
        assert "Fast" in tokenizer.__class__.__name__

    def test_dont_use_fast(self):
        cfg = DictDefault(
            {
+                "tokenizer_config": "huggyllama/llama-7b",
                "tokenizer_use_fast": False,
            }
        )
-        tokenizer = load_tokenizer("huggyllama/llama-7b", None, cfg)
+        tokenizer = load_tokenizer(cfg)
        assert "Fast" not in tokenizer.__class__.__name__


--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -6,8 +6,8 @@ from typing import Optional

 import pytest

+from axolotl.utils.config import validate_config
 from axolotl.utils.dict import DictDefault
-from axolotl.utils.validation import validate_config


 class ValidationTest(unittest.TestCase):
Author	SHA1	Message	Date
Aman Karmani	956a177678	speed up flash-attn inference Some checks failed pre-commit / pre-commit (push) Has been cancelled Details PyTest / test (3.10) (push) Has been cancelled Details PyTest / test (3.9) (push) Has been cancelled Details	2023-08-13 18:03:38 +00:00
Aman Karmani	747e84d3bb	update flash-attn patch for 70B/GQA and inference using helper from flash-attn tests	2023-08-13 15:41:44 +00:00
Aman Karmani	c45a786039	sync xformers patch to follow shared format and be diffable	2023-08-13 15:41:06 +00:00
Aman Karmani	70e6c28121	split sdp attn into its own patch	2023-08-13 15:40:43 +00:00
NanoCode012	729c299256	Feat(doc): Improve sharegpt doc (#378 ) * Feat(doc): Improve sharegpt doc * Fix typo	2023-08-14 00:36:00 +09:00
Wing Lian	86a91e260b	save tokenizer before training starts (#380 )	2023-08-13 11:28:58 -04:00
Aman Gupta Karmani	094fc2c6e6	try to detect accelerate and only use device_map=None in that case (#373 )	2023-08-13 00:32:07 -04:00
Wing Lian	2dafa730ef	Create FUNDING.yml	2023-08-13 00:30:34 -04:00
Wing Lian	343ac84e5a	fix check for flash attn branching (#377 )	2023-08-12 22:48:08 -04:00
Aman Karmani	0c967279ce	remove unnecessary local variable	2023-08-13 01:58:39 +00:00
Aman Karmani	efb3b2c95e	simplify `load_tokenizer`	2023-08-12 18:55:06 -07:00
Aman Karmani	7b55fe6419	improve GPU logging to break out pytorch cache and system mem	2023-08-12 18:52:57 -07:00
Aman Karmani	e029ab34ea	quiet noise from llama tokenizer by setting pad token earlier	2023-08-12 18:31:40 -07:00
Aman Karmani	8cec513447	extract module for working with cfg	2023-08-12 18:25:27 -07:00
Aman Karmani	a13e45d548	fix DefaultDict.__or__	2023-08-13 01:15:50 +00:00