From 14706504e34204066cf5d6df6d9be78d2d9e5f94 Mon Sep 17 00:00:00 2001 From: Wing Lian Date: Wed, 15 Nov 2023 12:23:18 -0500 Subject: [PATCH] various bugfixes (#856) * various bugfixes use latest tinyllama release check if val_set_size is empty first update sdp and xformers llama patches for updated upstream transformers fix system prompt when no input calculate total and total supervised tokens even when not sample packing * add fix for when eval size is estimated to be too small * should be len 1 for dataset length * add catchall kwargs --- examples/llama-2/tiny-llama.yml | 2 +- src/axolotl/core/trainer_builder.py | 8 ++-- .../monkeypatch/llama_attn_hijack_sdp.py | 2 + .../monkeypatch/llama_attn_hijack_xformers.py | 2 + src/axolotl/prompters.py | 2 +- src/axolotl/utils/samplers/multipack.py | 21 +++++---- src/axolotl/utils/trainer.py | 45 ++++++++++--------- 7 files changed, 45 insertions(+), 37 deletions(-) diff --git a/examples/llama-2/tiny-llama.yml b/examples/llama-2/tiny-llama.yml index b91877e97..6b3fa652f 100644 --- a/examples/llama-2/tiny-llama.yml +++ b/examples/llama-2/tiny-llama.yml @@ -1,4 +1,4 @@ -base_model: PY007/TinyLlama-1.1B-step-50K-105b +base_model: PY007/TinyLlama-1.1B-intermediate-step-715k-1.5T model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py index 7ed98b8b6..bcd5e3219 100644 --- a/src/axolotl/core/trainer_builder.py +++ b/src/axolotl/core/trainer_builder.py @@ -543,16 +543,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase): "dataloader_prefetch_factor" ] = self.cfg.dataloader_prefetch_factor - if self.cfg.eval_steps: + if self.cfg.val_set_size == 0: + # no eval set, so don't eval + training_arguments_kwargs["evaluation_strategy"] = "no" + elif self.cfg.eval_steps: training_arguments_kwargs["evaluation_strategy"] = "steps" training_arguments_kwargs["eval_steps"] = self.cfg.eval_steps elif self.cfg.evaluation_strategy: training_arguments_kwargs[ "evaluation_strategy" ] = self.cfg.evaluation_strategy - elif self.cfg.val_set_size == 0: - # no eval set, so don't eval - training_arguments_kwargs["evaluation_strategy"] = "no" else: # we have an eval set, but no steps defined, default to use epoch training_arguments_kwargs["evaluation_strategy"] = "epoch" diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py b/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py index 2a653ceb6..cfed8cb17 100644 --- a/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py +++ b/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py @@ -25,6 +25,8 @@ def sdp_attention_forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + padding_mask: Optional[torch.LongTensor] = None, # pylint: disable=unused-argument + **kwargs, # pylint: disable=unused-argument ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: # pylint: disable=duplicate-code bsz, q_len, _ = hidden_states.size() diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py index c9d517646..8143750f0 100644 --- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py +++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py @@ -29,6 +29,8 @@ def xformers_forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + padding_mask: Optional[torch.LongTensor] = None, # pylint: disable=unused-argument + **kwargs, # pylint: disable=unused-argument ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: # pylint: disable=duplicate-code bsz, q_len, _ = hidden_states.size() diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py index e749ad4c8..033077b1a 100644 --- a/src/axolotl/prompters.py +++ b/src/axolotl/prompters.py @@ -75,7 +75,7 @@ class AlpacaPrompter(Prompter): else: res = ( self.system_format.format(system=self.system_no_input_prompt) - if self.system_prompt + if self.system_no_input_prompt else "" ) + self.turn_no_input_format.format(instruction=instruction) if output: diff --git a/src/axolotl/utils/samplers/multipack.py b/src/axolotl/utils/samplers/multipack.py index e57632082..451893971 100644 --- a/src/axolotl/utils/samplers/multipack.py +++ b/src/axolotl/utils/samplers/multipack.py @@ -181,13 +181,16 @@ class MultipackBatchSampler(BatchSampler): ) # shave off 1% + 1 for dealing with variance in packing from random sampler to sampler - return ( - world_size - * math.floor( - 0.99 - * lengths_sum_per_device - / self.packing_efficiency_estimate - // self.batch_max_len - ) - - 1 + return min( + 1, + ( + world_size + * math.floor( + 0.99 + * lengths_sum_per_device + / self.packing_efficiency_estimate + // self.batch_max_len + ) + - 1 + ), ) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index f93316cde..cac760700 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -142,31 +142,32 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): def calculate_total_num_steps(cfg, train_dataset): + if not cfg.total_num_tokens: + total_num_tokens = np.sum( + train_dataset.data.column("input_ids") + .to_pandas() + .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda + .values + ) + LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True) + cfg.total_num_tokens = total_num_tokens + + if not cfg.total_supervised_tokens: + total_supervised_tokens = ( + train_dataset.data.column("labels") + .to_pandas() + .apply(lambda x: np.sum(np.array(x) != -100)) + .sum() + ) + LOG.debug( + f"`total_supervised_tokens: {total_supervised_tokens}`", + main_process_only=True, + ) + cfg.total_supervised_tokens = total_supervised_tokens + if cfg.sample_packing: # we have to drop anything longer then sequence len otherwise # flash attention with position ids fails - if not cfg.total_num_tokens: - total_num_tokens = np.sum( - train_dataset.data.column("input_ids") - .to_pandas() - .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda - .values - ) - LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True) - cfg.total_num_tokens = total_num_tokens - - if not cfg.total_supervised_tokens: - total_supervised_tokens = ( - train_dataset.data.column("labels") - .to_pandas() - .apply(lambda x: np.sum(np.array(x) != -100)) - .sum() - ) - LOG.debug( - f"`total_supervised_tokens: {total_supervised_tokens}`", - main_process_only=True, - ) - cfg.total_supervised_tokens = total_supervised_tokens if cfg.sample_packing_eff_est: total_num_steps = (