diff --git a/examples/llama-2/tiny-llama.yml b/examples/llama-2/tiny-llama.yml index b91877e97..6b3fa652f 100644 --- a/examples/llama-2/tiny-llama.yml +++ b/examples/llama-2/tiny-llama.yml @@ -1,4 +1,4 @@ -base_model: PY007/TinyLlama-1.1B-step-50K-105b +base_model: PY007/TinyLlama-1.1B-intermediate-step-715k-1.5T model_type: LlamaForCausalLM tokenizer_type: LlamaTokenizer diff --git a/src/axolotl/core/trainer_builder.py b/src/axolotl/core/trainer_builder.py index 7ed98b8b6..bcd5e3219 100644 --- a/src/axolotl/core/trainer_builder.py +++ b/src/axolotl/core/trainer_builder.py @@ -543,16 +543,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase): "dataloader_prefetch_factor" ] = self.cfg.dataloader_prefetch_factor - if self.cfg.eval_steps: + if self.cfg.val_set_size == 0: + # no eval set, so don't eval + training_arguments_kwargs["evaluation_strategy"] = "no" + elif self.cfg.eval_steps: training_arguments_kwargs["evaluation_strategy"] = "steps" training_arguments_kwargs["eval_steps"] = self.cfg.eval_steps elif self.cfg.evaluation_strategy: training_arguments_kwargs[ "evaluation_strategy" ] = self.cfg.evaluation_strategy - elif self.cfg.val_set_size == 0: - # no eval set, so don't eval - training_arguments_kwargs["evaluation_strategy"] = "no" else: # we have an eval set, but no steps defined, default to use epoch training_arguments_kwargs["evaluation_strategy"] = "epoch" diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py b/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py index 2a653ceb6..cfed8cb17 100644 --- a/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py +++ b/src/axolotl/monkeypatch/llama_attn_hijack_sdp.py @@ -25,6 +25,8 @@ def sdp_attention_forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + padding_mask: Optional[torch.LongTensor] = None, # pylint: disable=unused-argument + **kwargs, # pylint: disable=unused-argument ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: # pylint: disable=duplicate-code bsz, q_len, _ = hidden_states.size() diff --git a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py index c9d517646..8143750f0 100644 --- a/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py +++ b/src/axolotl/monkeypatch/llama_attn_hijack_xformers.py @@ -29,6 +29,8 @@ def xformers_forward( past_key_value: Optional[Tuple[torch.Tensor]] = None, output_attentions: bool = False, use_cache: bool = False, + padding_mask: Optional[torch.LongTensor] = None, # pylint: disable=unused-argument + **kwargs, # pylint: disable=unused-argument ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: # pylint: disable=duplicate-code bsz, q_len, _ = hidden_states.size() diff --git a/src/axolotl/prompters.py b/src/axolotl/prompters.py index e749ad4c8..033077b1a 100644 --- a/src/axolotl/prompters.py +++ b/src/axolotl/prompters.py @@ -75,7 +75,7 @@ class AlpacaPrompter(Prompter): else: res = ( self.system_format.format(system=self.system_no_input_prompt) - if self.system_prompt + if self.system_no_input_prompt else "" ) + self.turn_no_input_format.format(instruction=instruction) if output: diff --git a/src/axolotl/utils/samplers/multipack.py b/src/axolotl/utils/samplers/multipack.py index e57632082..451893971 100644 --- a/src/axolotl/utils/samplers/multipack.py +++ b/src/axolotl/utils/samplers/multipack.py @@ -181,13 +181,16 @@ class MultipackBatchSampler(BatchSampler): ) # shave off 1% + 1 for dealing with variance in packing from random sampler to sampler - return ( - world_size - * math.floor( - 0.99 - * lengths_sum_per_device - / self.packing_efficiency_estimate - // self.batch_max_len - ) - - 1 + return min( + 1, + ( + world_size + * math.floor( + 0.99 + * lengths_sum_per_device + / self.packing_efficiency_estimate + // self.batch_max_len + ) + - 1 + ), ) diff --git a/src/axolotl/utils/trainer.py b/src/axolotl/utils/trainer.py index f93316cde..cac760700 100644 --- a/src/axolotl/utils/trainer.py +++ b/src/axolotl/utils/trainer.py @@ -142,31 +142,32 @@ def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer): def calculate_total_num_steps(cfg, train_dataset): + if not cfg.total_num_tokens: + total_num_tokens = np.sum( + train_dataset.data.column("input_ids") + .to_pandas() + .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda + .values + ) + LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True) + cfg.total_num_tokens = total_num_tokens + + if not cfg.total_supervised_tokens: + total_supervised_tokens = ( + train_dataset.data.column("labels") + .to_pandas() + .apply(lambda x: np.sum(np.array(x) != -100)) + .sum() + ) + LOG.debug( + f"`total_supervised_tokens: {total_supervised_tokens}`", + main_process_only=True, + ) + cfg.total_supervised_tokens = total_supervised_tokens + if cfg.sample_packing: # we have to drop anything longer then sequence len otherwise # flash attention with position ids fails - if not cfg.total_num_tokens: - total_num_tokens = np.sum( - train_dataset.data.column("input_ids") - .to_pandas() - .apply(lambda x: len(x)) # pylint: disable=unnecessary-lambda - .values - ) - LOG.debug(f"total_num_tokens: {total_num_tokens}", main_process_only=True) - cfg.total_num_tokens = total_num_tokens - - if not cfg.total_supervised_tokens: - total_supervised_tokens = ( - train_dataset.data.column("labels") - .to_pandas() - .apply(lambda x: np.sum(np.array(x) != -100)) - .sum() - ) - LOG.debug( - f"`total_supervised_tokens: {total_supervised_tokens}`", - main_process_only=True, - ) - cfg.total_supervised_tokens = total_supervised_tokens if cfg.sample_packing_eff_est: total_num_steps = (