From 2dac1edf7225cc75bd781d78a7d0cca33bd8560f Mon Sep 17 00:00:00 2001 From: Chiwan Park Date: Tue, 27 Aug 2024 01:56:12 +0900 Subject: [PATCH] Fix `drop_long_seq` bug due to truncation in prompt tokenization strategies when using `chat_template` (#1867) --- src/axolotl/prompt_strategies/chat_template.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/axolotl/prompt_strategies/chat_template.py b/src/axolotl/prompt_strategies/chat_template.py index 8ae668d7e..19e36531a 100644 --- a/src/axolotl/prompt_strategies/chat_template.py +++ b/src/axolotl/prompt_strategies/chat_template.py @@ -350,7 +350,8 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None): ), "roles": ds_cfg.get("roles"), "drop_system_message": ds_cfg.get("drop_system_message", False), - "max_length": cfg.sequence_len, + # we need to add one for detecting sequences with exceeding the `sequence_len` limit. + "max_length": cfg.sequence_len + 1, } strategy_params = {