Fix drop_long_seq bug due to truncation in prompt tokenization strategies when using chat_template (#1867)

2024-08-27 01:56:12 +09:00
parent 6819c12cee
commit 2dac1edf72
1 changed files with 2 additions and 1 deletions
--- a/src/axolotl/prompt_strategies/chat_template.py
+++ b/src/axolotl/prompt_strategies/chat_template.py
@@ -350,7 +350,8 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
        ),
        "roles": ds_cfg.get("roles"),
        "drop_system_message": ds_cfg.get("drop_system_message", False),
-        "max_length": cfg.sequence_len,
+        # we need to add one for detecting sequences with exceeding the `sequence_len` limit.
+        "max_length": cfg.sequence_len + 1,
    }

    strategy_params = {