Fix: Higher vram usage for mistral and sample_packing (#691)

* Fix: Higher vram usage for mistral and sample_packing * chore: update comment * chore: lint
2023-10-07 01:33:43 +09:00
parent d4a88e4eca
commit 669f1d052c
2 changed files with 6 additions and 5 deletions
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -81,7 +81,8 @@ def load_tokenizer(cfg):
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
        os.environ["TOKENIZERS_PARALLELISM"] = "false"

-    if cfg.is_mistral_derived_model:
+    # Mistral's official FA implementation requires left padding
+    if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing:
        tokenizer.padding_side = "left"

    if cfg.special_tokens: