From 42410c783ceb928281b396c01a4c033781e07ffd Mon Sep 17 00:00:00 2001
From: Wing Lian <wing.lian@gmail.com>
Date: Sun, 14 May 2023 09:16:41 -0400
Subject: [PATCH] more fixes

---
 src/axolotl/utils/models.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py
index 7eef944f3..4d7a45920 100644
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -184,7 +184,8 @@ def load_model(
         for k, v in cfg.tokens.items():
             tokenizer.add_special_tokens({k: v})
 
-    model.resize_token_embeddings(len(tokenizer))
+    # this should only be needed if you are messing with new tokens in the vocab
+    # model.resize_token_embeddings(len(tokenizer))
 
     if cfg.adapter and load_in_8bit and not cfg.load_4bit:
         logging.info("converting PEFT model w/ prepare_model_for_int8_training")
@@ -207,7 +208,10 @@ def load_model(
                 m.scales = m.scales.half()
                 m.bias = m.bias.half()
 
-    if torch.cuda.device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) > 1:
+    if torch.cuda.device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) > 1 and cfg.load_4bit:
+        # llama is PROBABLY model parallelizable, but the default isn't that it is
+        # so let's only set it for the 4bit, see
+        # https://github.com/johnsmith0031/alpaca_lora_4bit/blob/08b3fca4a4a9e0d3945be1bab4529f100a428636/finetune.py#L130-L133
         model.is_parallelizable = True
         model.model_parallel = True