fix for qwen w lora (#906 )

ensure merged model matches the training dtype (#902 )
* ensure merged model matches the training dtype * Update src/axolotl/cli/__init__.py * Update src/axolotl/cli/__init__.py
2023-11-30 12:45:50 -05:00 · 2023-11-29 09:55:19 -05:00 · 2023-11-29 08:36:35 -05:00 · 2023-11-27 21:23:54 +09:00
6 changed files with 19 additions and 26 deletions
--- a/examples/qwen/lora.yml
+++ b/examples/qwen/lora.yml
@@ -53,7 +53,7 @@ resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
-flash_attention: true
+flash_attention:

 warmup_steps: 10
 eval_steps: 0.05
--- a/examples/qwen/qlora.yml
+++ b/examples/qwen/qlora.yml
@@ -53,7 +53,7 @@ resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
-flash_attention: true
+flash_attention:

 warmup_steps: 10
 eval_steps: 0.05
--- a/src/axolotl/cli/init.py
+++ b/src/axolotl/cli/init.py
@@ -29,6 +29,7 @@ from axolotl.utils.dict import DictDefault
 from axolotl.utils.distributed import is_main_process
 from axolotl.utils.models import load_tokenizer
 from axolotl.utils.tokenization import check_dataset_labels
+from axolotl.utils.trainer import prepare_optim_env
 from axolotl.utils.wandb_ import setup_wandb_env_vars

 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
@@ -71,7 +72,7 @@ def do_merge_lora(

    LOG.info("running merge of LoRA with base model")
    model = model.merge_and_unload()
-    model.to(dtype=torch.float16)
+    model.to(dtype=cfg.torch_dtype)

    if cfg.local_rank == 0:
        LOG.info(f"saving merged model to: {str(Path(cfg.output_dir) / 'merged')}")
@@ -296,6 +297,8 @@ def load_cfg(config: Path = Path("examples/"), **kwargs):

    validate_config(cfg)

+    prepare_optim_env(cfg)
+
    normalize_config(cfg)

    setup_wandb_env_vars(cfg)
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -698,24 +698,6 @@ def get_dataset_wrapper(
    return dataset_wrapper, dataset_prompter


-def encode_packed_pretraining(
-    tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: List[str]
-):
-    # tokenize all the examples
-    # rows get split with stride (overlap)
-    res = tokenizer(
-        examples,
-        truncation=True,
-        max_length=max_tokens,
-        add_special_tokens=True,
-        return_overflowing_tokens=True,
-        stride=256,
-    )
-    # convert to a dataset.from_list
-    # use a dataloader and multipack batch sampler to pack the data
-    pass
-
-
 def encode_pretraining(
    tokenizer: PreTrainedTokenizerBase, max_tokens: int, examples: List[str]
 ) -> Dict[str, List]:
@@ -831,7 +813,6 @@ def load_pretraining_dataset(path, tokenizer, max_tokens=2048, seed=42):
    dataset = dataset.map(
        encode,
        batched=True,
-        batch_size=10_000,
        input_columns="text",
        # remove all the existing columns after mapping since they end up having
        # a different length than the encoded/tokenized column
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -412,15 +412,22 @@ def load_model(
                module.to(torch.float32)

    needs_fa2_dtype = cfg.adapter or cfg.fsdp
+    skip_prepare_model_for_kbit_training = False
+
+    if cfg.model_config_type == "qwen" and cfg.adapter == "lora":
+        # Qwen doesn't play nicely with LoRA if this is enabled
+        skip_prepare_model_for_kbit_training = True
+
    if (cfg.adapter == "lora" and load_in_8bit) or (
        cfg.adapter == "qlora" and cfg.load_in_4bit
    ):
        LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
        if cfg.gradient_checkpointing:
            model.gradient_checkpointing_enable()
-        model = prepare_model_for_kbit_training(
-            model, use_gradient_checkpointing=cfg.gradient_checkpointing
-        )
+        if not skip_prepare_model_for_kbit_training:
+            model = prepare_model_for_kbit_training(
+                model, use_gradient_checkpointing=cfg.gradient_checkpointing
+            )
        needs_fa2_dtype = True

    # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
--- a/src/axolotl/utils/trainer.py
+++ b/src/axolotl/utils/trainer.py
@@ -267,12 +267,14 @@ def setup_fsdp_envs(cfg):
        ] = cfg.fsdp_config.fsdp_transformer_layer_cls_to_wrap


-def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
+def prepare_optim_env(cfg):
    if cfg.fsdp:
        setup_fsdp_envs(cfg)
    elif cfg.deepspeed:
        os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"

+
+def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
    trainer_builder = HFCausalTrainerBuilder(cfg, model, tokenizer)
    trainer_builder.train_dataset = train_dataset
    trainer_builder.eval_dataset = eval_dataset
Author	SHA1	Message	Date
Wing Lian	3e3229e2d9	fix for qwen w lora (#906 )	2023-11-30 12:45:50 -05:00
Wing Lian	1d21aa6b0a	ensure merged model matches the training dtype (#902 ) * ensure merged model matches the training dtype * Update src/axolotl/cli/__init__.py * Update src/axolotl/cli/__init__.py	2023-11-29 09:55:19 -05:00
kallewoof	71b7ea3c05	Determine FSDP/deepspeed settings on device select. (#883 ) * Determine FSDP/deepspeed settings on device select. Without this, the OS env check for accelerate will fail. * rename and move env setup call * chore: lint --------- Co-authored-by: Karl-Johan Alm <kalle@gmail.com> Co-authored-by: Wing Lian <wing.lian@gmail.com>	2023-11-29 08:36:35 -05:00
NanoCode012	a48dbf6561	fix: remove FA for qwen examples (#900 ) * fix: remove FA for qwen lora * fix: remove FA for qlora	2023-11-27 21:23:54 +09:00