add a prelim test for expading the 4d mask

2024-01-26 00:41:24 -05:00
17 changed files with 85 additions and 177 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1,6 +1,6 @@
 # These are supported funding model platforms

-github: [winglian, OpenAccess-AI-Collective] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+github: OpenAccess-AI-Collective # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 patreon: # Replace with a single Patreon username
 open_collective: # Replace with a single Open Collective username
 ko_fi: axolotl_ai # Replace with a single Ko-fi username
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -73,7 +73,7 @@ jobs:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
-            pytorch: 2.1.2
+            pytorch: 2.1.1
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/README.md
+++ b/README.md
@@ -607,17 +607,6 @@ datasets:
      # For `completion` datsets only, uses the provided field instead of `text` column
      field:

-# A list of one or more datasets to eval the model with.
-# You can use either test_datasets, or val_set_size, but not both.
-test_datasets:
-  - path: /workspace/data/eval.jsonl
-    ds_type: json
-    # You need to specify a split. For "json" datasets the default split is called "train".
-    split: train
-    type: completion
-    data_files:
-      - /workspace/data/eval.jsonl
-
 # use RL training: dpo, ipo, kto_pair
 rl:

@@ -707,12 +696,6 @@ lora_modules_to_save:

 lora_fan_in_fan_out: false

-peft:
-  # Configuration options for loftq initialization for LoRA
-  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
-  loftq_config:
-    loftq_bits:  # typically 4 bits
-
 # ReLoRA configuration
 # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
 relora_steps: # Number of steps per ReLoRA restart
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -11,6 +11,7 @@ val_set_size: 0.05
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
+max_packed_sequence_len: 2048
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -67,3 +67,6 @@ weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -1,70 +0,0 @@
-base_model: NousResearch/Llama-2-7b-hf
-model_type: LlamaForCausalLM
-tokenizer_type: LlamaTokenizer
-is_llama_derived_model: true
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: mhenrichsen/alpaca_2k_test
-    type: alpaca
-dataset_prepared_path:
-val_set_size: 0.05
-output_dir: ./lora-out
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-
-adapter: lora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-peft:
-  loftq_config:
-    loftq_bits: 4
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 4
-micro_batch_size: 2
-num_epochs: 4
-optimizer: adamw_bnb_8bit
-lr_scheduler: cosine
-learning_rate: 0.0002
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-local_rank:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-s2_attention:
-
-warmup_steps: 10
-evals_per_epoch: 4
-eval_table_size:
-eval_table_max_new_tokens: 128
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -65,3 +65,6 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -65,3 +65,6 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
+  bos_token: "<s>"
+  eos_token: "</s>"
+  unk_token: "<unk>"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
-peft @ git+https://github.com/huggingface/peft.git
+peft==0.7.1
 transformers==4.37.0
 tokenizers==0.15.0
 bitsandbytes>=0.41.1
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,6 @@ def parse_requirements():

    try:
        torch_version = version("torch")
-        _install_requires.append(f"torch=={torch_version}")
        if torch_version.startswith("2.1."):
            _install_requires.pop(_install_requires.index("xformers==0.0.22"))
            _install_requires.append("xformers>=0.0.23")
@@ -51,7 +50,7 @@ setup(
    dependency_links=dependency_links,
    extras_require={
        "flash-attn": [
-            "flash-attn==2.5.0",
+            "flash-attn==2.3.3",
        ],
        "fused-dense-lib": [
            "fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.3.3#subdirectory=csrc/fused_dense_lib",
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -59,22 +59,6 @@ except ImportError:
 LOG = logging.getLogger("axolotl.core.trainer_builder")


-def _sanitize_kwargs_for_tagging(tag_names, kwargs=None):
-    if isinstance(tag_names, str):
-        tag_names = [tag_names]
-
-    if kwargs is not None:
-        if "tags" not in kwargs:
-            kwargs["tags"] = tag_names
-        elif "tags" in kwargs and isinstance(kwargs["tags"], list):
-            kwargs["tags"].extend(tag_names)
-        elif "tags" in kwargs and isinstance(kwargs["tags"], str):
-            tag_names.append(kwargs["tags"])
-            kwargs["tags"] = tag_names
-
-    return kwargs
-
-
@dataclass
 class AxolotlTrainingArguments(TrainingArguments):
    """
@@ -365,13 +349,30 @@ class AxolotlTrainer(Trainer):
        #     return (loss, outputs) if return_outputs else loss
        return super().compute_loss(model, inputs, return_outputs=return_outputs)

+    def _sanitize_kwargs_for_tagging(self, tag_names, kwargs=None):
+        if isinstance(tag_names, str):
+            tag_names = [tag_names]
+
+        if kwargs is not None:
+            if "tags" not in kwargs:
+                kwargs["tags"] = tag_names
+            elif "tags" in kwargs and isinstance(kwargs["tags"], list):
+                kwargs["tags"].extend(tag_names)
+            elif "tags" in kwargs and isinstance(kwargs["tags"], str):
+                tag_names.append(kwargs["tags"])
+                kwargs["tags"] = tag_names
+
+        return kwargs
+
    @wraps(Trainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
        Overwrite the `push_to_hub` method in order to force-add the tags when pushing the
        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
        """
-        kwargs = _sanitize_kwargs_for_tagging(tag_names=self.tag_names, kwargs=kwargs)
+        kwargs = self._sanitize_kwargs_for_tagging(
+            tag_names=self.tag_names, kwargs=kwargs
+        )

        return super().push_to_hub(*args, **kwargs)

@@ -470,24 +471,6 @@ class ReLoRATrainer(AxolotlTrainer):
        return self.lr_scheduler


-class AxolotlDPOTrainer(DPOTrainer):
-    """
-    Extend the base DPOTrainer for axolotl helpers
-    """
-
-    tag_names = ["axolotl", "dpo"]
-
-    @wraps(DPOTrainer.push_to_hub)
-    def push_to_hub(self, *args, **kwargs) -> str:
-        """
-        Overwrite the `push_to_hub` method in order to force-add the tags when pushing the
-        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
-        """
-        kwargs = _sanitize_kwargs_for_tagging(tag_names=self.tag_names, kwargs=kwargs)
-
-        return super().push_to_hub(*args, **kwargs)
-
-
 class TrainerBuilderBase(abc.ABC):
    """
    Base class for trainer builder
@@ -735,7 +718,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False:
            training_arguments_kwargs["dataloader_drop_last"] = True

-        if not self.cfg.test_datasets and self.cfg.val_set_size == 0:
+        if self.cfg.val_set_size == 0:
            # no eval set, so don't eval
            training_arguments_kwargs["evaluation_strategy"] = "no"
        elif self.cfg.eval_steps:
@@ -822,7 +805,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                self.cfg.load_best_model_at_end is not False
                or self.cfg.early_stopping_patience
            )
-            and not self.cfg.test_datasets
            and self.cfg.val_set_size > 0
            and self.cfg.save_steps
            and self.cfg.eval_steps
@@ -1094,7 +1076,7 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
            dpo_trainer_kwargs[
                "precompute_ref_log_probs"
            ] = self.cfg.precompute_ref_log_probs
-        dpo_trainer = AxolotlDPOTrainer(
+        dpo_trainer = DPOTrainer(
            self.model,
            self.model_ref,
            args=training_args,
--- a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
@@ -94,7 +94,7 @@ def _prepare_decoder_attention_mask(
    sliding_window,
 ):  # pylint: disable=unused-argument
    # [bsz, seq_len]
-    if attention_mask is None or sliding_window is None:
+    if attention_mask is None:
        return attention_mask

    # NOTE: attention mask and sliding masks are only broadcastable in certain scenarios.
@@ -151,7 +151,7 @@ def flashattn_forward(
    )

    use_sliding_windows = (
-        getattr(self.config, "sliding_window") is not None
+        hasattr(self.config, "sliding_window") is not None
        and kv_seq_len > self.config.sliding_window
    )

--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -232,6 +232,9 @@ def validate_config(cfg):
            "eval_batch_size != micro_batch_size. This can lead to VRAM instability."
        )

+    if cfg.load_4bit:
+        raise ValueError("cfg.load_4bit parameter has been deprecated")
+
    if cfg.adapter == "qlora":
        if cfg.merge_lora:
            # can't merge qlora if loaded in 8bit or 4bit
@@ -257,8 +260,7 @@ def validate_config(cfg):
        if cfg.flash_attn_fuse_qkv or cfg.flash_attn_fuse_mlp:
            raise ValueError("Fused modules are not supported with QLoRA")

-    loftq = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
-    if not cfg.load_in_8bit and cfg.adapter == "lora" and not loftq:
+    if not cfg.load_in_8bit and cfg.adapter == "lora":
        LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")

    if cfg.adapter == "lora" and (cfg.flash_attn_fuse_qkv or cfg.flash_attn_fuse_mlp):
@@ -338,11 +340,6 @@ def validate_config(cfg):
            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
        )

-    if cfg.hub_model_id and not (cfg.save_steps or cfg.saves_per_epoch):
-        LOG.warning(
-            "hub_model_id is set without any models being saved. To save a model, set either save_steps or saves_per_epoch."
-        )
-
    if cfg.gptq and cfg.model_revision:
        raise ValueError(
            "model_revision is not supported for GPTQ models. "
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -440,7 +440,7 @@ def load_prepare_datasets(
    split="train",
 ) -> Tuple[Dataset, Dataset, List[Prompter]]:
    dataset, prompters = load_tokenized_prepared_datasets(
-        tokenizer, cfg, default_dataset_prepared_path, split=split
+        tokenizer, cfg, default_dataset_prepared_path
    )

    if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -9,7 +9,7 @@ import bitsandbytes as bnb
 import torch
 import transformers
 from optimum.bettertransformer import BetterTransformer
-from peft import LoftQConfig, PeftConfig, prepare_model_for_kbit_training
+from peft import PeftConfig, prepare_model_for_kbit_training
 from peft.tuners.lora import QuantLinear
 from transformers import (  # noqa: F401
    AddedToken,
@@ -667,17 +667,13 @@ def load_model(
        # Qwen doesn't play nicely with LoRA if this is enabled
        skip_prepare_model_for_kbit_training = True

-    loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
-    if cfg.adapter == "lora" and loftq_bits:
-        skip_prepare_model_for_kbit_training = True
-
-    if cfg.adapter in ["lora", "qlora"]:
+    if (cfg.adapter == "lora" and load_in_8bit) or (
+        cfg.adapter == "qlora" and cfg.load_in_4bit
+    ):
+        LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
        if cfg.gradient_checkpointing:
            model.gradient_checkpointing_enable()
-        if (
-            cfg.load_in_8bit or cfg.load_in_4bit
-        ) and not skip_prepare_model_for_kbit_training:
-            LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
+        if not skip_prepare_model_for_kbit_training:
            model = prepare_model_for_kbit_training(
                model, use_gradient_checkpointing=cfg.gradient_checkpointing
            )
@@ -704,7 +700,6 @@ def load_model(
            model, lora_config = load_adapter(model, cfg, cfg.adapter)

    if cfg.ddp and not load_in_8bit and not (cfg.rl and cfg.load_in_4bit):
-        # TODO revaldate this conditional
        model.to(f"cuda:{cfg.local_rank}")

    if torch.cuda.device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) == 1:
@@ -756,7 +751,7 @@ def load_llama_adapter(model, cfg):
    )

    if cfg.lora_model_dir:
-        LOG.debug("Loading pretrained PEFT - llama_adapter")
+        LOG.debug("Loading pretained PEFT - llama_adapter")
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
@@ -802,12 +797,6 @@ def load_lora(model, cfg, inference=False, config_only=False):
        LOG.info(f"found linear modules: {repr(linear_names)}")
        lora_target_modules = list(set(lora_target_modules + linear_names))

-    lora_config_kwargs = {}
-    loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
-    if loftq_bits:
-        lora_config_kwargs["loftq_config"] = LoftQConfig(loftq_bits=loftq_bits)
-        lora_config_kwargs["init_lora_weights"] = "loftq"
-
    lora_config = LoraConfig(
        r=cfg.lora_r,
        lora_alpha=cfg.lora_alpha,
@@ -818,14 +807,13 @@ def load_lora(model, cfg, inference=False, config_only=False):
        modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
        bias="none",
        task_type="CAUSAL_LM",
-        **lora_config_kwargs,
    )

    if config_only:
        return None, lora_config

    if cfg.lora_model_dir:
-        LOG.debug("Loading pretrained PEFT - LoRA")
+        LOG.debug("Loading pretained PEFT - LoRA")
        model_kwargs: Any = {}
        if cfg.lora_on_cpu:
            model_kwargs["max_memory"] = {"cpu": "256GiB"}
--- a/tests/test_expand_mask.py
+++ b/tests/test_expand_mask.py
@@ -39,6 +39,32 @@ class TestExpandMask(unittest.TestCase):
        # Check that the output matches the expected output
        self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))

+    def test_output_multipack(self):
+        mask = torch.tensor([[1, 1, 1, 0], [2, 2, 3, 3]])
+        dtype = torch.float32
+        expected_output = torch.tensor(
+            [
+                [
+                    [
+                        [0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
+                        [0.0000e00, 0.0000e00, -3.4028e38, -3.4028e38],
+                        [0.0000e00, 0.0000e00, 0.0000e00, -3.4028e38],
+                        [-3.4028e38, -3.4028e38, -3.4028e38, -3.4028e38],
+                    ]
+                ],
+                [
+                    [
+                        [0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
+                        [0.0000e00, 0.0000e00, -3.4028e38, -3.4028e38],
+                        [-3.4028e38, -3.4028e38, 0.0000e00, -3.4028e38],
+                        [-3.4028e38, -3.4028e38, 0.0000e00, 0.0000e00],
+                    ]
+                ],
+            ]
+        )
+        # Check that the output matches the expected output
+        self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))
+

 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -26,12 +26,21 @@ class BaseValidation(unittest.TestCase):
        self._caplog = caplog


-# pylint: disable=too-many-public-methods
 class ValidationTest(BaseValidation):
    """
    Test the validation module
    """

+    def test_load_4bit_deprecate(self):
+        cfg = DictDefault(
+            {
+                "load_4bit": True,
+            }
+        )
+
+        with pytest.raises(ValueError):
+            validate_config(cfg)
+
    def test_batch_size_unused_warning(self):
        cfg = DictDefault(
            {
@@ -689,22 +698,6 @@ class ValidationTest(BaseValidation):
        ):
            validate_config(cfg)

-    def test_hub_model_id_save_value_warns(self):
-        cfg = DictDefault({"hub_model_id": "test"})
-
-        with self._caplog.at_level(logging.WARNING):
-            validate_config(cfg)
-            assert (
-                "set without any models being saved" in self._caplog.records[0].message
-            )
-
-    def test_hub_model_id_save_value(self):
-        cfg = DictDefault({"hub_model_id": "test", "saves_per_epoch": 4})
-
-        with self._caplog.at_level(logging.WARNING):
-            validate_config(cfg)
-            assert len(self._caplog.records) == 0
-

 class ValidationCheckModelConfig(BaseValidation):
    """