add a prelim test for expading the 4d mask

2024-01-26 00:41:24 -05:00
17 changed files with 85 additions and 177 deletions
--- a/.github/FUNDING.yml
+++ b/.github/FUNDING.yml
@@ -1,6 +1,6 @@
 # These are supported funding model platforms
-github: [winglian, OpenAccess-AI-Collective] # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
+github: OpenAccess-AI-Collective # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
 patreon: # Replace with a single Patreon username
 open_collective: # Replace with a single Open Collective username
 ko_fi: axolotl_ai # Replace with a single Ko-fi username
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -73,7 +73,7 @@ jobs:
          - cuda: 121
            cuda_version: 12.1.0
            python_version: "3.10"
-            pytorch: 2.1.2
+            pytorch: 2.1.1
    steps:
      - name: Checkout
        uses: actions/checkout@v4
--- a/README.md
+++ b/README.md
@@ -607,17 +607,6 @@ datasets:
      # For `completion` datsets only, uses the provided field instead of `text` column
      field:
 # A list of one or more datasets to eval the model with.
 # You can use either test_datasets, or val_set_size, but not both.
 test_datasets:
  - path: /workspace/data/eval.jsonl
    ds_type: json
    # You need to specify a split. For "json" datasets the default split is called "train".
    split: train
    type: completion
    data_files:
      - /workspace/data/eval.jsonl
 # use RL training: dpo, ipo, kto_pair
 rl:
@@ -707,12 +696,6 @@ lora_modules_to_save:
 lora_fan_in_fan_out: false
 peft:
  # Configuration options for loftq initialization for LoRA
  # https://huggingface.co/docs/peft/developer_guides/quantization#loftq-initialization
  loftq_config:
    loftq_bits:  # typically 4 bits
 # ReLoRA configuration
 # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
 relora_steps: # Number of steps per ReLoRA restart
--- a/examples/cerebras/qlora.yml
+++ b/examples/cerebras/qlora.yml
@@ -11,6 +11,7 @@ val_set_size: 0.05
 adapter: qlora
 lora_model_dir:
 sequence_len: 2048
 max_packed_sequence_len: 2048
 lora_r: 16
 lora_alpha: 32
 lora_dropout: 0.05
--- a/examples/llama-2/fft_optimized.yml
+++ b/examples/llama-2/fft_optimized.yml
@@ -67,3 +67,6 @@ weight_decay: 0.1
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
--- a/examples/llama-2/loftq.yml
+++ b/examples/llama-2/loftq.yml
@@ -1,70 +0,0 @@
 base_model: NousResearch/Llama-2-7b-hf
 model_type: LlamaForCausalLM
 tokenizer_type: LlamaTokenizer
 is_llama_derived_model: true
 load_in_8bit: false
 load_in_4bit: false
 strict: false
 datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
 dataset_prepared_path:
 val_set_size: 0.05
 output_dir: ./lora-out
 sequence_len: 4096
 sample_packing: true
 pad_to_sequence_len: true
 adapter: lora
 lora_model_dir:
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_linear: true
 lora_fan_in_fan_out:
 peft:
  loftq_config:
    loftq_bits: 4
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 2
 num_epochs: 4
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: auto
 fp16:
 tf32: false
 gradient_checkpointing: true
 early_stopping_patience:
 resume_from_checkpoint:
 local_rank:
 logging_steps: 1
 xformers_attention:
 flash_attention: true
 s2_attention:
 warmup_steps: 10
 evals_per_epoch: 4
 eval_table_size:
 eval_table_max_new_tokens: 128
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
--- a/examples/llama-2/lora.yml
+++ b/examples/llama-2/lora.yml
@@ -65,3 +65,6 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
--- a/examples/llama-2/qlora.yml
+++ b/examples/llama-2/qlora.yml
@@ -65,3 +65,6 @@ weight_decay: 0.0
 fsdp:
 fsdp_config:
 special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
 packaging==23.2
-peft @ git+https://github.com/huggingface/peft.git
+peft==0.7.1
 transformers==4.37.0
 tokenizers==0.15.0
 bitsandbytes>=0.41.1
--- a/setup.py
+++ b/setup.py
@@ -27,7 +27,6 @@ def parse_requirements():
    try:
        torch_version = version("torch")
        _install_requires.append(f"torch=={torch_version}")
        if torch_version.startswith("2.1."):
            _install_requires.pop(_install_requires.index("xformers==0.0.22"))
            _install_requires.append("xformers>=0.0.23")
@@ -51,7 +50,7 @@ setup(
    dependency_links=dependency_links,
    extras_require={
        "flash-attn": [
-            "flash-attn==2.5.0",
+            "flash-attn==2.3.3",
        ],
        "fused-dense-lib": [
            "fused-dense-lib  @ git+https://github.com/Dao-AILab/flash-attention@v2.3.3#subdirectory=csrc/fused_dense_lib",
--- a/src/axolotl/core/trainer_builder.py
+++ b/src/axolotl/core/trainer_builder.py
@@ -59,22 +59,6 @@ except ImportError:
 LOG = logging.getLogger("axolotl.core.trainer_builder")
 def _sanitize_kwargs_for_tagging(tag_names, kwargs=None):
    if isinstance(tag_names, str):
        tag_names = [tag_names]
    if kwargs is not None:
        if "tags" not in kwargs:
            kwargs["tags"] = tag_names
        elif "tags" in kwargs and isinstance(kwargs["tags"], list):
            kwargs["tags"].extend(tag_names)
        elif "tags" in kwargs and isinstance(kwargs["tags"], str):
            tag_names.append(kwargs["tags"])
            kwargs["tags"] = tag_names
    return kwargs
@dataclass
 class AxolotlTrainingArguments(TrainingArguments):
    """
@@ -365,13 +349,30 @@ class AxolotlTrainer(Trainer):
        #     return (loss, outputs) if return_outputs else loss
        return super().compute_loss(model, inputs, return_outputs=return_outputs)
    def _sanitize_kwargs_for_tagging(self, tag_names, kwargs=None):
        if isinstance(tag_names, str):
            tag_names = [tag_names]
        if kwargs is not None:
            if "tags" not in kwargs:
                kwargs["tags"] = tag_names
            elif "tags" in kwargs and isinstance(kwargs["tags"], list):
                kwargs["tags"].extend(tag_names)
            elif "tags" in kwargs and isinstance(kwargs["tags"], str):
                tag_names.append(kwargs["tags"])
                kwargs["tags"] = tag_names
        return kwargs
    @wraps(Trainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
        Overwrite the `push_to_hub` method in order to force-add the tags when pushing the
        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
        """
-        kwargs = _sanitize_kwargs_for_tagging(tag_names=self.tag_names, kwargs=kwargs)
+        kwargs = self._sanitize_kwargs_for_tagging(
            tag_names=self.tag_names, kwargs=kwargs
        )
        return super().push_to_hub(*args, **kwargs)
@@ -470,24 +471,6 @@ class ReLoRATrainer(AxolotlTrainer):
        return self.lr_scheduler
 class AxolotlDPOTrainer(DPOTrainer):
    """
    Extend the base DPOTrainer for axolotl helpers
    """
    tag_names = ["axolotl", "dpo"]
    @wraps(DPOTrainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
        Overwrite the `push_to_hub` method in order to force-add the tags when pushing the
        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
        """
        kwargs = _sanitize_kwargs_for_tagging(tag_names=self.tag_names, kwargs=kwargs)
        return super().push_to_hub(*args, **kwargs)
 class TrainerBuilderBase(abc.ABC):
    """
    Base class for trainer builder
@@ -735,7 +718,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
        elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False:
            training_arguments_kwargs["dataloader_drop_last"] = True
-        if not self.cfg.test_datasets and self.cfg.val_set_size == 0:
+        if self.cfg.val_set_size == 0:
            # no eval set, so don't eval
            training_arguments_kwargs["evaluation_strategy"] = "no"
        elif self.cfg.eval_steps:
@@ -822,7 +805,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
                self.cfg.load_best_model_at_end is not False
                or self.cfg.early_stopping_patience
            )
            and not self.cfg.test_datasets
            and self.cfg.val_set_size > 0
            and self.cfg.save_steps
            and self.cfg.eval_steps
@@ -1094,7 +1076,7 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
            dpo_trainer_kwargs[
                "precompute_ref_log_probs"
            ] = self.cfg.precompute_ref_log_probs
-        dpo_trainer = AxolotlDPOTrainer(
+        dpo_trainer = DPOTrainer(
            self.model,
            self.model_ref,
            args=training_args,
--- a/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
+++ b/src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
@@ -94,7 +94,7 @@ def _prepare_decoder_attention_mask(
    sliding_window,
 ):  # pylint: disable=unused-argument
    # [bsz, seq_len]
-    if attention_mask is None or sliding_window is None:
+    if attention_mask is None:
        return attention_mask
    # NOTE: attention mask and sliding masks are only broadcastable in certain scenarios.
@@ -151,7 +151,7 @@ def flashattn_forward(
    )
    use_sliding_windows = (
-        getattr(self.config, "sliding_window") is not None
+        hasattr(self.config, "sliding_window") is not None
        and kv_seq_len > self.config.sliding_window
    )
--- a/src/axolotl/utils/config.py
+++ b/src/axolotl/utils/config.py
@@ -232,6 +232,9 @@ def validate_config(cfg):
            "eval_batch_size != micro_batch_size. This can lead to VRAM instability."
        )
    if cfg.load_4bit:
        raise ValueError("cfg.load_4bit parameter has been deprecated")
    if cfg.adapter == "qlora":
        if cfg.merge_lora:
            # can't merge qlora if loaded in 8bit or 4bit
@@ -257,8 +260,7 @@ def validate_config(cfg):
        if cfg.flash_attn_fuse_qkv or cfg.flash_attn_fuse_mlp:
            raise ValueError("Fused modules are not supported with QLoRA")
-    loftq = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
+    if not cfg.load_in_8bit and cfg.adapter == "lora":
    if not cfg.load_in_8bit and cfg.adapter == "lora" and not loftq:
        LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
    if cfg.adapter == "lora" and (cfg.flash_attn_fuse_qkv or cfg.flash_attn_fuse_mlp):
@@ -338,11 +340,6 @@ def validate_config(cfg):
            "push_to_hub_model_id is deprecated. Please use hub_model_id instead."
        )
    if cfg.hub_model_id and not (cfg.save_steps or cfg.saves_per_epoch):
        LOG.warning(
            "hub_model_id is set without any models being saved. To save a model, set either save_steps or saves_per_epoch."
        )
    if cfg.gptq and cfg.model_revision:
        raise ValueError(
            "model_revision is not supported for GPTQ models. "
--- a/src/axolotl/utils/data.py
+++ b/src/axolotl/utils/data.py
@@ -440,7 +440,7 @@ def load_prepare_datasets(
    split="train",
 ) -> Tuple[Dataset, Dataset, List[Prompter]]:
    dataset, prompters = load_tokenized_prepared_datasets(
-        tokenizer, cfg, default_dataset_prepared_path, split=split
+        tokenizer, cfg, default_dataset_prepared_path
    )
    if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -9,7 +9,7 @@ import bitsandbytes as bnb
 import torch
 import transformers
 from optimum.bettertransformer import BetterTransformer
-from peft import LoftQConfig, PeftConfig, prepare_model_for_kbit_training
+from peft import PeftConfig, prepare_model_for_kbit_training
 from peft.tuners.lora import QuantLinear
 from transformers import (  # noqa: F401
    AddedToken,
@@ -667,17 +667,13 @@ def load_model(
        # Qwen doesn't play nicely with LoRA if this is enabled
        skip_prepare_model_for_kbit_training = True
-    loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
+    if (cfg.adapter == "lora" and load_in_8bit) or (
-    if cfg.adapter == "lora" and loftq_bits:
+        cfg.adapter == "qlora" and cfg.load_in_4bit
-        skip_prepare_model_for_kbit_training = True
+    ):
-
+        LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
    if cfg.adapter in ["lora", "qlora"]:
        if cfg.gradient_checkpointing:
            model.gradient_checkpointing_enable()
-        if (
+        if not skip_prepare_model_for_kbit_training:
            cfg.load_in_8bit or cfg.load_in_4bit
        ) and not skip_prepare_model_for_kbit_training:
            LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
            model = prepare_model_for_kbit_training(
                model, use_gradient_checkpointing=cfg.gradient_checkpointing
            )
@@ -704,7 +700,6 @@ def load_model(
            model, lora_config = load_adapter(model, cfg, cfg.adapter)
    if cfg.ddp and not load_in_8bit and not (cfg.rl and cfg.load_in_4bit):
        # TODO revaldate this conditional
        model.to(f"cuda:{cfg.local_rank}")
    if torch.cuda.device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) == 1:
@@ -756,7 +751,7 @@ def load_llama_adapter(model, cfg):
    )
    if cfg.lora_model_dir:
-        LOG.debug("Loading pretrained PEFT - llama_adapter")
+        LOG.debug("Loading pretained PEFT - llama_adapter")
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
@@ -802,12 +797,6 @@ def load_lora(model, cfg, inference=False, config_only=False):
        LOG.info(f"found linear modules: {repr(linear_names)}")
        lora_target_modules = list(set(lora_target_modules + linear_names))
    lora_config_kwargs = {}
    loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
    if loftq_bits:
        lora_config_kwargs["loftq_config"] = LoftQConfig(loftq_bits=loftq_bits)
        lora_config_kwargs["init_lora_weights"] = "loftq"
    lora_config = LoraConfig(
        r=cfg.lora_r,
        lora_alpha=cfg.lora_alpha,
@@ -818,14 +807,13 @@ def load_lora(model, cfg, inference=False, config_only=False):
        modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
        bias="none",
        task_type="CAUSAL_LM",
        **lora_config_kwargs,
    )
    if config_only:
        return None, lora_config
    if cfg.lora_model_dir:
-        LOG.debug("Loading pretrained PEFT - LoRA")
+        LOG.debug("Loading pretained PEFT - LoRA")
        model_kwargs: Any = {}
        if cfg.lora_on_cpu:
            model_kwargs["max_memory"] = {"cpu": "256GiB"}
--- a/tests/test_expand_mask.py
+++ b/tests/test_expand_mask.py
@@ -39,6 +39,32 @@ class TestExpandMask(unittest.TestCase):
        # Check that the output matches the expected output
        self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))
    def test_output_multipack(self):
        mask = torch.tensor([[1, 1, 1, 0], [2, 2, 3, 3]])
        dtype = torch.float32
        expected_output = torch.tensor(
            [
                [
                    [
                        [0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
                        [0.0000e00, 0.0000e00, -3.4028e38, -3.4028e38],
                        [0.0000e00, 0.0000e00, 0.0000e00, -3.4028e38],
                        [-3.4028e38, -3.4028e38, -3.4028e38, -3.4028e38],
                    ]
                ],
                [
                    [
                        [0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
                        [0.0000e00, 0.0000e00, -3.4028e38, -3.4028e38],
                        [-3.4028e38, -3.4028e38, 0.0000e00, -3.4028e38],
                        [-3.4028e38, -3.4028e38, 0.0000e00, 0.0000e00],
                    ]
                ],
            ]
        )
        # Check that the output matches the expected output
        self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))
 if __name__ == "__main__":
    unittest.main()
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -26,12 +26,21 @@ class BaseValidation(unittest.TestCase):
        self._caplog = caplog
 # pylint: disable=too-many-public-methods
 class ValidationTest(BaseValidation):
    """
    Test the validation module
    """
    def test_load_4bit_deprecate(self):
        cfg = DictDefault(
            {
                "load_4bit": True,
            }
        )
        with pytest.raises(ValueError):
            validate_config(cfg)
    def test_batch_size_unused_warning(self):
        cfg = DictDefault(
            {
@@ -689,22 +698,6 @@ class ValidationTest(BaseValidation):
        ):
            validate_config(cfg)
    def test_hub_model_id_save_value_warns(self):
        cfg = DictDefault({"hub_model_id": "test"})
        with self._caplog.at_level(logging.WARNING):
            validate_config(cfg)
            assert (
                "set without any models being saved" in self._caplog.records[0].message
            )
    def test_hub_model_id_save_value(self):
        cfg = DictDefault({"hub_model_id": "test", "saves_per_epoch": 4})
        with self._caplog.at_level(logging.WARNING):
            validate_config(cfg)
            assert len(self._caplog.records) == 0
 class ValidationCheckModelConfig(BaseValidation):
    """