wip on multimodal packing support

wip on multimodal sample packing support
2024-10-04 15:08:36 -04:00 · 2024-10-04 14:59:35 -04:00
3 changed files with 103 additions and 67 deletions
--- a/examples/pixtral/lora-12b.yaml
+++ b/examples/pixtral/lora-12b.yaml
@@ -1,65 +0,0 @@
 base_model: mistral-community/pixtral-12b
 processor_type: AutoProcessor
 load_in_8bit: true
 strict: false
 # these 3 lines are needed for now to handle vision chat templates w images
 skip_prepare_dataset: true
 remove_unused_columns: false
 sample_packing: false
 chat_template: llama3_2_vision
 datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages
 dataset_prepared_path: last_run_prepared
 val_set_size: 0.0
 output_dir: ./outputs/out
 adapter: lora
 lora_model_dir:
 sequence_len: 8192
 pad_to_sequence_len: false
 lora_r: 32
 lora_alpha: 16
 lora_dropout: 0.05
 lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
 wandb_project:
 wandb_entity:
 wandb_watch:
 wandb_name:
 wandb_log_model:
 gradient_accumulation_steps: 4
 micro_batch_size: 1
 num_epochs: 1
 optimizer: adamw_bnb_8bit
 lr_scheduler: cosine
 learning_rate: 0.0002
 train_on_inputs: false
 group_by_length: false
 bf16: true
 fp16:
 tf32: true
 gradient_checkpointing: true
 local_rank:
 logging_steps: 1
 flash_attention: true
 eager_attention:
 warmup_ratio: 0.1
 evals_per_epoch: 1
 saves_per_epoch: 1
 debug:
 deepspeed:
 weight_decay: 0.0
 fsdp:
 fsdp_config:
--- a/src/axolotl/utils/collators/mm_chat.py
+++ b/src/axolotl/utils/collators/mm_chat.py
@@ -20,6 +20,7 @@ class MultiModalChatDataCollator(DataCollatorMixin):
    return_tensors: str = "pt"
    chat_template: Optional[str] = None
    packing: bool = False
    sequence_length: Optional[int] = None
    max_images: int = -1
    padding: Union[bool, str, PaddingStrategy] = True
    pad_to_multiple_of: Optional[int] = None
@@ -32,11 +33,112 @@ class MultiModalChatDataCollator(DataCollatorMixin):
        self, examples: List[Union[List[int], Any, Dict[str, Any]]]
    ) -> Dict[str, Any]:
        # Handle dict or lists with proper padding and conversion to tensor.
        if self.packing:
            return self.__class__.process_rows_packing(
                examples,
                self.processor,
                self.chat_template,
                self.max_images,
                self.sequence_length,
            )
        return self.__class__.process_rows(
            examples, self.processor, self.chat_template, self.max_images
        )
    @staticmethod
    def process_rows_packing(
        examples,
        processor,
        chat_template,
        max_images,
        sequence_length,
        length_only=False,
    ):
        import torch
        # Perform sample packing within a batch
        if processor.tokenizer.sep_token is None:
            sep_token = "[SEP]"
            processor.tokenizer.add_tokens([sep_token])
            processor.tokenizer.sep_token = sep_token
        sep_token_id = processor.tokenizer.convert_tokens_to_ids(
            processor.tokenizer.sep_token
        )
        pad_token_id = processor.tokenizer.pad_token_id
        texts = [
            processor.apply_chat_template(
                example["messages"], chat_template=chat_template, tokenize=False
            )
            for example in examples
        ]
        images = [example["images"] for example in examples]
        if max_images > 0:
            images = [img_batch[:max_images] for img_batch in images]
        batch = processor(text=texts, images=images, padding=False)
        n_sequence = len(examples)
        n_seq_in_batch = 0
        pack_len = 0
        features_pack = {}
        packed = {}
        features = list[batch.keys()]
        for feature in features:
            features_pack[feature] = []
            packed[feature] = []
        features.remove("input_ids")
        for seq_in_batch_id in range(n_sequence):
            next_seq_len = len(batch["input_ids"][seq_in_batch_id])
            if not pack_len + next_seq_len + 1 < sequence_length:
                n_seq_in_batch += 1
                pack_len += next_seq_len + 1
                features_pack["input_ids"] += batch["input_ids"][seq_in_batch_id] + [
                    sep_token_id
                ]
                """
                Do something with attention mask and cross-attention
                """
                for feature in features:
                    features_pack[feature] += batch[feature][seq_in_batch_id]
            else:
                for _ in range(sequence_length - pack_len):
                    features_pack["input_ids"] += [pad_token_id]
                packed["input_ids"].append(
                    torch.tensor(features_pack["input_ids"].copy())
                )
                for feature in features:
                    packed[feature].append(torch.tensor(features_pack[feature].copy()))
                    features_pack[feature] = []
                pack_len = 0
        image_token_id = processor.tokenizer.convert_tokens_to_ids(
            processor.image_token
        )
        labels = [pack.clone() for pack in packed["input_ids"]]
        for label_id, label in enumerate(labels):
            labels[label_id][label == processor.tokenizer.pad_token_id] = -100  #
            # Ignore the image token index in the loss computation (model specific)
            labels[label_id][label == image_token_id] = -100
        packed["labels"] = labels
        if length_only:
            return {
                "length": [len(sample["input_ids"]) for sample in batch["input_ids"]]
            }
        return packed
    @staticmethod
    def process_rows(examples, processor, chat_template, max_images, length_only=False):
        # HINT: use `_torch_collate_batch` to stack and pad tensors
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -1114,8 +1114,7 @@ def load_lora(model, cfg, inference=False, config_only=False):
        fan_in_fan_out=cfg.lora_fan_in_fan_out,
        modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
        bias="none",
-        # task_type="CAUSAL_LM",
+        task_type="CAUSAL_LM",
        task_type="CONDITIONAL_GENERATION" if cfg.is_multimodal else "CAUSAL_LM",
        **lora_config_kwargs,
    )
Author	SHA1	Message	Date
sunny	cdd8be7097	wip on multimodal packing support	2024-10-04 15:08:36 -04:00
sunny	08143c7b0d	wip on multimodal sample packing support	2024-10-04 14:59:35 -04:00