Compare commits

..

2 Commits
mm2 ... mm3

Author SHA1 Message Date
sunny
cdd8be7097 wip on multimodal packing support 2024-10-04 15:08:36 -04:00
sunny
08143c7b0d wip on multimodal sample packing support 2024-10-04 14:59:35 -04:00
3 changed files with 103 additions and 67 deletions

View File

@@ -1,65 +0,0 @@
base_model: mistral-community/pixtral-12b
processor_type: AutoProcessor
load_in_8bit: true
strict: false
# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false
chat_template: llama3_2_vision
datasets:
- path: HuggingFaceH4/llava-instruct-mix-vsft
type: chat_template
split: train[:1%]
field_messages: messages
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out
adapter: lora
lora_model_dir:
sequence_len: 8192
pad_to_sequence_len: false
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
train_on_inputs: false
group_by_length: false
bf16: true
fp16:
tf32: true
gradient_checkpointing: true
local_rank:
logging_steps: 1
flash_attention: true
eager_attention:
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:

View File

@@ -20,6 +20,7 @@ class MultiModalChatDataCollator(DataCollatorMixin):
return_tensors: str = "pt"
chat_template: Optional[str] = None
packing: bool = False
sequence_length: Optional[int] = None
max_images: int = -1
padding: Union[bool, str, PaddingStrategy] = True
pad_to_multiple_of: Optional[int] = None
@@ -32,11 +33,112 @@ class MultiModalChatDataCollator(DataCollatorMixin):
self, examples: List[Union[List[int], Any, Dict[str, Any]]]
) -> Dict[str, Any]:
# Handle dict or lists with proper padding and conversion to tensor.
if self.packing:
return self.__class__.process_rows_packing(
examples,
self.processor,
self.chat_template,
self.max_images,
self.sequence_length,
)
return self.__class__.process_rows(
examples, self.processor, self.chat_template, self.max_images
)
@staticmethod
def process_rows_packing(
examples,
processor,
chat_template,
max_images,
sequence_length,
length_only=False,
):
import torch
# Perform sample packing within a batch
if processor.tokenizer.sep_token is None:
sep_token = "[SEP]"
processor.tokenizer.add_tokens([sep_token])
processor.tokenizer.sep_token = sep_token
sep_token_id = processor.tokenizer.convert_tokens_to_ids(
processor.tokenizer.sep_token
)
pad_token_id = processor.tokenizer.pad_token_id
texts = [
processor.apply_chat_template(
example["messages"], chat_template=chat_template, tokenize=False
)
for example in examples
]
images = [example["images"] for example in examples]
if max_images > 0:
images = [img_batch[:max_images] for img_batch in images]
batch = processor(text=texts, images=images, padding=False)
n_sequence = len(examples)
n_seq_in_batch = 0
pack_len = 0
features_pack = {}
packed = {}
features = list[batch.keys()]
for feature in features:
features_pack[feature] = []
packed[feature] = []
features.remove("input_ids")
for seq_in_batch_id in range(n_sequence):
next_seq_len = len(batch["input_ids"][seq_in_batch_id])
if not pack_len + next_seq_len + 1 < sequence_length:
n_seq_in_batch += 1
pack_len += next_seq_len + 1
features_pack["input_ids"] += batch["input_ids"][seq_in_batch_id] + [
sep_token_id
]
"""
Do something with attention mask and cross-attention
"""
for feature in features:
features_pack[feature] += batch[feature][seq_in_batch_id]
else:
for _ in range(sequence_length - pack_len):
features_pack["input_ids"] += [pad_token_id]
packed["input_ids"].append(
torch.tensor(features_pack["input_ids"].copy())
)
for feature in features:
packed[feature].append(torch.tensor(features_pack[feature].copy()))
features_pack[feature] = []
pack_len = 0
image_token_id = processor.tokenizer.convert_tokens_to_ids(
processor.image_token
)
labels = [pack.clone() for pack in packed["input_ids"]]
for label_id, label in enumerate(labels):
labels[label_id][label == processor.tokenizer.pad_token_id] = -100 #
# Ignore the image token index in the loss computation (model specific)
labels[label_id][label == image_token_id] = -100
packed["labels"] = labels
if length_only:
return {
"length": [len(sample["input_ids"]) for sample in batch["input_ids"]]
}
return packed
@staticmethod
def process_rows(examples, processor, chat_template, max_images, length_only=False):
# HINT: use `_torch_collate_batch` to stack and pad tensors

View File

@@ -1114,8 +1114,7 @@ def load_lora(model, cfg, inference=False, config_only=False):
fan_in_fan_out=cfg.lora_fan_in_fan_out,
modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
bias="none",
# task_type="CAUSAL_LM",
task_type="CONDITIONAL_GENERATION" if cfg.is_multimodal else "CAUSAL_LM",
task_type="CAUSAL_LM",
**lora_config_kwargs,
)