Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a02af506ed | ||
|
|
431a0b0f9d |
65
examples/pixtral/lora-12b.yaml
Normal file
65
examples/pixtral/lora-12b.yaml
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
base_model: mistral-community/pixtral-12b
|
||||||
|
processor_type: AutoProcessor
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
|
skip_prepare_dataset: true
|
||||||
|
remove_unused_columns: false
|
||||||
|
sample_packing: false
|
||||||
|
|
||||||
|
chat_template: llama3_2_vision
|
||||||
|
datasets:
|
||||||
|
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||||
|
type: chat_template
|
||||||
|
split: train[:1%]
|
||||||
|
field_messages: messages
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
|
||||||
|
sequence_len: 8192
|
||||||
|
pad_to_sequence_len: false
|
||||||
|
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 4
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
train_on_inputs: false
|
||||||
|
group_by_length: false
|
||||||
|
bf16: true
|
||||||
|
fp16:
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
local_rank:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
eager_attention:
|
||||||
|
|
||||||
|
warmup_ratio: 0.1
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
debug:
|
||||||
|
deepspeed:
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
fsdp_config:
|
||||||
@@ -20,7 +20,6 @@ class MultiModalChatDataCollator(DataCollatorMixin):
|
|||||||
return_tensors: str = "pt"
|
return_tensors: str = "pt"
|
||||||
chat_template: Optional[str] = None
|
chat_template: Optional[str] = None
|
||||||
packing: bool = False
|
packing: bool = False
|
||||||
sequence_length: Optional[int] = None
|
|
||||||
max_images: int = -1
|
max_images: int = -1
|
||||||
padding: Union[bool, str, PaddingStrategy] = True
|
padding: Union[bool, str, PaddingStrategy] = True
|
||||||
pad_to_multiple_of: Optional[int] = None
|
pad_to_multiple_of: Optional[int] = None
|
||||||
@@ -33,112 +32,11 @@ class MultiModalChatDataCollator(DataCollatorMixin):
|
|||||||
self, examples: List[Union[List[int], Any, Dict[str, Any]]]
|
self, examples: List[Union[List[int], Any, Dict[str, Any]]]
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
# Handle dict or lists with proper padding and conversion to tensor.
|
# Handle dict or lists with proper padding and conversion to tensor.
|
||||||
if self.packing:
|
|
||||||
return self.__class__.process_rows_packing(
|
|
||||||
examples,
|
|
||||||
self.processor,
|
|
||||||
self.chat_template,
|
|
||||||
self.max_images,
|
|
||||||
self.sequence_length,
|
|
||||||
)
|
|
||||||
|
|
||||||
return self.__class__.process_rows(
|
return self.__class__.process_rows(
|
||||||
examples, self.processor, self.chat_template, self.max_images
|
examples, self.processor, self.chat_template, self.max_images
|
||||||
)
|
)
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def process_rows_packing(
|
|
||||||
examples,
|
|
||||||
processor,
|
|
||||||
chat_template,
|
|
||||||
max_images,
|
|
||||||
sequence_length,
|
|
||||||
length_only=False,
|
|
||||||
):
|
|
||||||
import torch
|
|
||||||
|
|
||||||
# Perform sample packing within a batch
|
|
||||||
|
|
||||||
if processor.tokenizer.sep_token is None:
|
|
||||||
sep_token = "[SEP]"
|
|
||||||
processor.tokenizer.add_tokens([sep_token])
|
|
||||||
processor.tokenizer.sep_token = sep_token
|
|
||||||
sep_token_id = processor.tokenizer.convert_tokens_to_ids(
|
|
||||||
processor.tokenizer.sep_token
|
|
||||||
)
|
|
||||||
pad_token_id = processor.tokenizer.pad_token_id
|
|
||||||
|
|
||||||
texts = [
|
|
||||||
processor.apply_chat_template(
|
|
||||||
example["messages"], chat_template=chat_template, tokenize=False
|
|
||||||
)
|
|
||||||
for example in examples
|
|
||||||
]
|
|
||||||
images = [example["images"] for example in examples]
|
|
||||||
|
|
||||||
if max_images > 0:
|
|
||||||
images = [img_batch[:max_images] for img_batch in images]
|
|
||||||
|
|
||||||
batch = processor(text=texts, images=images, padding=False)
|
|
||||||
|
|
||||||
n_sequence = len(examples)
|
|
||||||
n_seq_in_batch = 0
|
|
||||||
pack_len = 0
|
|
||||||
features_pack = {}
|
|
||||||
packed = {}
|
|
||||||
features = list[batch.keys()]
|
|
||||||
for feature in features:
|
|
||||||
features_pack[feature] = []
|
|
||||||
packed[feature] = []
|
|
||||||
features.remove("input_ids")
|
|
||||||
|
|
||||||
for seq_in_batch_id in range(n_sequence):
|
|
||||||
next_seq_len = len(batch["input_ids"][seq_in_batch_id])
|
|
||||||
if not pack_len + next_seq_len + 1 < sequence_length:
|
|
||||||
n_seq_in_batch += 1
|
|
||||||
pack_len += next_seq_len + 1
|
|
||||||
features_pack["input_ids"] += batch["input_ids"][seq_in_batch_id] + [
|
|
||||||
sep_token_id
|
|
||||||
]
|
|
||||||
|
|
||||||
"""
|
|
||||||
Do something with attention mask and cross-attention
|
|
||||||
"""
|
|
||||||
|
|
||||||
for feature in features:
|
|
||||||
features_pack[feature] += batch[feature][seq_in_batch_id]
|
|
||||||
|
|
||||||
else:
|
|
||||||
for _ in range(sequence_length - pack_len):
|
|
||||||
features_pack["input_ids"] += [pad_token_id]
|
|
||||||
|
|
||||||
packed["input_ids"].append(
|
|
||||||
torch.tensor(features_pack["input_ids"].copy())
|
|
||||||
)
|
|
||||||
|
|
||||||
for feature in features:
|
|
||||||
packed[feature].append(torch.tensor(features_pack[feature].copy()))
|
|
||||||
features_pack[feature] = []
|
|
||||||
|
|
||||||
pack_len = 0
|
|
||||||
|
|
||||||
image_token_id = processor.tokenizer.convert_tokens_to_ids(
|
|
||||||
processor.image_token
|
|
||||||
)
|
|
||||||
labels = [pack.clone() for pack in packed["input_ids"]]
|
|
||||||
for label_id, label in enumerate(labels):
|
|
||||||
labels[label_id][label == processor.tokenizer.pad_token_id] = -100 #
|
|
||||||
# Ignore the image token index in the loss computation (model specific)
|
|
||||||
|
|
||||||
labels[label_id][label == image_token_id] = -100
|
|
||||||
packed["labels"] = labels
|
|
||||||
|
|
||||||
if length_only:
|
|
||||||
return {
|
|
||||||
"length": [len(sample["input_ids"]) for sample in batch["input_ids"]]
|
|
||||||
}
|
|
||||||
return packed
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def process_rows(examples, processor, chat_template, max_images, length_only=False):
|
def process_rows(examples, processor, chat_template, max_images, length_only=False):
|
||||||
# HINT: use `_torch_collate_batch` to stack and pad tensors
|
# HINT: use `_torch_collate_batch` to stack and pad tensors
|
||||||
|
|||||||
@@ -1114,7 +1114,8 @@ def load_lora(model, cfg, inference=False, config_only=False):
|
|||||||
fan_in_fan_out=cfg.lora_fan_in_fan_out,
|
fan_in_fan_out=cfg.lora_fan_in_fan_out,
|
||||||
modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
|
modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
|
||||||
bias="none",
|
bias="none",
|
||||||
task_type="CAUSAL_LM",
|
# task_type="CAUSAL_LM",
|
||||||
|
task_type="CONDITIONAL_GENERATION" if cfg.is_multimodal else "CAUSAL_LM",
|
||||||
**lora_config_kwargs,
|
**lora_config_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user