Post release fixes (#2581)
* fix missing kwarg on child * make the runpod test shorter * update docs * rename runpod test json file * typing fixes and ordering of doc
This commit is contained in:
@@ -12,22 +12,22 @@
|
|||||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
||||||
"model_type": "AutoModelForCausalLM",
|
"model_type": "AutoModelForCausalLM",
|
||||||
"tokenizer_type": "AutoTokenizer",
|
"tokenizer_type": "AutoTokenizer",
|
||||||
"load_in_8bit": true,
|
"load_in_4bit": true,
|
||||||
"load_in_4bit": false,
|
|
||||||
"strict": false,
|
"strict": false,
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
"path": "mhenrichsen/alpaca_2k_test",
|
"path": "mhenrichsen/alpaca_2k_test",
|
||||||
"type": "alpaca"
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]"
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
"val_set_size": 0.05,
|
"val_set_size": 0.02,
|
||||||
"output_dir": "./outputs/lora-out",
|
"output_dir": "./outputs/lora-out",
|
||||||
"sequence_len": 4096,
|
"sequence_len": 4096,
|
||||||
"sample_packing": true,
|
"sample_packing": true,
|
||||||
"eval_sample_packing": false,
|
"eval_sample_packing": false,
|
||||||
"pad_to_sequence_len": true,
|
"pad_to_sequence_len": true,
|
||||||
"adapter": "lora",
|
"adapter": "qlora",
|
||||||
"lora_r": 32,
|
"lora_r": 32,
|
||||||
"lora_alpha": 64,
|
"lora_alpha": 64,
|
||||||
"lora_dropout": 0.05,
|
"lora_dropout": 0.05,
|
||||||
@@ -36,8 +36,8 @@
|
|||||||
"embed_tokens",
|
"embed_tokens",
|
||||||
"lm_head"
|
"lm_head"
|
||||||
],
|
],
|
||||||
"gradient_accumulation_steps": 4,
|
"gradient_accumulation_steps": 2,
|
||||||
"micro_batch_size": 2,
|
"micro_batch_size": 1,
|
||||||
"num_epochs": 1,
|
"num_epochs": 1,
|
||||||
"optimizer": "adamw_torch_fused",
|
"optimizer": "adamw_torch_fused",
|
||||||
"lr_scheduler": "cosine",
|
"lr_scheduler": "cosine",
|
||||||
@@ -56,7 +56,8 @@
|
|||||||
"weight_decay": 0.0,
|
"weight_decay": 0.0,
|
||||||
"special_tokens": {
|
"special_tokens": {
|
||||||
"pad_token": "<|endoftext|>"
|
"pad_token": "<|endoftext|>"
|
||||||
}
|
},
|
||||||
|
"max_steps": 20
|
||||||
},
|
},
|
||||||
"timeout": 100000
|
"timeout": 100000
|
||||||
},
|
},
|
||||||
@@ -184,6 +184,10 @@ datasets:
|
|||||||
# adding a system turn with empty content.
|
# adding a system turn with empty content.
|
||||||
drop_system_message:
|
drop_system_message:
|
||||||
|
|
||||||
|
# Optional[bool]. Whether to split the assistant turn based on a reasoning trace inside delimited tags
|
||||||
|
# defaults to False
|
||||||
|
split_thinking:
|
||||||
|
|
||||||
# IMPORTANT: The following fields determine which parts of the conversation to train on.
|
# IMPORTANT: The following fields determine which parts of the conversation to train on.
|
||||||
# Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
|
# Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
|
||||||
# See examples at `docs/dataset-formats/conversation.qmd`
|
# See examples at `docs/dataset-formats/conversation.qmd`
|
||||||
|
|||||||
@@ -37,6 +37,7 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
|
|||||||
train_on_eos=None,
|
train_on_eos=None,
|
||||||
train_on_eot=None,
|
train_on_eot=None,
|
||||||
eot_tokens=None,
|
eot_tokens=None,
|
||||||
|
split_thinking: bool | None = False,
|
||||||
logprobs_field="logprobs",
|
logprobs_field="logprobs",
|
||||||
gen_temperature=1.0,
|
gen_temperature=1.0,
|
||||||
kd_temperature=1.0,
|
kd_temperature=1.0,
|
||||||
@@ -54,6 +55,7 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
|
|||||||
train_on_eos=train_on_eos,
|
train_on_eos=train_on_eos,
|
||||||
train_on_eot=train_on_eot,
|
train_on_eot=train_on_eot,
|
||||||
eot_tokens=eot_tokens,
|
eot_tokens=eot_tokens,
|
||||||
|
split_thinking=split_thinking,
|
||||||
)
|
)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ HF Chat Templates prompt strategy
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import Any, Dict, List, Optional, Set, Union
|
from typing import Any, Dict, List, Set, Union
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from transformers import ProcessorMixin
|
from transformers import ProcessorMixin
|
||||||
@@ -29,12 +29,12 @@ class ChatTemplatePrompter(Prompter):
|
|||||||
chat_template: str,
|
chat_template: str,
|
||||||
processor=None,
|
processor=None,
|
||||||
max_length=2048,
|
max_length=2048,
|
||||||
message_property_mappings: Optional[Dict[str, str]] = None,
|
message_property_mappings: Dict[str, str] | None = None,
|
||||||
message_field_training: Optional[str] = None,
|
message_field_training: str | None = None,
|
||||||
message_field_training_detail: Optional[str] = None,
|
message_field_training_detail: str | None = None,
|
||||||
field_messages: str = "messages",
|
field_messages: str = "messages",
|
||||||
field_system: str = "system",
|
field_system: str = "system",
|
||||||
roles: Optional[Dict[str, List[str]]] = None,
|
roles: Dict[str, List[str]] | None = None,
|
||||||
drop_system_message: bool = False,
|
drop_system_message: bool = False,
|
||||||
):
|
):
|
||||||
# check if message_property_mappings is None or empty dict
|
# check if message_property_mappings is None or empty dict
|
||||||
@@ -65,7 +65,7 @@ class ChatTemplatePrompter(Prompter):
|
|||||||
self.field_messages = field_messages
|
self.field_messages = field_messages
|
||||||
self.field_system = field_system
|
self.field_system = field_system
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.processor: Optional[ProcessorMixin] = processor
|
self.processor: ProcessorMixin | None = processor
|
||||||
self.chat_template = chat_template
|
self.chat_template = chat_template
|
||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
self.drop_system_message = drop_system_message
|
self.drop_system_message = drop_system_message
|
||||||
@@ -224,11 +224,11 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
tokenizer,
|
tokenizer,
|
||||||
train_on_inputs: bool,
|
train_on_inputs: bool,
|
||||||
sequence_len: int,
|
sequence_len: int,
|
||||||
roles_to_train: Optional[List[str]] = None,
|
roles_to_train: list[str] | None = None,
|
||||||
train_on_eos: Optional[str] = None,
|
train_on_eos: str | None = None,
|
||||||
train_on_eot: Optional[str] = None,
|
train_on_eot: str | None = None,
|
||||||
eot_tokens: Optional[List[str]] = None,
|
eot_tokens: list[str] | None = None,
|
||||||
split_thinking: Optional[bool] = False,
|
split_thinking: bool | None = False,
|
||||||
):
|
):
|
||||||
super().__init__(prompter, tokenizer, train_on_inputs, sequence_len)
|
super().__init__(prompter, tokenizer, train_on_inputs, sequence_len)
|
||||||
self.prompter: ChatTemplatePrompter = prompter
|
self.prompter: ChatTemplatePrompter = prompter
|
||||||
@@ -714,7 +714,7 @@ class StrategyLoader:
|
|||||||
self,
|
self,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
cfg,
|
cfg,
|
||||||
ds_cfg: Optional[Union[Dict[str, Any], DatasetConfig]] = None,
|
ds_cfg: Union[Dict[str, Any], DatasetConfig] | None = None,
|
||||||
processor=None,
|
processor=None,
|
||||||
):
|
):
|
||||||
if ds_cfg is None:
|
if ds_cfg is None:
|
||||||
|
|||||||
Reference in New Issue
Block a user