Compare commits
1 Commits
hymba_mult
...
optimizer-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
c7b095d77f |
@@ -1,58 +0,0 @@
|
|||||||
base_model: nvidia/Hymba-1.5B-Base
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: tatsu-lab/alpaca
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.05
|
|
||||||
output_dir: ./outputs/out
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
pad_to_sequence_len: true
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 2
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: paged_adamw_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 2e-5
|
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
|
||||||
fp16:
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
trust_remote_code: true
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
gradient_checkpointing_kwargs:
|
|
||||||
use_reentrant: false
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_steps: 5
|
|
||||||
evals_per_epoch: 2
|
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
|
||||||
pad_token: <|end_of_text|>
|
|
||||||
@@ -1,73 +0,0 @@
|
|||||||
base_model: nvidia/Hymba-1.5B-Base
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: True
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: tatsu-lab/alpaca
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.05
|
|
||||||
output_dir: ./outputs/out
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
pad_to_sequence_len: true
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_fan_in_fan_out:
|
|
||||||
lora_target_modules:
|
|
||||||
- gate_proj
|
|
||||||
- down_proj
|
|
||||||
- up_proj
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 2
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: paged_adamw_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 2e-5
|
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: auto
|
|
||||||
fp16:
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
trust_remote_code: true
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
gradient_checkpointing_kwargs:
|
|
||||||
use_reentrant: false
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_steps: 5
|
|
||||||
evals_per_epoch: 2
|
|
||||||
eval_table_size:
|
|
||||||
saves_per_epoch: 1
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
|
||||||
pad_token: <|end_of_text|>
|
|
||||||
@@ -424,6 +424,11 @@ class SchedulerMixin(Trainer):
|
|||||||
|
|
||||||
return self.lr_scheduler
|
return self.lr_scheduler
|
||||||
|
|
||||||
|
def _load_optimizer_and_scheduler(self, checkpoint):
|
||||||
|
if not checkpoint and self.args.optimizer_checkpoint is not None:
|
||||||
|
checkpoint = self.args.optimizer_checkpoint
|
||||||
|
return super()._load_optimizer_and_scheduler(checkpoint)
|
||||||
|
|
||||||
|
|
||||||
class AxolotlTrainer(SchedulerMixin, Trainer):
|
class AxolotlTrainer(SchedulerMixin, Trainer):
|
||||||
"""
|
"""
|
||||||
@@ -1764,6 +1769,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
] = self.cfg.loraplus_lr_embedding
|
] = self.cfg.loraplus_lr_embedding
|
||||||
training_arguments_kwargs["embedding_lr"] = self.cfg.embedding_lr
|
training_arguments_kwargs["embedding_lr"] = self.cfg.embedding_lr
|
||||||
training_arguments_kwargs["embedding_lr_scale"] = self.cfg.embedding_lr_scale
|
training_arguments_kwargs["embedding_lr_scale"] = self.cfg.embedding_lr_scale
|
||||||
|
if self.cfg.optimizer_checkpoint:
|
||||||
|
training_arguments_kwargs[
|
||||||
|
"optimizer_checkpoint"
|
||||||
|
] = self.cfg.optimizer_checkpoint
|
||||||
|
|
||||||
if self.cfg.lr_scheduler in ["one_cycle", "log_sweep"]:
|
if self.cfg.lr_scheduler in ["one_cycle", "log_sweep"]:
|
||||||
training_arguments_kwargs["lr_scheduler_type"] = "cosine"
|
training_arguments_kwargs["lr_scheduler_type"] = "cosine"
|
||||||
|
|||||||
@@ -25,7 +25,6 @@ SUPPORTED_MULTIPACK_MODEL_TYPES = [
|
|||||||
"gemmoe",
|
"gemmoe",
|
||||||
"starcoder2",
|
"starcoder2",
|
||||||
"deepseek_v2",
|
"deepseek_v2",
|
||||||
"hymba",
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -31,7 +31,6 @@ _CHAT_TEMPLATES = {
|
|||||||
"qwen_25": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
"qwen_25": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
||||||
"exaone": "{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{{ '[|system|][|endofturn|]\n' }}{% endif %}{{ '[|' + message['role'] + '|]' + message['content'] }}{% if message['role'] == 'user' %}{{ '\n' }}{% else %}{{ '[|endofturn|]\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '[|assistant|]' }}{% endif %}",
|
"exaone": "{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{{ '[|system|][|endofturn|]\n' }}{% endif %}{{ '[|' + message['role'] + '|]' + message['content'] }}{% if message['role'] == 'user' %}{{ '\n' }}{% else %}{{ '[|endofturn|]\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '[|assistant|]' }}{% endif %}",
|
||||||
"metharme": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'Enter RP mode. You shall reply to the user while staying in character. Your responses must be detailed, creative, immersive, and drive the scenario forward.' %}{% endif %}{{ '<|system|>' + system_message }}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|user|>' + content.strip() }}{% elif message['role'] == 'assistant' %}{{ '<|model|>' + content.strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|model|>' }}{% else %}{{ eos_token }}{% endif %}",
|
"metharme": "{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'Enter RP mode. You shall reply to the user while staying in character. Your responses must be detailed, creative, immersive, and drive the scenario forward.' %}{% endif %}{{ '<|system|>' + system_message }}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|user|>' + content.strip() }}{% elif message['role'] == 'assistant' %}{{ '<|model|>' + content.strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|model|>' }}{% else %}{{ eos_token }}{% endif %}",
|
||||||
"hymba": "{{'<extra_id_0>System'}}{% for message in messages %}{% if message['role'] == 'system' %}{{'\n' + message['content'].strip()}}{% if tools or contexts %}{{'\n'}}{% endif %}{% endif %}{% endfor %}{% if tools %}{% for tool in tools %}{{ '\n<tool> ' + tool|tojson + ' </tool>' }}{% endfor %}{% endif %}{% if contexts %}{% if tools %}{{'\n'}}{% endif %}{% for context in contexts %}{{ '\n<context> ' + context.strip() + ' </context>' }}{% endfor %}{% endif %}{{'\n\n'}}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<extra_id_1>User\n' + message['content'].strip() + '\n' }}{% elif message['role'] == 'assistant' %}{{ '<extra_id_1>Assistant\n' + message['content'].strip() + '\n' }}{% elif message['role'] == 'tool' %}{{ '<extra_id_1>Tool\n' + message['content'].strip() + '\n' }}{% endif %}{% endfor %}{%- if add_generation_prompt %}{{'<extra_id_1>Assistant\n'}}{%- endif %}",
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -603,6 +603,8 @@ class AxolotlInputConfig(
|
|||||||
strict: Optional[bool] = Field(default=False)
|
strict: Optional[bool] = Field(default=False)
|
||||||
resume_from_checkpoint: Optional[str] = None
|
resume_from_checkpoint: Optional[str] = None
|
||||||
auto_resume_from_checkpoints: Optional[bool] = None
|
auto_resume_from_checkpoints: Optional[bool] = None
|
||||||
|
optimizer_checkpoint: Optional[str] = None
|
||||||
|
|
||||||
resize_token_embeddings_to_32x: Optional[bool] = None
|
resize_token_embeddings_to_32x: Optional[bool] = None
|
||||||
mean_resizing_embeddings: Optional[bool] = False
|
mean_resizing_embeddings: Optional[bool] = False
|
||||||
|
|
||||||
@@ -1629,19 +1631,3 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig):
|
|||||||
else:
|
else:
|
||||||
data["torch_compile"] = False
|
data["torch_compile"] = False
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@model_validator(mode="before")
|
|
||||||
@classmethod
|
|
||||||
def check_hymba_torch_version(cls, data):
|
|
||||||
if "hymba" in data.get("base_model", {}).lower():
|
|
||||||
env_capabilities = data.get("env_capabilities", {})
|
|
||||||
torch_version = env_capabilities.get("torch_version")
|
|
||||||
|
|
||||||
if torch_version is None:
|
|
||||||
import torch
|
|
||||||
|
|
||||||
torch_version = str(torch.__version__).split("+", maxsplit=1)[0]
|
|
||||||
|
|
||||||
if version.parse(torch_version) < version.parse("2.5.0"):
|
|
||||||
raise ValueError("Hymba requires torch version >= 2.5")
|
|
||||||
return data
|
|
||||||
|
|||||||
@@ -28,10 +28,8 @@ def encode_pretraining(
|
|||||||
)
|
)
|
||||||
# Convert to PyTorch tensors
|
# Convert to PyTorch tensors
|
||||||
input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
|
input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
|
||||||
targets = [torch.tensor(seq) for seq in res["input_ids"]]
|
|
||||||
attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
|
attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
|
||||||
new_input_ids = []
|
new_input_ids = []
|
||||||
new_labels = []
|
|
||||||
new_attention_mask = []
|
new_attention_mask = []
|
||||||
# Append EOS and PAD tokens to input_ids, and correct attention_mask
|
# Append EOS and PAD tokens to input_ids, and correct attention_mask
|
||||||
for i, _ in enumerate(input_ids):
|
for i, _ in enumerate(input_ids):
|
||||||
@@ -42,34 +40,22 @@ def encode_pretraining(
|
|||||||
),
|
),
|
||||||
dim=0,
|
dim=0,
|
||||||
)
|
)
|
||||||
targets[i] = torch.cat(
|
|
||||||
(
|
|
||||||
targets[i],
|
|
||||||
torch.tensor([tokenizer.eos_token_id, -100]),
|
|
||||||
),
|
|
||||||
dim=0,
|
|
||||||
)
|
|
||||||
attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)
|
attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)
|
||||||
|
|
||||||
# Concatenate tokens so that their lengths are less than max_tokens
|
# Concatenate tokens so that their lengths are less than max_tokens
|
||||||
buffer_input_ids = torch.tensor([], dtype=torch.long)
|
buffer_input_ids = torch.tensor([], dtype=torch.long)
|
||||||
buffer_labels = torch.tensor([], dtype=torch.long)
|
|
||||||
buffer_attention_mask = torch.tensor([], dtype=torch.long)
|
buffer_attention_mask = torch.tensor([], dtype=torch.long)
|
||||||
|
|
||||||
for ids, labels, mask in zip(input_ids, targets, attention_mask):
|
for ids, mask in zip(input_ids, attention_mask):
|
||||||
if buffer_input_ids.numel() == max_tokens:
|
if buffer_input_ids.numel() == max_tokens:
|
||||||
new_input_ids.append(buffer_input_ids)
|
new_input_ids.append(buffer_input_ids)
|
||||||
new_labels.append(buffer_labels)
|
|
||||||
new_attention_mask.append(buffer_attention_mask)
|
new_attention_mask.append(buffer_attention_mask)
|
||||||
buffer_input_ids = torch.tensor([], dtype=torch.long)
|
buffer_input_ids = torch.tensor([], dtype=torch.long)
|
||||||
buffer_labels = torch.tensor([], dtype=torch.long)
|
|
||||||
buffer_attention_mask = torch.tensor([], dtype=torch.long)
|
buffer_attention_mask = torch.tensor([], dtype=torch.long)
|
||||||
buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
|
buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
|
||||||
buffer_labels = torch.cat((buffer_labels, labels), dim=0)
|
|
||||||
buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
|
buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
|
||||||
elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
|
elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
|
||||||
buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
|
buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
|
||||||
buffer_labels = torch.cat((buffer_labels, labels), dim=0)
|
|
||||||
buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
|
buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
|
||||||
else:
|
else:
|
||||||
buffer_input_ids = torch.cat(
|
buffer_input_ids = torch.cat(
|
||||||
@@ -83,17 +69,6 @@ def encode_pretraining(
|
|||||||
),
|
),
|
||||||
dim=0,
|
dim=0,
|
||||||
)
|
)
|
||||||
buffer_labels = torch.cat(
|
|
||||||
(
|
|
||||||
buffer_labels,
|
|
||||||
torch.full(
|
|
||||||
(max_tokens - buffer_labels.numel(),),
|
|
||||||
-100,
|
|
||||||
dtype=torch.long,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
dim=0,
|
|
||||||
)
|
|
||||||
buffer_attention_mask = torch.cat(
|
buffer_attention_mask = torch.cat(
|
||||||
(
|
(
|
||||||
buffer_attention_mask,
|
buffer_attention_mask,
|
||||||
@@ -106,14 +81,11 @@ def encode_pretraining(
|
|||||||
dim=0,
|
dim=0,
|
||||||
)
|
)
|
||||||
new_input_ids.append(buffer_input_ids)
|
new_input_ids.append(buffer_input_ids)
|
||||||
new_labels.append(buffer_labels)
|
|
||||||
new_attention_mask.append(buffer_attention_mask)
|
new_attention_mask.append(buffer_attention_mask)
|
||||||
buffer_input_ids = torch.tensor([], dtype=torch.long)
|
buffer_input_ids = torch.tensor([], dtype=torch.long)
|
||||||
buffer_labels = torch.tensor([], dtype=torch.long)
|
|
||||||
buffer_attention_mask = torch.tensor([], dtype=torch.long)
|
buffer_attention_mask = torch.tensor([], dtype=torch.long)
|
||||||
|
|
||||||
buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
|
buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
|
||||||
buffer_labels = torch.cat((buffer_labels, labels), dim=0)
|
|
||||||
buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
|
buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
|
||||||
|
|
||||||
if buffer_input_ids.numel() > 0: # for any leftover tokens
|
if buffer_input_ids.numel() > 0: # for any leftover tokens
|
||||||
@@ -129,17 +101,6 @@ def encode_pretraining(
|
|||||||
),
|
),
|
||||||
dim=0,
|
dim=0,
|
||||||
)
|
)
|
||||||
buffer_labels = torch.cat(
|
|
||||||
(
|
|
||||||
buffer_labels,
|
|
||||||
torch.full(
|
|
||||||
(max_tokens - buffer_labels.numel(),),
|
|
||||||
-100,
|
|
||||||
dtype=torch.long,
|
|
||||||
),
|
|
||||||
),
|
|
||||||
dim=0,
|
|
||||||
)
|
|
||||||
buffer_attention_mask = torch.cat(
|
buffer_attention_mask = torch.cat(
|
||||||
(
|
(
|
||||||
buffer_attention_mask,
|
buffer_attention_mask,
|
||||||
@@ -152,12 +113,11 @@ def encode_pretraining(
|
|||||||
dim=0,
|
dim=0,
|
||||||
)
|
)
|
||||||
new_input_ids.append(buffer_input_ids)
|
new_input_ids.append(buffer_input_ids)
|
||||||
new_labels.append(buffer_labels)
|
|
||||||
new_attention_mask.append(buffer_attention_mask)
|
new_attention_mask.append(buffer_attention_mask)
|
||||||
|
|
||||||
ret = {
|
ret = {
|
||||||
"input_ids": [seq.tolist() for seq in new_input_ids],
|
"input_ids": [seq.tolist() for seq in new_input_ids],
|
||||||
"labels": [seq.tolist() for seq in new_labels],
|
"labels": [seq.tolist() for seq in new_input_ids],
|
||||||
"attention_mask": [seq.tolist() for seq in new_attention_mask],
|
"attention_mask": [seq.tolist() for seq in new_attention_mask],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -409,7 +409,6 @@ class ModelLoader:
|
|||||||
and self.cfg.sample_packing
|
and self.cfg.sample_packing
|
||||||
):
|
):
|
||||||
if "auto_map" in self.model_config:
|
if "auto_map" in self.model_config:
|
||||||
# some model config objects are not subscriptable
|
|
||||||
try:
|
try:
|
||||||
auto_map_config = self.model_config["auto_map"]
|
auto_map_config = self.model_config["auto_map"]
|
||||||
except TypeError:
|
except TypeError:
|
||||||
|
|||||||
@@ -67,8 +67,8 @@ class TestCustomOptimizers(unittest.TestCase):
|
|||||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||||
|
|
||||||
@require_torch_2_5_1
|
|
||||||
@with_temp_dir
|
@with_temp_dir
|
||||||
|
@require_torch_2_5_1
|
||||||
def test_adopt_adamw(self, temp_dir):
|
def test_adopt_adamw(self, temp_dir):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = DictDefault(
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ from axolotl.train import train
|
|||||||
from axolotl.utils.config import normalize_config
|
from axolotl.utils.config import normalize_config
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
from .utils import check_tensorboard, require_torch_2_5_1, with_temp_dir
|
from .utils import check_tensorboard, with_temp_dir
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||||
os.environ["WANDB_DISABLED"] = "true"
|
os.environ["WANDB_DISABLED"] = "true"
|
||||||
@@ -68,129 +68,3 @@ class TestPackedLlama(unittest.TestCase):
|
|||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestUnpackedHymba(unittest.TestCase):
|
|
||||||
"""
|
|
||||||
Test case for Unpacked training of hymba models
|
|
||||||
"""
|
|
||||||
|
|
||||||
@require_torch_2_5_1
|
|
||||||
@with_temp_dir
|
|
||||||
def test_loss_unpacked(self, temp_dir):
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
cfg = DictDefault(
|
|
||||||
{
|
|
||||||
"base_model": "nvidia/Hymba-1.5B-Base",
|
|
||||||
"trust_remote_code": True,
|
|
||||||
"load_in_4bit": True,
|
|
||||||
"adapter": "qlora",
|
|
||||||
"lora_r": 32,
|
|
||||||
"lora_alpha": 16,
|
|
||||||
"lora_dropout": 0.05,
|
|
||||||
"lora_target_modules": [
|
|
||||||
"gate_proj",
|
|
||||||
"down_proj",
|
|
||||||
"up_proj",
|
|
||||||
"q_proj",
|
|
||||||
"v_proj",
|
|
||||||
"k_proj",
|
|
||||||
"o_proj",
|
|
||||||
],
|
|
||||||
"sequence_len": 1024,
|
|
||||||
"sample_packing": False,
|
|
||||||
"flash_attention": True,
|
|
||||||
"val_set_size": 0.0,
|
|
||||||
"datasets": [
|
|
||||||
{
|
|
||||||
"path": "vicgalle/alpaca-gpt4",
|
|
||||||
"type": "alpaca",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"num_epochs": 1,
|
|
||||||
"micro_batch_size": 2,
|
|
||||||
"gradient_accumulation_steps": 4,
|
|
||||||
"output_dir": temp_dir,
|
|
||||||
"learning_rate": 0.00001,
|
|
||||||
"optimizer": "adamw_torch",
|
|
||||||
"lr_scheduler": "cosine",
|
|
||||||
"max_steps": 5,
|
|
||||||
"use_tensorboard": True,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
if is_torch_bf16_gpu_available():
|
|
||||||
cfg.bf16 = True
|
|
||||||
else:
|
|
||||||
cfg.fp16 = True
|
|
||||||
normalize_config(cfg)
|
|
||||||
cli_args = TrainerCliArgs()
|
|
||||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
|
||||||
|
|
||||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
|
||||||
|
|
||||||
check_tensorboard(
|
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class TestPackedHymba(unittest.TestCase):
|
|
||||||
"""
|
|
||||||
Test case for Packed training of hymba models
|
|
||||||
"""
|
|
||||||
|
|
||||||
@require_torch_2_5_1
|
|
||||||
@with_temp_dir
|
|
||||||
def test_loss_packed(self, temp_dir):
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
cfg = DictDefault(
|
|
||||||
{
|
|
||||||
"base_model": "nvidia/Hymba-1.5B-Base",
|
|
||||||
"trust_remote_code": True,
|
|
||||||
"load_in_4bit": True,
|
|
||||||
"adapter": "qlora",
|
|
||||||
"lora_r": 32,
|
|
||||||
"lora_alpha": 16,
|
|
||||||
"lora_dropout": 0.05,
|
|
||||||
"lora_target_modules": [
|
|
||||||
"gate_proj",
|
|
||||||
"down_proj",
|
|
||||||
"up_proj",
|
|
||||||
"q_proj",
|
|
||||||
"v_proj",
|
|
||||||
"k_proj",
|
|
||||||
"o_proj",
|
|
||||||
],
|
|
||||||
"sequence_len": 1024,
|
|
||||||
"sample_packing": True,
|
|
||||||
"flash_attention": True,
|
|
||||||
"val_set_size": 0.0,
|
|
||||||
"datasets": [
|
|
||||||
{
|
|
||||||
"path": "vicgalle/alpaca-gpt4",
|
|
||||||
"type": "alpaca",
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"num_epochs": 1,
|
|
||||||
"micro_batch_size": 2,
|
|
||||||
"gradient_accumulation_steps": 4,
|
|
||||||
"output_dir": temp_dir,
|
|
||||||
"learning_rate": 0.00001,
|
|
||||||
"optimizer": "adamw_torch",
|
|
||||||
"lr_scheduler": "cosine",
|
|
||||||
"max_steps": 5,
|
|
||||||
"use_tensorboard": True,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
if is_torch_bf16_gpu_available():
|
|
||||||
cfg.bf16 = True
|
|
||||||
else:
|
|
||||||
cfg.fp16 = True
|
|
||||||
normalize_config(cfg)
|
|
||||||
cli_args = TrainerCliArgs()
|
|
||||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
|
||||||
|
|
||||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
|
||||||
|
|
||||||
check_tensorboard(
|
|
||||||
temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss is too high"
|
|
||||||
)
|
|
||||||
|
|||||||
Reference in New Issue
Block a user