migrate example configs to canonical attn_implementation

This commit is contained in:
Wing Lian
2026-04-23 22:15:07 +00:00
parent 2d64d009d8
commit 39226623d2
222 changed files with 209 additions and 243 deletions

View File

@@ -42,7 +42,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
save_total_limit: 1
save_steps:

View File

@@ -36,7 +36,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -71,7 +71,6 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: false
warmup_ratio: 0.1
evals_per_epoch: 4

View File

@@ -54,7 +54,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -51,7 +51,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -49,7 +49,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -51,7 +51,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -69,7 +69,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -40,7 +40,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
save_total_limit: 1
save_steps:

View File

@@ -53,8 +53,7 @@ tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: false
sdp_attention: true
attn_implementation: sdpa
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -59,7 +59,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -54,7 +54,7 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attn_implementation: flash_attention_2
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3