replace attention in the yaml config with an enum

This commit is contained in:
Wing Lian
2025-04-04 23:37:30 -04:00
committed by NanoCode012
parent 0d71b0aa5f
commit ba47adc24b
101 changed files with 268 additions and 122 deletions

View File

@@ -49,7 +49,8 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 100
evals_per_epoch: 2

View File

@@ -34,7 +34,8 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 100
evals_per_epoch: 2

View File

@@ -61,7 +61,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -56,7 +56,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -77,7 +77,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -53,7 +53,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -54,7 +54,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -48,7 +48,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -55,7 +55,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -48,7 +48,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -49,7 +49,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -53,7 +53,8 @@ gradient_checkpointing_kwargs:
use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 20
evals_per_epoch: 4

View File

@@ -51,7 +51,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

View File

@@ -39,7 +39,8 @@ gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: true
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -48,7 +48,8 @@ gradient_checkpointing_kwargs:
use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 10
evals_per_epoch: 4

View File

@@ -46,7 +46,8 @@ tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
attention: flash
warmup_steps: 10
evals_per_epoch: 4