migrate example configs to canonical attn_implementation

This commit is contained in:
Wing Lian
2026-04-23 22:15:07 +00:00
parent 2d64d009d8
commit 39226623d2
222 changed files with 209 additions and 243 deletions

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/out_gemma/
sequence_len: 8096
sample_packing: true
flash_attention: true
attn_implementation: flash_attention_2
wandb_entity:
wandb_watch:

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_gemma/
sequence_len: 8096
sample_packing: true
flash_attention: true
attn_implementation: flash_attention_2
qat:
activation_dtype: nvfp4

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/out_math_gemma/
sequence_len: 4096
sample_packing: true
flash_attention: true
attn_implementation: flash_attention_2
wandb_entity:
wandb_watch:

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_math_gemma/
sequence_len: 4096
sample_packing: true
flash_attention: true
attn_implementation: flash_attention_2
qat:
activation_dtype: nvfp4

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/out_math_gemma27/
sequence_len: 4096
sample_packing: true
flash_attention: true
attn_implementation: flash_attention_2
wandb_entity:
wandb_watch:

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_math_gemma27/
sequence_len: 4096
sample_packing: true
flash_attention: true
attn_implementation: flash_attention_2
qat:
activation_dtype: nvfp4

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/out_math_72b/
sequence_len: 4096
sample_packing: true
flash_attention: true
attn_implementation: flash_attention_2
wandb_entity:
wandb_watch:

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_math_72b/
sequence_len: 4096
sample_packing: true
flash_attention: true
attn_implementation: flash_attention_2
qat:
activation_dtype: nvfp4

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/out_qwen72b/
sequence_len: 8096
sample_packing: true
flash_attention: true
attn_implementation: flash_attention_2
wandb_entity:
wandb_watch:

View File

@@ -24,7 +24,7 @@ output_dir: ./outputs/qat_out_qwen72b/
sequence_len: 8096
sample_packing: true
flash_attention: true
attn_implementation: flash_attention_2
qat:
activation_dtype: nvfp4