Files
axolotl/examples/streaming/streaming-pretrain.yml
2025-08-25 14:22:32 +00:00

62 lines
1.4 KiB
YAML

# Example configuration for streaming pretraining
# This demonstrates how to pretrain on large datasets that don't fit in memory
base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Required: max_steps for streaming pretraining
max_steps: 10000
# Pretraining dataset configuration
# These are automatically streamed
pretraining_dataset:
- path: allenai/c4
name: en
type: pretrain
# Optional: skip N samples (useful for resuming)
# skip: 1000000
# Can also use multiple pretraining datasets
# pretraining_dataset:
# - path: allenai/c4
# name: en
# type: pretrain
# - path: HuggingFaceFW/fineweb
# type: pretrain
val_set_size: 0.0
# Sequence and packing configuration
sequence_len: 2048
sample_packing: true
pretrain_multipack_attn: true
pretrain_multipack_buffer_size: 10000 # Buffer size for multipack batching
# Training hyperparameters
gradient_accumulation_steps: 8
micro_batch_size: 4
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 3e-4
# Memory optimizations
bf16: auto
tf32: false
gradient_checkpointing: true
flash_attention: true
# Checkpointing and logging
output_dir: ./outputs/pretrain-streaming
logging_steps: 10
save_steps: 500
save_total_limit: 3 # Keep only last 3 checkpoints
# Warmup
warmup_ratio: 0.1
# Optional: enable wandb for monitoring
# wandb_project: streaming-pretrain
# wandb_entity: your-entity
# wandb_name: c4-pretrain