# Example configuration for streaming SFT training
# This enables training on datasets larger than memory by streaming them from HuggingFace Hub

base_model: HuggingFaceTB/SmolLM2-135M

# Enable streaming mode for datasets
streaming: true

# When using streaming, max_steps is required
max_steps: 3  # Just test a few steps

# Training datasets - these will be streamed
# datasets:
#   - path: tatsu-lab/alpaca
#     type: alpaca
#     split: train

pretraining_dataset:
  - path: tatsu-lab/alpaca
    type: alpaca
    split: train

# Dataset configuration
sequence_len: 2048
sample_packing: true  # Enable multipack batching
pretrain_multipack_attn: true  # Enable multipack attention masking
pretrain_multipack_buffer_size: 1000  # Buffer size for packing (smaller for streaming SFT)
special_tokens:
  pad_token: <|endoftext|>

# Training hyperparameters
gradient_accumulation_steps: 4
micro_batch_size: 1  # Always 1 for multipack - sequences are packed into single samples
num_epochs: 1  # With streaming, typically use max_steps instead
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 2e-5

# Enable efficient training
bf16: auto
tf32: false
gradient_checkpointing: true
flash_attention: true  # Enable flash attention with multipack

# Logging and checkpointing
logging_steps: 10
eval_steps: 100
save_steps: 200
output_dir: ./outputs/streaming-model

# Warmup
warmup_steps: 100