base_model: HuggingFaceTB/SmolLM2-135M # Streaming pretraining configuration pretraining_dataset: - path: HuggingFaceFW/fineweb-edu name: sample-10BT type: pretrain text_column: text split: train # Streaming-specific settings streaming_multipack_buffer_size: 10000 shuffle_merged_datasets: true # Training configuration max_steps: 1000 output_dir: ./outputs/smollm2-135m-pretrain-streaming # Sequence and packing settings sequence_len: 1024 sample_packing: true pretrain_multipack_attn: true # Prevent cross-attention between packed sequences flash_attention: true # Batch size settings gradient_accumulation_steps: 8 micro_batch_size: 1 # Optimizer and scheduler optimizer: adamw_torch lr_scheduler: cosine learning_rate: 5e-4 warmup_ratio: 0.1 weight_decay: 0.01 # Precision and performance bf16: auto tf32: true # Logging and checkpointing logging_steps: 10 save_strategy: steps save_steps: 250 save_total_limit: 3 # Weights & Biases (optional) wandb_project: wandb_entity: wandb_watch: wandb_name: wandb_log_model: # Special tokens special_tokens: pad_token: "<|endoftext|>" # save_first_step: true # uncomment this to validate checkpoint saving works with your config