seems to be working?

This commit is contained in:
Dan Saunders
2025-08-24 00:49:13 +00:00
parent 79ddaebe9a
commit 3a35076513
11 changed files with 1004 additions and 23 deletions

View File

@@ -0,0 +1,61 @@
# Example configuration for streaming pretraining
# This demonstrates how to pretrain on large datasets that don't fit in memory
base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Required: max_steps for streaming pretraining
max_steps: 10000
# Pretraining dataset configuration
# These are automatically streamed
pretraining_dataset:
- path: allenai/c4
name: en
type: pretrain
# Optional: skip N samples (useful for resuming)
# skip: 1000000
# Can also use multiple pretraining datasets
# pretraining_dataset:
# - path: allenai/c4
# name: en
# type: pretrain
# - path: HuggingFaceFW/fineweb
# type: pretrain
val_set_size: 0.0
# Sequence and packing configuration
sequence_len: 2048
sample_packing: true
pretrain_multipack_attn: true
pretrain_multipack_buffer_size: 10000 # Buffer size for multipack batching
# Training hyperparameters
gradient_accumulation_steps: 8
micro_batch_size: 4
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 3e-4
# Memory optimizations
bf16: auto
tf32: false
gradient_checkpointing: true
flash_attention: true
# Checkpointing and logging
output_dir: ./outputs/pretrain-streaming
logging_steps: 10
save_steps: 500
save_total_limit: 3 # Keep only last 3 checkpoints
# Warmup
warmup_ratio: 0.1
# Optional: enable wandb for monitoring
# wandb_project: streaming-pretrain
# wandb_entity: your-entity
# wandb_name: c4-pretrain

View File

@@ -0,0 +1,52 @@
# Example configuration for streaming SFT training
# This enables training on datasets larger than memory by streaming them from HuggingFace Hub
base_model: HuggingFaceTB/SmolLM2-135M
# Enable streaming mode for datasets
streaming: true
# When using streaming, max_steps is required
max_steps: 3 # Just test a few steps
# Training datasets - these will be streamed
# datasets:
# - path: tatsu-lab/alpaca
# type: alpaca
# split: train
pretraining_dataset:
- path: tatsu-lab/alpaca
type: alpaca
split: train
# Dataset configuration
sequence_len: 2048
sample_packing: true # Enable multipack batching
pretrain_multipack_attn: true # Enable multipack attention masking
pretrain_multipack_buffer_size: 1000 # Buffer size for packing (smaller for streaming SFT)
special_tokens:
pad_token: <|endoftext|>
# Training hyperparameters
gradient_accumulation_steps: 4
micro_batch_size: 1 # Always 1 for multipack - sequences are packed into single samples
num_epochs: 1 # With streaming, typically use max_steps instead
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 2e-5
# Enable efficient training
bf16: auto
tf32: false
gradient_checkpointing: true
flash_attention: true # Enable flash attention with multipack
# Logging and checkpointing
logging_steps: 10
eval_steps: 100
save_steps: 200
output_dir: ./outputs/streaming-model
# Warmup
warmup_steps: 100