base_model: Qwen/Qwen2.5-0.5B model_type: AutoModelForCausalLM tokenizer_type: AutoTokenizer # Use random initialization for fair comparison reinit_weights: true load_in_8bit: false load_in_4bit: false strict: false # Pretraining dataset pretraining_dataset: - path: allenai/c4 name: en type: pretrain split: train dataset_prepared_path: val_set_size: 0.0 output_dir: ./outputs/compare-adamw-pretrain sequence_len: 2048 sample_packing: true pad_to_sequence_len: true wandb_project: dist_muon wandb_entity: wandb_watch: wandb_name: adamw wandb_log_model: gradient_accumulation_steps: 1 micro_batch_size: 4 num_epochs: 1 max_steps: 305 # AdamW optimizer settings (standard LR for AdamW) optimizer: adamw_torch_fused learning_rate: 0.0002 weight_decay: 0.01 lr_scheduler: cosine train_on_inputs: true group_by_length: false bf16: auto fp16: false tf32: false gradient_checkpointing: false logging_steps: 1 flash_attention: true warmup_steps: 10 evals_per_epoch: 0 saves_per_epoch: 1 # Reproducibility seed: 42 fsdp_config: fsdp_version: 2 fsdp_offload_params: false fsdp_state_dict_type: FULL_STATE_DICT fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP fsdp_cpu_ram_efficient_loading: false fsdp_reshard_after_forward: true special_tokens: