base_model: axolotl-ai-co/Falcon-E-1.2-3B-Exp-prequantized output_dir: ./output plugins: - axolotl.integrations.kernels.KernelsPlugin use_kernels: false use_scattermoe: false use_sonicmoe: false use_onebitllms: true load_in_8bit: false load_in_4bit: false chat_template: tokenizer_default rl: dpo datasets: - path: allenai/Dolci-Think-DPO-7B split: train type: chatml.ultra dataset_prepared_path: ./axolotl_dataset_cache sequence_len: 8192 trust_remote_code: false gradient_accumulation_steps: 4 # This can run on 4 GPUs # Very important to enable gradient accumulation with FSDP # https://github.com/huggingface/transformers/issues/29425 accelerator_config: gradient_accumulation_kwargs: sync_each_batch: True micro_batch_size: 1 num_epochs: 3 optimizer: adamw_torch lr_scheduler: cosine learning_rate: 1.0e-5 # adamw hyperparams adam_beta1: 0.9 adam_beta2: 0.95 bf16: true tf32: false logging_steps: 1 flash_attention: true loss_watchdog_threshold: 15.0 loss_watchdog_patience: 3 warmup_steps: 128 evals_per_epoch: 0 save_steps: 500 save_strategy: steps weight_decay: 0.01 shuffle_merged_datasets: true experimental_skip_move_to_device: true fsdp_version: 2 fsdp_config: offload_params: false auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: LlamaDecoderLayer state_dict_type: FULL_STATE_DICT reshard_after_forward: true activation_checkpointing: true # Comment to disable CP # The number of GPUs to shard the model parameters across (FSDP dimension). dp_shard_size: 1 # The number of times to replicate the sharded model (DDP dimension). dp_replicate_size: 1 # Number of GPUs for Tensor Parallelism. tensor_parallel_size: 1 # (default is 1, no TP) # Number of GPUs for Context/Sequence Parallelism. context_parallel_size: 1 # (default is 1, no CP) special_tokens: eos_token: <|end_of_text|> eot_tokens: - <|im_end|>