base_model: tiiuae/Falcon-E-3B-Base-prequantized output_dir: ./output plugins: - axolotl.integrations.kernels.KernelsPlugin use_kernels: false use_scattermoe: false use_sonicmoe: false use_onebitllms: true load_in_8bit: false load_in_4bit: false chat_template: tokenizer_default datasets: - path: cgato/SlimOrcaDedupCleaned type: chat_template field_messages: conversations message_property_mappings: role: from content: value dataset_prepared_path: ./axolotl_dataset_cache sequence_len: 32768 trust_remote_code: false gradient_accumulation_steps: 4 # This can run on 4 GPUs # Very important to enable gradient accumulation with FSDP # https://github.com/huggingface/transformers/issues/29425 accelerator_config: gradient_accumulation_kwargs: sync_each_batch: True micro_batch_size: 1 num_epochs: 3 optimizer: adamw_torch lr_scheduler: cosine learning_rate: 5.0e-4 # adamw hyperparams adam_beta1: 0.9 adam_beta2: 0.95 bf16: true tf32: false logging_steps: 1 flash_attention: true loss_watchdog_threshold: 15.0 loss_watchdog_patience: 3 warmup_steps: 128 evals_per_epoch: 0 save_steps: 500 save_strategy: steps weight_decay: 0.01 sample_packing: true pad_to_sequence_len: true shuffle_merged_datasets: true experimental_skip_move_to_device: true fsdp_version: 2 fsdp_config: offload_params: false auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: LlamaDecoderLayer state_dict_type: FULL_STATE_DICT reshard_after_forward: true activation_checkpointing: true # save_first_step: true # uncomment this to validate checkpoint saving works with your config # Comment to disable CP # The number of GPUs to shard the model parameters across (FSDP dimension). dp_shard_size: 1 # The number of times to replicate the sharded model (DDP dimension). dp_replicate_size: 1 # Number of GPUs for Tensor Parallelism. tensor_parallel_size: 1 # (default is 1, no TP) # Number of GPUs for Context/Sequence Parallelism. context_parallel_size: 1 # (default is 1, no CP) special_tokens: eos_token: <|end_of_text|> eot_tokens: - <|im_end|>