base_model: axolotl-ai-co/Falcon-E-1.2-3B-Exp-prequantized
output_dir: ./output

plugins:
  - axolotl.integrations.kernels.KernelsPlugin

use_kernels: false
use_scattermoe: false
use_sonicmoe: false
use_onebitllms: true

load_in_8bit: false
load_in_4bit: false

chat_template: tokenizer_default

rl: dpo
datasets:
  - path: allenai/Dolci-Think-DPO-7B
    split: train
    type: chatml.ultra

dataset_prepared_path: ./axolotl_dataset_cache

sequence_len: 8192
trust_remote_code: false

gradient_accumulation_steps: 4 # This can run on 4 GPUs

# Very important to enable gradient accumulation with FSDP
# https://github.com/huggingface/transformers/issues/29425
accelerator_config:
  gradient_accumulation_kwargs:
    sync_each_batch: True


micro_batch_size: 1
num_epochs: 3
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 1.0e-5
# adamw hyperparams
adam_beta1: 0.9
adam_beta2: 0.95

bf16: true
tf32: false

logging_steps: 1

flash_attention: true

loss_watchdog_threshold: 15.0
loss_watchdog_patience: 3

warmup_steps: 128
evals_per_epoch: 0

save_steps: 500
save_strategy: steps

weight_decay: 0.01

shuffle_merged_datasets: true
experimental_skip_move_to_device: true

fsdp_version: 2
fsdp_config:
  offload_params: false
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: LlamaDecoderLayer
  state_dict_type: FULL_STATE_DICT
  reshard_after_forward: true
  activation_checkpointing: true

# Comment to disable CP
# The number of GPUs to shard the model parameters across (FSDP dimension).
dp_shard_size: 1

# The number of times to replicate the sharded model (DDP dimension).
dp_replicate_size: 1

# Number of GPUs for Tensor Parallelism.
tensor_parallel_size: 1  # (default is 1, no TP)

# Number of GPUs for Context/Sequence Parallelism.
context_parallel_size: 1 # (default is 1, no CP)

special_tokens:
  eos_token: <|end_of_text|>

eot_tokens:
  - <|im_end|>