base_model: meta-llama/Llama-3.1-8B

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

dp_shard_size: 4
dp_replicate_size: 2
tensor_parallel_size: 2
# context_parallel_size: 2

dataset_prepared_path: last_run_prepared

special_tokens:
  pad_token: <|end_of_text|>

fsdp_version: 2
fsdp_config:
  offload_params: false
  state_dict_type: FULL_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: LlamaDecoderLayer
  reshard_after_forward: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca

output_dir: ./outputs/ndp-out/

sequence_len: 2048
sample_packing: true
flash_attention: true

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 2
optimizer: adamw_torch_fused
lr_scheduler: constant_with_warmup
learning_rate: 2e-6

bf16: true
tf32: true

logging_steps: 1
saves_per_epoch: 1

warmup_ratio: 0.1