base_model: Qwen/Qwen3-8B

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

dp_shard_size: 2
# dp_replicate_size: 1
context_parallel_size: 2
tensor_parallel_size: 2

dataset_prepared_path: last_run_prepared

fsdp_version: 2
fsdp_config:
  offload_params: false
  state_dict_type: FULL_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen3DecoderLayer
  reshard_after_forward: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca

output_dir: ./outputs/ndp-out/

sequence_len: 8192
sample_packing: true
flash_attention: true

gradient_accumulation_steps: 1
micro_batch_size: 1  # must be 1 when using context parallel
num_epochs: 2
optimizer: adamw_torch_fused
lr_scheduler: constant_with_warmup
learning_rate: 2e-6

bf16: true
tf32: true

logging_steps: 1
saves_per_epoch: 1

warmup_ratio: 0.1

special_tokens: