base_model: Qwen/Qwen3-8B plugins: - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin dp_shard_size: 2 # dp_replicate_size: 1 context_parallel_size: 2 tensor_parallel_size: 2 dataset_prepared_path: last_run_prepared fsdp_version: 2 fsdp_config: offload_params: false state_dict_type: FULL_STATE_DICT auto_wrap_policy: TRANSFORMER_BASED_WRAP transformer_layer_cls_to_wrap: Qwen3DecoderLayer reshard_after_forward: true datasets: - path: tatsu-lab/alpaca type: alpaca output_dir: ./outputs/ndp-out/ sequence_len: 8192 sample_packing: true flash_attention: true gradient_accumulation_steps: 1 micro_batch_size: 1 # must be 1 when using context parallel num_epochs: 2 optimizer: adamw_torch_fused lr_scheduler: constant_with_warmup learning_rate: 2e-6 bf16: true tf32: true logging_steps: 1 saves_per_epoch: 1 warmup_ratio: 0.1 special_tokens: