base_model: axolotl-ai-co/Mistral-Small-4-119B-2603-BF16

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
  - axolotl.integrations.kernels.KernelsPlugin
use_kernels: true
use_sonicmoe: true

# only train language model layers, freeze vision tower
unfrozen_parameters:
  - model.language_model.*
  - lm_head
  - embed_tokens

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

sequence_len: 2048
sample_packing: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 2e-5

bf16: true
tf32: true

logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

fsdp_version: 2
fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: false
  state_dict_type: FULL_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Mistral4DecoderLayer
  reshard_after_forward: true
  activation_checkpointing: true