Files
axolotl/examples/falcon-e/falcon-e-3b-ft.yaml
Younes B 6136ae627b Fix: add bitnet config (#3636)
* add bitnet config

* chore: lint

---------

Co-authored-by: Wing Lian <wing@axolotl.ai>
2026-04-30 12:30:56 -04:00

101 lines
2.1 KiB
YAML

base_model: tiiuae/Falcon-E-3B-Base-prequantized
output_dir: ./output
plugins:
- axolotl.integrations.kernels.KernelsPlugin
use_kernels: false
use_scattermoe: false
use_sonicmoe: false
use_onebitllms: true
load_in_8bit: false
load_in_4bit: false
chat_template: tokenizer_default
datasets:
- path: cgato/SlimOrcaDedupCleaned
type: chat_template
field_messages: conversations
message_property_mappings:
role: from
content: value
dataset_prepared_path: ./axolotl_dataset_cache
sequence_len: 32768
trust_remote_code: false
gradient_accumulation_steps: 4 # This can run on 4 GPUs
# Very important to enable gradient accumulation with FSDP
# https://github.com/huggingface/transformers/issues/29425
accelerator_config:
gradient_accumulation_kwargs:
sync_each_batch: True
micro_batch_size: 1
num_epochs: 3
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 5.0e-4
# adamw hyperparams
adam_beta1: 0.9
adam_beta2: 0.95
bf16: true
tf32: false
logging_steps: 1
flash_attention: true
loss_watchdog_threshold: 15.0
loss_watchdog_patience: 3
warmup_steps: 128
evals_per_epoch: 0
save_steps: 500
save_strategy: steps
weight_decay: 0.01
sample_packing: true
pad_to_sequence_len: true
shuffle_merged_datasets: true
experimental_skip_move_to_device: true
fsdp_version: 2
fsdp_config:
offload_params: false
auto_wrap_policy: TRANSFORMER_BASED_WRAP
transformer_layer_cls_to_wrap: LlamaDecoderLayer
state_dict_type: FULL_STATE_DICT
reshard_after_forward: true
activation_checkpointing: true
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
# Comment to disable CP
# The number of GPUs to shard the model parameters across (FSDP dimension).
dp_shard_size: 1
# The number of times to replicate the sharded model (DDP dimension).
dp_replicate_size: 1
# Number of GPUs for Tensor Parallelism.
tensor_parallel_size: 1 # (default is 1, no TP)
# Number of GPUs for Context/Sequence Parallelism.
context_parallel_size: 1 # (default is 1, no CP)
special_tokens:
eos_token: <|end_of_text|>
eot_tokens:
- <|im_end|>