Fix: add bitnet config (#3636)

* add bitnet config * chore: lint --------- Co-authored-by: Wing Lian <wing@axolotl.ai>
2026-04-30 20:30:56 +04:00
parent e662972a29
commit 6136ae627b
2 changed files with 193 additions and 0 deletions
--- a/examples/falcon-e/falcon-e-3b-dpo.yaml
+++ b/examples/falcon-e/falcon-e-3b-dpo.yaml
@@ -0,0 +1,93 @@
+base_model: axolotl-ai-co/Falcon-E-1.2-3B-Exp-prequantized
+output_dir: ./output
+
+plugins:
+  - axolotl.integrations.kernels.KernelsPlugin
+
+use_kernels: false
+use_scattermoe: false
+use_sonicmoe: false
+use_onebitllms: true
+
+load_in_8bit: false
+load_in_4bit: false
+
+chat_template: tokenizer_default
+
+rl: dpo
+datasets:
+  - path: allenai/Dolci-Think-DPO-7B
+    split: train
+    type: chatml.ultra
+
+dataset_prepared_path: ./axolotl_dataset_cache
+
+sequence_len: 8192
+trust_remote_code: false
+
+gradient_accumulation_steps: 4 # This can run on 4 GPUs
+
+# Very important to enable gradient accumulation with FSDP
+# https://github.com/huggingface/transformers/issues/29425
+accelerator_config:
+  gradient_accumulation_kwargs:
+    sync_each_batch: True
+
+
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 1.0e-5
+# adamw hyperparams
+adam_beta1: 0.9
+adam_beta2: 0.95
+
+bf16: true
+tf32: false
+
+logging_steps: 1
+
+flash_attention: true
+
+loss_watchdog_threshold: 15.0
+loss_watchdog_patience: 3
+
+warmup_steps: 128
+evals_per_epoch: 0
+
+save_steps: 500
+save_strategy: steps
+
+weight_decay: 0.01
+
+shuffle_merged_datasets: true
+experimental_skip_move_to_device: true
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+  activation_checkpointing: true
+
+# Comment to disable CP
+# The number of GPUs to shard the model parameters across (FSDP dimension).
+dp_shard_size: 1
+
+# The number of times to replicate the sharded model (DDP dimension).
+dp_replicate_size: 1
+
+# Number of GPUs for Tensor Parallelism.
+tensor_parallel_size: 1  # (default is 1, no TP)
+
+# Number of GPUs for Context/Sequence Parallelism.
+context_parallel_size: 1 # (default is 1, no CP)
+
+special_tokens:
+  eos_token: <|end_of_text|>
+
+eot_tokens:
+  - <|im_end|>
--- a/examples/falcon-e/falcon-e-3b-ft.yaml
+++ b/examples/falcon-e/falcon-e-3b-ft.yaml
@@ -0,0 +1,100 @@
+base_model: tiiuae/Falcon-E-3B-Base-prequantized
+output_dir: ./output
+
+plugins:
+  - axolotl.integrations.kernels.KernelsPlugin
+
+use_kernels: false
+use_scattermoe: false
+use_sonicmoe: false
+use_onebitllms: true
+
+load_in_8bit: false
+load_in_4bit: false
+
+chat_template: tokenizer_default
+
+datasets:
+  - path: cgato/SlimOrcaDedupCleaned
+    type: chat_template
+    field_messages: conversations
+    message_property_mappings:
+      role: from
+      content: value
+
+dataset_prepared_path: ./axolotl_dataset_cache
+
+sequence_len: 32768
+trust_remote_code: false
+
+
+gradient_accumulation_steps: 4 # This can run on 4 GPUs
+
+# Very important to enable gradient accumulation with FSDP
+# https://github.com/huggingface/transformers/issues/29425
+accelerator_config:
+  gradient_accumulation_kwargs:
+    sync_each_batch: True
+
+
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_torch
+lr_scheduler: cosine
+learning_rate: 5.0e-4
+# adamw hyperparams
+adam_beta1: 0.9
+adam_beta2: 0.95
+
+bf16: true
+tf32: false
+
+logging_steps: 1
+
+flash_attention: true
+
+loss_watchdog_threshold: 15.0
+loss_watchdog_patience: 3
+
+warmup_steps: 128
+evals_per_epoch: 0
+
+save_steps: 500
+save_strategy: steps
+
+weight_decay: 0.01
+
+sample_packing: true
+pad_to_sequence_len: true
+
+shuffle_merged_datasets: true
+experimental_skip_move_to_device: true
+
+fsdp_version: 2
+fsdp_config:
+  offload_params: false
+  auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  transformer_layer_cls_to_wrap: LlamaDecoderLayer
+  state_dict_type: FULL_STATE_DICT
+  reshard_after_forward: true
+  activation_checkpointing: true
+# save_first_step: true  # uncomment this to validate checkpoint saving works with your config
+
+# Comment to disable CP
+# The number of GPUs to shard the model parameters across (FSDP dimension).
+dp_shard_size: 1
+
+# The number of times to replicate the sharded model (DDP dimension).
+dp_replicate_size: 1
+
+# Number of GPUs for Tensor Parallelism.
+tensor_parallel_size: 1  # (default is 1, no TP)
+
+# Number of GPUs for Context/Sequence Parallelism.
+context_parallel_size: 1 # (default is 1, no CP)
+
+special_tokens:
+  eos_token: <|end_of_text|>
+
+eot_tokens:
+  - <|im_end|>