Fix: add bitnet config (#3636)
* add bitnet config * chore: lint --------- Co-authored-by: Wing Lian <wing@axolotl.ai>
This commit is contained in:
93
examples/falcon-e/falcon-e-3b-dpo.yaml
Normal file
93
examples/falcon-e/falcon-e-3b-dpo.yaml
Normal file
@@ -0,0 +1,93 @@
|
||||
base_model: axolotl-ai-co/Falcon-E-1.2-3B-Exp-prequantized
|
||||
output_dir: ./output
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.kernels.KernelsPlugin
|
||||
|
||||
use_kernels: false
|
||||
use_scattermoe: false
|
||||
use_sonicmoe: false
|
||||
use_onebitllms: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
chat_template: tokenizer_default
|
||||
|
||||
rl: dpo
|
||||
datasets:
|
||||
- path: allenai/Dolci-Think-DPO-7B
|
||||
split: train
|
||||
type: chatml.ultra
|
||||
|
||||
dataset_prepared_path: ./axolotl_dataset_cache
|
||||
|
||||
sequence_len: 8192
|
||||
trust_remote_code: false
|
||||
|
||||
gradient_accumulation_steps: 4 # This can run on 4 GPUs
|
||||
|
||||
# Very important to enable gradient accumulation with FSDP
|
||||
# https://github.com/huggingface/transformers/issues/29425
|
||||
accelerator_config:
|
||||
gradient_accumulation_kwargs:
|
||||
sync_each_batch: True
|
||||
|
||||
|
||||
micro_batch_size: 1
|
||||
num_epochs: 3
|
||||
optimizer: adamw_torch
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 1.0e-5
|
||||
# adamw hyperparams
|
||||
adam_beta1: 0.9
|
||||
adam_beta2: 0.95
|
||||
|
||||
bf16: true
|
||||
tf32: false
|
||||
|
||||
logging_steps: 1
|
||||
|
||||
flash_attention: true
|
||||
|
||||
loss_watchdog_threshold: 15.0
|
||||
loss_watchdog_patience: 3
|
||||
|
||||
warmup_steps: 128
|
||||
evals_per_epoch: 0
|
||||
|
||||
save_steps: 500
|
||||
save_strategy: steps
|
||||
|
||||
weight_decay: 0.01
|
||||
|
||||
shuffle_merged_datasets: true
|
||||
experimental_skip_move_to_device: true
|
||||
|
||||
fsdp_version: 2
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
|
||||
# Comment to disable CP
|
||||
# The number of GPUs to shard the model parameters across (FSDP dimension).
|
||||
dp_shard_size: 1
|
||||
|
||||
# The number of times to replicate the sharded model (DDP dimension).
|
||||
dp_replicate_size: 1
|
||||
|
||||
# Number of GPUs for Tensor Parallelism.
|
||||
tensor_parallel_size: 1 # (default is 1, no TP)
|
||||
|
||||
# Number of GPUs for Context/Sequence Parallelism.
|
||||
context_parallel_size: 1 # (default is 1, no CP)
|
||||
|
||||
special_tokens:
|
||||
eos_token: <|end_of_text|>
|
||||
|
||||
eot_tokens:
|
||||
- <|im_end|>
|
||||
100
examples/falcon-e/falcon-e-3b-ft.yaml
Normal file
100
examples/falcon-e/falcon-e-3b-ft.yaml
Normal file
@@ -0,0 +1,100 @@
|
||||
base_model: tiiuae/Falcon-E-3B-Base-prequantized
|
||||
output_dir: ./output
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.kernels.KernelsPlugin
|
||||
|
||||
use_kernels: false
|
||||
use_scattermoe: false
|
||||
use_sonicmoe: false
|
||||
use_onebitllms: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
chat_template: tokenizer_default
|
||||
|
||||
datasets:
|
||||
- path: cgato/SlimOrcaDedupCleaned
|
||||
type: chat_template
|
||||
field_messages: conversations
|
||||
message_property_mappings:
|
||||
role: from
|
||||
content: value
|
||||
|
||||
dataset_prepared_path: ./axolotl_dataset_cache
|
||||
|
||||
sequence_len: 32768
|
||||
trust_remote_code: false
|
||||
|
||||
|
||||
gradient_accumulation_steps: 4 # This can run on 4 GPUs
|
||||
|
||||
# Very important to enable gradient accumulation with FSDP
|
||||
# https://github.com/huggingface/transformers/issues/29425
|
||||
accelerator_config:
|
||||
gradient_accumulation_kwargs:
|
||||
sync_each_batch: True
|
||||
|
||||
|
||||
micro_batch_size: 1
|
||||
num_epochs: 3
|
||||
optimizer: adamw_torch
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 5.0e-4
|
||||
# adamw hyperparams
|
||||
adam_beta1: 0.9
|
||||
adam_beta2: 0.95
|
||||
|
||||
bf16: true
|
||||
tf32: false
|
||||
|
||||
logging_steps: 1
|
||||
|
||||
flash_attention: true
|
||||
|
||||
loss_watchdog_threshold: 15.0
|
||||
loss_watchdog_patience: 3
|
||||
|
||||
warmup_steps: 128
|
||||
evals_per_epoch: 0
|
||||
|
||||
save_steps: 500
|
||||
save_strategy: steps
|
||||
|
||||
weight_decay: 0.01
|
||||
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
shuffle_merged_datasets: true
|
||||
experimental_skip_move_to_device: true
|
||||
|
||||
fsdp_version: 2
|
||||
fsdp_config:
|
||||
offload_params: false
|
||||
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
state_dict_type: FULL_STATE_DICT
|
||||
reshard_after_forward: true
|
||||
activation_checkpointing: true
|
||||
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
||||
|
||||
# Comment to disable CP
|
||||
# The number of GPUs to shard the model parameters across (FSDP dimension).
|
||||
dp_shard_size: 1
|
||||
|
||||
# The number of times to replicate the sharded model (DDP dimension).
|
||||
dp_replicate_size: 1
|
||||
|
||||
# Number of GPUs for Tensor Parallelism.
|
||||
tensor_parallel_size: 1 # (default is 1, no TP)
|
||||
|
||||
# Number of GPUs for Context/Sequence Parallelism.
|
||||
context_parallel_size: 1 # (default is 1, no CP)
|
||||
|
||||
special_tokens:
|
||||
eos_token: <|end_of_text|>
|
||||
|
||||
eot_tokens:
|
||||
- <|im_end|>
|
||||
Reference in New Issue
Block a user