101 lines
2.1 KiB
YAML
101 lines
2.1 KiB
YAML
base_model: tiiuae/Falcon-E-3B-Base-prequantized
|
|
output_dir: ./output
|
|
|
|
plugins:
|
|
- axolotl.integrations.kernels.KernelsPlugin
|
|
|
|
use_kernels: false
|
|
use_scattermoe: false
|
|
use_sonicmoe: false
|
|
use_onebitllms: true
|
|
|
|
load_in_8bit: false
|
|
load_in_4bit: false
|
|
|
|
chat_template: tokenizer_default
|
|
|
|
datasets:
|
|
- path: cgato/SlimOrcaDedupCleaned
|
|
type: chat_template
|
|
field_messages: conversations
|
|
message_property_mappings:
|
|
role: from
|
|
content: value
|
|
|
|
dataset_prepared_path: ./axolotl_dataset_cache
|
|
|
|
sequence_len: 32768
|
|
trust_remote_code: false
|
|
|
|
|
|
gradient_accumulation_steps: 4 # This can run on 4 GPUs
|
|
|
|
# Very important to enable gradient accumulation with FSDP
|
|
# https://github.com/huggingface/transformers/issues/29425
|
|
accelerator_config:
|
|
gradient_accumulation_kwargs:
|
|
sync_each_batch: True
|
|
|
|
|
|
micro_batch_size: 1
|
|
num_epochs: 3
|
|
optimizer: adamw_torch
|
|
lr_scheduler: cosine
|
|
learning_rate: 5.0e-4
|
|
# adamw hyperparams
|
|
adam_beta1: 0.9
|
|
adam_beta2: 0.95
|
|
|
|
bf16: true
|
|
tf32: false
|
|
|
|
logging_steps: 1
|
|
|
|
flash_attention: true
|
|
|
|
loss_watchdog_threshold: 15.0
|
|
loss_watchdog_patience: 3
|
|
|
|
warmup_steps: 128
|
|
evals_per_epoch: 0
|
|
|
|
save_steps: 500
|
|
save_strategy: steps
|
|
|
|
weight_decay: 0.01
|
|
|
|
sample_packing: true
|
|
pad_to_sequence_len: true
|
|
|
|
shuffle_merged_datasets: true
|
|
experimental_skip_move_to_device: true
|
|
|
|
fsdp_version: 2
|
|
fsdp_config:
|
|
offload_params: false
|
|
auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
|
transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
|
state_dict_type: FULL_STATE_DICT
|
|
reshard_after_forward: true
|
|
activation_checkpointing: true
|
|
# save_first_step: true # uncomment this to validate checkpoint saving works with your config
|
|
|
|
# Comment to disable CP
|
|
# The number of GPUs to shard the model parameters across (FSDP dimension).
|
|
dp_shard_size: 1
|
|
|
|
# The number of times to replicate the sharded model (DDP dimension).
|
|
dp_replicate_size: 1
|
|
|
|
# Number of GPUs for Tensor Parallelism.
|
|
tensor_parallel_size: 1 # (default is 1, no TP)
|
|
|
|
# Number of GPUs for Context/Sequence Parallelism.
|
|
context_parallel_size: 1 # (default is 1, no CP)
|
|
|
|
special_tokens:
|
|
eos_token: <|end_of_text|>
|
|
|
|
eot_tokens:
|
|
- <|im_end|>
|