Compare commits
4 Commits
multi-gpu-
...
merge-lora
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9aaa4b8ced | ||
|
|
8be7da8999 | ||
|
|
53e739f11e | ||
|
|
f20c8deff1 |
36
README.md
36
README.md
@@ -163,8 +163,6 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \
|
|||||||
```
|
```
|
||||||
</details>
|
</details>
|
||||||
|
|
||||||
- Windows: Please use WSL or Docker!
|
|
||||||
|
|
||||||
### Dataset
|
### Dataset
|
||||||
|
|
||||||
Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
|
Axolotl supports a variety of dataset formats. Below are some of the formats you can use.
|
||||||
@@ -330,15 +328,6 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
|
|||||||
name: enron_emails
|
name: enron_emails
|
||||||
type: completion # format from earlier
|
type: completion # format from earlier
|
||||||
|
|
||||||
# huggingface repo with multiple named configurations/subsets
|
|
||||||
datasets:
|
|
||||||
- path: bigcode/commitpackft
|
|
||||||
name:
|
|
||||||
- ruby
|
|
||||||
- python
|
|
||||||
- typescript
|
|
||||||
type: ... # unimplemented custom format
|
|
||||||
|
|
||||||
# local
|
# local
|
||||||
datasets:
|
datasets:
|
||||||
- path: data.jsonl # or json
|
- path: data.jsonl # or json
|
||||||
@@ -418,10 +407,6 @@ fp16: true
|
|||||||
# Use CUDA tf32
|
# Use CUDA tf32
|
||||||
tf32: true # require >=ampere
|
tf32: true # require >=ampere
|
||||||
|
|
||||||
# No AMP (automatic mixed precision)
|
|
||||||
bfloat16: true # require >=ampere
|
|
||||||
float16: true
|
|
||||||
|
|
||||||
# a list of one or more datasets to finetune the model with
|
# a list of one or more datasets to finetune the model with
|
||||||
datasets:
|
datasets:
|
||||||
# hf dataset repo | "json" for local dataset, make sure to fill data_files
|
# hf dataset repo | "json" for local dataset, make sure to fill data_files
|
||||||
@@ -474,9 +459,6 @@ dataset_shard_idx:
|
|||||||
# the maximum length of an input to train with, this should typically be less than 2048
|
# the maximum length of an input to train with, this should typically be less than 2048
|
||||||
# as most models have a token/context limit of 2048
|
# as most models have a token/context limit of 2048
|
||||||
sequence_len: 2048
|
sequence_len: 2048
|
||||||
# pad inputs so each step uses constant sized buffers
|
|
||||||
# this will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
|
|
||||||
pad_to_sequence_len:
|
|
||||||
# max sequence length to concatenate training samples together up to
|
# max sequence length to concatenate training samples together up to
|
||||||
# inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
|
# inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
|
||||||
# FutureWarning: This will soon be DEPRECATED
|
# FutureWarning: This will soon be DEPRECATED
|
||||||
@@ -511,12 +493,6 @@ lora_modules_to_save:
|
|||||||
lora_out_dir:
|
lora_out_dir:
|
||||||
lora_fan_in_fan_out: false
|
lora_fan_in_fan_out: false
|
||||||
|
|
||||||
# ReLoRA configuration
|
|
||||||
# must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
|
|
||||||
relora_steps: # number of steps per ReLoRA restart
|
|
||||||
relora_warmup_steps: # number of per-restart warmup steps
|
|
||||||
relora_cpu_offload: # true to perform lora weight merges on cpu during restarts, for modest gpu memory savings
|
|
||||||
|
|
||||||
# wandb configuration if you're using it
|
# wandb configuration if you're using it
|
||||||
wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
|
wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
|
||||||
wandb_project: # your wandb project name
|
wandb_project: # your wandb project name
|
||||||
@@ -539,7 +515,7 @@ lr_quadratic_warmup:
|
|||||||
logging_steps:
|
logging_steps:
|
||||||
save_strategy: # set to `no` to skip checkpoint saves
|
save_strategy: # set to `no` to skip checkpoint saves
|
||||||
save_steps: # leave empty to save at each epoch
|
save_steps: # leave empty to save at each epoch
|
||||||
eval_steps: # leave empty to eval at each epoch
|
eval_steps:
|
||||||
save_total_limit: # checkpoints saved at a time
|
save_total_limit: # checkpoints saved at a time
|
||||||
max_steps:
|
max_steps:
|
||||||
|
|
||||||
@@ -625,14 +601,12 @@ fsdp_config:
|
|||||||
# Deepspeed config path
|
# Deepspeed config path
|
||||||
deepspeed:
|
deepspeed:
|
||||||
|
|
||||||
# Advanced DDP Arguments
|
|
||||||
ddp_timeout:
|
|
||||||
ddp_bucket_cap_mb:
|
|
||||||
ddp_broadcast_buffers:
|
|
||||||
|
|
||||||
# Path to torch distx for optim 'adamw_anyprecision'
|
# Path to torch distx for optim 'adamw_anyprecision'
|
||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
|
|
||||||
|
# Set padding for data collator to 'longest'
|
||||||
|
collator_pad_to_longest:
|
||||||
|
|
||||||
# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
|
# Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
|
||||||
pretraining_dataset:
|
pretraining_dataset:
|
||||||
|
|
||||||
@@ -652,7 +626,7 @@ strict:
|
|||||||
|
|
||||||
Run
|
Run
|
||||||
```bash
|
```bash
|
||||||
accelerate launch scripts/finetune.py your_config.yml
|
accelerate launch scripts/finetune.py configs/your_config.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Multi-GPU
|
#### Multi-GPU
|
||||||
|
|||||||
@@ -1,46 +0,0 @@
|
|||||||
{
|
|
||||||
"zero_optimization": {
|
|
||||||
"stage": 2,
|
|
||||||
"offload_optimizer": {
|
|
||||||
"device": "cpu"
|
|
||||||
},
|
|
||||||
"contiguous_gradients": true,
|
|
||||||
"overlap_comm": true
|
|
||||||
},
|
|
||||||
"bf16": {
|
|
||||||
"enabled": "auto"
|
|
||||||
},
|
|
||||||
"fp16": {
|
|
||||||
"enabled": "auto",
|
|
||||||
"auto_cast": false,
|
|
||||||
"loss_scale": 0,
|
|
||||||
"initial_scale_power": 32,
|
|
||||||
"loss_scale_window": 1000,
|
|
||||||
"hysteresis": 2,
|
|
||||||
"min_loss_scale": 1
|
|
||||||
},
|
|
||||||
"optimizer": {
|
|
||||||
"type": "AdamW",
|
|
||||||
"params": {
|
|
||||||
"lr": "auto",
|
|
||||||
"betas": [
|
|
||||||
0.9,
|
|
||||||
0.999
|
|
||||||
],
|
|
||||||
"eps": 1e-8,
|
|
||||||
"weight_decay": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"scheduler": {
|
|
||||||
"type": "WarmupDecayLR",
|
|
||||||
"params": {
|
|
||||||
"warmup_min_lr": "auto",
|
|
||||||
"warmup_max_lr": "auto",
|
|
||||||
"warmup_num_steps": "auto",
|
|
||||||
"total_num_steps": "auto"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"train_batch_size": "auto",
|
|
||||||
"train_micro_batch_size_per_gpu": "auto",
|
|
||||||
"wall_clock_breakdown": false
|
|
||||||
}
|
|
||||||
@@ -35,7 +35,10 @@
|
|||||||
"type": "AdamW",
|
"type": "AdamW",
|
||||||
"params": {
|
"params": {
|
||||||
"lr": "auto",
|
"lr": "auto",
|
||||||
"betas": "auto",
|
"betas": [
|
||||||
|
0.9,
|
||||||
|
0.95
|
||||||
|
],
|
||||||
"eps": 1e-8,
|
"eps": 1e-8,
|
||||||
"weight_decay": "auto"
|
"weight_decay": "auto"
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ RUN apt-get update && \
|
|||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
RUN pip3 install "peft @ git+https://github.com/huggingface/peft.git@main"
|
RUN pip3 install --force-reinstall "peft @ git+https://github.com/huggingface/peft.git@main"
|
||||||
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
RUN git clone --depth=1 https://github.com/OpenAccess-AI-Collective/axolotl.git
|
||||||
# If AXOLOTL_EXTRAS is set, append it in brackets
|
# If AXOLOTL_EXTRAS is set, append it in brackets
|
||||||
RUN cd axolotl && \
|
RUN cd axolotl && \
|
||||||
|
|||||||
@@ -1,67 +0,0 @@
|
|||||||
base_model: codellama/CodeLlama-13b-hf
|
|
||||||
base_model_config: codellama/CodeLlama-13b-hf
|
|
||||||
model_type: LlamaForCausalLM
|
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
|
||||||
is_llama_derived_model: true
|
|
||||||
|
|
||||||
load_in_8bit: true
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.01
|
|
||||||
output_dir: ./lora-out
|
|
||||||
|
|
||||||
sequence_len: 100000
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
adapter: lora
|
|
||||||
lora_model_dir:
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_run_id:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 3
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
|
||||||
fp16: false
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_steps: 10
|
|
||||||
eval_steps: 20
|
|
||||||
save_steps:
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
|
||||||
bos_token: "<s>"
|
|
||||||
eos_token: "</s>"
|
|
||||||
unk_token: "<unk>"
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
base_model: codellama/CodeLlama-13b-hf
|
|
||||||
base_model_config: codellama/CodeLlama-13b-hf
|
|
||||||
model_type: LlamaForCausalLM
|
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
|
||||||
is_llama_derived_model: true
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: true
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.01
|
|
||||||
output_dir: ./qlora-out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 100000
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_run_id:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 3
|
|
||||||
optimizer: paged_adamw_32bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
|
||||||
fp16: false
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_steps: 10
|
|
||||||
eval_steps: 20
|
|
||||||
save_steps:
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
|
||||||
bos_token: "<s>"
|
|
||||||
eos_token: "</s>"
|
|
||||||
unk_token: "<unk>"
|
|
||||||
@@ -1,67 +0,0 @@
|
|||||||
base_model: codellama/CodeLlama-34b-hf
|
|
||||||
base_model_config: codellama/CodeLlama-34b-hf
|
|
||||||
model_type: LlamaForCausalLM
|
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
|
||||||
is_llama_derived_model: true
|
|
||||||
|
|
||||||
load_in_8bit: true
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.01
|
|
||||||
output_dir: ./lora-out
|
|
||||||
|
|
||||||
sequence_len: 100000
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
adapter: lora
|
|
||||||
lora_model_dir:
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_run_id:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 3
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
|
||||||
fp16: false
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_steps: 10
|
|
||||||
eval_steps: 20
|
|
||||||
save_steps:
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
|
||||||
bos_token: "<s>"
|
|
||||||
eos_token: "</s>"
|
|
||||||
unk_token: "<unk>"
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
base_model: codellama/CodeLlama-34b-hf
|
|
||||||
base_model_config: codellama/CodeLlama-34b-hf
|
|
||||||
model_type: LlamaForCausalLM
|
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
|
||||||
is_llama_derived_model: true
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: true
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.01
|
|
||||||
output_dir: ./qlora-out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 100000
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_run_id:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 3
|
|
||||||
optimizer: paged_adamw_32bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
|
||||||
fp16: false
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_steps: 10
|
|
||||||
eval_steps: 20
|
|
||||||
save_steps:
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
|
||||||
bos_token: "<s>"
|
|
||||||
eos_token: "</s>"
|
|
||||||
unk_token: "<unk>"
|
|
||||||
@@ -1,67 +0,0 @@
|
|||||||
base_model: codellama/CodeLlama-7b-hf
|
|
||||||
base_model_config: codellama/CodeLlama-7b-hf
|
|
||||||
model_type: LlamaForCausalLM
|
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
|
||||||
is_llama_derived_model: true
|
|
||||||
|
|
||||||
load_in_8bit: true
|
|
||||||
load_in_4bit: false
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.01
|
|
||||||
output_dir: ./lora-out
|
|
||||||
|
|
||||||
sequence_len: 100000
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
adapter: lora
|
|
||||||
lora_model_dir:
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_run_id:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 3
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
|
||||||
fp16: false
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_steps: 10
|
|
||||||
eval_steps: 20
|
|
||||||
save_steps:
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
|
||||||
bos_token: "<s>"
|
|
||||||
eos_token: "</s>"
|
|
||||||
unk_token: "<unk>"
|
|
||||||
@@ -1,69 +0,0 @@
|
|||||||
base_model: codellama/CodeLlama-7b-hf
|
|
||||||
base_model_config: codellama/CodeLlama-7b-hf
|
|
||||||
model_type: LlamaForCausalLM
|
|
||||||
tokenizer_type: CodeLlamaTokenizer
|
|
||||||
is_llama_derived_model: true
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: true
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.01
|
|
||||||
output_dir: ./qlora-out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 100000
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_run_id:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 3
|
|
||||||
optimizer: paged_adamw_32bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
|
||||||
fp16: false
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_steps: 10
|
|
||||||
eval_steps: 20
|
|
||||||
save_steps:
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
|
||||||
bos_token: "<s>"
|
|
||||||
eos_token: "</s>"
|
|
||||||
unk_token: "<unk>"
|
|
||||||
@@ -1,22 +0,0 @@
|
|||||||
# Overview
|
|
||||||
|
|
||||||
This is an example of CodeLLaMA configuration for 7b, 13b and 34b.
|
|
||||||
|
|
||||||
The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.
|
|
||||||
|
|
||||||
The 13b variant will fit if you change these settings to these values:
|
|
||||||
gradient_accumulation_steps: 2
|
|
||||||
micro_batch_size: 1
|
|
||||||
|
|
||||||
The 34b variant does not fit on 24GB of VRAM - you will need something with +40 gb VRAM that also supports flash attention v2 - A6000 or A100 are good choices.
|
|
||||||
|
|
||||||
```shell
|
|
||||||
accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/qlora.yml
|
|
||||||
|
|
||||||
```
|
|
||||||
or
|
|
||||||
|
|
||||||
```shell
|
|
||||||
accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/lora.yml
|
|
||||||
|
|
||||||
```
|
|
||||||
@@ -57,7 +57,7 @@ weight_decay: 0.0001
|
|||||||
fsdp:
|
fsdp:
|
||||||
fsdp_config:
|
fsdp_config:
|
||||||
tokens:
|
tokens:
|
||||||
pad_token: "<pad>"
|
pad_token: "[PAD]"
|
||||||
bos_token: "<s>"
|
bos_token: "<s>"
|
||||||
eos_token: "</s>"
|
eos_token: "</s>"
|
||||||
unk_token: "<unk>"
|
unk_token: "<unk>"
|
||||||
|
|||||||
@@ -1,73 +0,0 @@
|
|||||||
base_model: meta-llama/Llama-2-7b-hf
|
|
||||||
base_model_config: meta-llama/Llama-2-7b-hf
|
|
||||||
model_type: LlamaForCausalLM
|
|
||||||
tokenizer_type: LlamaTokenizer
|
|
||||||
is_llama_derived_model: true
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: true
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
|
||||||
type: alpaca
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.01
|
|
||||||
output_dir: ./relora-out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 4096
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
lora_r: 8
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_modules:
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_fan_in_fan_out:
|
|
||||||
|
|
||||||
relora_steps: 150
|
|
||||||
relora_warmup_steps: 10
|
|
||||||
relora_cpu_offload: false
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_run_id:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 4
|
|
||||||
num_epochs: 3
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
train_on_inputs: false
|
|
||||||
group_by_length: false
|
|
||||||
bf16: true
|
|
||||||
fp16: false
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
early_stopping_patience:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
local_rank:
|
|
||||||
logging_steps: 1
|
|
||||||
xformers_attention:
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_steps: 10
|
|
||||||
eval_steps: 20
|
|
||||||
save_steps: 50
|
|
||||||
debug:
|
|
||||||
deepspeed:
|
|
||||||
weight_decay: 0.0
|
|
||||||
fsdp:
|
|
||||||
fsdp_config:
|
|
||||||
special_tokens:
|
|
||||||
bos_token: "<s>"
|
|
||||||
eos_token: "</s>"
|
|
||||||
unk_token: "<unk>"
|
|
||||||
@@ -47,3 +47,4 @@ local_rank:
|
|||||||
gradient_checkpointing: true
|
gradient_checkpointing: true
|
||||||
fsdp:
|
fsdp:
|
||||||
fsdp_config:
|
fsdp_config:
|
||||||
|
collator_pad_to_longest: true
|
||||||
|
|||||||
@@ -1,14 +1,12 @@
|
|||||||
packaging
|
|
||||||
peft @ git+https://github.com/huggingface/peft.git
|
peft @ git+https://github.com/huggingface/peft.git
|
||||||
transformers @ git+https://github.com/huggingface/transformers.git
|
transformers @ git+https://github.com/huggingface/transformers.git
|
||||||
bitsandbytes>=0.41.1
|
bitsandbytes>=0.41.1
|
||||||
accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
|
accelerate @ git+https://github.com/huggingface/accelerate@2a289f6108e77a77a4efffb3f6316bc98538413b
|
||||||
addict
|
addict
|
||||||
evaluate
|
|
||||||
fire
|
fire
|
||||||
PyYAML>=6.0
|
PyYAML==6.0
|
||||||
datasets
|
datasets
|
||||||
flash-attn>=2.0.8
|
flash-attn==2.0.8
|
||||||
sentencepiece
|
sentencepiece
|
||||||
wandb
|
wandb
|
||||||
einops
|
einops
|
||||||
@@ -17,7 +15,7 @@ optimum
|
|||||||
hf_transfer
|
hf_transfer
|
||||||
colorama
|
colorama
|
||||||
numba
|
numba
|
||||||
numpy>=1.24.4
|
numpy==1.24.4
|
||||||
# qlora things
|
# qlora things
|
||||||
bert-score==0.3.13
|
bert-score==0.3.13
|
||||||
evaluate==0.4.0
|
evaluate==0.4.0
|
||||||
@@ -25,4 +23,3 @@ rouge-score==0.1.2
|
|||||||
scipy
|
scipy
|
||||||
scikit-learn==1.2.2
|
scikit-learn==1.2.2
|
||||||
pynvml
|
pynvml
|
||||||
art
|
|
||||||
|
|||||||
@@ -4,28 +4,27 @@ import importlib
|
|||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
import random
|
import random
|
||||||
|
import signal
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Dict, List, Optional, Union
|
from typing import Any, Dict, List, Optional, Union
|
||||||
|
|
||||||
import fire
|
import fire
|
||||||
import torch
|
import torch
|
||||||
import transformers
|
|
||||||
import yaml
|
import yaml
|
||||||
|
|
||||||
# add src to the pythonpath so we don't need to pip install this
|
# add src to the pythonpath so we don't need to pip install this
|
||||||
from art import text2art
|
from optimum.bettertransformer import BetterTransformer
|
||||||
from transformers import GenerationConfig, TextStreamer
|
from transformers import GenerationConfig, TextStreamer
|
||||||
|
|
||||||
from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
|
|
||||||
from axolotl.logging_config import configure_logging
|
from axolotl.logging_config import configure_logging
|
||||||
from axolotl.train import TrainDatasetMeta, train
|
|
||||||
from axolotl.utils.config import normalize_config, validate_config
|
from axolotl.utils.config import normalize_config, validate_config
|
||||||
from axolotl.utils.data import prepare_dataset
|
from axolotl.utils.data import prepare_dataset
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.distributed import is_main_process
|
from axolotl.utils.distributed import is_main_process
|
||||||
from axolotl.utils.models import load_model_config, load_tokenizer
|
from axolotl.utils.models import load_model, load_tokenizer
|
||||||
from axolotl.utils.tokenization import check_dataset_labels
|
from axolotl.utils.tokenization import check_dataset_labels
|
||||||
|
from axolotl.utils.trainer import setup_trainer
|
||||||
from axolotl.utils.wandb import setup_wandb_env_vars
|
from axolotl.utils.wandb import setup_wandb_env_vars
|
||||||
|
|
||||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
@@ -38,12 +37,15 @@ LOG = logging.getLogger("axolotl.scripts")
|
|||||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||||
|
|
||||||
|
|
||||||
def print_axolotl_text_art(suffix=None):
|
def print_axolotl_text_art():
|
||||||
font = "nancyj"
|
ascii_art = """
|
||||||
ascii_text = " axolotl"
|
dP dP dP
|
||||||
if suffix:
|
88 88 88
|
||||||
ascii_text += f" x {suffix}"
|
.d8888b. dP. .dP .d8888b. 88 .d8888b. d8888P 88
|
||||||
ascii_art = text2art(" axolotl", font=font)
|
88' `88 `8bd8' 88' `88 88 88' `88 88 88
|
||||||
|
88. .88 .d88b. 88. .88 88 88. .88 88 88
|
||||||
|
`88888P8 dP' `dP `88888P' dP `88888P' dP dP
|
||||||
|
"""
|
||||||
|
|
||||||
if is_main_process():
|
if is_main_process():
|
||||||
print(ascii_art)
|
print(ascii_art)
|
||||||
@@ -58,45 +60,7 @@ def get_multi_line_input() -> Optional[str]:
|
|||||||
return instruction
|
return instruction
|
||||||
|
|
||||||
|
|
||||||
def do_merge_lora(
|
def do_inference(cfg, model, tokenizer, prompter: Optional[str]):
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
):
|
|
||||||
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
|
|
||||||
safe_serialization = cfg.save_safetensors is True
|
|
||||||
|
|
||||||
LOG.info("running merge of LoRA with base model")
|
|
||||||
model = model.merge_and_unload()
|
|
||||||
model.to(dtype=torch.float16)
|
|
||||||
|
|
||||||
if cfg.local_rank == 0:
|
|
||||||
LOG.info("saving merged model")
|
|
||||||
model.save_pretrained(
|
|
||||||
str(Path(cfg.output_dir) / "merged"),
|
|
||||||
safe_serialization=safe_serialization,
|
|
||||||
)
|
|
||||||
tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
|
|
||||||
|
|
||||||
|
|
||||||
def shard(
|
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
):
|
|
||||||
model, _ = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
|
|
||||||
safe_serialization = cfg.save_safetensors is True
|
|
||||||
LOG.debug("Re-saving model w/ sharding")
|
|
||||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
|
||||||
|
|
||||||
|
|
||||||
def do_inference(
|
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
):
|
|
||||||
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
|
|
||||||
prompter = cli_args.prompter
|
|
||||||
default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
||||||
|
|
||||||
for token, symbol in default_tokens.items():
|
for token, symbol in default_tokens.items():
|
||||||
@@ -118,8 +82,6 @@ def do_inference(
|
|||||||
max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
|
max_seq_len=255, mem_freq=50, top_k=5, max_cache_size=None
|
||||||
)
|
)
|
||||||
|
|
||||||
model = model.to(cfg.device)
|
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
print("=" * 80)
|
print("=" * 80)
|
||||||
# support for multiline inputs
|
# support for multiline inputs
|
||||||
@@ -171,10 +133,6 @@ def choose_config(path: Path):
|
|||||||
"No YAML config files found in the specified directory. Are you using a .yml extension?"
|
"No YAML config files found in the specified directory. Are you using a .yml extension?"
|
||||||
)
|
)
|
||||||
|
|
||||||
if len(yaml_files) == 1:
|
|
||||||
print(f"Using default YAML file '{yaml_files[0]}'")
|
|
||||||
return yaml_files[0]
|
|
||||||
|
|
||||||
print("Choose a YAML file:")
|
print("Choose a YAML file:")
|
||||||
for idx, file in enumerate(yaml_files):
|
for idx, file in enumerate(yaml_files):
|
||||||
print(f"{idx + 1}. {file}")
|
print(f"{idx + 1}. {file}")
|
||||||
@@ -197,7 +155,29 @@ def check_not_in(list1: List[str], list2: Union[Dict[str, Any], List[str]]) -> b
|
|||||||
return not any(el in list2 for el in list1)
|
return not any(el in list2 for el in list1)
|
||||||
|
|
||||||
|
|
||||||
def load_cfg(config: Path = Path("examples/"), **kwargs):
|
def merge_lora(model, tokenizer, cfg):
|
||||||
|
LOG.info("running merge of LoRA with base model")
|
||||||
|
model = model.merge_and_unload()
|
||||||
|
model_dtype = torch.bfloat16 if cfg.bf16 or cfg.bfloat16 else torch.float16
|
||||||
|
model.to(dtype=model_dtype)
|
||||||
|
if cfg.hub_model_id:
|
||||||
|
model.push_to_hub("hub_model_id")
|
||||||
|
|
||||||
|
if cfg.local_rank == 0:
|
||||||
|
LOG.info("saving merged model")
|
||||||
|
model.save_pretrained(
|
||||||
|
str(Path(cfg.output_dir) / "merged"),
|
||||||
|
safe_serialization=cfg.save_safetensors is True,
|
||||||
|
)
|
||||||
|
tokenizer.save_pretrained(str(Path(cfg.output_dir) / "merged"))
|
||||||
|
|
||||||
|
|
||||||
|
def train(
|
||||||
|
config: Path = Path("configs/"),
|
||||||
|
prepare_ds_only: bool = False,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
print_axolotl_text_art()
|
||||||
if Path(config).is_dir():
|
if Path(config).is_dir():
|
||||||
config = choose_config(config)
|
config = choose_config(config)
|
||||||
|
|
||||||
@@ -216,72 +196,130 @@ def load_cfg(config: Path = Path("examples/"), **kwargs):
|
|||||||
else:
|
else:
|
||||||
cfg[k] = kwargs[k]
|
cfg[k] = kwargs[k]
|
||||||
|
|
||||||
model_config = load_model_config(cfg)
|
|
||||||
|
|
||||||
# figure out if the model is llama
|
|
||||||
cfg.is_llama_derived_model = (
|
|
||||||
(hasattr(model_config, "model_type") and model_config.model_type == "llama")
|
|
||||||
or cfg.is_llama_derived_model
|
|
||||||
or "llama" in cfg.base_model
|
|
||||||
or (cfg.model_type and "llama" in cfg.model_type.lower())
|
|
||||||
)
|
|
||||||
validate_config(cfg)
|
validate_config(cfg)
|
||||||
|
|
||||||
normalize_config(cfg)
|
normalize_config(cfg)
|
||||||
|
|
||||||
setup_wandb_env_vars(cfg)
|
setup_wandb_env_vars(cfg)
|
||||||
return cfg
|
|
||||||
|
|
||||||
|
# load the tokenizer first
|
||||||
def load_datasets(
|
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
) -> TrainDatasetMeta:
|
|
||||||
tokenizer = load_tokenizer(cfg)
|
tokenizer = load_tokenizer(cfg)
|
||||||
|
|
||||||
train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
|
if (
|
||||||
|
check_not_in(["shard", "merge_lora"], kwargs) and not cfg.inference
|
||||||
|
): # don't need to load dataset for these
|
||||||
|
train_dataset, eval_dataset, total_num_steps = prepare_dataset(cfg, tokenizer)
|
||||||
|
|
||||||
if cli_args.debug or cfg.debug:
|
if cfg.debug or "debug" in kwargs:
|
||||||
LOG.info("check_dataset_labels...")
|
LOG.info("check_dataset_labels...")
|
||||||
check_dataset_labels(
|
check_dataset_labels(
|
||||||
train_dataset.select(
|
train_dataset.select(
|
||||||
[
|
[random.randrange(0, len(train_dataset) - 1) for _ in range(5)] # nosec
|
||||||
random.randrange(0, len(train_dataset) - 1) # nosec
|
|
||||||
for _ in range(cli_args.debug_num_examples)
|
|
||||||
]
|
|
||||||
),
|
),
|
||||||
tokenizer,
|
tokenizer,
|
||||||
num_examples=cli_args.debug_num_examples,
|
|
||||||
text_only=cli_args.debug_text_only,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
return TrainDatasetMeta(
|
if prepare_ds_only:
|
||||||
train_dataset=train_dataset,
|
LOG.info("Finished preparing dataset. Exiting...")
|
||||||
eval_dataset=eval_dataset,
|
return
|
||||||
total_num_steps=total_num_steps,
|
|
||||||
|
# Load the model and tokenizer
|
||||||
|
LOG.info("loading model and (optionally) peft_config...")
|
||||||
|
model, peft_config = load_model(cfg, tokenizer)
|
||||||
|
|
||||||
|
safe_serialization = cfg.save_safetensors is True
|
||||||
|
|
||||||
|
if "merge_lora" in kwargs and cfg.adapter is not None:
|
||||||
|
merge_lora(model, tokenizer, cfg)
|
||||||
|
return
|
||||||
|
|
||||||
|
if cfg.inference:
|
||||||
|
LOG.info("calling do_inference function")
|
||||||
|
prompter: Optional[str] = "AlpacaPrompter"
|
||||||
|
if "prompter" in kwargs:
|
||||||
|
if kwargs["prompter"] == "None":
|
||||||
|
prompter = None
|
||||||
|
else:
|
||||||
|
prompter = kwargs["prompter"]
|
||||||
|
do_inference(cfg, model, tokenizer, prompter=prompter)
|
||||||
|
return
|
||||||
|
|
||||||
|
if "shard" in kwargs:
|
||||||
|
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||||
|
return
|
||||||
|
|
||||||
|
trainer = setup_trainer(
|
||||||
|
cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
|
||||||
)
|
)
|
||||||
|
|
||||||
|
model.config.use_cache = False
|
||||||
|
|
||||||
def do_cli(config: Path = Path("examples/"), **kwargs):
|
if torch.__version__ >= "2" and sys.platform != "win32":
|
||||||
print_axolotl_text_art()
|
LOG.info("Compiling torch model")
|
||||||
parsed_cfg = load_cfg(config, **kwargs)
|
model = torch.compile(model)
|
||||||
parser = transformers.HfArgumentParser((TrainerCliArgs))
|
|
||||||
parsed_cli_args, _ = parser.parse_args_into_dataclasses(
|
# go ahead and presave, so we have the adapter config available to inspect
|
||||||
return_remaining_strings=True
|
if peft_config:
|
||||||
)
|
LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
|
||||||
if parsed_cli_args.inference:
|
peft_config.save_pretrained(cfg.output_dir)
|
||||||
do_inference(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
|
||||||
elif parsed_cli_args.merge_lora:
|
# In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
|
||||||
do_merge_lora(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
if cfg.local_rank == 0:
|
||||||
elif parsed_cli_args.shard:
|
|
||||||
shard(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
def terminate_handler(_, __, model):
|
||||||
|
if cfg.flash_optimum:
|
||||||
|
model = BetterTransformer.reverse(model)
|
||||||
|
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
signal.signal(
|
||||||
|
signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
|
||||||
|
)
|
||||||
|
|
||||||
|
LOG.info("Starting trainer...")
|
||||||
|
if cfg.group_by_length:
|
||||||
|
LOG.info("hang tight... sorting dataset for group_by_length")
|
||||||
|
resume_from_checkpoint = cfg.resume_from_checkpoint
|
||||||
|
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
|
||||||
|
possible_checkpoints = [
|
||||||
|
str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
|
||||||
|
]
|
||||||
|
if len(possible_checkpoints) > 0:
|
||||||
|
sorted_paths = sorted(
|
||||||
|
possible_checkpoints,
|
||||||
|
key=lambda path: int(path.split("-")[-1]),
|
||||||
|
)
|
||||||
|
resume_from_checkpoint = sorted_paths[-1]
|
||||||
|
LOG.info(
|
||||||
|
f"Using Auto-resume functionality to start with checkpoint at {resume_from_checkpoint}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not Path(cfg.output_dir).is_dir():
|
||||||
|
os.makedirs(cfg.output_dir, exist_ok=True)
|
||||||
|
tokenizer.save_pretrained(cfg.output_dir)
|
||||||
|
if cfg.flash_optimum:
|
||||||
|
with torch.backends.cuda.sdp_kernel(
|
||||||
|
enable_flash=True, enable_math=True, enable_mem_efficient=True
|
||||||
|
):
|
||||||
|
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||||
else:
|
else:
|
||||||
dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
||||||
if parsed_cli_args.prepare_ds_only:
|
|
||||||
return
|
LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
|
||||||
train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
|
|
||||||
|
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
|
||||||
|
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
|
||||||
|
if cfg.fsdp:
|
||||||
|
trainer.save_model(cfg.output_dir)
|
||||||
|
elif cfg.local_rank == 0:
|
||||||
|
if cfg.flash_optimum:
|
||||||
|
model = BetterTransformer.reverse(model)
|
||||||
|
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||||
|
|
||||||
|
if cfg.adapter is not None:
|
||||||
|
merge_lora(model, tokenizer, cfg)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
fire.Fire(do_cli)
|
fire.Fire(train)
|
||||||
|
|||||||
@@ -1,43 +0,0 @@
|
|||||||
"""
|
|
||||||
shared module for cli specific things
|
|
||||||
"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from axolotl.logging_config import configure_logging
|
|
||||||
from axolotl.utils.dict import DictDefault
|
|
||||||
from axolotl.utils.models import load_model, load_tokenizer
|
|
||||||
|
|
||||||
configure_logging()
|
|
||||||
LOG = logging.getLogger("axolotl.common.cli")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TrainerCliArgs:
|
|
||||||
"""
|
|
||||||
dataclass representing the various non-training arguments
|
|
||||||
"""
|
|
||||||
|
|
||||||
debug: bool = field(default=False)
|
|
||||||
debug_text_only: bool = field(default=False)
|
|
||||||
debug_num_examples: int = field(default=5)
|
|
||||||
inference: bool = field(default=False)
|
|
||||||
merge_lora: bool = field(default=False)
|
|
||||||
prepare_ds_only: bool = field(default=False)
|
|
||||||
prompter: Optional[str] = field(default=None)
|
|
||||||
shard: bool = field(default=False)
|
|
||||||
|
|
||||||
|
|
||||||
def load_model_and_tokenizer(
|
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
):
|
|
||||||
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
|
||||||
tokenizer = load_tokenizer(cfg)
|
|
||||||
LOG.info("loading model and (optionally) peft_config...")
|
|
||||||
model, _ = load_model(cfg, tokenizer, inference=cli_args.inference)
|
|
||||||
|
|
||||||
return model, tokenizer
|
|
||||||
@@ -1,393 +0,0 @@
|
|||||||
"""Implements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune."""
|
|
||||||
import glob
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import os.path
|
|
||||||
import shutil
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, List, Sequence
|
|
||||||
|
|
||||||
import bitsandbytes as bnb
|
|
||||||
import peft
|
|
||||||
import safetensors.torch as st
|
|
||||||
import torch
|
|
||||||
from huggingface_hub import snapshot_download
|
|
||||||
from torch.optim.lr_scheduler import LRScheduler
|
|
||||||
from torch.optim.optimizer import Optimizer
|
|
||||||
from transformers import (
|
|
||||||
TrainerCallback,
|
|
||||||
TrainerControl,
|
|
||||||
TrainerState,
|
|
||||||
TrainingArguments,
|
|
||||||
)
|
|
||||||
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
|
|
||||||
|
|
||||||
from axolotl.utils.dict import DictDefault
|
|
||||||
from axolotl.utils.distributed import is_main_process
|
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.relora")
|
|
||||||
|
|
||||||
|
|
||||||
def reset_optimizer(optimizer: torch.optim.Optimizer):
|
|
||||||
for group in optimizer.param_groups:
|
|
||||||
for param in group["params"]:
|
|
||||||
param_state = optimizer.state[param]
|
|
||||||
for key in param_state:
|
|
||||||
if "qmap" in key:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if key == "step" and isinstance(param_state[key], int):
|
|
||||||
param_state[key] = 0
|
|
||||||
else:
|
|
||||||
param_state[key] = torch.zeros_like(param_state[key])
|
|
||||||
|
|
||||||
|
|
||||||
class ReLoRACallback(TrainerCallback):
|
|
||||||
"""Callback to merge LoRA weights into the base model and save full-weight checkpoints"""
|
|
||||||
|
|
||||||
def __init__(self, cfg: DictDefault):
|
|
||||||
self.relora_steps = cfg.relora_steps
|
|
||||||
self.cpu_offload = cfg.relora_cpu_offload
|
|
||||||
self.quantized = cfg.load_in_4bit or cfg.load_in_8bit
|
|
||||||
self.last_full_model = cfg.base_model
|
|
||||||
self.resume_from_checkpoint = cfg.resume_from_checkpoint
|
|
||||||
|
|
||||||
if not os.path.exists(self.last_full_model):
|
|
||||||
self.last_full_model = str(Path(snapshot_download(cfg.base_model)))
|
|
||||||
|
|
||||||
assert os.path.exists(
|
|
||||||
self.last_full_model
|
|
||||||
), "for ReLORA base_model must be a local path"
|
|
||||||
|
|
||||||
self.num_lora_restarts = 0
|
|
||||||
self.need_full_save = False
|
|
||||||
|
|
||||||
def on_train_begin(
|
|
||||||
self,
|
|
||||||
_args: TrainingArguments,
|
|
||||||
_state: TrainerState,
|
|
||||||
control: TrainerControl,
|
|
||||||
model: peft.LoraModel,
|
|
||||||
**_kwargs,
|
|
||||||
):
|
|
||||||
if self.resume_from_checkpoint:
|
|
||||||
weight_path = os.path.join(self.resume_from_checkpoint, "relora")
|
|
||||||
if not os.path.exists(weight_path):
|
|
||||||
LOG.warning(
|
|
||||||
"Resuming ReLoRA from checkpoint, but no full-weight save found"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
LOG.info(f"Loading adjusted base weights from {weight_path}")
|
|
||||||
load_weight_checkpoint(model, weight_path)
|
|
||||||
return control
|
|
||||||
|
|
||||||
def on_step_begin(
|
|
||||||
self,
|
|
||||||
args: TrainingArguments,
|
|
||||||
state: TrainerState,
|
|
||||||
control: TrainerControl,
|
|
||||||
model: peft.LoraModel,
|
|
||||||
optimizer: torch.optim.Optimizer,
|
|
||||||
**_kwargs,
|
|
||||||
):
|
|
||||||
if state.global_step > 0 and state.global_step % self.relora_steps == 0:
|
|
||||||
checkpoint_folder = os.path.join(
|
|
||||||
args.output_dir,
|
|
||||||
f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
|
|
||||||
"relora",
|
|
||||||
)
|
|
||||||
|
|
||||||
with torch.no_grad():
|
|
||||||
merge_and_save(
|
|
||||||
model,
|
|
||||||
self.last_full_model,
|
|
||||||
checkpoint_folder,
|
|
||||||
reinit=True,
|
|
||||||
quantized=self.quantized,
|
|
||||||
actually_save=is_main_process(),
|
|
||||||
cpu_offload=self.cpu_offload,
|
|
||||||
)
|
|
||||||
reset_optimizer(optimizer)
|
|
||||||
|
|
||||||
if self.quantized:
|
|
||||||
self.last_full_model = checkpoint_folder
|
|
||||||
self.num_lora_restarts += 1
|
|
||||||
|
|
||||||
return control
|
|
||||||
|
|
||||||
def on_save(
|
|
||||||
self,
|
|
||||||
args: TrainingArguments,
|
|
||||||
state: TrainerState,
|
|
||||||
control: TrainerControl,
|
|
||||||
model: peft.LoraModel,
|
|
||||||
**_kwargs,
|
|
||||||
):
|
|
||||||
checkpoint_folder = os.path.join(
|
|
||||||
args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", "relora"
|
|
||||||
)
|
|
||||||
if (
|
|
||||||
state.global_step >= self.relora_steps
|
|
||||||
and state.global_step % self.relora_steps != 0
|
|
||||||
):
|
|
||||||
if self.quantized:
|
|
||||||
if is_main_process() and self.last_full_model != checkpoint_folder:
|
|
||||||
# ensure the latest full parameter save is in the latest checkpoint
|
|
||||||
# folder, so that automatic pruning of checkpoints does not remove it
|
|
||||||
LOG.info(f"moving last full parameter save to {checkpoint_folder}")
|
|
||||||
os.makedirs(checkpoint_folder, exist_ok=True)
|
|
||||||
chunks = glob.glob(
|
|
||||||
f"{self.last_full_model}/model*.safetensors"
|
|
||||||
) + glob.glob(f"{self.last_full_model}/model*.index.json")
|
|
||||||
for path in chunks:
|
|
||||||
new_path = os.path.abspath(shutil.move(path, checkpoint_folder))
|
|
||||||
try:
|
|
||||||
os.symlink(new_path, path)
|
|
||||||
except OSError:
|
|
||||||
# probably on windows without permission to symlink
|
|
||||||
pass
|
|
||||||
|
|
||||||
self.last_full_model = checkpoint_folder
|
|
||||||
else:
|
|
||||||
model.model.save_pretrained(checkpoint_folder, safe_serialization=True)
|
|
||||||
|
|
||||||
return control
|
|
||||||
|
|
||||||
def on_log(
|
|
||||||
self,
|
|
||||||
_args: TrainingArguments,
|
|
||||||
_state: TrainerState,
|
|
||||||
control: TrainerControl,
|
|
||||||
logs: Dict[str, float],
|
|
||||||
**_kwargs,
|
|
||||||
):
|
|
||||||
logs["num_lora_restarts"] = self.num_lora_restarts
|
|
||||||
return control
|
|
||||||
|
|
||||||
def on_train_end(
|
|
||||||
self,
|
|
||||||
args: TrainingArguments,
|
|
||||||
_state: TrainerState,
|
|
||||||
control: TrainerControl,
|
|
||||||
model: peft.LoraModel,
|
|
||||||
**_kwargs,
|
|
||||||
):
|
|
||||||
if self.quantized:
|
|
||||||
# perform final merge and save
|
|
||||||
with torch.no_grad():
|
|
||||||
merge_and_save(
|
|
||||||
model,
|
|
||||||
self.last_full_model,
|
|
||||||
args.output_dir,
|
|
||||||
reinit=False,
|
|
||||||
quantized=self.quantized,
|
|
||||||
actually_save=is_main_process(),
|
|
||||||
cpu_offload=self.cpu_offload,
|
|
||||||
)
|
|
||||||
# no need to save if unquantized, as finetune.py will call merge_and_unload()
|
|
||||||
return control
|
|
||||||
|
|
||||||
|
|
||||||
class ReLoRAScheduler(LRScheduler):
|
|
||||||
"""Wraps another scheduler to apply per-lora-restart learning rate warmups."""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
optimizer: Optimizer,
|
|
||||||
inner_schedule: LRScheduler,
|
|
||||||
relora_steps: int,
|
|
||||||
warmup_steps: int,
|
|
||||||
min_lr_scale: float = 0.001,
|
|
||||||
) -> None:
|
|
||||||
self.inner_schedule = inner_schedule
|
|
||||||
self.relora_steps = relora_steps
|
|
||||||
self.warmup_steps = warmup_steps
|
|
||||||
self.min_lr_scale = min_lr_scale
|
|
||||||
super().__init__(optimizer, inner_schedule.last_epoch, inner_schedule.verbose)
|
|
||||||
|
|
||||||
def get_lr(self) -> float:
|
|
||||||
self.inner_schedule.last_epoch = self.last_epoch
|
|
||||||
|
|
||||||
original = self.inner_schedule.get_lr()
|
|
||||||
step = self.last_epoch
|
|
||||||
if step < self.relora_steps:
|
|
||||||
scale = 1
|
|
||||||
else:
|
|
||||||
cycle_t = min(1.0, (step % self.relora_steps) / self.warmup_steps)
|
|
||||||
scale = cycle_t * (1 - self.min_lr_scale) + self.min_lr_scale
|
|
||||||
|
|
||||||
if isinstance(original, Sequence):
|
|
||||||
return [lr * scale for lr in original]
|
|
||||||
return original * scale
|
|
||||||
|
|
||||||
|
|
||||||
def sharded_paths(path: str, module_names: List[str]) -> Dict[str, str]:
|
|
||||||
model_name = "model.safetensors"
|
|
||||||
if not os.path.exists(str(Path(path) / model_name)) and not os.path.exists(
|
|
||||||
str(Path(path) / f"{model_name}.index.json")
|
|
||||||
):
|
|
||||||
model_name = "pytorch_model.bin"
|
|
||||||
|
|
||||||
index_path = str(Path(path) / f"{model_name}.index.json")
|
|
||||||
if os.path.exists(index_path):
|
|
||||||
with open(index_path, "r", encoding="utf-8") as file:
|
|
||||||
data = json.load(file)
|
|
||||||
return data["weight_map"]
|
|
||||||
return {(module_name + ".weight"): model_name for module_name in module_names}
|
|
||||||
|
|
||||||
|
|
||||||
def lora_delta_weight(layer: peft.tuners.lora.LoraLayer, device) -> torch.Tensor:
|
|
||||||
if isinstance(layer, (peft.tuners.lora.Linear8bitLt, peft.tuners.lora.Linear4bit)):
|
|
||||||
adapter = layer.active_adapter
|
|
||||||
return (
|
|
||||||
peft.utils.transpose(
|
|
||||||
layer.lora_B[adapter].weight.detach().to(device)
|
|
||||||
@ layer.lora_A[adapter].weight.detach().to(device),
|
|
||||||
getattr(layer, "fan_in_fan_out", False),
|
|
||||||
)
|
|
||||||
* layer.scaling[adapter]
|
|
||||||
)
|
|
||||||
|
|
||||||
return layer.get_delta_weight().to(device)
|
|
||||||
|
|
||||||
|
|
||||||
def find_lora_modules(model: peft.LoraModel) -> Dict[str, peft.tuners.lora.LoraLayer]:
|
|
||||||
modules: Dict[str, peft.tuners.lora.LoraLayer] = {}
|
|
||||||
|
|
||||||
key_list = [key for key, _ in model.model.named_modules() if "lora" not in key]
|
|
||||||
for key in key_list:
|
|
||||||
try:
|
|
||||||
# pylint: disable=protected-access
|
|
||||||
_parent, target, _target_name = peft.utils._get_submodules(model.model, key)
|
|
||||||
except AttributeError:
|
|
||||||
continue
|
|
||||||
|
|
||||||
if isinstance(target, peft.tuners.lora.LoraLayer):
|
|
||||||
modules[key] = target
|
|
||||||
|
|
||||||
return modules
|
|
||||||
|
|
||||||
|
|
||||||
def update_weights(
|
|
||||||
target: peft.tuners.lora.LoraLayer, new_weight: torch.Tensor, reinit: bool, device
|
|
||||||
):
|
|
||||||
if reinit:
|
|
||||||
for adapter_name in target.lora_A:
|
|
||||||
target.reset_lora_parameters(adapter_name)
|
|
||||||
for adapter_name in target.lora_embedding_A:
|
|
||||||
target.reset_lora_parameters(adapter_name)
|
|
||||||
|
|
||||||
if isinstance(target, peft.tuners.lora.Linear4bit):
|
|
||||||
# This could be faster, but the quantization of Linear4bit weights occurs
|
|
||||||
# when the module is moved from cpu to gpu. Without meddling *too* deeply in
|
|
||||||
# PEFT's innards or maintaining a duplicate of that codepath, this is good
|
|
||||||
# enough for now.
|
|
||||||
target.weight.quant_state = None
|
|
||||||
target.weight.data = new_weight.cpu()
|
|
||||||
target.to(device)
|
|
||||||
elif isinstance(target, peft.tuners.lora.Linear8bitLt):
|
|
||||||
target.weight = bnb.nn.Int8Params(new_weight, requires_grad=False).to(device)
|
|
||||||
else:
|
|
||||||
target.weight.data = new_weight.to(device)
|
|
||||||
|
|
||||||
|
|
||||||
def merge_and_save(
|
|
||||||
model: peft.LoraModel,
|
|
||||||
model_src: str,
|
|
||||||
model_dst: str,
|
|
||||||
reinit: bool = False,
|
|
||||||
quantized: bool = False,
|
|
||||||
cpu_offload: bool = False,
|
|
||||||
actually_save: bool = True,
|
|
||||||
):
|
|
||||||
modules = find_lora_modules(model)
|
|
||||||
|
|
||||||
if not quantized:
|
|
||||||
for module_name, target in modules.items():
|
|
||||||
update = target.get_delta_weight(target.active_adapter).detach()
|
|
||||||
target.weight.data += update
|
|
||||||
|
|
||||||
if reinit:
|
|
||||||
for adapter_name in target.lora_A:
|
|
||||||
target.reset_lora_parameters(adapter_name)
|
|
||||||
for adapter_name in target.lora_embedding_A:
|
|
||||||
target.reset_lora_parameters(adapter_name)
|
|
||||||
return
|
|
||||||
|
|
||||||
os.makedirs(model_dst, exist_ok=True)
|
|
||||||
shard_paths = sharded_paths(model_src, modules.keys())
|
|
||||||
out_shard_paths = {}
|
|
||||||
|
|
||||||
unique_shards = list(set(shard_paths.values()))
|
|
||||||
for shard_path in unique_shards:
|
|
||||||
out_tensors = {}
|
|
||||||
if shard_path.endswith(".safetensors"):
|
|
||||||
in_tensors = st.load_file(str(Path(model_src) / shard_path))
|
|
||||||
else:
|
|
||||||
in_tensors = torch.load(Path(model_src) / shard_path)
|
|
||||||
if "state_dict" in in_tensors:
|
|
||||||
in_tensors = in_tensors["state_dict"]
|
|
||||||
|
|
||||||
for module_name, target in modules.items():
|
|
||||||
key = module_name + ".weight"
|
|
||||||
if key not in shard_paths or shard_paths[key] != shard_path:
|
|
||||||
continue
|
|
||||||
|
|
||||||
orig_weight = in_tensors[key]
|
|
||||||
old_dev = target.weight.device
|
|
||||||
math_dev = "cpu" if cpu_offload else old_dev
|
|
||||||
|
|
||||||
delta_weight = lora_delta_weight(target, math_dev)
|
|
||||||
new_weight = orig_weight.to(math_dev) + delta_weight
|
|
||||||
del delta_weight
|
|
||||||
|
|
||||||
if actually_save:
|
|
||||||
out_tensors[key] = new_weight.half().cpu()
|
|
||||||
|
|
||||||
update_weights(target, new_weight, reinit=reinit, device=old_dev)
|
|
||||||
|
|
||||||
if actually_save:
|
|
||||||
out_shard_name = shard_path
|
|
||||||
if out_shard_name.startswith("pytorch_model"):
|
|
||||||
out_shard_name = (
|
|
||||||
out_shard_name.replace("pytorch_model", "model").rstrip(".bin")
|
|
||||||
+ ".safetensors"
|
|
||||||
)
|
|
||||||
|
|
||||||
for module_name in in_tensors:
|
|
||||||
if module_name not in out_tensors:
|
|
||||||
out_tensors[module_name] = in_tensors[module_name].half()
|
|
||||||
out_shard_paths[module_name] = out_shard_name
|
|
||||||
|
|
||||||
shard_fn = str(Path(model_dst) / out_shard_name)
|
|
||||||
LOG.info(f"saving tensors to {shard_fn}")
|
|
||||||
st.save_file(out_tensors, shard_fn, metadata={"format": "pt"})
|
|
||||||
|
|
||||||
del in_tensors
|
|
||||||
del out_tensors
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
if actually_save and len(unique_shards) > 1:
|
|
||||||
with open(
|
|
||||||
str(Path(model_dst, "model.safetensors.index.json")), "w", encoding="utf-8"
|
|
||||||
) as file:
|
|
||||||
json.dump({"metadata": {}, "weight_map": out_shard_paths}, file)
|
|
||||||
|
|
||||||
|
|
||||||
def load_weight_checkpoint(model: peft.LoraModel, checkpoint_path: str):
|
|
||||||
modules = find_lora_modules(model)
|
|
||||||
shard_paths = sharded_paths(checkpoint_path, modules.keys())
|
|
||||||
unique_shards = list(set(shard_paths.values()))
|
|
||||||
|
|
||||||
for shard_path in unique_shards:
|
|
||||||
tensors = st.load_file(os.path.join(checkpoint_path, shard_path))
|
|
||||||
|
|
||||||
for module_name, target in modules.items():
|
|
||||||
key = module_name + ".weight"
|
|
||||||
if key not in shard_paths or shard_paths[key] != shard_path:
|
|
||||||
continue
|
|
||||||
|
|
||||||
new_weight = tensors[key]
|
|
||||||
update_weights(
|
|
||||||
target, new_weight, reinit=False, device=target.weight.device
|
|
||||||
)
|
|
||||||
@@ -13,7 +13,7 @@ from axolotl.prompters import IGNORE_TOKEN_ID
|
|||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
IGNORE_INDEX = -100
|
IGNORE_INDEX = -100
|
||||||
LLAMA_DEFAULT_PAD_TOKEN = "<pad>" # nosec
|
LLAMA_DEFAULT_PAD_TOKEN = "[PAD]" # nosec
|
||||||
LLAMA_DEFAULT_EOS_TOKEN = "</s>" # nosec
|
LLAMA_DEFAULT_EOS_TOKEN = "</s>" # nosec
|
||||||
LLAMA_DEFAULT_BOS_TOKEN = "<s>" # nosec
|
LLAMA_DEFAULT_BOS_TOKEN = "<s>" # nosec
|
||||||
LLAMA_DEFAULT_UNK_TOKEN = "<unk>" # nosec
|
LLAMA_DEFAULT_UNK_TOKEN = "<unk>" # nosec
|
||||||
|
|||||||
@@ -1,139 +0,0 @@
|
|||||||
"""Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
import signal
|
|
||||||
import sys
|
|
||||||
from dataclasses import dataclass
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import torch
|
|
||||||
|
|
||||||
# add src to the pythonpath so we don't need to pip install this
|
|
||||||
from datasets import Dataset
|
|
||||||
from optimum.bettertransformer import BetterTransformer
|
|
||||||
|
|
||||||
from axolotl.common.cli import TrainerCliArgs
|
|
||||||
from axolotl.logging_config import configure_logging
|
|
||||||
from axolotl.utils.dict import DictDefault
|
|
||||||
from axolotl.utils.models import load_model, load_tokenizer
|
|
||||||
from axolotl.utils.trainer import setup_trainer
|
|
||||||
|
|
||||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
|
||||||
src_dir = os.path.join(project_root, "src")
|
|
||||||
sys.path.insert(0, src_dir)
|
|
||||||
|
|
||||||
configure_logging()
|
|
||||||
LOG = logging.getLogger("axolotl.train")
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class TrainDatasetMeta:
|
|
||||||
"""
|
|
||||||
dataclass to capture the dataset specific options for training
|
|
||||||
"""
|
|
||||||
|
|
||||||
train_dataset: Dataset
|
|
||||||
eval_dataset: Optional[Dataset] = None
|
|
||||||
total_num_steps: Optional[int] = None
|
|
||||||
|
|
||||||
|
|
||||||
def train(
|
|
||||||
*,
|
|
||||||
cfg: DictDefault,
|
|
||||||
cli_args: TrainerCliArgs,
|
|
||||||
dataset_meta: TrainDatasetMeta,
|
|
||||||
):
|
|
||||||
# load the tokenizer first
|
|
||||||
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
|
||||||
tokenizer = load_tokenizer(cfg)
|
|
||||||
|
|
||||||
train_dataset = dataset_meta.train_dataset
|
|
||||||
eval_dataset = dataset_meta.eval_dataset
|
|
||||||
total_num_steps = dataset_meta.total_num_steps
|
|
||||||
|
|
||||||
# Load the model and tokenizer
|
|
||||||
LOG.info("loading model and (optionally) peft_config...")
|
|
||||||
model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
|
|
||||||
|
|
||||||
safe_serialization = cfg.save_safetensors is True
|
|
||||||
|
|
||||||
if cfg.resume_from_checkpoint is None and cfg.auto_resume_from_checkpoints:
|
|
||||||
possible_checkpoints = [
|
|
||||||
str(cp) for cp in Path(cfg.output_dir).glob("checkpoint-*")
|
|
||||||
]
|
|
||||||
if len(possible_checkpoints) > 0:
|
|
||||||
sorted_paths = sorted(
|
|
||||||
possible_checkpoints,
|
|
||||||
key=lambda path: int(path.split("-")[-1]),
|
|
||||||
)
|
|
||||||
cfg.resume_from_checkpoint = sorted_paths[-1]
|
|
||||||
LOG.info(
|
|
||||||
f"Using Auto-resume functionality to start with checkpoint at {cfg.resume_from_checkpoint}"
|
|
||||||
)
|
|
||||||
resume_from_checkpoint = cfg.resume_from_checkpoint
|
|
||||||
|
|
||||||
trainer = setup_trainer(
|
|
||||||
cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps
|
|
||||||
)
|
|
||||||
|
|
||||||
model.config.use_cache = False
|
|
||||||
|
|
||||||
if torch.__version__ >= "2" and sys.platform != "win32":
|
|
||||||
LOG.info("Compiling torch model")
|
|
||||||
model = torch.compile(model)
|
|
||||||
|
|
||||||
# go ahead and presave, so we have the adapter config available to inspect
|
|
||||||
if peft_config:
|
|
||||||
LOG.info(f"Pre-saving adapter config to {cfg.output_dir}")
|
|
||||||
peft_config.save_pretrained(cfg.output_dir)
|
|
||||||
|
|
||||||
# In case we want to stop early with ctrl+c, this is a nice to have to save the pretrained model
|
|
||||||
if cfg.local_rank == 0:
|
|
||||||
|
|
||||||
def terminate_handler(_, __, model):
|
|
||||||
if cfg.flash_optimum:
|
|
||||||
model = BetterTransformer.reverse(model)
|
|
||||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
|
||||||
sys.exit(0)
|
|
||||||
|
|
||||||
signal.signal(
|
|
||||||
signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
|
|
||||||
)
|
|
||||||
|
|
||||||
LOG.info("Starting trainer...")
|
|
||||||
if cfg.group_by_length:
|
|
||||||
LOG.info("hang tight... sorting dataset for group_by_length")
|
|
||||||
|
|
||||||
if not Path(cfg.output_dir).is_dir():
|
|
||||||
os.makedirs(cfg.output_dir, exist_ok=True)
|
|
||||||
tokenizer.save_pretrained(cfg.output_dir)
|
|
||||||
if cfg.flash_optimum:
|
|
||||||
with torch.backends.cuda.sdp_kernel(
|
|
||||||
enable_flash=True, enable_math=True, enable_mem_efficient=True
|
|
||||||
):
|
|
||||||
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
|
||||||
else:
|
|
||||||
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
|
|
||||||
|
|
||||||
LOG.info(f"Training Completed!!! Saving pre-trained model to {cfg.output_dir}")
|
|
||||||
|
|
||||||
if cfg.relora_steps:
|
|
||||||
if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
|
|
||||||
model = model.merge_and_unload()
|
|
||||||
else:
|
|
||||||
# final model weights have already been saved by `ReLoRACallback.on_train_end`
|
|
||||||
return model, tokenizer
|
|
||||||
|
|
||||||
# TODO do we need this fix? https://huggingface.co/docs/accelerate/usage_guides/fsdp#saving-and-loading
|
|
||||||
# only save on rank 0, otherwise it corrupts output on multi-GPU when multiple processes attempt to write the same file
|
|
||||||
if cfg.fsdp:
|
|
||||||
trainer.save_model(cfg.output_dir)
|
|
||||||
elif cfg.local_rank == 0:
|
|
||||||
if cfg.flash_optimum:
|
|
||||||
model = BetterTransformer.reverse(model)
|
|
||||||
|
|
||||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
|
||||||
|
|
||||||
return model, tokenizer
|
|
||||||
@@ -1,20 +1,9 @@
|
|||||||
"""Callbacks for Trainer class"""
|
"""Callbacks for Trainer class"""
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from typing import TYPE_CHECKING, Dict, List
|
|
||||||
|
|
||||||
import evaluate
|
|
||||||
import numpy as np
|
|
||||||
import pandas as pd
|
|
||||||
import torch
|
|
||||||
import torch.distributed as dist
|
|
||||||
from accelerate.state import PartialState
|
|
||||||
from datasets import load_dataset
|
|
||||||
from optimum.bettertransformer import BetterTransformer
|
from optimum.bettertransformer import BetterTransformer
|
||||||
from tqdm import tqdm
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
TrainerCallback,
|
TrainerCallback,
|
||||||
TrainerControl,
|
TrainerControl,
|
||||||
@@ -24,18 +13,8 @@ from transformers import (
|
|||||||
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
|
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
|
||||||
|
|
||||||
from axolotl.utils.bench import log_gpu_memory_usage
|
from axolotl.utils.bench import log_gpu_memory_usage
|
||||||
from axolotl.utils.distributed import (
|
|
||||||
gather_scalar_from_all_ranks,
|
|
||||||
get_world_size,
|
|
||||||
is_main_process,
|
|
||||||
)
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from axolotl.utils.trainer import AxolotlTrainingArguments
|
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl.callbacks")
|
LOG = logging.getLogger("axolotl.callbacks")
|
||||||
IGNORE_INDEX = -100
|
|
||||||
dist_state = PartialState()
|
|
||||||
|
|
||||||
|
|
||||||
class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods
|
class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods
|
||||||
@@ -54,9 +33,7 @@ class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-
|
|||||||
)
|
)
|
||||||
|
|
||||||
peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
|
peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
|
||||||
kwargs["model"].save_pretrained(
|
kwargs["model"].save_pretrained(peft_model_path)
|
||||||
peft_model_path, save_safetensors=args.save_safetensors
|
|
||||||
)
|
|
||||||
|
|
||||||
return control
|
return control
|
||||||
|
|
||||||
@@ -117,199 +94,3 @@ class GPUStatsCallback(
|
|||||||
log_gpu_memory_usage(LOG, "while training", self.cfg.device)
|
log_gpu_memory_usage(LOG, "while training", self.cfg.device)
|
||||||
self.logged = True
|
self.logged = True
|
||||||
return control
|
return control
|
||||||
|
|
||||||
|
|
||||||
def bench_eval_callback_factory(trainer, tokenizer):
|
|
||||||
accuracy = evaluate.load("accuracy")
|
|
||||||
abcd_idx = [
|
|
||||||
tokenizer("A", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("B", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("C", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("D", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("E", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("F", add_special_tokens=False).input_ids[0],
|
|
||||||
tokenizer("G", add_special_tokens=False).input_ids[0],
|
|
||||||
]
|
|
||||||
bench_split = "eval"
|
|
||||||
|
|
||||||
def transform_bench_subject(example):
|
|
||||||
# Split on ':' and trim whitespace
|
|
||||||
parts = example["subject"].split(":")
|
|
||||||
first_part = (
|
|
||||||
parts[0].strip().lower().replace("-", "_")
|
|
||||||
) # Lowercase the first part
|
|
||||||
second_part = (
|
|
||||||
parts[1].strip().replace("-", "_") if len(parts) > 1 else "all"
|
|
||||||
) # Replace hyphens with underscores
|
|
||||||
|
|
||||||
# Return the transformed values
|
|
||||||
return {"name": first_part, "subject": second_part}
|
|
||||||
|
|
||||||
if trainer.args.bench_dataset == "mmlu-zs":
|
|
||||||
bench_dataset = load_dataset(
|
|
||||||
"openaccess-ai-collective/mmlu-evals",
|
|
||||||
data_files={
|
|
||||||
"eval": "zero_shot_mmlu_val.json",
|
|
||||||
"test": "zero_shot_mmlu_test.json",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
# bench_dataset = bench_dataset.remove_columns("subject")
|
|
||||||
# MMLU Five-shot (Eval/Test only)
|
|
||||||
elif trainer.args.bench_dataset in ["mmlu", "mmlu-fs"]:
|
|
||||||
bench_dataset = load_dataset(
|
|
||||||
"openaccess-ai-collective/mmlu-evals",
|
|
||||||
data_files={
|
|
||||||
"eval": "five_shot_mmlu_val.json",
|
|
||||||
"test": "five_shot_mmlu_test.json",
|
|
||||||
},
|
|
||||||
)
|
|
||||||
# bench_dataset = bench_dataset.remove_columns('subject')
|
|
||||||
elif "/" in trainer.args.bench_dataset:
|
|
||||||
bench_ds = trainer.args.bench_dataset
|
|
||||||
bench_ds_name = "/".join(bench_ds.split("/", 2)[:2])
|
|
||||||
bench_ds_data_file = "/".join(bench_ds.split("/", 2)[2:])
|
|
||||||
bench_dataset = load_dataset(
|
|
||||||
bench_ds_name,
|
|
||||||
data_files={
|
|
||||||
"eval": bench_ds_data_file,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
bench_dataset["eval"] = bench_dataset["eval"].map(transform_bench_subject)
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"unhandled value `{trainer.args.bench_dataset}` for bench_dataset training args"
|
|
||||||
)
|
|
||||||
bench_dataset = bench_dataset[trainer.args.bench_split]
|
|
||||||
if trainer.args.max_bench_samples is not None:
|
|
||||||
bench_dataset = bench_dataset.select(range(trainer.args.max_bench_samples))
|
|
||||||
|
|
||||||
def tokenize_evals(example):
|
|
||||||
source = f"{tokenizer.bos_token}{example['input']}"
|
|
||||||
target = f"{example['output']}{tokenizer.eos_token}"
|
|
||||||
|
|
||||||
tokenized_source = tokenizer(
|
|
||||||
source,
|
|
||||||
max_length=2048,
|
|
||||||
truncation=True,
|
|
||||||
add_special_tokens=False,
|
|
||||||
)
|
|
||||||
tokenized_target = tokenizer(
|
|
||||||
target,
|
|
||||||
max_length=2048,
|
|
||||||
truncation=True,
|
|
||||||
add_special_tokens=False,
|
|
||||||
)
|
|
||||||
input_ids = tokenized_source["input_ids"] + tokenized_target["input_ids"]
|
|
||||||
labels = [IGNORE_INDEX] * len(tokenized_source["input_ids"]) + tokenized_target[
|
|
||||||
"input_ids"
|
|
||||||
]
|
|
||||||
|
|
||||||
return {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"labels": labels,
|
|
||||||
"subject": example["subject"],
|
|
||||||
}
|
|
||||||
|
|
||||||
with dist_state.main_process_first():
|
|
||||||
bench_dataset = bench_dataset.map(tokenize_evals)
|
|
||||||
bench_dataset = bench_dataset.filter(lambda x: x["labels"][-2] in abcd_idx)
|
|
||||||
|
|
||||||
class BenchEvalCallback(TrainerCallback):
|
|
||||||
"""
|
|
||||||
TrainerCallback that runs the MMLU evals
|
|
||||||
"""
|
|
||||||
|
|
||||||
def on_evaluate(
|
|
||||||
self,
|
|
||||||
args: AxolotlTrainingArguments,
|
|
||||||
state: TrainerState, # pylint: disable=unused-argument
|
|
||||||
control: TrainerControl, # pylint: disable=unused-argument
|
|
||||||
metrics: Dict[str, float], # pylint: disable=unused-argument
|
|
||||||
**kwargs, # pylint: disable=unused-argument
|
|
||||||
):
|
|
||||||
data_loader = trainer.get_bench_dataloader(
|
|
||||||
bench_dataset.remove_columns(["input", "subject", "output", "name"])
|
|
||||||
)
|
|
||||||
trainer.model.eval()
|
|
||||||
preds, refs = [], []
|
|
||||||
loss_bench = 0
|
|
||||||
for batch in tqdm(data_loader, total=len(data_loader)):
|
|
||||||
(loss, logits, labels) = trainer.prediction_step(
|
|
||||||
trainer.model,
|
|
||||||
batch,
|
|
||||||
prediction_loss_only=False,
|
|
||||||
)
|
|
||||||
# There are two tokens, the output, and eos token.
|
|
||||||
for i, logit in enumerate(logits):
|
|
||||||
label_non_zero_id = (batch["labels"][i] != IGNORE_INDEX).nonzero()[
|
|
||||||
0
|
|
||||||
][0]
|
|
||||||
logit_abcd = logit[label_non_zero_id - 1][abcd_idx]
|
|
||||||
preds.append(torch.argmax(logit_abcd).item())
|
|
||||||
labels = labels[labels != IGNORE_INDEX].view(-1, 2)[:, 0]
|
|
||||||
refs += [
|
|
||||||
abcd_idx.index(label) if label in abcd_idx else -1
|
|
||||||
for label in labels.tolist()
|
|
||||||
]
|
|
||||||
loss_bench += loss.item()
|
|
||||||
# Extract results by subject.
|
|
||||||
bench_name = bench_dataset["name"]
|
|
||||||
bench_names: dict = {s: {"refs": [], "preds": []} for s in set(bench_name)}
|
|
||||||
for s, p, r in zip(bench_name, preds, refs): # pylint: disable=invalid-name
|
|
||||||
bench_names[s]["preds"].append(p)
|
|
||||||
bench_names[s]["refs"].append(r)
|
|
||||||
dist_state.wait_for_everyone()
|
|
||||||
local_bench_names = bench_names
|
|
||||||
gathered_bench_names: List[Dict] = [{} for _ in range(get_world_size())]
|
|
||||||
# Gather results from all GPUs to GPU 0
|
|
||||||
|
|
||||||
loss_bench_ranks = gather_scalar_from_all_ranks(
|
|
||||||
lambda: loss_bench, get_world_size()
|
|
||||||
)
|
|
||||||
len_data_loader_ranks = gather_scalar_from_all_ranks(
|
|
||||||
lambda: len(data_loader), get_world_size()
|
|
||||||
)
|
|
||||||
|
|
||||||
if not is_main_process():
|
|
||||||
dist.gather_object(local_bench_names, dst=0)
|
|
||||||
else:
|
|
||||||
dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
|
|
||||||
bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
|
|
||||||
results = {f"{bench_split}_bench_loss": bench_loss}
|
|
||||||
|
|
||||||
# Combine results from all GPUs
|
|
||||||
combined_bench_names: Dict[str, Dict[str, List]] = {}
|
|
||||||
for bench_name in gathered_bench_names:
|
|
||||||
for name, data in bench_name.items():
|
|
||||||
if name not in combined_bench_names:
|
|
||||||
combined_bench_names[name] = {"refs": [], "preds": []}
|
|
||||||
combined_bench_names[name]["refs"].extend(data["refs"])
|
|
||||||
combined_bench_names[name]["preds"].extend(data["preds"])
|
|
||||||
|
|
||||||
bench_scores = []
|
|
||||||
bench_refs = []
|
|
||||||
bench_preds = []
|
|
||||||
for (
|
|
||||||
bench_name
|
|
||||||
) in combined_bench_names: # pylint: disable=consider-using-dict-items
|
|
||||||
bench_score = accuracy.compute(
|
|
||||||
references=combined_bench_names[bench_name]["refs"],
|
|
||||||
predictions=combined_bench_names[bench_name]["preds"],
|
|
||||||
)["accuracy"]
|
|
||||||
bench_refs.extend(combined_bench_names[bench_name]["refs"])
|
|
||||||
bench_preds.extend(combined_bench_names[bench_name]["preds"])
|
|
||||||
if not pd.isna(bench_score):
|
|
||||||
results[
|
|
||||||
f"{bench_split}_bench_accuracy_{bench_name}"
|
|
||||||
] = bench_score
|
|
||||||
bench_scores.append(bench_score)
|
|
||||||
else:
|
|
||||||
results[f"{bench_split}_bench_accuracy_{bench_name}"] = 0.0
|
|
||||||
bench_scores.append(0.0)
|
|
||||||
results[f"{bench_split}_bench_average_accuracy"] = np.mean(bench_scores)
|
|
||||||
results[f"{bench_split}_bench_total_accuracy"] = accuracy.compute(
|
|
||||||
references=bench_refs, predictions=bench_preds
|
|
||||||
)["accuracy"]
|
|
||||||
trainer.log(results)
|
|
||||||
|
|
||||||
return BenchEvalCallback
|
|
||||||
|
|||||||
@@ -126,19 +126,6 @@ def validate_config(cfg):
|
|||||||
if not cfg.load_in_8bit and cfg.adapter == "lora":
|
if not cfg.load_in_8bit and cfg.adapter == "lora":
|
||||||
LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
|
LOG.warning("We recommend setting `load_in_8bit: true` for LORA finetuning")
|
||||||
|
|
||||||
if cfg.relora_steps:
|
|
||||||
if cfg.adapter not in ("lora", "qlora"):
|
|
||||||
raise ValueError("cfg.adapter must be lora or qlora to use ReLoRA")
|
|
||||||
|
|
||||||
if cfg.fsdp:
|
|
||||||
raise ValueError("fsdp not supported with ReLoRA")
|
|
||||||
|
|
||||||
if cfg.deepspeed:
|
|
||||||
raise ValueError("deepspeed not supported with ReLoRA")
|
|
||||||
|
|
||||||
if cfg.lr_scheduler == "one_cycle":
|
|
||||||
raise ValueError("ReLoRA is not compatible with the one_cycle scheduler")
|
|
||||||
|
|
||||||
if cfg.trust_remote_code:
|
if cfg.trust_remote_code:
|
||||||
LOG.warning(
|
LOG.warning(
|
||||||
"`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
|
"`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ from pathlib import Path
|
|||||||
from typing import Tuple, Union
|
from typing import Tuple, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from accelerate.state import PartialState
|
|
||||||
from datasets import (
|
from datasets import (
|
||||||
Dataset,
|
Dataset,
|
||||||
DatasetDict,
|
DatasetDict,
|
||||||
@@ -43,6 +42,7 @@ from axolotl.prompters import (
|
|||||||
SummarizeTLDRPrompter,
|
SummarizeTLDRPrompter,
|
||||||
)
|
)
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
from axolotl.utils.distributed import is_main_process, zero_first
|
||||||
from axolotl.utils.trainer import (
|
from axolotl.utils.trainer import (
|
||||||
calculate_total_num_steps,
|
calculate_total_num_steps,
|
||||||
process_datasets_for_packing,
|
process_datasets_for_packing,
|
||||||
@@ -50,15 +50,13 @@ from axolotl.utils.trainer import (
|
|||||||
|
|
||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
|
DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
|
||||||
state = PartialState()
|
|
||||||
|
|
||||||
|
|
||||||
def prepare_dataset(cfg, tokenizer):
|
def prepare_dataset(cfg, tokenizer):
|
||||||
if not cfg.pretraining_dataset:
|
if not cfg.pretraining_dataset:
|
||||||
with state.main_process_first():
|
train_dataset, eval_dataset = load_prepare_datasets(
|
||||||
train_dataset, eval_dataset = load_prepare_datasets(
|
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
|
||||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
|
)
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
train_dataset = load_pretraining_dataset(
|
train_dataset = load_pretraining_dataset(
|
||||||
cfg.pretraining_dataset,
|
cfg.pretraining_dataset,
|
||||||
@@ -70,7 +68,7 @@ def prepare_dataset(cfg, tokenizer):
|
|||||||
train_dataset = train_dataset.with_format("torch")
|
train_dataset = train_dataset.with_format("torch")
|
||||||
eval_dataset = None
|
eval_dataset = None
|
||||||
|
|
||||||
with state.main_process_first():
|
with zero_first(is_main_process()):
|
||||||
train_dataset, eval_dataset = process_datasets_for_packing(
|
train_dataset, eval_dataset = process_datasets_for_packing(
|
||||||
cfg, train_dataset, eval_dataset
|
cfg, train_dataset, eval_dataset
|
||||||
)
|
)
|
||||||
@@ -135,17 +133,8 @@ def load_tokenized_prepared_datasets(
|
|||||||
seed = 42
|
seed = 42
|
||||||
|
|
||||||
datasets = []
|
datasets = []
|
||||||
|
|
||||||
def for_d_in_datasets(dataset_configs):
|
|
||||||
for dataset in dataset_configs:
|
|
||||||
if dataset.name and isinstance(dataset.name, list):
|
|
||||||
for name in dataset.name:
|
|
||||||
yield DictDefault({**dataset, "name": name})
|
|
||||||
else:
|
|
||||||
yield dataset
|
|
||||||
|
|
||||||
# pylint: disable=invalid-name
|
# pylint: disable=invalid-name
|
||||||
for d in for_d_in_datasets(cfg.datasets):
|
for d in cfg.datasets:
|
||||||
ds: Union[Dataset, DatasetDict] = None
|
ds: Union[Dataset, DatasetDict] = None
|
||||||
ds_from_hub = False
|
ds_from_hub = False
|
||||||
try:
|
try:
|
||||||
@@ -508,7 +497,7 @@ def load_prepare_datasets(
|
|||||||
to_hash_test.encode(), usedforsecurity=False
|
to_hash_test.encode(), usedforsecurity=False
|
||||||
).hexdigest()
|
).hexdigest()
|
||||||
|
|
||||||
with state.main_process_first():
|
with zero_first(is_main_process()):
|
||||||
dataset = dataset.train_test_split(
|
dataset = dataset.train_test_split(
|
||||||
test_size=cfg.val_set_size,
|
test_size=cfg.val_set_size,
|
||||||
shuffle=False,
|
shuffle=False,
|
||||||
|
|||||||
@@ -243,18 +243,6 @@ class MultipackDistributedDataloader:
|
|||||||
len_remaining -= 1
|
len_remaining -= 1
|
||||||
if not len_remaining:
|
if not len_remaining:
|
||||||
return
|
return
|
||||||
# yield a no-op for cases where we don't have any data left to pack
|
|
||||||
for i in range(0, len_remaining):
|
|
||||||
yield self.collate_fn(
|
|
||||||
[
|
|
||||||
{
|
|
||||||
"input_ids": [0],
|
|
||||||
"labels": [-100],
|
|
||||||
"attention_mask": [True],
|
|
||||||
"position_ids": [0],
|
|
||||||
}
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
def _len_est(self):
|
def _len_est(self):
|
||||||
lengths_sum = np.sum(self.lengths)
|
lengths_sum = np.sum(self.lengths)
|
||||||
|
|||||||
@@ -1,27 +1,27 @@
|
|||||||
"""
|
"""
|
||||||
utility helpers for distributed checks
|
utility helpers for distributed checks
|
||||||
"""
|
"""
|
||||||
import torch
|
from contextlib import contextmanager
|
||||||
|
|
||||||
import torch.distributed as dist
|
import torch.distributed as dist
|
||||||
from accelerate import DistributedType
|
from accelerate import Accelerator
|
||||||
from accelerate.state import PartialState
|
|
||||||
from accelerate.utils import wait_for_everyone
|
|
||||||
|
|
||||||
accelerate = None # pylint: disable=invalid-name
|
accelerate = None # pylint: disable=invalid-name
|
||||||
|
|
||||||
state = PartialState()
|
|
||||||
|
def load_accelerate():
|
||||||
|
global accelerate # pylint: disable=global-statement
|
||||||
|
accelerate = Accelerator()
|
||||||
|
|
||||||
|
|
||||||
def is_distributed():
|
def is_distributed():
|
||||||
"""
|
"""
|
||||||
Check if distributed training is initialized.
|
Check if distributed training is initialized.
|
||||||
"""
|
"""
|
||||||
return state.distributed_type in (
|
global accelerate # pylint: disable=global-statement
|
||||||
DistributedType.MULTI_GPU,
|
if not accelerate:
|
||||||
DistributedType.MULTI_CPU,
|
accelerate = Accelerator()
|
||||||
DistributedType.DEEPSPEED,
|
return dist.is_available() and dist.is_initialized()
|
||||||
DistributedType.FSDP,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def barrier():
|
def barrier():
|
||||||
@@ -29,48 +29,27 @@ def barrier():
|
|||||||
Acts as a barrier to wait for all processes. This ensures that all processes
|
Acts as a barrier to wait for all processes. This ensures that all processes
|
||||||
reach the barrier before proceeding further.
|
reach the barrier before proceeding further.
|
||||||
"""
|
"""
|
||||||
wait_for_everyone()
|
if is_distributed():
|
||||||
|
dist.barrier()
|
||||||
|
|
||||||
|
|
||||||
def is_main_process() -> bool:
|
def is_main_process():
|
||||||
"""
|
"""
|
||||||
Check if the current process is the main process.
|
Check if the current process is the main process.
|
||||||
If not in distributed mode, always return True.
|
If not in distributed mode, always return True.
|
||||||
"""
|
"""
|
||||||
return state.is_main_process
|
if not is_distributed():
|
||||||
|
return True
|
||||||
|
return dist.get_rank() == 0
|
||||||
|
|
||||||
|
|
||||||
def get_world_size() -> int:
|
@contextmanager
|
||||||
return state.num_processes
|
def zero_first(is_main):
|
||||||
|
|
||||||
|
|
||||||
def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-name
|
|
||||||
"""
|
"""
|
||||||
Run a callable 'fn' on all ranks and gather the results on the specified rank.
|
runs the wrapped context so that rank 0 runs first before other ranks
|
||||||
|
|
||||||
Args:
|
|
||||||
- fn (callable): A function that computes the value. This should not have any side effects.
|
|
||||||
- rank (int, optional): The rank that gathers the values. Default is 0.
|
|
||||||
- world_size (int, optional): Total number of processes in the current distributed setup.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
- A list of computed values from all ranks if on the gathering rank, otherwise None.
|
|
||||||
"""
|
"""
|
||||||
value_scalar = fn()
|
if not is_main: # other ranks wait first
|
||||||
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
|
barrier()
|
||||||
|
yield
|
||||||
if not state.is_main_process:
|
if is_main: # then rank 0 waits after it has run the context
|
||||||
dist.gather(value_tensor, dst=0)
|
barrier()
|
||||||
else:
|
|
||||||
gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)]
|
|
||||||
dist.gather(value_tensor, gather_list=gathered_tensors, dst=0)
|
|
||||||
|
|
||||||
# Convert tensors back to their original type (int or float)
|
|
||||||
gathered_values = []
|
|
||||||
for tensor in gathered_tensors:
|
|
||||||
if tensor == tensor.int():
|
|
||||||
gathered_values.append(int(tensor.item()))
|
|
||||||
else:
|
|
||||||
gathered_values.append(float(tensor.item()))
|
|
||||||
return gathered_values
|
|
||||||
return None
|
|
||||||
|
|||||||
@@ -5,13 +5,13 @@ import logging
|
|||||||
import math
|
import math
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Tuple # noqa: F401
|
from typing import TYPE_CHECKING, Optional, Tuple # noqa: F401
|
||||||
|
|
||||||
import bitsandbytes as bnb
|
import bitsandbytes as bnb
|
||||||
import torch
|
import torch
|
||||||
import transformers
|
import transformers
|
||||||
from optimum.bettertransformer import BetterTransformer
|
from optimum.bettertransformer import BetterTransformer
|
||||||
from peft import PeftConfig
|
from peft.tuners.lora import LoraLayer
|
||||||
from transformers import ( # noqa: F401
|
from transformers import ( # noqa: F401
|
||||||
AutoConfig,
|
AutoConfig,
|
||||||
AutoModelForCausalLM,
|
AutoModelForCausalLM,
|
||||||
@@ -22,19 +22,15 @@ from transformers import ( # noqa: F401
|
|||||||
PreTrainedTokenizerBase,
|
PreTrainedTokenizerBase,
|
||||||
)
|
)
|
||||||
|
|
||||||
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
|
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_PAD_TOKEN
|
||||||
from axolotl.utils.bench import log_gpu_memory_usage
|
from axolotl.utils.bench import log_gpu_memory_usage
|
||||||
from axolotl.utils.dict import DictDefault
|
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from peft import PeftConfig # noqa: F401
|
||||||
|
|
||||||
def load_model_config(cfg):
|
from axolotl.utils.dict import DictDefault # noqa: F401
|
||||||
model_config_name = cfg.base_model_config or cfg.base_model
|
|
||||||
trust_remote_code: bool = False or cfg.trust_remote_code
|
|
||||||
return AutoConfig.from_pretrained(
|
|
||||||
model_config_name, trust_remote_code=trust_remote_code
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_tokenizer(cfg):
|
def load_tokenizer(cfg):
|
||||||
@@ -59,18 +55,11 @@ def load_tokenizer(cfg):
|
|||||||
**tokenizer_kwargs,
|
**tokenizer_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if tokenizer.__class__.__name__ in [
|
||||||
tokenizer.__class__.__name__
|
"LlamaTokenizer",
|
||||||
in [
|
"LlamaTokenizerFast",
|
||||||
"LlamaTokenizer",
|
]:
|
||||||
"LlamaTokenizerFast",
|
tokenizer.pad_token = LLAMA_DEFAULT_PAD_TOKEN
|
||||||
"CodeLlamaTokenizer",
|
|
||||||
]
|
|
||||||
and hasattr(tokenizer, "pad_token")
|
|
||||||
and not tokenizer.pad_token
|
|
||||||
):
|
|
||||||
# set a pad_token, but use eos_token so we don't add a new token
|
|
||||||
tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN
|
|
||||||
|
|
||||||
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
|
LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
|
||||||
LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
|
LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
|
||||||
@@ -91,10 +80,8 @@ def load_tokenizer(cfg):
|
|||||||
|
|
||||||
|
|
||||||
def load_model(
|
def load_model(
|
||||||
cfg: DictDefault,
|
cfg, tokenizer
|
||||||
tokenizer: PreTrainedTokenizerBase,
|
): # type: (DictDefault, PreTrainedTokenizerBase) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
|
||||||
inference: bool = False,
|
|
||||||
) -> Tuple[PreTrainedModel, Optional[PeftConfig]]:
|
|
||||||
"""
|
"""
|
||||||
Load a model for a given configuration and tokenizer.
|
Load a model for a given configuration and tokenizer.
|
||||||
"""
|
"""
|
||||||
@@ -104,9 +91,14 @@ def load_model(
|
|||||||
|
|
||||||
# TODO refactor as a kwarg
|
# TODO refactor as a kwarg
|
||||||
load_in_8bit = cfg.load_in_8bit
|
load_in_8bit = cfg.load_in_8bit
|
||||||
|
cfg.is_llama_derived_model = (
|
||||||
|
"llama" in base_model
|
||||||
|
or (cfg.model_type and "llama" in cfg.model_type.lower())
|
||||||
|
or cfg.is_llama_derived_model
|
||||||
|
)
|
||||||
|
|
||||||
if cfg.is_llama_derived_model and cfg.flash_attention:
|
if cfg.is_llama_derived_model and cfg.flash_attention:
|
||||||
if cfg.device not in ["mps", "cpu"] and not inference:
|
if cfg.device not in ["mps", "cpu"] and not cfg.inference:
|
||||||
from axolotl.monkeypatch.llama_attn_hijack_flash import (
|
from axolotl.monkeypatch.llama_attn_hijack_flash import (
|
||||||
replace_llama_attn_with_flash_attn,
|
replace_llama_attn_with_flash_attn,
|
||||||
)
|
)
|
||||||
@@ -148,7 +140,7 @@ def load_model(
|
|||||||
if (
|
if (
|
||||||
cfg.is_llama_derived_model
|
cfg.is_llama_derived_model
|
||||||
and (cfg.max_packed_sequence_len or cfg.sample_packing)
|
and (cfg.max_packed_sequence_len or cfg.sample_packing)
|
||||||
and not inference
|
and not cfg.inference
|
||||||
):
|
):
|
||||||
from axolotl.monkeypatch.llama_expand_mask import hijack_expand_mask
|
from axolotl.monkeypatch.llama_expand_mask import hijack_expand_mask
|
||||||
|
|
||||||
@@ -350,15 +342,6 @@ def load_model(
|
|||||||
if model.device.type == "cuda":
|
if model.device.type == "cuda":
|
||||||
log_gpu_memory_usage(LOG, "after model load", model.device)
|
log_gpu_memory_usage(LOG, "after model load", model.device)
|
||||||
|
|
||||||
# make sure these are fp32 per Ramesh et al. (2021)
|
|
||||||
for name, module in model.named_modules():
|
|
||||||
if "norm" in name:
|
|
||||||
module.to(torch.float32)
|
|
||||||
if "lm_head" in name or "embed_tokens" in name:
|
|
||||||
if hasattr(module, "weight"):
|
|
||||||
module.to(torch.float32)
|
|
||||||
|
|
||||||
needs_fa2_dtype = cfg.adapter or cfg.fsdp
|
|
||||||
if not cfg.gptq and (
|
if not cfg.gptq and (
|
||||||
(cfg.adapter == "lora" and load_in_8bit)
|
(cfg.adapter == "lora" and load_in_8bit)
|
||||||
or (cfg.adapter == "qlora" and cfg.load_in_4bit)
|
or (cfg.adapter == "qlora" and cfg.load_in_4bit)
|
||||||
@@ -367,18 +350,6 @@ def load_model(
|
|||||||
model = prepare_model_for_kbit_training(
|
model = prepare_model_for_kbit_training(
|
||||||
model, use_gradient_checkpointing=cfg.gradient_checkpointing
|
model, use_gradient_checkpointing=cfg.gradient_checkpointing
|
||||||
)
|
)
|
||||||
needs_fa2_dtype = True
|
|
||||||
|
|
||||||
# LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so we need to
|
|
||||||
# convert them back to fp16/bf16 for flash-attn compatibility.
|
|
||||||
if needs_fa2_dtype or (cfg.flash_attention and cfg.is_llama_derived_model):
|
|
||||||
LOG.info("converting modules to %s for flash attention", cfg.torch_dtype)
|
|
||||||
for name, module in model.named_modules():
|
|
||||||
if "norm" in name:
|
|
||||||
module.to(cfg.torch_dtype)
|
|
||||||
if "lm_head" in name or "embed_tokens" in name:
|
|
||||||
if hasattr(module, "weight"):
|
|
||||||
module.to(cfg.torch_dtype)
|
|
||||||
|
|
||||||
model, lora_config = load_adapter(model, cfg, cfg.adapter)
|
model, lora_config = load_adapter(model, cfg, cfg.adapter)
|
||||||
|
|
||||||
@@ -426,15 +397,15 @@ def load_model(
|
|||||||
return model, lora_config
|
return model, lora_config
|
||||||
|
|
||||||
|
|
||||||
def load_adapter(model, cfg, adapter, inference=False):
|
def load_adapter(model, cfg, adapter):
|
||||||
# type: (PreTrainedModel, DictDefault, Optional[str], bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
|
# type: (PreTrainedModel, DictDefault, Optional[str]) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
|
||||||
|
|
||||||
if adapter is None:
|
if adapter is None:
|
||||||
return model, None
|
return model, None
|
||||||
if hasattr(model, "enable_input_require_grads"):
|
if hasattr(model, "enable_input_require_grads"):
|
||||||
model.enable_input_require_grads()
|
model.enable_input_require_grads()
|
||||||
if adapter in ["lora", "qlora"]:
|
if adapter in ["lora", "qlora"]:
|
||||||
return load_lora(model, cfg, inference=inference)
|
return load_lora(model, cfg)
|
||||||
if adapter == "llama-adapter":
|
if adapter == "llama-adapter":
|
||||||
return load_llama_adapter(model, cfg)
|
return load_llama_adapter(model, cfg)
|
||||||
|
|
||||||
@@ -466,8 +437,12 @@ def load_llama_adapter(model, cfg):
|
|||||||
return model, peft_config
|
return model, peft_config
|
||||||
|
|
||||||
|
|
||||||
def find_all_linear_names(model):
|
def find_all_linear_names(bits, model):
|
||||||
cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
|
cls = (
|
||||||
|
bnb.nn.Linear4bit
|
||||||
|
if bits == 4
|
||||||
|
else (bnb.nn.Linear8bitLt if bits == 8 else torch.nn.Linear)
|
||||||
|
)
|
||||||
lora_module_names = set()
|
lora_module_names = set()
|
||||||
for name, module in model.named_modules():
|
for name, module in model.named_modules():
|
||||||
if isinstance(module, cls):
|
if isinstance(module, cls):
|
||||||
@@ -480,15 +455,21 @@ def find_all_linear_names(model):
|
|||||||
return list(lora_module_names)
|
return list(lora_module_names)
|
||||||
|
|
||||||
|
|
||||||
def load_lora(model, cfg, inference=False):
|
def load_lora(model, cfg):
|
||||||
# type: (PreTrainedModel, DictDefault, bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
|
# type: (PreTrainedModel, DictDefault) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
|
||||||
|
|
||||||
from peft import LoraConfig, PeftModel, get_peft_model
|
from peft import LoraConfig, PeftModel, get_peft_model
|
||||||
|
|
||||||
lora_target_modules = list(cfg.lora_target_modules or [])
|
lora_target_modules = list(cfg.lora_target_modules or [])
|
||||||
|
|
||||||
if cfg.lora_target_linear:
|
if cfg.lora_target_linear:
|
||||||
linear_names = find_all_linear_names(model)
|
bits = None
|
||||||
|
if cfg.load_in_4bit:
|
||||||
|
bits = 4
|
||||||
|
elif cfg.load_in_8bit:
|
||||||
|
bits = 8
|
||||||
|
|
||||||
|
linear_names = find_all_linear_names(bits, model)
|
||||||
LOG.info(f"found linear modules: {repr(linear_names)}")
|
LOG.info(f"found linear modules: {repr(linear_names)}")
|
||||||
lora_target_modules = list(set(lora_target_modules + linear_names))
|
lora_target_modules = list(set(lora_target_modules + linear_names))
|
||||||
|
|
||||||
@@ -508,11 +489,27 @@ def load_lora(model, cfg, inference=False):
|
|||||||
model = PeftModel.from_pretrained(
|
model = PeftModel.from_pretrained(
|
||||||
model,
|
model,
|
||||||
cfg.lora_model_dir,
|
cfg.lora_model_dir,
|
||||||
is_trainable=(not inference),
|
is_trainable=not cfg.inference,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model = get_peft_model(model, lora_config)
|
model = get_peft_model(model, lora_config)
|
||||||
|
|
||||||
|
for name, module in model.named_modules():
|
||||||
|
if isinstance(module, LoraLayer):
|
||||||
|
module = module.to(cfg.torch_dtype)
|
||||||
|
if "norm" in name:
|
||||||
|
module = module.to(torch.float32)
|
||||||
|
if "lm_head" in name or "embed_tokens" in name:
|
||||||
|
if hasattr(module, "weight"):
|
||||||
|
module = module.to(cfg.torch_dtype)
|
||||||
|
|
||||||
|
# LlamaRMSNorm layers are in fp32 after kbit_training, so we need to
|
||||||
|
# convert them back to fp16/bf16 for flash-attn compatibility.
|
||||||
|
if cfg.flash_attention and cfg.is_llama_derived_model:
|
||||||
|
for name, module in model.named_modules():
|
||||||
|
if "norm" in name:
|
||||||
|
module = module.to(cfg.torch_dtype)
|
||||||
|
|
||||||
model.print_trainable_parameters()
|
model.print_trainable_parameters()
|
||||||
|
|
||||||
return model, lora_config
|
return model, lora_config
|
||||||
|
|||||||
@@ -8,13 +8,13 @@ from termcolor import colored
|
|||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
|
|
||||||
def check_dataset_labels(dataset, tokenizer, num_examples=5, text_only=False):
|
def check_dataset_labels(dataset, tokenizer):
|
||||||
# the dataset is already shuffled, so let's just check the first 5 elements
|
# the dataset is already shuffled, so let's just check the first 5 elements
|
||||||
for idx in range(num_examples):
|
for idx in range(5):
|
||||||
check_example_labels(dataset[idx], tokenizer, text_only=text_only)
|
check_example_labels(dataset[idx], tokenizer)
|
||||||
|
|
||||||
|
|
||||||
def check_example_labels(example, tokenizer, text_only=False):
|
def check_example_labels(example, tokenizer):
|
||||||
# Get the input_ids, labels, and attention_mask from the dataset
|
# Get the input_ids, labels, and attention_mask from the dataset
|
||||||
input_ids = example["input_ids"]
|
input_ids = example["input_ids"]
|
||||||
labels = example["labels"]
|
labels = example["labels"]
|
||||||
@@ -29,10 +29,8 @@ def check_example_labels(example, tokenizer, text_only=False):
|
|||||||
decoded_input_token = tokenizer.decode(input_id)
|
decoded_input_token = tokenizer.decode(input_id)
|
||||||
# Choose the color based on whether the label has the ignore value or not
|
# Choose the color based on whether the label has the ignore value or not
|
||||||
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
|
||||||
colored_token = colored(decoded_input_token, color) + (
|
colored_token = colored(decoded_input_token, color) + colored(
|
||||||
not text_only
|
f"({label_id}, {mask}, {input_id})", "white"
|
||||||
and colored(f"({label_id}, {mask}, {input_id})", "white")
|
|
||||||
or ""
|
|
||||||
)
|
)
|
||||||
colored_tokens.append(colored_token)
|
colored_tokens.append(colored_token)
|
||||||
|
|
||||||
|
|||||||
@@ -10,30 +10,31 @@ from functools import partial
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Union
|
from typing import Optional, Union
|
||||||
|
|
||||||
|
import bitsandbytes as bnb
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch.cuda
|
import torch.cuda
|
||||||
import transformers
|
import transformers
|
||||||
from datasets import Dataset, set_caching_enabled
|
from datasets import Dataset, set_caching_enabled
|
||||||
|
from torch import nn
|
||||||
from torch.optim.lr_scheduler import OneCycleLR
|
from torch.optim.lr_scheduler import OneCycleLR
|
||||||
from torch.utils.data import (
|
from torch.utils.data import DataLoader, DistributedSampler, RandomSampler
|
||||||
DataLoader,
|
|
||||||
DistributedSampler,
|
|
||||||
RandomSampler,
|
|
||||||
SequentialSampler,
|
|
||||||
)
|
|
||||||
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
|
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
|
||||||
from transformers.trainer_pt_utils import SequentialDistributedSampler
|
from transformers.trainer_pt_utils import (
|
||||||
|
SequentialDistributedSampler,
|
||||||
|
get_parameter_names,
|
||||||
|
)
|
||||||
|
|
||||||
from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
|
|
||||||
from axolotl.utils.callbacks import (
|
from axolotl.utils.callbacks import (
|
||||||
GPUStatsCallback,
|
GPUStatsCallback,
|
||||||
SaveBetterTransformerModelCallback,
|
SaveBetterTransformerModelCallback,
|
||||||
SavePeftModelCallback,
|
SavePeftModelCallback,
|
||||||
bench_eval_callback_factory,
|
|
||||||
)
|
)
|
||||||
from axolotl.utils.collators import DataCollatorForSeq2Seq
|
from axolotl.utils.collators import DataCollatorForSeq2Seq
|
||||||
from axolotl.utils.dataloader import MultipackDistributedDataloader
|
from axolotl.utils.dataloader import MultipackDistributedDataloader
|
||||||
from axolotl.utils.schedulers import get_cosine_schedule_with_quadratic_warmup
|
from axolotl.utils.schedulers import (
|
||||||
|
InterpolatingLogScheduler,
|
||||||
|
get_cosine_schedule_with_quadratic_warmup,
|
||||||
|
)
|
||||||
|
|
||||||
LOG = logging.getLogger("axolotl")
|
LOG = logging.getLogger("axolotl")
|
||||||
|
|
||||||
@@ -126,35 +127,6 @@ class AxolotlTrainingArguments(TrainingArguments):
|
|||||||
default=1,
|
default=1,
|
||||||
metadata={"help": "the multiplier for the max len for packed sequences"},
|
metadata={"help": "the multiplier for the max len for packed sequences"},
|
||||||
)
|
)
|
||||||
relora_steps: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "how often to reset for ReLoRA"},
|
|
||||||
)
|
|
||||||
relora_warmup_steps: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
|
|
||||||
)
|
|
||||||
bench_split: Optional[str] = field(
|
|
||||||
default="eval", metadata={"help": "The benchmark split to run on"}
|
|
||||||
)
|
|
||||||
bench_dataset: Optional[str] = field(
|
|
||||||
default="pharaouk/dharma-1/dharma_1_mini.json",
|
|
||||||
metadata={
|
|
||||||
"help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
|
|
||||||
},
|
|
||||||
)
|
|
||||||
do_bench_eval: Optional[bool] = field(
|
|
||||||
default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
|
|
||||||
)
|
|
||||||
max_bench_samples: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={
|
|
||||||
"help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
|
|
||||||
},
|
|
||||||
)
|
|
||||||
bench_source_max_len: int = field(
|
|
||||||
default=2048, metadata={"help": "Maximum source sequence length for bench."}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class AxolotlTrainer(Trainer):
|
class AxolotlTrainer(Trainer):
|
||||||
@@ -164,10 +136,6 @@ class AxolotlTrainer(Trainer):
|
|||||||
|
|
||||||
args = None # type: AxolotlTrainingArguments
|
args = None # type: AxolotlTrainingArguments
|
||||||
|
|
||||||
def __init__(self, *args, bench_data_collator=None, **kwargs):
|
|
||||||
self.bench_data_collator = bench_data_collator
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
def create_scheduler(
|
def create_scheduler(
|
||||||
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
|
self, num_training_steps: int, optimizer: torch.optim.Optimizer = None
|
||||||
):
|
):
|
||||||
@@ -258,31 +226,6 @@ class AxolotlTrainer(Trainer):
|
|||||||
)
|
)
|
||||||
return super().get_eval_dataloader(eval_dataset)
|
return super().get_eval_dataloader(eval_dataset)
|
||||||
|
|
||||||
def _get_bench_sampler(
|
|
||||||
self, bench_dataset: Dataset
|
|
||||||
) -> Optional[torch.utils.data.Sampler]:
|
|
||||||
if self.args.world_size <= 1:
|
|
||||||
return SequentialSampler(bench_dataset)
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_bench_dataloader(
|
|
||||||
self,
|
|
||||||
bench_dataset: Dataset,
|
|
||||||
) -> Union[DataLoader, MultipackDistributedDataloader]:
|
|
||||||
dataloader_params = {
|
|
||||||
"batch_size": self.args.eval_batch_size,
|
|
||||||
"collate_fn": self.bench_data_collator,
|
|
||||||
"num_workers": self.args.dataloader_num_workers,
|
|
||||||
"pin_memory": self.args.dataloader_pin_memory,
|
|
||||||
}
|
|
||||||
|
|
||||||
if not isinstance(bench_dataset, torch.utils.data.IterableDataset):
|
|
||||||
dataloader_params["sampler"] = self._get_bench_sampler(bench_dataset)
|
|
||||||
dataloader_params["drop_last"] = self.args.dataloader_drop_last
|
|
||||||
|
|
||||||
return DataLoader(bench_dataset, **dataloader_params)
|
|
||||||
# return self.accelerator.prepare(DataLoader(bench_dataset, **dataloader_params))
|
|
||||||
|
|
||||||
def compute_loss(self, model, inputs, return_outputs=False):
|
def compute_loss(self, model, inputs, return_outputs=False):
|
||||||
# use one's weighted cross entropy loss calc
|
# use one's weighted cross entropy loss calc
|
||||||
# if self.args.sample_packing:
|
# if self.args.sample_packing:
|
||||||
@@ -322,46 +265,13 @@ class OneCycleLRSchedulerTrainer(AxolotlTrainer):
|
|||||||
return self.lr_scheduler
|
return self.lr_scheduler
|
||||||
|
|
||||||
|
|
||||||
class ReLoRATrainer(AxolotlTrainer):
|
|
||||||
"""
|
|
||||||
Trainer subclass that uses the OneCycleLR scheduler
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
self.lr_scheduler = None
|
|
||||||
|
|
||||||
def create_scheduler(
|
|
||||||
self,
|
|
||||||
num_training_steps: int,
|
|
||||||
optimizer: Optional[torch.optim.Optimizer] = None,
|
|
||||||
):
|
|
||||||
optimizer = self.optimizer if optimizer is None else optimizer
|
|
||||||
lr_scheduler = super().create_scheduler(num_training_steps, optimizer)
|
|
||||||
|
|
||||||
if self.args.relora_steps:
|
|
||||||
warmup_steps = (
|
|
||||||
self.args.relora_warmup_steps if self.args.relora_warmup_steps else 10
|
|
||||||
)
|
|
||||||
self.lr_scheduler = ReLoRAScheduler(
|
|
||||||
optimizer,
|
|
||||||
lr_scheduler,
|
|
||||||
self.args.relora_steps,
|
|
||||||
warmup_steps,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
self.lr_scheduler = lr_scheduler
|
|
||||||
|
|
||||||
return self.lr_scheduler
|
|
||||||
|
|
||||||
|
|
||||||
def add_position_ids(sample):
|
def add_position_ids(sample):
|
||||||
sample["position_ids"] = torch.arange(len(sample["input_ids"]))
|
sample["position_ids"] = torch.arange(len(sample["input_ids"]))
|
||||||
return sample
|
return sample
|
||||||
|
|
||||||
|
|
||||||
def drop_long_seq(sample, sequence_len=2048):
|
def drop_long_seq(sample, sequence_len=2048):
|
||||||
return len(sample["input_ids"]) <= sequence_len and len(sample["input_ids"]) > 0
|
return len(sample["input_ids"]) <= sequence_len
|
||||||
|
|
||||||
|
|
||||||
@contextmanager
|
@contextmanager
|
||||||
@@ -401,16 +311,6 @@ def calculate_total_num_steps(cfg, train_dataset, tokenizer):
|
|||||||
LOG.info(f"📝 UPDATE CONFIG WITH: `total_num_tokens: {total_num_tokens}`")
|
LOG.info(f"📝 UPDATE CONFIG WITH: `total_num_tokens: {total_num_tokens}`")
|
||||||
cfg.total_num_tokens = total_num_tokens
|
cfg.total_num_tokens = total_num_tokens
|
||||||
|
|
||||||
if not cfg.total_supervised_tokens:
|
|
||||||
total_supervised_tokens = (
|
|
||||||
train_dataset.data.column("labels")
|
|
||||||
.to_pandas()
|
|
||||||
.apply(lambda x: np.sum(np.array(x) != -100))
|
|
||||||
.sum()
|
|
||||||
)
|
|
||||||
LOG.info(f"`total_supervised_tokens: {total_supervised_tokens}`")
|
|
||||||
cfg.total_supervised_tokens = total_supervised_tokens
|
|
||||||
|
|
||||||
if cfg.sample_packing_eff_est:
|
if cfg.sample_packing_eff_est:
|
||||||
total_num_steps = (
|
total_num_steps = (
|
||||||
# match count to len est in dataloader
|
# match count to len est in dataloader
|
||||||
@@ -584,20 +484,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
"steps" if cfg.save_steps else "epoch"
|
"steps" if cfg.save_steps else "epoch"
|
||||||
)
|
)
|
||||||
|
|
||||||
if cfg.do_bench_eval:
|
|
||||||
training_arguments_kwargs["do_bench_eval"] = cfg.do_bench_eval
|
|
||||||
if cfg.bench_dataset:
|
|
||||||
training_arguments_kwargs["bench_dataset"] = cfg.bench_dataset
|
|
||||||
|
|
||||||
# DDP Config
|
|
||||||
if cfg.ddp_timeout:
|
|
||||||
training_arguments_kwargs["ddp_timeout"] = cfg.ddp_timeout
|
|
||||||
# see https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
|
|
||||||
if cfg.ddp_bucket_cap_mb:
|
|
||||||
training_arguments_kwargs["ddp_bucket_cap_mb"] = cfg.ddp_bucket_cap_mb
|
|
||||||
if cfg.ddp_broadcast_buffers is not None:
|
|
||||||
training_arguments_kwargs["ddp_broadcast_buffers"] = cfg.ddp_broadcast_buffers
|
|
||||||
|
|
||||||
training_args = AxolotlTrainingArguments( # pylint: disable=unexpected-keyword-arg
|
training_args = AxolotlTrainingArguments( # pylint: disable=unexpected-keyword-arg
|
||||||
max_steps=total_num_steps if cfg.max_steps else -1,
|
max_steps=total_num_steps if cfg.max_steps else -1,
|
||||||
max_seq_length=cfg.sequence_len,
|
max_seq_length=cfg.sequence_len,
|
||||||
@@ -631,8 +517,6 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
weight_decay=cfg.weight_decay if cfg.weight_decay is not None else 0.0,
|
weight_decay=cfg.weight_decay if cfg.weight_decay is not None else 0.0,
|
||||||
sample_packing=cfg.sample_packing if cfg.sample_packing else False,
|
sample_packing=cfg.sample_packing if cfg.sample_packing else False,
|
||||||
sample_packing_seq_len_multiplier=cfg.micro_batch_size,
|
sample_packing_seq_len_multiplier=cfg.micro_batch_size,
|
||||||
relora_steps=cfg.relora_steps,
|
|
||||||
relora_warmup_steps=cfg.relora_warmup_steps,
|
|
||||||
**training_arguments_kwargs,
|
**training_arguments_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -642,13 +526,69 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
if Path(cfg.torchdistx_path).exists():
|
if Path(cfg.torchdistx_path).exists():
|
||||||
sys.path.append(cfg.torchdistx_path)
|
sys.path.append(cfg.torchdistx_path)
|
||||||
importlib.import_module("torchdistx")
|
importlib.import_module("torchdistx")
|
||||||
|
if (
|
||||||
|
cfg.optimizer == "adamw_bnb_8bit"
|
||||||
|
and not cfg.gptq
|
||||||
|
and "deepspeed" not in training_arguments_kwargs
|
||||||
|
and not cfg.fsdp
|
||||||
|
):
|
||||||
|
decay_parameters = get_parameter_names(model, [nn.LayerNorm])
|
||||||
|
decay_parameters = [name for name in decay_parameters if "bias" not in name]
|
||||||
|
optimizer_grouped_parameters = [
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
p
|
||||||
|
for n, p in model.named_parameters()
|
||||||
|
if (n in decay_parameters and p.requires_grad)
|
||||||
|
],
|
||||||
|
"weight_decay": training_args.weight_decay,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"params": [
|
||||||
|
p
|
||||||
|
for n, p in model.named_parameters()
|
||||||
|
if (n not in decay_parameters and p.requires_grad)
|
||||||
|
],
|
||||||
|
"weight_decay": 0.0,
|
||||||
|
},
|
||||||
|
]
|
||||||
|
|
||||||
|
optimizer = bnb.optim.Adam8bit(
|
||||||
|
optimizer_grouped_parameters,
|
||||||
|
betas=(training_args.adam_beta1, training_args.adam_beta2),
|
||||||
|
eps=training_args.adam_epsilon,
|
||||||
|
lr=training_args.learning_rate,
|
||||||
|
)
|
||||||
|
|
||||||
|
if cfg.lr_scheduler == "one_cycle":
|
||||||
|
lr_scheduler_kwargs = (
|
||||||
|
cfg.lr_scheduler_kwargs if cfg.lr_scheduler_kwargs else {}
|
||||||
|
)
|
||||||
|
lr_scheduler = OneCycleLR(
|
||||||
|
optimizer,
|
||||||
|
cfg.learning_rate,
|
||||||
|
total_steps=total_num_steps,
|
||||||
|
epochs=cfg.num_epochs,
|
||||||
|
div_factor=cfg.lr_div_factor if cfg.lr_div_factor else 6,
|
||||||
|
**lr_scheduler_kwargs,
|
||||||
|
)
|
||||||
|
elif cfg.lr_scheduler == "log_sweep":
|
||||||
|
lr_scheduler = InterpolatingLogScheduler(
|
||||||
|
optimizer,
|
||||||
|
cfg.warmup_steps,
|
||||||
|
cfg.log_sweep_min_lr if cfg.log_sweep_min_lr else 1e-10,
|
||||||
|
cfg.log_sweep_max_lr if cfg.log_sweep_max_lr else 10,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
lr_scheduler = transformers.get_cosine_schedule_with_warmup(
|
||||||
|
optimizer,
|
||||||
|
training_args.warmup_steps,
|
||||||
|
total_num_steps,
|
||||||
|
)
|
||||||
|
trainer_kwargs["optimizers"] = (optimizer, lr_scheduler)
|
||||||
|
|
||||||
callbacks = []
|
callbacks = []
|
||||||
callbacks.append(GPUStatsCallback(cfg))
|
callbacks.append(GPUStatsCallback(cfg))
|
||||||
|
|
||||||
if cfg.relora_steps:
|
|
||||||
callbacks.append(ReLoRACallback(cfg))
|
|
||||||
|
|
||||||
# TODO on_save callback to sync checkpoints to GCP/AWS in background
|
# TODO on_save callback to sync checkpoints to GCP/AWS in background
|
||||||
if cfg.early_stopping_patience:
|
if cfg.early_stopping_patience:
|
||||||
early_stop_cb = EarlyStoppingCallback(
|
early_stop_cb = EarlyStoppingCallback(
|
||||||
@@ -666,12 +606,10 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
callbacks.append(SaveBetterTransformerModelCallback)
|
callbacks.append(SaveBetterTransformerModelCallback)
|
||||||
|
|
||||||
data_collator_kwargs = {
|
data_collator_kwargs = {
|
||||||
"padding": True, # True/"longest" is the default
|
"padding": True,
|
||||||
}
|
}
|
||||||
if cfg.pad_to_sequence_len:
|
if cfg.collator_pad_to_longest:
|
||||||
data_collator_kwargs["pad_to_multiple_of"] = 64 * math.ceil(
|
data_collator_kwargs["padding"] = "longest"
|
||||||
cfg.sequence_len / 64
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
# A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
|
# A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
|
||||||
# https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
|
# https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
|
||||||
@@ -695,11 +633,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
num_proc=32,
|
num_proc=32,
|
||||||
)
|
)
|
||||||
|
|
||||||
trainer_cls = AxolotlTrainer
|
trainer_cls = (
|
||||||
if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora"):
|
OneCycleLRSchedulerTrainer
|
||||||
trainer_cls = OneCycleLRSchedulerTrainer
|
if cfg.lr_scheduler == "one_cycle" and (cfg.fsdp or cfg.adapter == "qlora")
|
||||||
elif cfg.relora_steps:
|
else AxolotlTrainer
|
||||||
trainer_cls = ReLoRATrainer
|
)
|
||||||
trainer = trainer_cls(
|
trainer = trainer_cls(
|
||||||
model=model,
|
model=model,
|
||||||
train_dataset=train_dataset,
|
train_dataset=train_dataset,
|
||||||
@@ -710,16 +648,8 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
|||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
**data_collator_kwargs,
|
**data_collator_kwargs,
|
||||||
),
|
),
|
||||||
bench_data_collator=transformers.DataCollatorForSeq2Seq(
|
|
||||||
tokenizer,
|
|
||||||
return_tensors="pt",
|
|
||||||
**data_collator_kwargs,
|
|
||||||
),
|
|
||||||
callbacks=callbacks,
|
callbacks=callbacks,
|
||||||
**trainer_kwargs,
|
**trainer_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
if cfg.do_bench_eval:
|
|
||||||
trainer.add_callback(bench_eval_callback_factory(trainer, tokenizer))
|
|
||||||
|
|
||||||
return trainer
|
return trainer
|
||||||
|
|||||||
Reference in New Issue
Block a user