Compare commits
1 Commits
optimizer-
...
feat/pref_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8428b3f2c7 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -1,7 +1,6 @@
|
||||
**/axolotl.egg-info
|
||||
configs
|
||||
last_run_prepared/
|
||||
outputs
|
||||
.vscode
|
||||
_site/
|
||||
|
||||
|
||||
@@ -5,6 +5,6 @@ python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
|
||||
|
||||
pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ /workspace/axolotl/tests/
|
||||
# pytest -v --durations=10 -n8 --dist loadfile /workspace/axolotl/tests/patched/
|
||||
pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/
|
||||
pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
|
||||
pytest -v --durations=10 -n1 --dist loadfile /workspace/axolotl/tests/e2e/patched/
|
||||
pytest -v --durations=10 -n1 --dist loadfile /workspace/axolotl/tests/e2e/integrations/
|
||||
pytest -v --durations=10 --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ /workspace/axolotl/tests/e2e/
|
||||
|
||||
@@ -1,27 +0,0 @@
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 1,
|
||||
"overlap_comm": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"compile": {
|
||||
"disable": false,
|
||||
"backend": "inductor"
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"gradient_clipping": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -127,40 +127,34 @@ datasets:
|
||||
# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.
|
||||
# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
|
||||
chat_template: tokenizer_default
|
||||
|
||||
# Custom jinja chat template. Used only if `chat_template: jinja` or empty.
|
||||
# Custom jinja template for chat template. This will be only used if `chat_template` is set to `jinja` or empty (in which case chat_template is automatically set to `jinja`).
|
||||
chat_template_jinja:
|
||||
|
||||
# Key containing the messages (default: "messages")
|
||||
# The key in the data example that contains the messages. Default is "messages".
|
||||
field_messages: messages
|
||||
# Key for role in each message (default: "role")
|
||||
# The key in the message turn that contains the role. Default is "role".
|
||||
message_field_role: role
|
||||
# Key for content in each message (default: "content")
|
||||
# The key in the message turn that contains the content. Default is "content".
|
||||
message_field_content: content
|
||||
|
||||
# Optional[Dict[str, List]]. Roles mapping in the messages. The default is:
|
||||
# Optional[Dict[str, List]]. Roles mapping for the messages.
|
||||
roles:
|
||||
user: ["human", "user"]
|
||||
assistant: ["gpt", "assistant"]
|
||||
assistant: ["gpt", "assistant", "ai"]
|
||||
system: ["system"]
|
||||
tool: ["tool"]
|
||||
|
||||
# IMPORTANT: The following fields determine which parts of the conversation to train on.
|
||||
# Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
|
||||
# See examples at `docs/dataset-formats/conversation.qmd`
|
||||
# Note: If the below 4 fields are empty, defaults to training only on the last message.
|
||||
## NOTE: Leaving the below empty will default to using the simple legacy tokenization strategy where only last message is trained on.
|
||||
|
||||
# Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
|
||||
roles_to_train: ["assistant"] # default
|
||||
roles_to_train: ["gpt", "assistant"]
|
||||
# Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
|
||||
# - all: train on all EOS tokens
|
||||
# - turn (default): train on the EOS token at the end of each trainable turn
|
||||
# - turn: train on the EOS token at the end of each trainable turn
|
||||
# - last: train on the last EOS token in the conversation
|
||||
train_on_eos: last
|
||||
# The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
|
||||
message_field_training: training
|
||||
# The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
|
||||
# The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).
|
||||
# See example at `docs/dataset-formats/conversation.qmd`
|
||||
message_field_training_detail: train_detail
|
||||
|
||||
|
||||
@@ -245,9 +239,6 @@ sample_packing_group_size: 100000
|
||||
# The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples.
|
||||
sample_packing_bin_size: 200
|
||||
|
||||
# Use batch flattening for speedups when not using sample_packing
|
||||
batch_flattening:
|
||||
|
||||
# Passed through to transformers when loading the model when launched without accelerate
|
||||
# Use `sequential` when training w/ model parallelism to limit memory
|
||||
device_map:
|
||||
@@ -340,8 +331,7 @@ comet_experiment_config: # Dictionary for additional configuration settings, see
|
||||
output_dir: ./completed-model
|
||||
|
||||
# Whether to use torch.compile and which backend to use
|
||||
# setting to `auto` will enable torch compile when torch>=2.5.1
|
||||
torch_compile: # Optional[Union[Literal["auto"], bool]]
|
||||
torch_compile: # bool
|
||||
torch_compile_backend: # Optional[str]
|
||||
|
||||
# Training hyperparameters
|
||||
@@ -373,10 +363,6 @@ eval_table_size: # Approximate number of predictions sent to wandb depending on
|
||||
eval_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
|
||||
eval_causal_lm_metrics: # HF evaluate metrics used during evaluation. Default is ["sacrebleu", "comet", "ter", "chrf", "perplexity"]
|
||||
|
||||
profiler_steps: # enable the pytorch profiler to capture the first N steps of training to the output_dir.
|
||||
# see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information
|
||||
# snapshots can be visualized @ https://pytorch.org/memory_viz
|
||||
|
||||
loss_watchdog_threshold: # High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)
|
||||
loss_watchdog_patience: # Number of high-loss steps in a row before the trainer aborts (default: 3)
|
||||
|
||||
|
||||
@@ -68,8 +68,6 @@ We recommend checking the below examples for other usecases.
|
||||
datasets:
|
||||
- path: ...
|
||||
type: chat_template
|
||||
roles_to_train:
|
||||
train_on_eos:
|
||||
```
|
||||
|
||||
2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
|
||||
@@ -79,7 +77,7 @@ chat_template: gemma # this overwrites the tokenizer's chat_template
|
||||
datasets:
|
||||
- path: ...
|
||||
type: chat_template
|
||||
roles_to_train: ["assistant"] # default value
|
||||
roles_to_train: ["assistant"]
|
||||
```
|
||||
|
||||
3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
|
||||
@@ -89,6 +87,7 @@ chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer
|
||||
datasets:
|
||||
- path: ...
|
||||
type: chat_template
|
||||
roles_to_train: ["assistant"]
|
||||
```
|
||||
|
||||
4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
|
||||
@@ -100,6 +99,7 @@ chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message
|
||||
datasets:
|
||||
- path: ...
|
||||
type: chat_template
|
||||
roles_to_train: ["assistant"]
|
||||
```
|
||||
|
||||
5. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: cerebras/btlm-3b-8k-base
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: GPT2Tokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
tokenizer_use_fast: true
|
||||
tokenizer_legacy: true
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: cerebras/Cerebras-GPT-1.3B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: codellama/CodeLlama-13b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: CodeLlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: codellama/CodeLlama-13b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: CodeLlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: codellama/CodeLlama-34b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: CodeLlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: codellama/CodeLlama-34b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: CodeLlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: codellama/CodeLlama-7b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: CodeLlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: codellama/CodeLlama-7b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: CodeLlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: LnL-AI/dbrx-base-converted-v2
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: LnL-AI/dbrx-base-converted-v2
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: true
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: LnL-AI/dbrx-base-converted-v2
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
base_model: deepseek-ai/DeepSeek-V2-Lite
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,12 +1,7 @@
|
||||
base_model: tiiuae/falcon-7b
|
||||
# optionally might have model_type or tokenizer_type
|
||||
trust_remote_code: true
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,15 +1,10 @@
|
||||
# 1b: tiiuae/falcon-rw-1b
|
||||
# 40b: tiiuae/falcon-40b
|
||||
base_model: tiiuae/falcon-7b
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
||||
trust_remote_code: true
|
||||
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
# enable 4bit for QLoRA
|
||||
|
||||
@@ -1,12 +1,7 @@
|
||||
base_model: tiiuae/falcon-7b
|
||||
# optionally might have model_type or tokenizer_type
|
||||
trust_remote_code: true
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
# use google/gemma-7b if you have access
|
||||
base_model: mhenrichsen/gemma-7b
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: google/gemma-2-9b
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: google/gemma-2-2b
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForSequenceClassification
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: EleutherAI/gpt-j-6b
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: ai21labs/Jamba-v0.1
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
base_model: ai21labs/Jamba-v0.1
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
base_model: ai21labs/AI21-Jamba-1.5-Large
|
||||
# optionally might have model_type or tokenizer_type
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: huggyllama/llama-7b
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
datasets:
|
||||
- path: openaccess-ai-collective/jeopardy
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: NousResearch/Llama-2-7b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,13 +1,8 @@
|
||||
base_model: TheBloke/Llama-2-7B-GPTQ
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
gptq: true
|
||||
gptq_disable_exllama: true
|
||||
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
tokenizer_use_fast: true
|
||||
tokenizer_legacy: true
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: NousResearch/Llama-2-7b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: NousResearch/Llama-2-7b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: NousResearch/Llama-2-7b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: NousResearch/Llama-2-7b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: NousResearch/Llama-2-7b-hf
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
base_model: alpindale/Llama-3.2-11B-Vision-Instruct
|
||||
# optionally might have model_type or tokenizer_type or processor_type
|
||||
processor_type: AutoProcessor
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
strict: false
|
||||
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
base_model: NousResearch/Meta-Llama-3.1-8B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
base_model: NousResearch/Meta-Llama-3.1-8B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: meta-llama/Meta-Llama-3-8B-Instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: NousResearch/Meta-Llama-3-8B-Instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: meta-llama/Llama-3.2-1B
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: meta-llama/Llama-3.2-1B
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
base_model: NousResearch/Llama-3.2-1B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: NousResearch/Meta-Llama-3-8B
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
base_model: meta-llama/Llama-3.2-1B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
base_model: NousResearch/Llama-3.2-1B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
base_model: hugging-quants/Meta-Llama-3.1-405B-BNB-NF4-BF16
|
||||
# optionally might have model_type or tokenizer_type
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: casperhansen/llama-3-70b-fp16
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: NousResearch/Meta-Llama-3-8B
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,10 +1,7 @@
|
||||
base_model: state-spaces/mamba-2.8b
|
||||
# optionally might have model_type or tokenizer_type or tokenizer_config
|
||||
model_type: MambaLMHeadModel
|
||||
tokenizer_type: AutoTokenizer
|
||||
tokenizer_config: EleutherAI/gpt-neox-20b
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: mistral-community/Mixtral-8x22B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: mistralai/Mistral-7B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: mistralai/Mistral-7B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: mistralai/Mistral-7B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -4,11 +4,8 @@
|
||||
#face problems with the special tokens.
|
||||
|
||||
base_model: mistralai/Mistral-7B-Instruct-v0.2
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: mistralai/Mixtral-8x7B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: mistralai/Mistral-7B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: mistral-community/Mixtral-8x22B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: mistralai/Mixtral-8x7B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: mistralai/Mixtral-8x7B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: mistral-community/Mixtral-8x22B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: mistralai/Mistral-7B-v0.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,5 @@
|
||||
base_model: mosaicml/mpt-7b
|
||||
# optionally might have model_type or tokenizer_type
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true # required for mpt as their model class is not merged into transformers yet
|
||||
load_in_8bit: false
|
||||
datasets:
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: openlm-research/open_llama_3b_v2
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: openlm-research/open_llama_3b_v2
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: openlm-research/open_llama_3b_v2
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: microsoft/Phi-3.5-mini-instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: microsoft/phi-1_5
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: microsoft/phi-1_5
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: microsoft/phi-2
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: microsoft/Phi-3-mini-4k-instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
base_model: microsoft/Phi-3-mini-4k-instruct
|
||||
# optionally might have model_type or tokenizer_type
|
||||
trust_remote_code: true
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
chat_template: phi_3
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
base_model: EleutherAI/pythia-12b-deduped
|
||||
base_model_ignore_patterns: pytorch* # prefer safetensors
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: GPTNeoXForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
gptq: false
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: EleutherAI/pythia-1.4b-deduped
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: Qwen/Qwen-7B
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: Qwen/Qwen-7B
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: Qwen/Qwen1.5-MoE-A2.7B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: Qwen/Qwen1.5-MoE-A2.7B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
base_model: Qwen/Qwen2.5-0.5B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
strict: false
|
||||
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: Qwen/Qwen2-7B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: togethercomputer/RedPajama-INCITE-Chat-3B-v1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: GPTNeoXForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code:
|
||||
load_in_8bit: false
|
||||
datasets:
|
||||
|
||||
@@ -1,7 +1,4 @@
|
||||
base_model: replit/replit-code-v1-3b
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
load_in_8bit: false
|
||||
datasets:
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: stabilityai/stablelm-2-1_6b
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
|
||||
@@ -1,10 +1,6 @@
|
||||
base_model: stabilityai/stablelm-2-1_6b
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: true
|
||||
|
||||
@@ -1,6 +1,4 @@
|
||||
base_model: bigcode/starcoder2-3b
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: TinyLlama/TinyLlama_v1.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,8 +1,5 @@
|
||||
base_model: TinyLlama/TinyLlama_v1.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,7 @@
|
||||
base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
|
||||
# optionally might have model_type or tokenizer_type
|
||||
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: TinyLlama/TinyLlama_v1.1
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,14 +1,9 @@
|
||||
# An example finetuning Saleforce's XGen-7b model with 8k context using qlora
|
||||
# on Tim Dettmer's Guanaco dataset.
|
||||
base_model: Salesforce/xgen-7b-8k-base
|
||||
# optionally might have model_type or tokenizer_type
|
||||
trust_remote_code: true
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
# enable 4bit for QLoRA
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
base_model: 01-ai/Yi-34B-Chat
|
||||
# optionally might have model_type or tokenizer_type
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
|
||||
@@ -7,31 +7,26 @@ mamba-ssm==1.2.0.post1
|
||||
flash-attn==2.7.0.post2
|
||||
xformers>=0.0.23.post1
|
||||
autoawq==0.2.7.post3
|
||||
liger-kernel==0.5.2
|
||||
liger-kernel==0.4.2
|
||||
# END section
|
||||
|
||||
packaging==23.2
|
||||
|
||||
peft==0.14.0
|
||||
transformers==4.47.1
|
||||
transformers>=4.46.3
|
||||
tokenizers>=0.20.1
|
||||
accelerate==1.2.1
|
||||
accelerate==1.2.0
|
||||
datasets==3.1.0
|
||||
deepspeed==0.16.1
|
||||
trl==0.12.1
|
||||
|
||||
optimum==1.16.2
|
||||
hf_transfer
|
||||
sentencepiece
|
||||
gradio==3.50.2
|
||||
|
||||
pydantic==2.6.3
|
||||
addict
|
||||
fire
|
||||
PyYAML>=6.0
|
||||
requests
|
||||
sentencepiece
|
||||
wandb
|
||||
einops
|
||||
optimum==1.16.2
|
||||
hf_transfer
|
||||
colorama
|
||||
numba
|
||||
numpy>=1.24.4,<=2.0.1
|
||||
@@ -41,6 +36,7 @@ scipy
|
||||
scikit-learn==1.4.2
|
||||
nvidia-ml-py==12.560.30
|
||||
art
|
||||
gradio==3.50.2
|
||||
tensorboard
|
||||
python-dotenv==1.0.1
|
||||
|
||||
@@ -49,6 +45,7 @@ s3fs>=2024.5.0
|
||||
gcsfs>=2024.5.0
|
||||
# adlfs
|
||||
|
||||
trl==0.12.1
|
||||
zstandard==0.22.0
|
||||
fastcore
|
||||
|
||||
@@ -58,7 +55,5 @@ langdetect==1.0.9
|
||||
immutabledict==4.2.0
|
||||
antlr4-python3-runtime==4.13.2
|
||||
|
||||
torchao==0.7.0
|
||||
torchao==0.5.0
|
||||
schedulefree==1.3.0
|
||||
|
||||
axolotl-contribs-lgpl==0.0.2
|
||||
|
||||
@@ -32,5 +32,5 @@ else:
|
||||
raise RuntimeError(f"Torch = {v} too new!")
|
||||
x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
|
||||
print(
|
||||
f'pip install unsloth-zoo==2024.12.1 && pip install --no-deps "unsloth[{x}]==2024.12.4"'
|
||||
f'pip install unsloth-zoo==2024.11.7 && pip install --no-deps "unsloth[{x}]==2024.11.9"'
|
||||
)
|
||||
|
||||
@@ -1,7 +1,3 @@
|
||||
"""Axolotl - Train and fine-tune large language models"""
|
||||
|
||||
import pkgutil
|
||||
|
||||
__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package
|
||||
|
||||
__version__ = "0.6.0"
|
||||
|
||||
@@ -1,52 +0,0 @@
|
||||
"""
|
||||
CLI to run training on a model
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Union
|
||||
|
||||
import fire
|
||||
from dotenv import load_dotenv
|
||||
from transformers.hf_argparser import HfArgumentParser
|
||||
|
||||
from axolotl.cli import (
|
||||
check_accelerate_default_config,
|
||||
check_user_token,
|
||||
load_cfg,
|
||||
load_datasets,
|
||||
load_rl_datasets,
|
||||
print_axolotl_text_art,
|
||||
)
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.evaluate import evaluate
|
||||
|
||||
LOG = logging.getLogger("axolotl.cli.evaluate")
|
||||
|
||||
|
||||
def do_evaluate(cfg, cli_args) -> None:
|
||||
# pylint: disable=duplicate-code
|
||||
print_axolotl_text_art()
|
||||
check_accelerate_default_config()
|
||||
check_user_token()
|
||||
|
||||
if cfg.rl: # and cfg.rl != "orpo":
|
||||
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
|
||||
else:
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
evaluate(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
|
||||
|
||||
def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
|
||||
# pylint: disable=duplicate-code
|
||||
parsed_cfg = load_cfg(config, **kwargs)
|
||||
parser = HfArgumentParser(TrainerCliArgs)
|
||||
parsed_cli_args, _ = parser.parse_args_into_dataclasses(
|
||||
return_remaining_strings=True
|
||||
)
|
||||
do_evaluate(parsed_cfg, parsed_cli_args)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
load_dotenv()
|
||||
fire.Fire(do_cli)
|
||||
@@ -12,8 +12,7 @@ from axolotl.cli.utils import (
|
||||
build_command,
|
||||
fetch_from_github,
|
||||
)
|
||||
from axolotl.common.cli import EvaluateCliArgs, PreprocessCliArgs, TrainerCliArgs
|
||||
from axolotl.utils import set_pytorch_cuda_alloc_conf
|
||||
from axolotl.common.cli import PreprocessCliArgs, TrainerCliArgs
|
||||
from axolotl.utils.config.models.input.v0_4_1 import AxolotlInputConfig
|
||||
|
||||
|
||||
@@ -49,9 +48,6 @@ def train(config: str, accelerate: bool, **kwargs):
|
||||
"""Train or fine-tune a model."""
|
||||
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
||||
|
||||
# Enable expandable segments for cuda allocation to improve VRAM usage
|
||||
set_pytorch_cuda_alloc_conf()
|
||||
|
||||
if accelerate:
|
||||
base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.train"]
|
||||
if config:
|
||||
@@ -69,31 +65,6 @@ def train(config: str, accelerate: bool, **kwargs):
|
||||
@click.option(
|
||||
"--accelerate/--no-accelerate",
|
||||
default=True,
|
||||
help="Use accelerate launch for multi-GPU training",
|
||||
)
|
||||
@add_options_from_dataclass(EvaluateCliArgs)
|
||||
@add_options_from_config(AxolotlInputConfig)
|
||||
def evaluate(config: str, accelerate: bool, **kwargs):
|
||||
"""Evaluate a model."""
|
||||
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
||||
|
||||
if accelerate:
|
||||
base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.evaluate"]
|
||||
if config:
|
||||
base_cmd.append(config)
|
||||
cmd = build_command(base_cmd, kwargs)
|
||||
subprocess.run(cmd, check=True) # nosec B603
|
||||
else:
|
||||
from axolotl.cli.evaluate import do_cli
|
||||
|
||||
do_cli(config=config, **kwargs)
|
||||
|
||||
|
||||
@cli.command()
|
||||
@click.argument("config", type=click.Path(exists=True, path_type=str))
|
||||
@click.option(
|
||||
"--accelerate/--no-accelerate",
|
||||
default=False,
|
||||
help="Use accelerate launch for multi-GPU inference",
|
||||
)
|
||||
@click.option(
|
||||
@@ -124,7 +95,7 @@ def inference(
|
||||
if lora_model_dir:
|
||||
kwargs["lora_model_dir"] = lora_model_dir
|
||||
if base_model:
|
||||
kwargs["base_model"] = base_model
|
||||
kwargs["output_dir"] = base_model
|
||||
|
||||
if accelerate:
|
||||
base_cmd = ["accelerate", "launch", "-m", "axolotl.cli.inference"]
|
||||
|
||||
@@ -15,19 +15,6 @@ configure_logging()
|
||||
LOG = logging.getLogger("axolotl.common.cli")
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreprocessCliArgs:
|
||||
"""
|
||||
dataclass representing arguments for preprocessing only
|
||||
"""
|
||||
|
||||
debug: bool = field(default=False)
|
||||
debug_text_only: bool = field(default=False)
|
||||
debug_num_examples: int = field(default=1)
|
||||
prompter: Optional[str] = field(default=None)
|
||||
download: Optional[bool] = field(default=True)
|
||||
|
||||
|
||||
@dataclass
|
||||
class TrainerCliArgs:
|
||||
"""
|
||||
@@ -44,14 +31,16 @@ class TrainerCliArgs:
|
||||
|
||||
|
||||
@dataclass
|
||||
class EvaluateCliArgs:
|
||||
class PreprocessCliArgs:
|
||||
"""
|
||||
dataclass representing the various evaluation arguments
|
||||
dataclass representing arguments for preprocessing only
|
||||
"""
|
||||
|
||||
debug: bool = field(default=False)
|
||||
debug_text_only: bool = field(default=False)
|
||||
debug_num_examples: int = field(default=0)
|
||||
debug_num_examples: int = field(default=1)
|
||||
prompter: Optional[str] = field(default=None)
|
||||
download: Optional[bool] = field(default=True)
|
||||
|
||||
|
||||
def load_model_and_tokenizer(
|
||||
@@ -61,9 +50,7 @@ def load_model_and_tokenizer(
|
||||
):
|
||||
LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
|
||||
LOG.info("loading model and (optionally) peft_config...")
|
||||
inference = getattr(cli_args, "inference", False)
|
||||
model, _ = load_model(cfg, tokenizer, inference=inference)
|
||||
model, _ = load_model(cfg, tokenizer, inference=cli_args.inference)
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
272
src/axolotl/core/tokenizer_utils.py
Normal file
272
src/axolotl/core/tokenizer_utils.py
Normal file
@@ -0,0 +1,272 @@
|
||||
"""
|
||||
helper functions for fixing the embeddings/tokenizer
|
||||
"""
|
||||
|
||||
# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
|
||||
# GNU LESSER GENERAL PUBLIC LICENSE
|
||||
# Version 3, 29 June 2007
|
||||
#
|
||||
# Copyright (C) 2007 Free Software Foundation, Inc. <https://fsf.org/>
|
||||
# Everyone is permitted to copy and distribute verbatim copies
|
||||
# of this license document, but changing it is not allowed.
|
||||
|
||||
import gc
|
||||
import itertools
|
||||
import logging
|
||||
from collections import Counter
|
||||
|
||||
import datasets
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
LOG = logging.getLogger("axolotl.core.tokenizer_utils")
|
||||
|
||||
|
||||
@torch.inference_mode()
|
||||
def fix_untrained_tokens( # pylint: disable=too-many-return-statements
|
||||
model, tokenizer, train_dataset, ignored_tokenizer_names=None, eps=1e-16
|
||||
):
|
||||
"""
|
||||
Llama-3 for eg has untrained vectors in the base model.
|
||||
These include <|eot_id|>, <|start_header_id|>, <|end_header_id|>
|
||||
We reset them to the mean of the rest of the tokens
|
||||
"""
|
||||
# Code licensed under LGPL
|
||||
embedding_matrix = model.get_input_embeddings().weight
|
||||
lm_head_matrix = model.get_output_embeddings().weight
|
||||
chat_template = getattr(tokenizer, "chat_template", None)
|
||||
tokenizer = tokenizer.tokenizer if hasattr(tokenizer, "tokenizer") else tokenizer
|
||||
|
||||
# Ignore some model checks for now
|
||||
if not ignored_tokenizer_names:
|
||||
ignored_tokenizer_names = []
|
||||
if (
|
||||
model.config._name_or_path # pylint: disable=protected-access
|
||||
in ignored_tokenizer_names
|
||||
):
|
||||
return
|
||||
|
||||
# Sometimes the sizes can be different like in vision models
|
||||
# Ie <image> is in input, but not in output
|
||||
min_size = min(embedding_matrix.shape[1], lm_head_matrix.shape[1])
|
||||
embedding_matrix = embedding_matrix[:, :min_size]
|
||||
lm_head_matrix = lm_head_matrix[:, :min_size]
|
||||
|
||||
# Get untrained tokens
|
||||
indicator_untrained1 = torch.amax(embedding_matrix, axis=1) <= eps
|
||||
# Check lm_head as well
|
||||
|
||||
# Does NOT work for Llama 3.1!!
|
||||
indicator_untrained2 = torch.amax(lm_head_matrix, axis=1) <= eps
|
||||
|
||||
# We instead check for repeated vectors
|
||||
lm_head_where = torch.where(indicator_untrained1)[0]
|
||||
lm_head_bad = lm_head_matrix[lm_head_where]
|
||||
lm_head_bad = lm_head_bad.cpu().float().numpy().round(3)
|
||||
counter = Counter()
|
||||
for row in lm_head_bad:
|
||||
counter[hash(row.data.tobytes())] += 1
|
||||
counter = Counter({k: c for k, c in counter.items() if c >= 2})
|
||||
|
||||
lm_head_where = lm_head_where.cpu().numpy()
|
||||
final_bad_lm_head = []
|
||||
for j, row in enumerate(lm_head_bad):
|
||||
if hash(row.data.tobytes()) in counter:
|
||||
final_bad_lm_head.append(lm_head_where[j])
|
||||
indicator_untrained2 = indicator_untrained2 | torch.zeros_like(indicator_untrained2)
|
||||
indicator_untrained2[final_bad_lm_head] = True
|
||||
|
||||
# Combine both checks
|
||||
indicator_untrained = indicator_untrained1 & indicator_untrained2
|
||||
|
||||
# Remove pad token possibility
|
||||
if hasattr(tokenizer, "pad_token_id"):
|
||||
pad_token_id = tokenizer.pad_token_id
|
||||
if pad_token_id is not None and pad_token_id < indicator_untrained.shape[0]:
|
||||
indicator_untrained[pad_token_id] = False
|
||||
|
||||
where_untrained = torch.where(indicator_untrained)[0]
|
||||
n_untrained = where_untrained.shape[0]
|
||||
n_trained = embedding_matrix.shape[0] - n_untrained
|
||||
|
||||
# Get set and actual tokens
|
||||
where_untrained = where_untrained.tolist()
|
||||
if len(where_untrained) == 0:
|
||||
return
|
||||
|
||||
# Remove untrained indices where it's longer
|
||||
where_untrained_set = frozenset(where_untrained)
|
||||
actual_bad_tokens = tokenizer.convert_ids_to_tokens(where_untrained)
|
||||
# Remove None items in actual_bad_tokens
|
||||
actual_bad_tokens = [x for x in actual_bad_tokens if x is not None]
|
||||
|
||||
# Check if tokenizer and training datasets have bad tokens
|
||||
if_bad_first = False
|
||||
if_bad_second = False
|
||||
# Check tokenizer's chat template for any untrained tokens
|
||||
if chat_template is not None:
|
||||
if_bad_first = any(x in chat_template for x in actual_bad_tokens)
|
||||
|
||||
if isinstance(train_dataset, datasets.IterableDataset):
|
||||
# Skip the check, since the code below assumes
|
||||
# an indexable dataset
|
||||
return
|
||||
|
||||
# Check the first 250, last 250 input_ids
|
||||
size_dataset = len(train_dataset)
|
||||
size = min(size_dataset, 250)
|
||||
for j in range(size):
|
||||
input_ids = train_dataset[j]
|
||||
if "input_ids" in input_ids:
|
||||
input_ids = input_ids["input_ids"]
|
||||
if_bad = any(item in where_untrained_set for item in input_ids)
|
||||
if if_bad:
|
||||
if_bad_second = True
|
||||
break
|
||||
|
||||
# Check last 250
|
||||
if not if_bad_second:
|
||||
left = max(size_dataset - 250, 0)
|
||||
for j in range(left, size_dataset):
|
||||
input_ids = train_dataset[j]
|
||||
if "input_ids" in input_ids:
|
||||
input_ids = input_ids["input_ids"]
|
||||
if_bad = any(item in where_untrained_set for item in input_ids)
|
||||
if if_bad:
|
||||
if_bad_second = True
|
||||
break
|
||||
|
||||
# Check if bad tokens exists!
|
||||
if not if_bad_first and not if_bad_second:
|
||||
return
|
||||
|
||||
# Check if lm_head / embed_token are trainable!
|
||||
bad_not_trainable = False
|
||||
if not embedding_matrix.requires_grad:
|
||||
bad_not_trainable = True
|
||||
if not lm_head_matrix.requires_grad:
|
||||
bad_not_trainable = True
|
||||
|
||||
if bad_not_trainable: # pylint: disable=too-many-nested-blocks
|
||||
final_bad_items = []
|
||||
|
||||
# Re-check the first 250, last 250 input_ids
|
||||
size_dataset = len(train_dataset)
|
||||
size = min(size_dataset, 250)
|
||||
for j in range(size):
|
||||
input_ids = train_dataset[j]
|
||||
if "input_ids" in input_ids:
|
||||
input_ids = input_ids["input_ids"]
|
||||
for item in input_ids:
|
||||
if item in where_untrained_set:
|
||||
final_bad_items.append(item)
|
||||
|
||||
# Re-check last 250
|
||||
left = max(size_dataset - 250, 0)
|
||||
for j in range(left, size_dataset):
|
||||
input_ids = train_dataset[j]
|
||||
if "input_ids" in input_ids:
|
||||
input_ids = input_ids["input_ids"]
|
||||
for item in input_ids:
|
||||
if item in where_untrained_set:
|
||||
final_bad_items.append(item)
|
||||
|
||||
# If no bad tokens, possibly chat template itself has issues?
|
||||
if len(final_bad_items) == 0:
|
||||
# Recheck 2000 and last 2000 items
|
||||
size_dataset = len(train_dataset)
|
||||
size = min(size_dataset, 2000)
|
||||
for j in range(size):
|
||||
input_ids = train_dataset[j]
|
||||
if "input_ids" in input_ids:
|
||||
input_ids = input_ids["input_ids"]
|
||||
for item in input_ids:
|
||||
if item in where_untrained_set:
|
||||
final_bad_items.append(item)
|
||||
|
||||
# Re-check last 2000
|
||||
left = max(size_dataset - 2000, 0)
|
||||
for j in range(left, size_dataset):
|
||||
input_ids = train_dataset[j]
|
||||
if "input_ids" in input_ids:
|
||||
input_ids = input_ids["input_ids"]
|
||||
for item in input_ids:
|
||||
if item in where_untrained_set:
|
||||
final_bad_items.append(item)
|
||||
|
||||
# Most likely false signal!
|
||||
if len(final_bad_items) == 0:
|
||||
return
|
||||
|
||||
raise ValueError(
|
||||
f"Untrained tokens of [{list(set(final_bad_items))}] found, but embed_tokens & lm_head not trainable, causing NaNs. "
|
||||
)
|
||||
|
||||
# Count all the possible bad tokens
|
||||
final_counts = np.zeros(
|
||||
max(len(tokenizer), embedding_matrix.shape[0]), dtype=np.int64
|
||||
)
|
||||
|
||||
def mapping(examples):
|
||||
input_ids = examples["input_ids"]
|
||||
counter = np.fromiter(itertools.chain.from_iterable(input_ids), dtype=np.int32)
|
||||
np.add.at(final_counts, counter, 1)
|
||||
|
||||
train_dataset.map(mapping, batched=True, desc="Counting untrained tokens")
|
||||
|
||||
# Get counts for untrained tokens
|
||||
counts_untrained = final_counts[where_untrained]
|
||||
# Identify untrained tokens seen in train_dataset
|
||||
indices_seen_in_train = np.where(counts_untrained > 0)[0]
|
||||
tokens_to_update = [where_untrained[i] for i in indices_seen_in_train]
|
||||
|
||||
if len(tokens_to_update) == 0:
|
||||
LOG.info(
|
||||
"No untrained tokens found in train_dataset. No embeddings were modified."
|
||||
)
|
||||
return
|
||||
|
||||
# Log the token IDs that are being rescaled
|
||||
LOG.info(
|
||||
f"Rescaling embeddings for tokens seen in train_dataset: {tokens_to_update}"
|
||||
)
|
||||
|
||||
# Get sum of all items
|
||||
sum_embedding = torch.sum(embedding_matrix, dtype=torch.float32, axis=0)
|
||||
sum_lm_head = torch.sum(lm_head_matrix, dtype=torch.float32, axis=0)
|
||||
|
||||
# Remove bad tokens
|
||||
sum_embedding -= torch.sum(
|
||||
embedding_matrix[where_untrained], dtype=torch.float32, axis=0
|
||||
)
|
||||
sum_lm_head -= torch.sum(
|
||||
lm_head_matrix[where_untrained], dtype=torch.float32, axis=0
|
||||
)
|
||||
|
||||
# Find correct average by dividing by sum of trained tokens
|
||||
mean_embedding = sum_embedding / n_trained
|
||||
mean_lm_head = sum_lm_head / n_trained
|
||||
|
||||
# Compute scaling for tokens to update
|
||||
scaling = counts_untrained[indices_seen_in_train] / max(final_counts.max(), 1)
|
||||
scaling = torch.tensor(scaling, device=mean_embedding.device).unsqueeze(1)
|
||||
|
||||
# Prepare mean embeddings for tokens to update
|
||||
mean_embedding_repeated = (
|
||||
mean_embedding.unsqueeze(0).repeat(len(tokens_to_update), 1) * scaling
|
||||
)
|
||||
mean_lm_head_repeated = (
|
||||
mean_lm_head.unsqueeze(0).repeat(len(tokens_to_update), 1) * scaling
|
||||
)
|
||||
|
||||
# Update embeddings only for tokens seen in train_dataset
|
||||
embedding_matrix[tokens_to_update] = mean_embedding_repeated.to(
|
||||
embedding_matrix.dtype
|
||||
)
|
||||
lm_head_matrix[tokens_to_update] = mean_lm_head_repeated.to(lm_head_matrix.dtype)
|
||||
|
||||
# Clean up
|
||||
for _ in range(3):
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
return
|
||||
@@ -14,21 +14,25 @@ import os
|
||||
import sys
|
||||
from abc import abstractmethod
|
||||
from collections import defaultdict
|
||||
from contextlib import nullcontext
|
||||
from dataclasses import dataclass, field
|
||||
from functools import wraps
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Literal, Optional, Type, Union
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
import transformers
|
||||
from datasets import Dataset
|
||||
from liger_kernel.chunked_loss.fused_linear_preference import (
|
||||
LigerFusedLinearPreferenceBase,
|
||||
)
|
||||
from packaging import version
|
||||
from peft.optimizers import create_loraplus_optimizer
|
||||
from torch import nn
|
||||
from torch import amp, nn
|
||||
from torch.optim.lr_scheduler import OneCycleLR
|
||||
from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
|
||||
from transformers import (
|
||||
DataCollatorWithFlattening,
|
||||
EarlyStoppingCallback,
|
||||
Trainer,
|
||||
TrainerCallback,
|
||||
@@ -56,7 +60,6 @@ from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
|
||||
from axolotl.utils import is_comet_available, is_mlflow_available
|
||||
from axolotl.utils.callbacks import (
|
||||
EvalFirstStepCallback,
|
||||
GCCallback,
|
||||
GPUStatsCallback,
|
||||
LossWatchDogCallback,
|
||||
SaveAxolotlConfigtoWandBCallback,
|
||||
@@ -67,8 +70,7 @@ from axolotl.utils.callbacks import (
|
||||
log_prediction_callback_factory,
|
||||
)
|
||||
from axolotl.utils.callbacks.lisa import lisa_callback_factory
|
||||
from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
|
||||
from axolotl.utils.chat_templates import get_chat_template_from_config
|
||||
from axolotl.utils.chat_templates import get_chat_template
|
||||
from axolotl.utils.collators import (
|
||||
BatchSamplerDataCollatorForSeq2Seq,
|
||||
DataCollatorForSeq2Seq,
|
||||
@@ -424,11 +426,6 @@ class SchedulerMixin(Trainer):
|
||||
|
||||
return self.lr_scheduler
|
||||
|
||||
def _load_optimizer_and_scheduler(self, checkpoint):
|
||||
if not checkpoint and self.args.optimizer_checkpoint is not None:
|
||||
checkpoint = self.args.optimizer_checkpoint
|
||||
return super()._load_optimizer_and_scheduler(checkpoint)
|
||||
|
||||
|
||||
class AxolotlTrainer(SchedulerMixin, Trainer):
|
||||
"""
|
||||
@@ -1085,6 +1082,15 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):
|
||||
self.dataset_tags = dataset_tags
|
||||
self.optimizer = None
|
||||
|
||||
from liger_kernel.chunked_loss import LigerFusedLinearDPOLoss
|
||||
|
||||
self.liger_loss = LigerFusedLinearDPOLoss(
|
||||
ignore_index=self.label_pad_token_id,
|
||||
beta=self.beta,
|
||||
compute_nll_loss=True, # not same as rpo_alpha hasattr(self.args, "rpo_alpha") and self.args.rpo_alpha is not None,
|
||||
use_ref_model=not self.reference_free,
|
||||
)
|
||||
|
||||
def create_optimizer(self):
|
||||
if self.args.loraplus_lr_ratio is None:
|
||||
return super().create_optimizer()
|
||||
@@ -1188,6 +1194,309 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):
|
||||
# transformers<=4.46
|
||||
return super(DPOTrainer, self).log(logs) # pylint: disable=bad-super-call
|
||||
|
||||
def get_batch_loss_metrics(
|
||||
self,
|
||||
model,
|
||||
batch: dict[str, Union[list, torch.LongTensor]],
|
||||
train_eval: Literal["train", "eval"] = "train",
|
||||
):
|
||||
"""Compute the DPO loss and other metrics using Liger kernel."""
|
||||
# return super().get_batch_loss_metrics(model, batch, train_eval)
|
||||
if not self.liger_loss:
|
||||
raise ValueError("Liger loss not initialized")
|
||||
|
||||
metrics = {}
|
||||
|
||||
model_output = self.concatenated_forward(model, batch)
|
||||
|
||||
# Get the lm_head weights and bias
|
||||
lin_weight = model.lm_head.weight
|
||||
lin_bias = getattr(model.lm_head, "bias", None)
|
||||
|
||||
hidden_states = model_output["hidden_states"]
|
||||
labels = model_output["labels"]
|
||||
|
||||
if not self.reference_free:
|
||||
# Adapted from DPO's compute_ref_log_probs
|
||||
compte_ref_context_manager = (
|
||||
amp.autocast("cuda")
|
||||
if self._peft_has_been_casted_to_bf16
|
||||
else nullcontext()
|
||||
)
|
||||
with torch.no_grad(), compte_ref_context_manager: # type: ignore
|
||||
if self.ref_model is None:
|
||||
with self.null_ref_context():
|
||||
ref_model_output = self.concatenated_forward(self.model, batch)
|
||||
ref_weight = self.model.lm_head.weight
|
||||
ref_bias = getattr(self.model.lm_head, "bias", None)
|
||||
|
||||
ref_hidden_states = ref_model_output["hidden_states"]
|
||||
|
||||
else:
|
||||
ref_model_output = self.concatenated_forward(self.ref_model, batch)
|
||||
ref_weight = self.ref_model.lm_head.weight
|
||||
ref_bias = getattr(self.ref_model.lm_head, "bias", None)
|
||||
|
||||
ref_hidden_states = ref_model_output["hidden_states"]
|
||||
(
|
||||
ref_chosen_logps,
|
||||
ref_rejected_logps,
|
||||
_ref_chosen_logits,
|
||||
_ref_rejected_logits,
|
||||
_ref_chosen_nll_loss,
|
||||
) = LigerFusedLinearPreferenceBase.chunk_forward(
|
||||
input_chunk=ref_hidden_states,
|
||||
weight=ref_weight,
|
||||
target_chunk=labels,
|
||||
bias=ref_bias,
|
||||
# ignore_index=ignore_index,
|
||||
compute_nll_loss=False,
|
||||
)
|
||||
|
||||
else:
|
||||
ref_hidden_states = None
|
||||
ref_weight = None
|
||||
ref_bias = None
|
||||
|
||||
# Compute loss using Liger kernel
|
||||
loss, return_vars = self.liger_loss(
|
||||
lin_weight=lin_weight,
|
||||
_input=hidden_states,
|
||||
target=labels,
|
||||
bias=lin_bias, # TODO: check whether to pass bias as FCLE doesn't
|
||||
ref_input=ref_hidden_states,
|
||||
ref_weight=ref_weight,
|
||||
ref_bias=ref_bias,
|
||||
)
|
||||
|
||||
(
|
||||
policy_chosen_logps,
|
||||
policy_rejected_logps,
|
||||
policy_chosen_logits_mean,
|
||||
policy_rejected_logits_mean,
|
||||
policy_nll_loss,
|
||||
) = return_vars
|
||||
|
||||
# Calculate rewards
|
||||
if not self.reference_free:
|
||||
chosen_rewards = (
|
||||
self.beta * (policy_chosen_logps - (ref_chosen_logps)).detach()
|
||||
)
|
||||
rejected_rewards = (
|
||||
self.beta * (policy_rejected_logps - (ref_rejected_logps)).detach()
|
||||
)
|
||||
|
||||
else:
|
||||
chosen_rewards = self.beta * policy_chosen_logps
|
||||
rejected_rewards = self.beta * policy_rejected_logps
|
||||
|
||||
reward_accuracies = (chosen_rewards > rejected_rewards).float()
|
||||
|
||||
prefix = "eval_" if train_eval == "eval" else ""
|
||||
metrics.update(
|
||||
{
|
||||
f"{prefix}rewards/chosen": chosen_rewards.mean().cpu(),
|
||||
f"{prefix}rewards/rejected": rejected_rewards.mean().cpu(),
|
||||
f"{prefix}rewards/accuracies": reward_accuracies.mean().cpu(),
|
||||
f"{prefix}rewards/margins": (chosen_rewards - rejected_rewards)
|
||||
.mean()
|
||||
.cpu(),
|
||||
f"{prefix}logps/chosen": policy_chosen_logps.mean().cpu(),
|
||||
f"{prefix}logps/rejected": policy_rejected_logps.mean().cpu(),
|
||||
f"{prefix}logits/chosen": policy_chosen_logits_mean.cpu(),
|
||||
f"{prefix}logits/rejected": policy_rejected_logits_mean.cpu(),
|
||||
}
|
||||
)
|
||||
|
||||
if hasattr(self.args, "rpo_alpha") and self.args.rpo_alpha is not None:
|
||||
metrics[f"{prefix}nll_loss"] = policy_nll_loss.cpu()
|
||||
|
||||
# TODO: Handle use_weighting, aux_loss_enabled as in upstream
|
||||
|
||||
return loss, metrics
|
||||
|
||||
def concatenated_forward(
|
||||
self, model: nn.Module, batch: dict[str, Union[list, torch.LongTensor]]
|
||||
):
|
||||
"""Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
|
||||
|
||||
We do this to avoid doing two forward passes, because it's faster for FSDP.
|
||||
|
||||
Overridden base function to return the hidden states and labels for the loss calculation.
|
||||
"""
|
||||
num_examples = batch["prompt_input_ids"].shape[0] # type: ignore
|
||||
|
||||
concatenated_batch = self.concatenated_inputs(
|
||||
batch, padding_value=self.padding_value
|
||||
)
|
||||
|
||||
model_kwargs = {}
|
||||
if self.aux_loss_enabled:
|
||||
model_kwargs["output_router_logits"] = True
|
||||
|
||||
# Add to get the hidden states for the loss
|
||||
model_kwargs["output_hidden_states"] = True
|
||||
|
||||
# Add the pixel values and attention masks for vision models
|
||||
if "pixel_values" in concatenated_batch:
|
||||
model_kwargs["pixel_values"] = concatenated_batch["pixel_values"]
|
||||
if "pixel_attention_mask" in concatenated_batch:
|
||||
model_kwargs["pixel_attention_mask"] = concatenated_batch[
|
||||
"pixel_attention_mask"
|
||||
]
|
||||
if "image_sizes" in concatenated_batch:
|
||||
model_kwargs["image_sizes"] = concatenated_batch["image_sizes"]
|
||||
|
||||
prompt_input_ids = concatenated_batch["prompt_input_ids"]
|
||||
prompt_attention_mask = concatenated_batch["prompt_attention_mask"]
|
||||
completion_input_ids = concatenated_batch["completion_input_ids"]
|
||||
completion_attention_mask = concatenated_batch["completion_attention_mask"]
|
||||
if self.is_encoder_decoder:
|
||||
labels = completion_input_ids
|
||||
labels[completion_attention_mask == 0] = self.label_pad_token_id
|
||||
outputs = model(
|
||||
input_ids=prompt_input_ids,
|
||||
attention_mask=prompt_attention_mask,
|
||||
labels=labels, # we need the labels for the logits to be returned
|
||||
**model_kwargs,
|
||||
)
|
||||
logits = outputs.logits
|
||||
hidden_states = outputs.decoder_hidden_states[-1]
|
||||
loss_mask = completion_attention_mask.bool()
|
||||
else:
|
||||
# Concatenate the prompt and completion inputs
|
||||
input_ids = torch.cat((prompt_input_ids, completion_input_ids), dim=1)
|
||||
attention_mask = torch.cat(
|
||||
(prompt_attention_mask, completion_attention_mask), dim=1
|
||||
)
|
||||
# Mask the prompt but not the completion for the loss
|
||||
loss_mask = torch.cat(
|
||||
(torch.zeros_like(prompt_attention_mask), completion_attention_mask),
|
||||
dim=1,
|
||||
)
|
||||
|
||||
# Flush left to reduce the memory usage
|
||||
# [[0, 0, x, x, x, x], -> [[x, x, x, x],
|
||||
# [0, x, x, x, 0, 0]] [x, x, x, 0]]
|
||||
for i in range(attention_mask.size(0)):
|
||||
first_one_idx = torch.nonzero(attention_mask[i])[0].item()
|
||||
input_ids[i] = torch.roll(input_ids[i], shifts=-first_one_idx) # type: ignore
|
||||
attention_mask[i] = torch.roll(attention_mask[i], shifts=-first_one_idx) # type: ignore
|
||||
loss_mask[i] = torch.roll(loss_mask[i], shifts=-first_one_idx) # type: ignore
|
||||
|
||||
# Get the first column idx that is all zeros and remove every column after that
|
||||
empty_cols = torch.sum(attention_mask, dim=0) == 0
|
||||
first_empty_col = (
|
||||
torch.nonzero(empty_cols)[0].item()
|
||||
if empty_cols.any()
|
||||
else attention_mask.size(1)
|
||||
)
|
||||
input_ids = input_ids[:, :first_empty_col] # type: ignore
|
||||
attention_mask = attention_mask[:, :first_empty_col] # type: ignore
|
||||
loss_mask = loss_mask[:, :first_empty_col] # type: ignore
|
||||
|
||||
# Truncate right
|
||||
if self.args.max_length is not None:
|
||||
input_ids = input_ids[:, : self.args.max_length]
|
||||
attention_mask = attention_mask[:, : self.args.max_length]
|
||||
loss_mask = loss_mask[:, : self.args.max_length]
|
||||
|
||||
# if self.use_num_logits_to_keep:
|
||||
# # Compute num_logits_to_keep based on loss_mask pattern:
|
||||
# # [[0, 0, 0, x, x, x, x],
|
||||
# # [0, 0, 0, x, x, x, 0]]
|
||||
# # ^ start computing logits from here ([:, -(7-3+1):])
|
||||
# first_compute_index = loss_mask.nonzero(as_tuple=True)[1].min()
|
||||
# num_logits_to_keep = loss_mask.shape[1] - first_compute_index
|
||||
# model_kwargs["num_logits_to_keep"] = num_logits_to_keep.item() + 1 # +1 for the first label
|
||||
|
||||
outputs = model(
|
||||
input_ids=input_ids, attention_mask=attention_mask, **model_kwargs
|
||||
)
|
||||
|
||||
# Offset the logits by one to align with the labels
|
||||
logits = outputs.logits[:, :-1, :]
|
||||
hidden_states = outputs.hidden_states[-1][:, :-1, :]
|
||||
labels = input_ids[:, 1:].clone()
|
||||
loss_mask = loss_mask[:, 1:].bool()
|
||||
|
||||
# if self.use_num_logits_to_keep:
|
||||
# # Align labels with logits
|
||||
# # logits: -, -, [x2, x3, x4, x5, x6]
|
||||
# # ^ --------- ^ after logits[:, :-1, :]
|
||||
# # labels: [y0, y1, y2, y3, y4, y5, y6]
|
||||
# # ^ --------- ^ with num_logits_to_keep=4, [:, -4:]
|
||||
# # loss_mask: [0, 0, 0, 1, 1, 1, 1]
|
||||
# labels = labels[:, -num_logits_to_keep:]
|
||||
# loss_mask = loss_mask[:, -num_logits_to_keep:]
|
||||
# hidden_states = hidden_states[:, -num_logits_to_keep:, :]
|
||||
|
||||
if logits.shape[:2] != labels.shape[:2]:
|
||||
# for llava, the returned logits include the image tokens (placed before the text tokens)
|
||||
seq_len = labels.shape[1]
|
||||
logits = logits[:, -seq_len:]
|
||||
hidden_states = hidden_states[:, -seq_len:]
|
||||
|
||||
# Compute the log probabilities of the labels
|
||||
labels[
|
||||
~loss_mask
|
||||
] = 0 # dummy token; we'll ignore the losses on these tokens later
|
||||
per_token_logps = torch.gather(
|
||||
logits.log_softmax(-1), dim=2, index=labels.unsqueeze(2)
|
||||
).squeeze(2)
|
||||
per_token_logps[~loss_mask] = 0
|
||||
all_logps = per_token_logps.sum(-1)
|
||||
|
||||
output = {}
|
||||
|
||||
if self.use_weighting:
|
||||
with torch.no_grad():
|
||||
# Eq (2) of the WPO paper: https://huggingface.co/papers/2406.11827
|
||||
logprobs = F.log_softmax(logits, dim=-1)
|
||||
weights_adjustment_factor = torch.logsumexp(
|
||||
2 * logprobs, dim=-1
|
||||
) # same as sum(probs**2) in log space
|
||||
per_token_logps_adjusted = per_token_logps - weights_adjustment_factor
|
||||
all_weights = (per_token_logps_adjusted * loss_mask).sum(
|
||||
-1
|
||||
) / loss_mask.sum(-1)
|
||||
chosen_weights = all_weights[:num_examples]
|
||||
rejected_weights = all_weights[num_examples:]
|
||||
output["policy_weights"] = torch.clamp(
|
||||
torch.exp(chosen_weights + rejected_weights), max=1
|
||||
)
|
||||
|
||||
if self.args.rpo_alpha is not None:
|
||||
# Only use the chosen logits for the RPO loss
|
||||
chosen_logits = logits[:num_examples]
|
||||
chosen_labels = labels[:num_examples]
|
||||
|
||||
# Compute the log probabilities of the labels
|
||||
output["nll_loss"] = F.cross_entropy(
|
||||
torch.flatten(chosen_logits, end_dim=1),
|
||||
torch.flatten(chosen_labels, end_dim=1),
|
||||
ignore_index=0,
|
||||
)
|
||||
|
||||
if self.loss_type == "ipo":
|
||||
all_logps = all_logps / loss_mask.sum(-1)
|
||||
|
||||
output["chosen_logps"] = all_logps[:num_examples]
|
||||
output["rejected_logps"] = all_logps[num_examples:]
|
||||
output["mean_chosen_logits"] = logits[:num_examples][
|
||||
loss_mask[:num_examples]
|
||||
].mean()
|
||||
output["mean_rejected_logits"] = logits[num_examples:][
|
||||
loss_mask[num_examples:]
|
||||
].mean()
|
||||
output["hidden_states"] = hidden_states
|
||||
output["labels"] = labels
|
||||
|
||||
if self.aux_loss_enabled:
|
||||
output["aux_loss"] = outputs.aux_loss
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class AxolotlORPOTrainer(SchedulerMixin, ORPOTrainer):
|
||||
"""
|
||||
@@ -1371,13 +1680,6 @@ class TrainerBuilderBase(abc.ABC):
|
||||
plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
|
||||
)
|
||||
|
||||
if self.cfg.profiler_steps:
|
||||
callbacks.append(
|
||||
PytorchProfilerCallback(
|
||||
steps_to_profile=self.cfg.profiler_steps,
|
||||
)
|
||||
)
|
||||
|
||||
if self.cfg.use_wandb:
|
||||
callbacks.append(
|
||||
SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
|
||||
@@ -1458,8 +1760,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
if self.cfg.loss_watchdog_threshold is not None:
|
||||
callbacks.append(LossWatchDogCallback(self.cfg))
|
||||
|
||||
if self.cfg.gc_steps:
|
||||
callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))
|
||||
callbacks.append(SaveModelCallback())
|
||||
|
||||
return callbacks
|
||||
@@ -1769,10 +2069,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
] = self.cfg.loraplus_lr_embedding
|
||||
training_arguments_kwargs["embedding_lr"] = self.cfg.embedding_lr
|
||||
training_arguments_kwargs["embedding_lr_scale"] = self.cfg.embedding_lr_scale
|
||||
if self.cfg.optimizer_checkpoint:
|
||||
training_arguments_kwargs[
|
||||
"optimizer_checkpoint"
|
||||
] = self.cfg.optimizer_checkpoint
|
||||
|
||||
if self.cfg.lr_scheduler in ["one_cycle", "log_sweep"]:
|
||||
training_arguments_kwargs["lr_scheduler_type"] = "cosine"
|
||||
@@ -1843,8 +2139,8 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
training_arguments_kwargs["model_type"] = self.cfg.model_config_type
|
||||
training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)
|
||||
if self.cfg.chat_template:
|
||||
training_arguments_kwargs["chat_template"] = get_chat_template_from_config(
|
||||
cfg=self.cfg,
|
||||
training_arguments_kwargs["chat_template"] = get_chat_template(
|
||||
self.cfg.chat_template,
|
||||
tokenizer=self.tokenizer,
|
||||
)
|
||||
|
||||
@@ -2002,11 +2298,9 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
V2BatchSamplerDataCollatorForSeq2Seq,
|
||||
BatchSamplerDataCollatorForSeq2Seq,
|
||||
DataCollatorForSeq2Seq,
|
||||
DataCollatorWithFlattening,
|
||||
RewardDataCollatorWithPadding,
|
||||
]
|
||||
]
|
||||
collator_args = [self.tokenizer]
|
||||
if self.cfg.reward_model:
|
||||
collator = RewardDataCollatorWithPadding
|
||||
if "max_length" in kwargs:
|
||||
@@ -2026,18 +2320,12 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
collator = MultiModalChatDataCollator
|
||||
kwargs["processor"] = self.processor
|
||||
kwargs["chat_template"] = training_args.chat_template
|
||||
elif self.cfg.batch_flattening:
|
||||
collator = DataCollatorWithFlattening
|
||||
collator_args.pop(0)
|
||||
kwargs.pop("pad_to_multiple_of", None)
|
||||
kwargs.pop("padding", None)
|
||||
else:
|
||||
collator = DataCollatorForSeq2Seq
|
||||
|
||||
kwargs["return_tensors"] = "pt"
|
||||
|
||||
return collator(
|
||||
*collator_args,
|
||||
self.tokenizer,
|
||||
return_tensors="pt",
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
@@ -2192,6 +2480,14 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
|
||||
if self.cfg.dpo_use_weighting is not None:
|
||||
training_args_kwargs["use_weighting"] = self.cfg.dpo_use_weighting
|
||||
|
||||
report_to = []
|
||||
if self.cfg.use_wandb:
|
||||
report_to.append("wandb")
|
||||
if self.cfg.wandb_name:
|
||||
training_args_kwargs["run_name"] = self.cfg.wandb_name
|
||||
|
||||
training_args_kwargs["report_to"] = report_to
|
||||
|
||||
training_args = training_args_cls( # pylint: disable=unexpected-keyword-arg
|
||||
output_dir=self.cfg.output_dir,
|
||||
per_device_train_batch_size=self.cfg.micro_batch_size,
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user