Merge pull request #7 from NanoCode012/cj_tokenizer_default_prompt_template
Feat: merge latest, update docs, fix dropped config bug, added unit test
This commit is contained in:
8
.github/workflows/base.yml
vendored
8
.github/workflows/base.yml
vendored
@@ -28,7 +28,13 @@ jobs:
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.4.0
|
||||
pytorch: 2.4.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
- cuda: "124"
|
||||
cuda_version: 12.4.1
|
||||
cudnn_version: ""
|
||||
python_version: "3.11"
|
||||
pytorch: 2.4.1
|
||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||
steps:
|
||||
- name: Checkout
|
||||
|
||||
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@@ -27,7 +27,7 @@ jobs:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.4.0
|
||||
pytorch: 2.4.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
@@ -84,7 +84,7 @@ jobs:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.4.0
|
||||
pytorch: 2.4.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
|
||||
4
.github/workflows/nightlies.yml
vendored
4
.github/workflows/nightlies.yml
vendored
@@ -26,7 +26,7 @@ jobs:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.4.0
|
||||
pytorch: 2.4.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
@@ -83,7 +83,7 @@ jobs:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.4.0
|
||||
pytorch: 2.4.1
|
||||
axolotl_extras:
|
||||
runs-on: axolotl-gpu-runner
|
||||
steps:
|
||||
|
||||
4
.github/workflows/tests-nightly.yml
vendored
4
.github/workflows/tests-nightly.yml
vendored
@@ -25,7 +25,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.10", "3.11"]
|
||||
pytorch_version: ["2.3.1", "2.4.0"]
|
||||
pytorch_version: ["2.3.1", "2.4.1"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -91,7 +91,7 @@ jobs:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.4.0
|
||||
pytorch: 2.4.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
nightly_build: "true"
|
||||
|
||||
4
.github/workflows/tests.yml
vendored
4
.github/workflows/tests.yml
vendored
@@ -36,7 +36,7 @@ jobs:
|
||||
fail-fast: false
|
||||
matrix:
|
||||
python_version: ["3.10", "3.11"]
|
||||
pytorch_version: ["2.3.1", "2.4.0"]
|
||||
pytorch_version: ["2.3.1", "2.4.1"]
|
||||
timeout-minutes: 20
|
||||
|
||||
steps:
|
||||
@@ -94,7 +94,7 @@ jobs:
|
||||
- cuda: 124
|
||||
cuda_version: 12.4.1
|
||||
python_version: "3.11"
|
||||
pytorch: 2.4.0
|
||||
pytorch: 2.4.1
|
||||
num_gpus: 1
|
||||
axolotl_extras:
|
||||
steps:
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
[settings]
|
||||
profile=black
|
||||
known_third_party=wandb
|
||||
known_third_party=wandb,comet_ml
|
||||
|
||||
18
README.md
18
README.md
@@ -14,7 +14,7 @@ Features:
|
||||
- Integrated with xformer, flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
|
||||
- Works with single GPU or multiple GPUs via FSDP or Deepspeed
|
||||
- Easily run with Docker locally or on the cloud
|
||||
- Log results and optionally checkpoints to wandb or mlflow
|
||||
- Log results and optionally checkpoints to wandb, mlflow or Comet
|
||||
- And more!
|
||||
|
||||
<a href="https://www.phorm.ai/query?projectId=e315ba4a-4e14-421f-ab05-38a1f9076f25">
|
||||
@@ -515,6 +515,22 @@ wandb_name:
|
||||
wandb_log_model:
|
||||
```
|
||||
|
||||
##### Comet Logging
|
||||
|
||||
Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to wandb with `comet login`.
|
||||
|
||||
- wandb options
|
||||
```yaml
|
||||
use_comet:
|
||||
comet_api_key:
|
||||
comet_workspace:
|
||||
comet_project_name:
|
||||
comet_experiment_key:
|
||||
comet_mode:
|
||||
comet_online:
|
||||
comet_experiment_config:
|
||||
```
|
||||
|
||||
##### Special Tokens
|
||||
|
||||
It is important to have special tokens like delimiters, end-of-sequence, beginning-of-sequence in your tokenizer's vocabulary. This will help you avoid tokenization issues and help your model train better. You can do this in axolotl like this:
|
||||
|
||||
@@ -83,7 +83,7 @@ lora_on_cpu: true
|
||||
datasets:
|
||||
# HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
|
||||
- path: vicgalle/alpaca-gpt4
|
||||
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
||||
# The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
|
||||
type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
|
||||
ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
|
||||
data_files: # Optional[str] path to source data files
|
||||
@@ -123,6 +123,48 @@ datasets:
|
||||
# For `completion` datsets only, uses the provided field instead of `text` column
|
||||
field:
|
||||
|
||||
# Using chat template
|
||||
- path: ...
|
||||
# Set type to `chat_template` to use this strategy
|
||||
type: chat_template
|
||||
# Specify the name of the chat template to use
|
||||
# The name of the chat template to use for training, following values are supported:
|
||||
# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default.
|
||||
# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
|
||||
# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml.
|
||||
# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
|
||||
chat_template: tokenizer_default
|
||||
# Custom jinja template for chat template. This will be only used if `chat_template` is set to `jinja` or empty (in which case chat_template is automatically set to `jinja`).
|
||||
chat_template_jinja:
|
||||
# The key in the data example that contains the messages. Default is "messages".
|
||||
field_messages: messages
|
||||
# The key in the message turn that contains the role. Default is "role".
|
||||
message_field_role: role
|
||||
# The key in the message turn that contains the content. Default is "content".
|
||||
message_field_content: content
|
||||
# Optional[Dict[str, List]]. Roles mapping for the messages.
|
||||
roles:
|
||||
user: ["human", "user"]
|
||||
assistant: ["gpt", "assistant", "ai"]
|
||||
system: ["system"]
|
||||
|
||||
## NOTE: Leaving the below empty will default to using the simple legacy tokenization strategy where only last message is trained on.
|
||||
|
||||
# Optional[List[str]]. Roles to train on. The tokens from these roles will be considered for the loss.
|
||||
roles_to_train: ["gpt", "assistant"]
|
||||
# Optional[str]. Which EOS tokens to train on in the conversation. Possible values are:
|
||||
# - all: train on all EOS tokens
|
||||
# - turn: train on the EOS token at the end of each trainable turn
|
||||
# - last: train on the last EOS token in the conversation
|
||||
train_on_eos: last
|
||||
# The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`.
|
||||
message_field_training: training
|
||||
# The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn.
|
||||
# The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train).
|
||||
# See example at `docs/dataset-formats/conversation.qmd`
|
||||
message_field_training_detail: train_detail
|
||||
|
||||
|
||||
# If false, the datasets will not be shuffled and will keep their original order in `datasets`.
|
||||
# The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true.
|
||||
shuffle_merged_datasets: true
|
||||
@@ -274,6 +316,18 @@ mlflow_tracking_uri: # URI to mlflow
|
||||
mlflow_experiment_name: # Your experiment name
|
||||
hf_mlflow_log_artifacts: # set to true to copy each saved checkpoint on each save to mlflow artifact registry
|
||||
|
||||
# Comet configuration if you're using it
|
||||
# Make sure your `COMET_API_KEY` environment variable is set (recommended) or you login to Comet with `comet login`.
|
||||
# Check out our documentation for more details https://www.comet.com/docs/v2/api-and-sdk/python-sdk/reference/Experiment-Creation/#comet_ml.start
|
||||
use_comet: # Enable or disable Comet integration.
|
||||
comet_api_key: # API key for Comet. Recommended to set via `comet login`.
|
||||
comet_workspace: # Workspace name in Comet. Defaults to the user's default workspace.
|
||||
comet_project_name: # Project name in Comet. Defaults to Uncategorized.
|
||||
comet_experiment_key: # Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key.
|
||||
comet_mode: # Create a new experiment ("create") or log to an existing one ("get"). Default ("get_or_create") auto-selects based on configuration.
|
||||
comet_online: # Set to True to log data to Comet server, or False for offline storage. Default is True.
|
||||
comet_experiment_config: # Dictionary for additional configuration settings, see the doc for more details.
|
||||
|
||||
# Where to save the full-finetuned model to
|
||||
output_dir: ./completed-model
|
||||
|
||||
|
||||
@@ -73,81 +73,37 @@ creates a chat where bot is asked to tell a joke, then explain why the joke is f
|
||||
|
||||
## chat_template
|
||||
|
||||
Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Usually this chat template is stored in tokenizer_config.json under the key `chat_template`.
|
||||
|
||||
Conversational data would normally look like follows:
|
||||
Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.
|
||||
|
||||
```{.json filename="data.jsonl"}
|
||||
{"conversations": [{"from": "...", "value": "..."}]}
|
||||
{"conversations": [{"role": "...", "content": "..."}]}
|
||||
```
|
||||
|
||||
with roles usually being system, user, assistant, etc.
|
||||
However, all fields can be customized using the following configuration:
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: ...
|
||||
# Set type to `chat_template` to use this strategy
|
||||
type: chat_template
|
||||
# Specify the name of the chat template to use
|
||||
# The name of the chat template to use for training, following values are supported:
|
||||
# - tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value.
|
||||
# - alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py
|
||||
# - tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer.
|
||||
# - jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field.
|
||||
chat_template: tokenizer_default
|
||||
# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
|
||||
chat_template_jinja: null
|
||||
# The key in the data example that contains the messages. Default is "conversations".
|
||||
field_messages: conversations
|
||||
# The key in the message turn that contains the role. Default is "from".
|
||||
message_field_role: from
|
||||
# The key in the message turn that contains the content. Default is "value".
|
||||
message_field_content: value
|
||||
# Role mapping for the messages. This can be useful if you are combining data from multiple sources and the roles are different.
|
||||
roles:
|
||||
human: user
|
||||
user: user
|
||||
assistant: assistant
|
||||
gpt: assistant
|
||||
system: system
|
||||
# Roles to train on. The tokens from these roles will be considered for the loss. Default is ["gpt", "assistant"]
|
||||
roles_to_train: ["gpt", "assistant"]
|
||||
# Which EOS tokens to train on in the conversation. Possible values are:
|
||||
# - all: train on all EOS tokens
|
||||
# - turn: train on the EOS token at the end of each trainable turn
|
||||
# - last: train on the last EOS token in the conversation
|
||||
# - none: do not train on EOS tokens
|
||||
# Default is "turn".
|
||||
train_on_eos: turn
|
||||
# The key in the message turn that indicates if tokens of a turn should be considered for training. This is an advanced option useful to selectively train on certain turns besides the `roles_to_train`. Default is "training".
|
||||
message_field_training: training
|
||||
# The key in the message turn that contains the training details. This is an advanced option useful to selectively train on certain tokens in a turn. Default is "train_detail".
|
||||
message_field_training_detail: train_detail
|
||||
```
|
||||
See `config.qmd` for full configs and supported templates.
|
||||
|
||||
### Examples
|
||||
|
||||
1. Using the default chat template in the tokenizer_config.json on OpenAI messages format
|
||||
1. Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: ...
|
||||
type: chat_template
|
||||
chat_template: tokenizer_default
|
||||
field_messages: messages
|
||||
message_field_role: role
|
||||
message_field_content: content
|
||||
roles:
|
||||
user: user
|
||||
assistant: assistant
|
||||
human: user
|
||||
gpt: assistant
|
||||
system: system
|
||||
```
|
||||
|
||||
2. Using the `gemma` chat template in the tokenizer_config.json on OpenAI messages format, training on all assistant messages.
|
||||
|
||||
```yaml
|
||||
chat_template: gemma
|
||||
datasets:
|
||||
- path: ...
|
||||
type: chat_template
|
||||
chat_template: gemma
|
||||
roles_to_train: ["assistant"]
|
||||
```
|
||||
|
||||
2. Using a custom jinja template on OpenAI messages format
|
||||
3. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
@@ -155,20 +111,10 @@ datasets:
|
||||
type: chat_template
|
||||
chat_template: jinja
|
||||
chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"
|
||||
field_messages: messages
|
||||
message_field_role: role
|
||||
message_field_content: content
|
||||
roles:
|
||||
user: user
|
||||
assistant: assistant
|
||||
human: user
|
||||
gpt: assistant
|
||||
system: system
|
||||
roles_to_train: ["assistant"]
|
||||
```
|
||||
|
||||
3. Using fine-grained control over tokens and turns to train in a conversation
|
||||
|
||||
4. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
|
||||
|
||||
For a data sample that looks like:
|
||||
|
||||
@@ -207,14 +153,10 @@ datasets:
|
||||
field_messages: conversations
|
||||
message_field_role: from
|
||||
message_field_content: value
|
||||
roles:
|
||||
human: human
|
||||
user: human
|
||||
assistant: assistant
|
||||
gpt: assistant
|
||||
system: system
|
||||
roles_to_train: []
|
||||
train_on_eos: turn
|
||||
message_field_training: train
|
||||
message_field_training_detail: train_detail
|
||||
```
|
||||
|
||||
Tip: It is not necessary to use both `message_field_training` and `message_field_training_detail` at a time.
|
||||
|
||||
@@ -205,7 +205,7 @@ ds = load_from_disk(f'last_run_prepared/{directory[0]}/')
|
||||
hi there!. goodbye farewell</s>
|
||||
```
|
||||
|
||||
We can check that the right tokens are ingored by comparing the labels
|
||||
We can check that the right tokens are ignored by comparing the labels
|
||||
to each token:
|
||||
|
||||
```python
|
||||
|
||||
28
docs/multimodal.qmd
Normal file
28
docs/multimodal.qmd
Normal file
@@ -0,0 +1,28 @@
|
||||
# MultiModal / Vision Language Models (BETA)
|
||||
|
||||
### Supported Models
|
||||
|
||||
- Mllama, i.e. llama with vision models
|
||||
|
||||
### Usage
|
||||
|
||||
Currently multimodal support is limited and doesn't have full feature parity. To finetune a multimodal Llama w/ LoRA,
|
||||
you'll need to use the following in YAML in combination with the rest of the required hyperparams.
|
||||
|
||||
```yaml
|
||||
base_model: alpindale/Llama-3.2-11B-Vision-Instruct
|
||||
processor_type: AutoProcessor
|
||||
skip_prepare_dataset: true
|
||||
|
||||
chat_template: llama3_2_vision
|
||||
datasets:
|
||||
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||
type: chat_template
|
||||
split: train[:1%]
|
||||
field_messages: messages
|
||||
remove_unused_columns: false
|
||||
sample_packing: false
|
||||
|
||||
# only finetune the Language model, leave the vision model and vision tower frozen
|
||||
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||
```
|
||||
83
examples/deepseek-v2/qlora-fsdp-2_5.yaml
Normal file
83
examples/deepseek-v2/qlora-fsdp-2_5.yaml
Normal file
@@ -0,0 +1,83 @@
|
||||
base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
liger_rms_norm: true
|
||||
liger_swiglu: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
|
||||
chat_template: deepseek_v2
|
||||
datasets:
|
||||
- path: mlabonne/FineTome-100k
|
||||
type: chat_template
|
||||
split: train
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
adapter: qlora
|
||||
lora_r: 256
|
||||
lora_alpha: 256
|
||||
lora_target_linear: true
|
||||
peft_use_rslora: true
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 8
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
evals_per_epoch: 2
|
||||
eval_table_size:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
63
examples/llama-3-vision/lora-11b.yaml
Normal file
63
examples/llama-3-vision/lora-11b.yaml
Normal file
@@ -0,0 +1,63 @@
|
||||
base_model: alpindale/Llama-3.2-11B-Vision-Instruct
|
||||
processor_type: AutoProcessor
|
||||
strict: false
|
||||
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
skip_prepare_dataset: true
|
||||
remove_unused_columns: false
|
||||
sample_packing: false
|
||||
|
||||
chat_template: llama3_2_vision
|
||||
datasets:
|
||||
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||
type: chat_template
|
||||
split: train[:1%]
|
||||
field_messages: messages
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
|
||||
sequence_len: 8192
|
||||
pad_to_sequence_len: false
|
||||
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
eager_attention:
|
||||
|
||||
warmup_ratio: 0.1
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
76
examples/phi/lora-3.5.yaml
Normal file
76
examples/phi/lora-3.5.yaml
Normal file
@@ -0,0 +1,76 @@
|
||||
base_model: microsoft/Phi-3.5-mini-instruct
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
chat_template: phi_3
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
chat_template: phi_3
|
||||
field_messages: messages
|
||||
message_field_role: role
|
||||
message_field_content: content
|
||||
roles:
|
||||
user:
|
||||
- user
|
||||
assistant:
|
||||
- assistant
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
lora_fan_in_fan_out:
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 4
|
||||
micro_batch_size: 4
|
||||
num_epochs: 2
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bfloat16: true
|
||||
bf16: true
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
s2_attention:
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
eval_max_new_tokens: 128
|
||||
saves_per_epoch: 4
|
||||
debug:
|
||||
deepspeed:
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
@@ -1,11 +1,11 @@
|
||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||
packaging==23.2
|
||||
peft==0.12.0
|
||||
transformers==4.44.2
|
||||
peft==0.13.0
|
||||
transformers==4.45.1
|
||||
tokenizers>=0.19.1
|
||||
bitsandbytes==0.43.3
|
||||
bitsandbytes==0.44.0
|
||||
accelerate==0.34.2
|
||||
datasets==2.20.0
|
||||
datasets==2.21.0
|
||||
deepspeed==0.14.4
|
||||
pydantic==2.6.3
|
||||
addict
|
||||
@@ -16,7 +16,7 @@ flash-attn==2.6.3
|
||||
sentencepiece
|
||||
wandb
|
||||
einops
|
||||
xformers==0.0.27
|
||||
xformers==0.0.28.post1
|
||||
optimum==1.16.2
|
||||
hf_transfer
|
||||
colorama
|
||||
@@ -34,7 +34,7 @@ tensorboard
|
||||
python-dotenv==1.0.1
|
||||
autoawq>=0.2.5
|
||||
triton>=2.3.0
|
||||
liger-kernel==0.2.1
|
||||
liger-kernel==0.3.0
|
||||
|
||||
mamba-ssm==1.2.0.post1
|
||||
|
||||
@@ -46,3 +46,9 @@ gcsfs>=2024.5.0
|
||||
trl==0.9.6
|
||||
zstandard==0.22.0
|
||||
fastcore
|
||||
|
||||
# lm eval harness
|
||||
lm_eval==0.4.4
|
||||
langdetect==1.0.9
|
||||
immutabledict==4.2.0
|
||||
antlr4-python3-runtime==4.13.2
|
||||
|
||||
7
setup.py
7
setup.py
@@ -49,10 +49,17 @@ def parse_requirements():
|
||||
else:
|
||||
raise ValueError("Invalid version format")
|
||||
|
||||
if (major, minor) >= (2, 4):
|
||||
if patch == 0:
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
_install_requires.append("xformers>=0.0.27")
|
||||
if (major, minor) >= (2, 3):
|
||||
if patch == 0:
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
_install_requires.append("xformers>=0.0.26.post1")
|
||||
else:
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
_install_requires.append("xformers>=0.0.27")
|
||||
elif (major, minor) >= (2, 2):
|
||||
_install_requires.pop(_install_requires.index(xformers_version))
|
||||
_install_requires.append("xformers>=0.0.25.post1")
|
||||
|
||||
@@ -30,6 +30,8 @@ from axolotl.common.cli import TrainerCliArgs, load_model_and_tokenizer
|
||||
from axolotl.integrations.base import PluginManager
|
||||
from axolotl.logging_config import configure_logging
|
||||
from axolotl.train import TrainDatasetMeta
|
||||
from axolotl.utils.chat_templates import get_chat_template
|
||||
from axolotl.utils.comet_ import setup_comet_env_vars
|
||||
from axolotl.utils.config import (
|
||||
normalize_cfg_datasets,
|
||||
normalize_config,
|
||||
@@ -39,7 +41,7 @@ from axolotl.utils.data import load_prepare_dpo_datasets, prepare_dataset
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.distributed import is_main_process
|
||||
from axolotl.utils.mlflow_ import setup_mlflow_env_vars
|
||||
from axolotl.utils.models import load_tokenizer
|
||||
from axolotl.utils.models import load_processor, load_tokenizer
|
||||
from axolotl.utils.tokenization import check_dataset_labels
|
||||
from axolotl.utils.trainer import prepare_opinionated_env, prepare_optim_env
|
||||
from axolotl.utils.wandb_ import setup_wandb_env_vars
|
||||
@@ -53,8 +55,22 @@ LOG = logging.getLogger("axolotl.scripts")
|
||||
|
||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||
|
||||
AXOLOTL_LOGO = """
|
||||
#@@ #@@ @@# @@#
|
||||
@@ @@ @@ @@ =@@# @@ #@ =@@#.
|
||||
@@ #@@@@@@@@@ @@ #@#@= @@ #@ .=@@
|
||||
#@@@@@@@@@@@@@@@@@ =@# @# ##= ## =####=+ @@ =#####+ =#@@###. @@
|
||||
@@@@@@@@@@/ +@@/ +@@ #@ =@= #@= @@ =@#+ +#@# @@ =@#+ +#@# #@. @@
|
||||
@@@@@@@@@@ ##@@ ##@@ =@# @# =@# @# @@ @@ @@ @@ #@ #@ @@
|
||||
@@@@@@@@@@@@@@@@@@@@ #@=+++#@= =@@# @@ @@ @@ @@ #@ #@ @@
|
||||
=@#=====@@ =@# @# @@ @@ @@ @@ #@ #@ @@
|
||||
@@@@@@@@@@@@@@@@ @@@@ #@ #@= #@= +@@ #@# =@# @@. =@# =@# #@. @@
|
||||
=@# @# #@= #@ =#@@@@#= +#@@= +#@@@@#= .##@@+ @@
|
||||
@@@@ @@@@@@@@@@@@@@@@
|
||||
"""
|
||||
|
||||
def print_axolotl_text_art(suffix=None):
|
||||
|
||||
def print_legacy_axolotl_text_art(suffix=None):
|
||||
font = "nancyj"
|
||||
ascii_text = " axolotl"
|
||||
if suffix:
|
||||
@@ -67,6 +83,13 @@ def print_axolotl_text_art(suffix=None):
|
||||
print_dep_versions()
|
||||
|
||||
|
||||
def print_axolotl_text_art(
|
||||
**kwargs, # pylint: disable=unused-argument
|
||||
):
|
||||
if is_main_process():
|
||||
print(AXOLOTL_LOGO)
|
||||
|
||||
|
||||
def print_dep_versions():
|
||||
packages = ["accelerate", "peft", "transformers", "trl", "torch", "bitsandbytes"]
|
||||
max_len = max(len(pkg) for pkg in packages)
|
||||
@@ -234,7 +257,8 @@ def do_inference_gradio(
|
||||
|
||||
model, tokenizer = load_model_and_tokenizer(cfg=cfg, cli_args=cli_args)
|
||||
prompter = cli_args.prompter
|
||||
default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
||||
# default_tokens = {"unk_token": "<unk>", "bos_token": "<s>", "eos_token": "</s>"}
|
||||
default_tokens: Dict[str, str] = {}
|
||||
|
||||
for token, symbol in default_tokens.items():
|
||||
# If the token isn't already specified in the config, add it
|
||||
@@ -242,10 +266,13 @@ def do_inference_gradio(
|
||||
tokenizer.add_special_tokens({token: symbol})
|
||||
|
||||
prompter_module = None
|
||||
chat_template_str = None
|
||||
if prompter:
|
||||
prompter_module = getattr(
|
||||
importlib.import_module("axolotl.prompters"), prompter
|
||||
)
|
||||
elif cfg.chat_template:
|
||||
chat_template_str = get_chat_template(cfg.chat_template)
|
||||
|
||||
model = model.to(cfg.device, dtype=cfg.torch_dtype)
|
||||
|
||||
@@ -259,7 +286,24 @@ def do_inference_gradio(
|
||||
)
|
||||
else:
|
||||
prompt = instruction.strip()
|
||||
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
|
||||
|
||||
if chat_template_str:
|
||||
batch = tokenizer.apply_chat_template(
|
||||
[
|
||||
{
|
||||
"role": "user",
|
||||
"content": prompt,
|
||||
}
|
||||
],
|
||||
return_tensors="pt",
|
||||
add_special_tokens=True,
|
||||
add_generation_prompt=True,
|
||||
chat_template=chat_template_str,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
)
|
||||
else:
|
||||
batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)
|
||||
|
||||
model.eval()
|
||||
with torch.no_grad():
|
||||
@@ -282,6 +326,7 @@ def do_inference_gradio(
|
||||
streamer = TextIteratorStreamer(tokenizer)
|
||||
generation_kwargs = {
|
||||
"inputs": batch["input_ids"].to(cfg.device),
|
||||
"attention_mask": batch["attention_mask"].to(cfg.device),
|
||||
"generation_config": generation_config,
|
||||
"streamer": streamer,
|
||||
}
|
||||
@@ -398,6 +443,8 @@ def load_cfg(config: Union[str, Path] = Path("examples/"), **kwargs):
|
||||
|
||||
setup_mlflow_env_vars(cfg)
|
||||
|
||||
setup_comet_env_vars(cfg)
|
||||
|
||||
return cfg
|
||||
|
||||
|
||||
@@ -407,9 +454,12 @@ def load_datasets(
|
||||
cli_args: TrainerCliArgs,
|
||||
) -> TrainDatasetMeta:
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
|
||||
|
||||
train_dataset, eval_dataset, total_num_steps, prompters = prepare_dataset(
|
||||
cfg, tokenizer
|
||||
cfg,
|
||||
tokenizer,
|
||||
processor=processor,
|
||||
)
|
||||
|
||||
if cli_args.debug or cfg.debug:
|
||||
|
||||
@@ -3,13 +3,11 @@ CLI to run training on a model
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Tuple, Union
|
||||
from typing import Union
|
||||
|
||||
import fire
|
||||
from dotenv import load_dotenv
|
||||
from transformers.hf_argparser import HfArgumentParser
|
||||
from transformers.modeling_utils import PreTrainedModel
|
||||
from transformers.tokenization_utils import PreTrainedTokenizer
|
||||
|
||||
from axolotl.cli import (
|
||||
check_accelerate_default_config,
|
||||
@@ -20,6 +18,7 @@ from axolotl.cli import (
|
||||
print_axolotl_text_art,
|
||||
)
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.integrations.base import PluginManager
|
||||
from axolotl.prompt_strategies.sharegpt import (
|
||||
register_chatml_template,
|
||||
register_llama3_template,
|
||||
@@ -39,7 +38,7 @@ def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
|
||||
return do_train(parsed_cfg, parsed_cli_args)
|
||||
|
||||
|
||||
def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
|
||||
def do_train(cfg, cli_args) -> None:
|
||||
print_axolotl_text_art()
|
||||
check_accelerate_default_config()
|
||||
check_user_token()
|
||||
@@ -64,7 +63,13 @@ def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
|
||||
else:
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
return train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
model, tokenizer = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
plugin_manager = PluginManager.get_instance()
|
||||
|
||||
del model
|
||||
del tokenizer
|
||||
|
||||
plugin_manager.post_train_unload(cfg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -21,6 +21,7 @@ from typing import Any, Dict, List, Literal, Optional, Type, Union
|
||||
import torch
|
||||
import transformers
|
||||
from datasets import Dataset
|
||||
from peft.optimizers import create_loraplus_optimizer
|
||||
from torch import nn
|
||||
from torch.optim.lr_scheduler import OneCycleLR
|
||||
from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
|
||||
@@ -45,10 +46,9 @@ from trl import (
|
||||
)
|
||||
from trl.trainer.utils import pad_to_length
|
||||
|
||||
from axolotl.loraplus import create_loraplus_optimizer
|
||||
from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
|
||||
from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
|
||||
from axolotl.utils import is_mlflow_available
|
||||
from axolotl.utils import is_comet_available, is_mlflow_available
|
||||
from axolotl.utils.callbacks import (
|
||||
EvalFirstStepCallback,
|
||||
GPUStatsCallback,
|
||||
@@ -61,12 +61,14 @@ from axolotl.utils.callbacks import (
|
||||
log_prediction_callback_factory,
|
||||
)
|
||||
from axolotl.utils.callbacks.lisa import lisa_callback_factory
|
||||
from axolotl.utils.chat_templates import get_chat_template
|
||||
from axolotl.utils.collators import (
|
||||
BatchSamplerDataCollatorForSeq2Seq,
|
||||
DataCollatorForSeq2Seq,
|
||||
MambaDataCollator,
|
||||
V2BatchSamplerDataCollatorForSeq2Seq,
|
||||
)
|
||||
from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
|
||||
from axolotl.utils.models import ensure_dtype
|
||||
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
|
||||
from axolotl.utils.schedulers import (
|
||||
@@ -250,6 +252,10 @@ class AxolotlTrainingMixins:
|
||||
"help": "workaround to pass an alternate lr scheduler to the HF trainer"
|
||||
},
|
||||
)
|
||||
chat_template: Optional[str] = field(
|
||||
default=None,
|
||||
metadata={"help": "Chat template converting chat messages to text"},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
@@ -456,14 +462,14 @@ class AxolotlTrainer(SchedulerMixin, Trainer):
|
||||
if self.args.loraplus_lr_ratio is not None:
|
||||
loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None)
|
||||
loraplus_lr_embedding = getattr(
|
||||
self.args, "loraplus_lr_embedding", None
|
||||
self.args, "loraplus_lr_embedding", 1e-6
|
||||
)
|
||||
self.optimizer = create_loraplus_optimizer( # pylint: disable=attribute-defined-outside-init
|
||||
opt_model,
|
||||
optimizer_cls,
|
||||
optimizer_kwargs,
|
||||
loraplus_lr_ratio,
|
||||
loraplus_lr_embedding,
|
||||
loraplus_lr_ratio=loraplus_lr_ratio,
|
||||
loraplus_lr_embedding=loraplus_lr_embedding,
|
||||
**optimizer_kwargs,
|
||||
)
|
||||
elif self.args.alternate_optimizer == "optimi_adamw":
|
||||
from optimi import AdamW
|
||||
@@ -969,9 +975,9 @@ class AxolotlDPOTrainer(SchedulerMixin, DPOTrainer):
|
||||
self.optimizer = create_loraplus_optimizer( # pylint: disable=attribute-defined-outside-init
|
||||
opt_model,
|
||||
optimizer_cls,
|
||||
optimizer_kwargs,
|
||||
loraplus_lr_ratio,
|
||||
loraplus_lr_embedding,
|
||||
loraplus_lr_ratio=loraplus_lr_ratio,
|
||||
loraplus_lr_embedding=loraplus_lr_embedding,
|
||||
**optimizer_kwargs,
|
||||
)
|
||||
|
||||
if is_sagemaker_mp_enabled():
|
||||
@@ -1043,10 +1049,11 @@ class TrainerBuilderBase(abc.ABC):
|
||||
_model_ref = None
|
||||
_peft_config = None
|
||||
|
||||
def __init__(self, cfg, model, tokenizer):
|
||||
def __init__(self, cfg, model, tokenizer, processor=None):
|
||||
self.cfg = cfg
|
||||
self.model = model
|
||||
self.tokenizer = tokenizer
|
||||
self.processor = processor
|
||||
|
||||
# in case the model supports tagging, add the axolotl tag.
|
||||
# This makes sure the tag is correctly pushed even if a user calls
|
||||
@@ -1104,6 +1111,12 @@ class TrainerBuilderBase(abc.ABC):
|
||||
callbacks.append(
|
||||
SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path)
|
||||
)
|
||||
if self.cfg.use_comet and is_comet_available():
|
||||
from axolotl.utils.callbacks.comet_ import SaveAxolotlConfigtoCometCallback
|
||||
|
||||
callbacks.append(
|
||||
SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
|
||||
)
|
||||
|
||||
return callbacks
|
||||
|
||||
@@ -1172,6 +1185,11 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
trainer, self.tokenizer, "mlflow"
|
||||
)
|
||||
callbacks.append(LogPredictionCallback(self.cfg))
|
||||
if self.cfg.use_comet and is_comet_available() and self.cfg.eval_table_size > 0:
|
||||
LogPredictionCallback = log_prediction_callback_factory(
|
||||
trainer, self.tokenizer, "comet_ml"
|
||||
)
|
||||
callbacks.append(LogPredictionCallback(self.cfg))
|
||||
|
||||
if self.cfg.do_bench_eval:
|
||||
callbacks.append(bench_eval_callback_factory(trainer, self.tokenizer))
|
||||
@@ -1417,10 +1435,14 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
report_to = []
|
||||
if self.cfg.use_wandb:
|
||||
report_to.append("wandb")
|
||||
if self.cfg.wandb_name:
|
||||
training_arguments_kwargs["run_name"] = self.cfg.wandb_name
|
||||
if self.cfg.use_mlflow:
|
||||
report_to.append("mlflow")
|
||||
if self.cfg.use_tensorboard:
|
||||
report_to.append("tensorboard")
|
||||
if self.cfg.use_comet:
|
||||
report_to.append("comet_ml")
|
||||
|
||||
training_arguments_kwargs["report_to"] = report_to
|
||||
training_arguments_kwargs["run_name"] = (
|
||||
@@ -1513,6 +1535,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
)
|
||||
training_arguments_kwargs["model_type"] = self.cfg.model_config_type
|
||||
training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)
|
||||
if self.cfg.chat_template:
|
||||
training_arguments_kwargs["chat_template"] = get_chat_template(
|
||||
self.cfg.chat_template
|
||||
)
|
||||
|
||||
if self.cfg.rl == "orpo":
|
||||
training_arguments_kwargs["orpo_alpha"] = self.cfg.orpo_alpha
|
||||
@@ -1574,6 +1600,12 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
)
|
||||
training_args = self.hook_post_create_training_args(training_args)
|
||||
|
||||
# unset run_name so wandb sets up experiment names
|
||||
if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
|
||||
training_args.run_name = ( # pylint: disable=attribute-defined-outside-init
|
||||
None
|
||||
)
|
||||
|
||||
data_collator_kwargs = {
|
||||
"padding": True, # True/"longest" is the default
|
||||
}
|
||||
@@ -1653,7 +1685,12 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
else:
|
||||
collator = BatchSamplerDataCollatorForSeq2Seq
|
||||
else:
|
||||
collator = DataCollatorForSeq2Seq
|
||||
if self.cfg.processor_type and self.processor:
|
||||
collator = MultiModalChatDataCollator
|
||||
kwargs["processor"] = self.processor
|
||||
kwargs["chat_template"] = training_args.chat_template
|
||||
else:
|
||||
collator = DataCollatorForSeq2Seq
|
||||
|
||||
return collator(
|
||||
self.tokenizer,
|
||||
|
||||
@@ -159,6 +159,29 @@ class BasePlugin:
|
||||
List[callable]: A list of callback functions to be added to the TrainingArgs
|
||||
"""
|
||||
|
||||
def post_train(self, cfg, model):
|
||||
"""
|
||||
Performs actions after training is complete.
|
||||
|
||||
Parameters:
|
||||
cfg (dict): The axolotl configuration
|
||||
model (object): The loaded model.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
def post_train_unload(self, cfg):
|
||||
"""
|
||||
Performs actions after training is complete and the model is unloaded.
|
||||
|
||||
Parameters:
|
||||
cfg (dict): The configuration for the plugin.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
|
||||
|
||||
def load_plugin(plugin_name: str) -> BasePlugin:
|
||||
"""
|
||||
@@ -381,3 +404,17 @@ class PluginManager:
|
||||
for plugin in self.plugins:
|
||||
callbacks.extend(plugin.add_callbacks_post_trainer(cfg, trainer))
|
||||
return callbacks
|
||||
|
||||
def post_train_unload(self, cfg):
|
||||
"""
|
||||
Calls the post_train_unload method of all registered plugins.
|
||||
|
||||
Parameters:
|
||||
cfg (dict): The configuration for the plugins.
|
||||
model (object): The loaded model.
|
||||
|
||||
Returns:
|
||||
None
|
||||
"""
|
||||
for plugin in self.plugins:
|
||||
plugin.post_train_unload(cfg)
|
||||
|
||||
13
src/axolotl/integrations/lm_eval/README.md
Normal file
13
src/axolotl/integrations/lm_eval/README.md
Normal file
@@ -0,0 +1,13 @@
|
||||
# LM Eval Harness
|
||||
|
||||
### Usage
|
||||
|
||||
```yaml
|
||||
plugins:
|
||||
- axolotl.integrations.lm_eval.LMEvalPlugin
|
||||
|
||||
lm_eval_tasks:
|
||||
- gsm8k
|
||||
- hellaswag
|
||||
- arc_easy
|
||||
```
|
||||
42
src/axolotl/integrations/lm_eval/__init__.py
Normal file
42
src/axolotl/integrations/lm_eval/__init__.py
Normal file
@@ -0,0 +1,42 @@
|
||||
"""
|
||||
Module for the Plugin for LM Eval Harness
|
||||
"""
|
||||
import subprocess # nosec
|
||||
from datetime import datetime
|
||||
|
||||
from axolotl.integrations.base import BasePlugin
|
||||
|
||||
from .args import LMEvalArgs # pylint: disable=unused-import. # noqa: F401
|
||||
|
||||
|
||||
class LMEvalPlugin(BasePlugin):
|
||||
"""
|
||||
Plugin for LM Evaluation Harness integraton with Axolotl.
|
||||
"""
|
||||
|
||||
def get_input_args(self):
|
||||
return "axolotl.integrations.lm_eval.LMEvalArgs"
|
||||
|
||||
def post_train_unload(self, cfg):
|
||||
tasks = ",".join(cfg.lm_eval_tasks)
|
||||
fa2 = ",attn_implementation=flash_attention_2" if cfg.flash_attention else ""
|
||||
dtype = ",dtype=bfloat16" if cfg.bf16 else ",dtype=float16"
|
||||
output_path = cfg.output_dir
|
||||
output_path += "" if cfg.output_dir.endswith("/") else "/"
|
||||
output_path += "lm_eval_results/" + datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
subprocess.run( # nosec
|
||||
[
|
||||
"lm_eval",
|
||||
"--model",
|
||||
"hf",
|
||||
"--model_args",
|
||||
f"pretrained={cfg.output_dir}{fa2}{dtype}",
|
||||
"--tasks",
|
||||
tasks,
|
||||
"--batch_size",
|
||||
str(cfg.lm_eval_batch_size),
|
||||
"--output_path",
|
||||
output_path,
|
||||
],
|
||||
check=True,
|
||||
)
|
||||
15
src/axolotl/integrations/lm_eval/args.py
Normal file
15
src/axolotl/integrations/lm_eval/args.py
Normal file
@@ -0,0 +1,15 @@
|
||||
"""
|
||||
Module for handling lm eval harness input arguments.
|
||||
"""
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class LMEvalArgs(BaseModel):
|
||||
"""
|
||||
Input args for lm eval harness
|
||||
"""
|
||||
|
||||
lm_eval_tasks: List[str] = []
|
||||
lm_eval_batch_size: Optional[int] = 8
|
||||
@@ -1,133 +0,0 @@
|
||||
"""Module for LoRA+"""
|
||||
|
||||
# MIT License
|
||||
#
|
||||
# Copyright (c) 2024 nikhil-ghosh-berkeley
|
||||
# https://github.com/nikhil-ghosh-berkeley/loraplus
|
||||
|
||||
import logging
|
||||
from functools import reduce
|
||||
|
||||
from peft.tuners import lora
|
||||
from torch import nn
|
||||
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
|
||||
from transformers.trainer_pt_utils import get_parameter_names
|
||||
|
||||
LOG = logging.getLogger("axolotl.loraplus")
|
||||
|
||||
|
||||
def get_module(name, opt_model):
|
||||
"""
|
||||
Retrieve a module from a model using its parameter name.
|
||||
Args:
|
||||
name (str): Full name of the parameter, typically including module path.
|
||||
opt_model (torch.nn.Module): The model from which to retrieve the module.
|
||||
|
||||
Returns:
|
||||
Module corresponding to the given name.
|
||||
"""
|
||||
parent_idx = 2 if "lora" in name else 1
|
||||
module_names = name.split(sep=".")[:-parent_idx]
|
||||
module = reduce(getattr, module_names, opt_model)
|
||||
return module
|
||||
|
||||
|
||||
def create_loraplus_optimizer(
|
||||
opt_model,
|
||||
optimizer_cls,
|
||||
optimizer_kwargs,
|
||||
loraplus_lr_ratio,
|
||||
loraplus_lr_embedding=None,
|
||||
):
|
||||
"""
|
||||
Creates an optimizer for the given model, applying LoRA-specific learning rate adjustments to different parameter groups.
|
||||
|
||||
Args:
|
||||
opt_model (torch.nn.Module): The model for which the optimizer is being created.
|
||||
optimizer_cls (class): The class of the optimizer to be used (e.g., torch.optim.Adam).
|
||||
optimizer_kwargs (dict): A dictionary of keyword arguments for the optimizer's initialization.
|
||||
loraplus_lr_ratio (float): The learning rate ratio to be applied to LoRA parameters.
|
||||
loraplus_lr_embedding (float, optional): A specific learning rate for embedding parameters, with a default value if not provided.
|
||||
|
||||
Returns:
|
||||
An instance of the specified optimizer class configured with the model's parameters organized into groups with custom learning rates.
|
||||
"""
|
||||
|
||||
assert loraplus_lr_ratio is not None, "loraplus_lr_ratio must be provided."
|
||||
|
||||
if loraplus_lr_embedding is None:
|
||||
loraplus_lr_embedding = 1e-6
|
||||
|
||||
decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
|
||||
decay_parameters = [name for name in decay_parameters if "bias" not in name]
|
||||
param_groups = {
|
||||
"groupA": {},
|
||||
"groupB": {},
|
||||
"groupB_no_decay": {},
|
||||
"embedding": {},
|
||||
}
|
||||
|
||||
for name, param in opt_model.named_parameters():
|
||||
if not param.requires_grad:
|
||||
continue
|
||||
|
||||
module = get_module(name, opt_model)
|
||||
if isinstance(module, lora.Embedding):
|
||||
param_groups["embedding"][name] = param
|
||||
elif "lora_B" in name or param.ndim == 1:
|
||||
if name in decay_parameters:
|
||||
param_groups["groupB"][name] = param
|
||||
else:
|
||||
param_groups["groupB_no_decay"][name] = param
|
||||
else:
|
||||
param_groups["groupA"][name] = param
|
||||
|
||||
assigned_param_groups = ""
|
||||
for group, group_params in param_groups.items():
|
||||
assigned_param_groups += f"{group}\n {list(group_params.keys())}\n\n"
|
||||
LOG.info(assigned_param_groups)
|
||||
|
||||
lr = optimizer_kwargs["lr"] # pylint: disable=invalid-name
|
||||
weight_decay = optimizer_kwargs.get("weight_decay", 0.0)
|
||||
|
||||
optimizer_grouped_parameters = [
|
||||
{
|
||||
"params": list(param_groups["groupA"].values()),
|
||||
"weight_decay": weight_decay,
|
||||
"lr": lr,
|
||||
},
|
||||
{
|
||||
"params": list(param_groups["embedding"].values()),
|
||||
"weight_decay": weight_decay,
|
||||
"lr": loraplus_lr_embedding,
|
||||
},
|
||||
{
|
||||
"params": list(param_groups["groupB"].values()),
|
||||
"weight_decay": weight_decay,
|
||||
"lr": lr * loraplus_lr_ratio,
|
||||
},
|
||||
{
|
||||
"params": list(param_groups["groupB_no_decay"].values()),
|
||||
"weight_decay": 0.0,
|
||||
"lr": lr * loraplus_lr_ratio,
|
||||
},
|
||||
]
|
||||
|
||||
optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
|
||||
if optimizer_cls.__name__ == "Adam8bit":
|
||||
import bitsandbytes
|
||||
|
||||
manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
|
||||
|
||||
skipped = 0
|
||||
for module in opt_model.modules():
|
||||
if isinstance(module, nn.Embedding):
|
||||
skipped += sum(
|
||||
{p.data_ptr(): p.numel() for p in module.parameters()}.values()
|
||||
)
|
||||
LOG.info(f"skipped {module}: {skipped/2**20}M params")
|
||||
manager.register_module_override(module, "weight", {"optim_bits": 32})
|
||||
LOG.debug(f"bitsandbytes: will optimize {module} in fp32")
|
||||
LOG.info(f"skipped: {skipped/2**20}M params")
|
||||
|
||||
return optimizer
|
||||
229
src/axolotl/monkeypatch/attention/mllama.py
Normal file
229
src/axolotl/monkeypatch/attention/mllama.py
Normal file
@@ -0,0 +1,229 @@
|
||||
"""
|
||||
Monkeypatch for Vision Llama for FA2 support
|
||||
"""
|
||||
# pylint: disable=duplicate-code
|
||||
|
||||
from typing import Optional, Tuple
|
||||
|
||||
import torch
|
||||
from flash_attn.flash_attn_interface import flash_attn_func
|
||||
from transformers.cache_utils import Cache
|
||||
from transformers.modeling_flash_attention_utils import _flash_attention_forward
|
||||
from transformers.models.mllama.configuration_mllama import MllamaTextConfig
|
||||
from transformers.models.mllama.modeling_mllama import (
|
||||
MllamaTextCrossAttention,
|
||||
MllamaTextSelfAttention,
|
||||
apply_rotary_pos_emb,
|
||||
repeat_kv,
|
||||
)
|
||||
from transformers.utils import is_flash_attn_greater_or_equal_2_10
|
||||
|
||||
|
||||
class MllamaTextCrossFlashAttention2(MllamaTextCrossAttention):
|
||||
"""
|
||||
Mllama flash cross-attention module. This module inherits from `MllamaTextCrossAttention` and
|
||||
implements the forward pass using Flash Attention for improved performance.
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
|
||||
# Check if flash attention version is greater or equal to 2.1
|
||||
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
cross_attention_states: Optional[torch.Tensor] = None,
|
||||
past_key_value: Optional[Cache] = None,
|
||||
attention_mask: Optional[ # pylint: disable=unused-argument
|
||||
torch.Tensor
|
||||
] = None,
|
||||
output_attentions: bool = False,
|
||||
use_cache: bool = False, # pylint: disable=unused-argument
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
bsz, q_len, _ = hidden_states.size()
|
||||
|
||||
query_states = self.q_proj(hidden_states)
|
||||
query_states = query_states.view(
|
||||
bsz, q_len, self.num_heads, self.head_dim
|
||||
).transpose(1, 2)
|
||||
query_states = self.q_norm(query_states)
|
||||
|
||||
if cross_attention_states is not None:
|
||||
key_states = self.k_proj(cross_attention_states)
|
||||
value_states = self.v_proj(cross_attention_states)
|
||||
key_states = key_states.view(
|
||||
bsz, -1, self.num_key_value_heads, self.head_dim
|
||||
).transpose(1, 2)
|
||||
value_states = value_states.view(
|
||||
bsz, -1, self.num_key_value_heads, self.head_dim
|
||||
).transpose(1, 2)
|
||||
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
||||
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
||||
|
||||
key_states = self.k_norm(key_states)
|
||||
if past_key_value is not None:
|
||||
key_states, value_states = past_key_value.update(
|
||||
key_states,
|
||||
value_states,
|
||||
self.layer_idx,
|
||||
{"cache_position": cache_position},
|
||||
)
|
||||
elif cache_position[0] != 0:
|
||||
key_states, value_states = (
|
||||
past_key_value.key_cache[self.layer_idx],
|
||||
past_key_value.value_cache[self.layer_idx],
|
||||
)
|
||||
else:
|
||||
raise ValueError(
|
||||
"Cross attention layer can't find neither `cross_attn_states` nor cached values for key/values!"
|
||||
)
|
||||
|
||||
# Transpose to get the expected layout for flash attention
|
||||
query_states = query_states.transpose(1, 2)
|
||||
key_states = key_states.transpose(1, 2)
|
||||
value_states = value_states.transpose(1, 2)
|
||||
|
||||
# Apply Flash Attention
|
||||
dropout_rate = self.dropout if self.training else 0.0
|
||||
output = flash_attn_func(
|
||||
query_states,
|
||||
key_states,
|
||||
value_states,
|
||||
dropout_p=dropout_rate,
|
||||
softmax_scale=None,
|
||||
causal=False,
|
||||
return_attn_probs=output_attentions,
|
||||
)
|
||||
|
||||
attn_output = output.contiguous().view(bsz, q_len, -1)
|
||||
attn_output = self.o_proj(attn_output)
|
||||
|
||||
if not output_attentions:
|
||||
attn_weights = None
|
||||
|
||||
return attn_output, attn_weights, past_key_value
|
||||
|
||||
|
||||
class MllamaTextSelfFlashAttention2(MllamaTextSelfAttention):
|
||||
"""
|
||||
Mllama flash self-attention module. This module inherits from `MllamaTextSelfAttention` and
|
||||
implements the forward pass using Flash Attention for improved performance.
|
||||
"""
|
||||
|
||||
def __init__(self, config: MllamaTextConfig, layer_idx: int, *args, **kwargs):
|
||||
super().__init__(config, layer_idx, *args, **kwargs)
|
||||
|
||||
# Check if flash attention version is greater or equal to 2.1
|
||||
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_embeddings: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
|
||||
output_attentions: bool = False,
|
||||
use_cache: bool = False, # pylint: disable=unused-argument
|
||||
past_key_value=None,
|
||||
cache_position: Optional[torch.LongTensor] = None,
|
||||
**kwargs, # pylint: disable=unused-argument
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
output_attentions = False
|
||||
|
||||
bsz, q_len, _ = hidden_states.size()
|
||||
|
||||
query_states = self.q_proj(hidden_states)
|
||||
key_states = self.k_proj(hidden_states)
|
||||
value_states = self.v_proj(hidden_states)
|
||||
|
||||
# Flash attention requires the input to have the shape
|
||||
# batch_size x seq_length x num_heads x head_dim
|
||||
query_states = query_states.view(
|
||||
bsz, q_len, self.num_heads, self.head_dim
|
||||
).transpose(1, 2)
|
||||
key_states = key_states.view(
|
||||
bsz, q_len, self.num_key_value_heads, self.head_dim
|
||||
).transpose(1, 2)
|
||||
value_states = value_states.view(
|
||||
bsz, q_len, self.num_key_value_heads, self.head_dim
|
||||
).transpose(1, 2)
|
||||
|
||||
cos, sin = position_embeddings
|
||||
query_states, key_states = apply_rotary_pos_emb(
|
||||
query_states, key_states, cos, sin
|
||||
)
|
||||
|
||||
if past_key_value is not None:
|
||||
# sin and cos are specific to RoPE models; cache_position needed for the static cache
|
||||
cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
|
||||
key_states, value_states = past_key_value.update(
|
||||
key_states, value_states, self.layer_idx, cache_kwargs
|
||||
)
|
||||
|
||||
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
||||
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
||||
|
||||
# Transpose to get the expected layout for flash attention
|
||||
query_states = query_states.transpose(1, 2)
|
||||
key_states = key_states.transpose(1, 2)
|
||||
value_states = value_states.transpose(1, 2)
|
||||
|
||||
dropout_rate = self.dropout if self.training else 0.0
|
||||
|
||||
# Handle potential silent casting to float32
|
||||
input_dtype = query_states.dtype
|
||||
if input_dtype == torch.float32:
|
||||
if torch.is_autocast_enabled():
|
||||
target_dtype = torch.get_autocast_gpu_dtype()
|
||||
elif hasattr(self.config, "_pre_quantization_dtype"):
|
||||
target_dtype = (
|
||||
self.config._pre_quantization_dtype # pylint: disable=protected-access
|
||||
)
|
||||
else:
|
||||
target_dtype = self.q_proj.weight.dtype
|
||||
|
||||
query_states = query_states.to(target_dtype)
|
||||
key_states = key_states.to(target_dtype)
|
||||
value_states = value_states.to(target_dtype)
|
||||
|
||||
attn_output = _flash_attention_forward(
|
||||
query_states,
|
||||
key_states,
|
||||
value_states,
|
||||
attention_mask,
|
||||
q_len,
|
||||
dropout=dropout_rate,
|
||||
use_top_left_mask=self._flash_attn_uses_top_left_mask,
|
||||
is_causal=True,
|
||||
)
|
||||
|
||||
attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
|
||||
attn_output = self.o_proj(attn_output)
|
||||
|
||||
if not output_attentions:
|
||||
attn_weights = None
|
||||
|
||||
return attn_output, attn_weights, past_key_value
|
||||
|
||||
|
||||
def patch_mllama():
|
||||
from transformers.models.mllama.modeling_mllama import (
|
||||
MLLAMA_TEXT_ATTENTION_CLASSES,
|
||||
MLLAMA_TEXT_CROSS_ATTENTION_CLASSES,
|
||||
MLLAMA_VISION_ATTENTION_CLASSES,
|
||||
MllamaPreTrainedModel,
|
||||
)
|
||||
|
||||
MllamaPreTrainedModel._supports_flash_attn_2 = ( # pylint: disable=protected-access
|
||||
True
|
||||
)
|
||||
MLLAMA_TEXT_ATTENTION_CLASSES["flash_attention_2"] = MllamaTextSelfFlashAttention2
|
||||
MLLAMA_TEXT_CROSS_ATTENTION_CLASSES[
|
||||
"flash_attention_2"
|
||||
] = MllamaTextCrossFlashAttention2
|
||||
# fallback to SDPA
|
||||
MLLAMA_VISION_ATTENTION_CLASSES[
|
||||
"flash_attention_2"
|
||||
] = MLLAMA_VISION_ATTENTION_CLASSES["sdpa"]
|
||||
@@ -10,6 +10,7 @@ from axolotl.monkeypatch.mixtral import patch_mixtral_moe_forward_zero3
|
||||
from axolotl.monkeypatch.utils import get_unpad_data
|
||||
|
||||
SUPPORTED_MULTIPACK_MODEL_TYPES = [
|
||||
"mllama_text_model",
|
||||
"llama",
|
||||
"mistral",
|
||||
"mixtral",
|
||||
|
||||
@@ -44,8 +44,8 @@ def magnitude_pruning_(tensor, prune_ratio):
|
||||
def reset_optimizer(
|
||||
optimizer: torch.optim.Optimizer,
|
||||
*,
|
||||
reset_params: list[str], # where str is the key to a torch.nn.Parameter
|
||||
optimizer_state_keys: list[str],
|
||||
reset_params: List[str], # where str is the key to a torch.nn.Parameter
|
||||
optimizer_state_keys: List[str],
|
||||
prune_ratio: float = 0.9,
|
||||
):
|
||||
pruning_fn = partial(magnitude_pruning_, prune_ratio=prune_ratio)
|
||||
|
||||
@@ -16,6 +16,7 @@
|
||||
# This code is based off the following work:
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
|
||||
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
|
||||
# pylint: disable=duplicate-code
|
||||
""" PyTorch StableLM Epoch model. """
|
||||
import importlib
|
||||
import math
|
||||
|
||||
@@ -1,51 +0,0 @@
|
||||
"""Patch transformers.dynamic_module_utils.get_class_in_module to avoid reloading models from disk"""
|
||||
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
import typing
|
||||
from pathlib import Path
|
||||
|
||||
from transformers.file_utils import HF_MODULES_CACHE
|
||||
|
||||
|
||||
def _patched_get_class_in_module(
|
||||
class_name: str, module_path: typing.Union[str, os.PathLike]
|
||||
) -> typing.Type:
|
||||
"""
|
||||
Import a module on the cache directory for modules and extract a class from it.
|
||||
|
||||
Args:
|
||||
class_name (`str`): The name of the class to import.
|
||||
module_path (`str` or `os.PathLike`): The path to the module to import.
|
||||
|
||||
Returns:
|
||||
`typing.Type`: The class looked for.
|
||||
"""
|
||||
name = os.path.normpath(module_path)
|
||||
if name.endswith(".py"):
|
||||
name = name[:-3]
|
||||
name = name.replace(os.path.sep, ".")
|
||||
module_spec = importlib.util.spec_from_file_location(
|
||||
name, location=Path(HF_MODULES_CACHE) / module_path
|
||||
)
|
||||
module = sys.modules.get(name)
|
||||
if module is None:
|
||||
module = importlib.util.module_from_spec(module_spec)
|
||||
# insert it into sys.modules before any loading begins
|
||||
sys.modules[name] = module
|
||||
# load in initial case only
|
||||
module_spec.loader.exec_module(module)
|
||||
return getattr(module, class_name)
|
||||
|
||||
|
||||
def patch_transformers_dynamic_module_utils():
|
||||
"""
|
||||
Recently, transformers started reloading modeling code from disk for models marked trust_remote_code=True.
|
||||
This causes monkey-patches for multipack and liger to be removed.
|
||||
We replace the original function with a version that does not reload the module from disk.
|
||||
See https://github.com/huggingface/transformers/pull/30370#pullrequestreview-2264361581
|
||||
"""
|
||||
import transformers
|
||||
|
||||
transformers.dynamic_module_utils.get_class_in_module = _patched_get_class_in_module
|
||||
@@ -9,7 +9,7 @@ from axolotl.prompt_strategies.user_defined import UserDefinedDatasetConfig
|
||||
LOG = logging.getLogger("axolotl.prompt_strategies")
|
||||
|
||||
|
||||
def load(strategy, tokenizer, cfg, ds_cfg):
|
||||
def load(strategy, tokenizer, cfg, ds_cfg, processor=None):
|
||||
try:
|
||||
load_fn = "load"
|
||||
if strategy.split(".")[-1].startswith("load_"):
|
||||
@@ -24,6 +24,8 @@ def load(strategy, tokenizer, cfg, ds_cfg):
|
||||
sig = inspect.signature(func)
|
||||
if "ds_cfg" in sig.parameters:
|
||||
load_kwargs["ds_cfg"] = ds_cfg
|
||||
if "processor" in sig.parameters:
|
||||
load_kwargs["processor"] = processor
|
||||
return func(tokenizer, cfg, **load_kwargs)
|
||||
except ModuleNotFoundError:
|
||||
return None
|
||||
|
||||
@@ -5,6 +5,8 @@ HF Chat Templates prompt strategy
|
||||
import logging
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from transformers import ProcessorMixin
|
||||
|
||||
from axolotl.prompt_tokenizers import PromptTokenizingStrategy
|
||||
from axolotl.prompters import IGNORE_TOKEN_ID, Prompter
|
||||
from axolotl.utils.chat_templates import get_chat_template_from_config
|
||||
@@ -20,12 +22,13 @@ class ChatTemplatePrompter(Prompter):
|
||||
def __init__(
|
||||
self,
|
||||
tokenizer,
|
||||
processor=None,
|
||||
chat_template=None,
|
||||
max_length=2048,
|
||||
message_field_role: str = "from",
|
||||
message_field_content: str = "value",
|
||||
message_field_training: str = "train",
|
||||
message_field_training_detail: str = "train_detail",
|
||||
message_field_training: Optional[str] = None,
|
||||
message_field_training_detail: Optional[str] = None,
|
||||
roles: Optional[Dict[str, List[str]]] = None,
|
||||
drop_system_message: bool = False,
|
||||
):
|
||||
@@ -44,11 +47,12 @@ class ChatTemplatePrompter(Prompter):
|
||||
self.message_field_training = message_field_training
|
||||
self.message_field_training_detail = message_field_training_detail
|
||||
self.tokenizer = tokenizer
|
||||
self.processor: ProcessorMixin = processor
|
||||
self.chat_template = chat_template
|
||||
self.max_length = max_length
|
||||
self.drop_system_message = drop_system_message
|
||||
|
||||
def build_prompt(self, conversation, add_generation_prompt=False):
|
||||
def build_prompt(self, conversation, add_generation_prompt=False, images=None):
|
||||
turns = [
|
||||
{
|
||||
"role": self.roles[t[self.message_field_role]],
|
||||
@@ -61,6 +65,28 @@ class ChatTemplatePrompter(Prompter):
|
||||
if self.drop_system_message and turns[0]["role"] == "system":
|
||||
turns = turns[1:]
|
||||
|
||||
if self.processor:
|
||||
text = self.processor.apply_chat_template(
|
||||
turns,
|
||||
chat_template=self.chat_template,
|
||||
tokenize=False,
|
||||
add_generation_prompt=add_generation_prompt,
|
||||
)
|
||||
batch = self.processor(
|
||||
text=text,
|
||||
images=images,
|
||||
return_tensors="pt",
|
||||
truncation=True,
|
||||
max_length=self.max_length,
|
||||
)
|
||||
# workaround since processor works in batches instead of single examples
|
||||
for k, val in batch.items():
|
||||
if k in ["pixel_values"]:
|
||||
batch[k] = val.tolist()
|
||||
else:
|
||||
batch[k] = val.squeeze().tolist()
|
||||
return batch
|
||||
|
||||
return self.tokenizer.apply_chat_template(
|
||||
turns,
|
||||
truncation=True,
|
||||
@@ -186,11 +212,12 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
||||
train_on_inputs,
|
||||
sequence_len,
|
||||
roles_to_train=None,
|
||||
train_on_eos="last",
|
||||
train_on_eos=None,
|
||||
):
|
||||
super().__init__(prompter, tokenizer, train_on_inputs, sequence_len)
|
||||
self.roles_to_train = roles_to_train if roles_to_train is not None else []
|
||||
self.train_on_eos = train_on_eos
|
||||
self.images = "images"
|
||||
|
||||
@property
|
||||
def messages(self):
|
||||
@@ -201,6 +228,40 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
||||
self._messages = messages
|
||||
|
||||
def tokenize_prompt(self, prompt):
|
||||
# Old simple legacy behavior that works reliably.
|
||||
if (
|
||||
not self.roles_to_train
|
||||
and not self.train_on_eos
|
||||
and not self.prompter.message_field_training
|
||||
and not self.prompter.message_field_training_detail
|
||||
):
|
||||
turns = self.get_conversation_thread(prompt)
|
||||
images = self.get_images(prompt)
|
||||
prompt_ids = self.prompter.build_prompt(
|
||||
turns[:-1],
|
||||
add_generation_prompt=True,
|
||||
images=images,
|
||||
)
|
||||
tokenized_res = self.prompter.build_prompt(turns, images=images)
|
||||
tokenized_prompt = {}
|
||||
if isinstance(tokenized_res, list):
|
||||
input_ids = prompt_ids + tokenized_res[len(prompt_ids) :]
|
||||
tokenized_prompt["input_ids"] = input_ids
|
||||
tokenized_prompt["attention_mask"] = [1] * len(input_ids)
|
||||
else:
|
||||
input_ids = tokenized_res["input_ids"]
|
||||
tokenized_prompt = tokenized_res
|
||||
|
||||
if not self.train_on_inputs:
|
||||
user_prompt_len = len(prompt_ids)
|
||||
labels = [-100] * user_prompt_len + input_ids[user_prompt_len:]
|
||||
else:
|
||||
labels = input_ids
|
||||
|
||||
tokenized_prompt["labels"] = labels
|
||||
|
||||
return tokenized_prompt
|
||||
|
||||
turns = prompt[self.messages]
|
||||
input_ids = self.prompter.build_prompt(turns)
|
||||
labels = [IGNORE_TOKEN_ID] * len(input_ids)
|
||||
@@ -219,9 +280,11 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
||||
should_train = (
|
||||
train_turn
|
||||
if train_turn is not None
|
||||
else bool(train_detail is not None)
|
||||
if train_detail is not None
|
||||
else self.train_on_inputs or role in self.roles_to_train
|
||||
else (
|
||||
bool(train_detail is not None)
|
||||
if train_detail is not None
|
||||
else self.train_on_inputs or role in self.roles_to_train
|
||||
)
|
||||
)
|
||||
|
||||
LOG.debug(f"Should train: {should_train}")
|
||||
@@ -335,8 +398,11 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
||||
def get_conversation_thread(self, prompt):
|
||||
return prompt[self.messages]
|
||||
|
||||
def get_images(self, prompt):
|
||||
return prompt.get(self.images, None)
|
||||
|
||||
def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
|
||||
|
||||
def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None, processor=None):
|
||||
ds_cfg = ds_cfg or {}
|
||||
chat_template_string = get_chat_template_from_config(
|
||||
cfg=cfg, ds_cfg=ds_cfg, tokenizer=tokenizer
|
||||
@@ -346,23 +412,25 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
|
||||
prompter_params = {
|
||||
"tokenizer": tokenizer,
|
||||
"chat_template": chat_template_string,
|
||||
"message_field_role": ds_cfg.get("message_field_role", "from"),
|
||||
"message_field_content": ds_cfg.get("message_field_content", "value"),
|
||||
"message_field_training": ds_cfg.get("message_field_training", "training"),
|
||||
"message_field_role": ds_cfg.get("message_field_role", "role"),
|
||||
"message_field_content": ds_cfg.get("message_field_content", "content"),
|
||||
"message_field_training": ds_cfg.get("message_field_training", None),
|
||||
"message_field_training_detail": ds_cfg.get(
|
||||
"message_field_training_detail", "train_detail"
|
||||
"message_field_training_detail",
|
||||
None,
|
||||
),
|
||||
"roles": ds_cfg.get("roles"),
|
||||
"drop_system_message": ds_cfg.get("drop_system_message", False),
|
||||
# we need to add one for detecting sequences with exceeding the `sequence_len` limit.
|
||||
"max_length": cfg.sequence_len + 1,
|
||||
"processor": processor,
|
||||
}
|
||||
|
||||
strategy_params = {
|
||||
"train_on_inputs": cfg.train_on_inputs,
|
||||
"sequence_len": cfg.sequence_len,
|
||||
"roles_to_train": ds_cfg.get("roles_to_train", ["gpt", "assistant"]),
|
||||
"train_on_eos": ds_cfg.get("train_on_eos", "turn"),
|
||||
"roles_to_train": ds_cfg.get("roles_to_train", []),
|
||||
"train_on_eos": ds_cfg.get("train_on_eos", None),
|
||||
}
|
||||
|
||||
strategy = ChatTemplateStrategy(
|
||||
|
||||
@@ -24,7 +24,7 @@ from axolotl.core.tokenizer_utils import fix_untrained_tokens
|
||||
from axolotl.logging_config import configure_logging
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.freeze import freeze_layers_except
|
||||
from axolotl.utils.models import load_model, load_tokenizer
|
||||
from axolotl.utils.models import load_model, load_processor, load_tokenizer
|
||||
from axolotl.utils.trainer import setup_trainer
|
||||
|
||||
try:
|
||||
@@ -69,6 +69,9 @@ def train(
|
||||
main_process_only=True,
|
||||
)
|
||||
tokenizer = load_tokenizer(cfg)
|
||||
processor = None
|
||||
if cfg.is_multimodal:
|
||||
processor = load_processor(cfg, tokenizer)
|
||||
|
||||
train_dataset = dataset_meta.train_dataset
|
||||
eval_dataset = dataset_meta.eval_dataset
|
||||
@@ -96,7 +99,9 @@ def train(
|
||||
LOG.debug(msg)
|
||||
# we wait unitl the last possible moment to setup Accelerator
|
||||
Accelerator()
|
||||
model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
|
||||
model, peft_config = load_model(
|
||||
cfg, tokenizer, processor=processor, inference=cli_args.inference
|
||||
)
|
||||
model.generation_config.do_sample = True
|
||||
|
||||
model_ref = None
|
||||
@@ -122,6 +127,7 @@ def train(
|
||||
eval_dataset,
|
||||
(model, model_ref, peft_config),
|
||||
tokenizer,
|
||||
processor,
|
||||
total_num_steps,
|
||||
)
|
||||
|
||||
|
||||
@@ -1,8 +1,12 @@
|
||||
"""
|
||||
Basic utils for Axolotl
|
||||
"""
|
||||
import importlib
|
||||
import importlib.util
|
||||
|
||||
|
||||
def is_mlflow_available():
|
||||
return importlib.util.find_spec("mlflow") is not None
|
||||
|
||||
|
||||
def is_comet_available():
|
||||
return importlib.util.find_spec("comet_ml") is not None
|
||||
|
||||
@@ -29,7 +29,7 @@ from transformers import (
|
||||
)
|
||||
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, IntervalStrategy
|
||||
|
||||
from axolotl.utils import is_mlflow_available
|
||||
from axolotl.utils import is_comet_available, is_mlflow_available
|
||||
from axolotl.utils.bench import log_gpu_memory_usage
|
||||
from axolotl.utils.callbacks.perplexity import Perplexity
|
||||
from axolotl.utils.config.models.input.v0_4_1 import AxolotlInputConfig
|
||||
@@ -462,7 +462,7 @@ def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
|
||||
references=[[r] for r in references],
|
||||
predictions=predictions,
|
||||
)
|
||||
scores[metric_name] = score
|
||||
scores["eval_" + metric_name] = score
|
||||
return scores
|
||||
|
||||
def predict_with_generate():
|
||||
@@ -747,6 +747,15 @@ def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
|
||||
artifact_file="PredictionsVsGroundTruth.json",
|
||||
tracking_uri=tracking_uri,
|
||||
)
|
||||
elif logger == "comet_ml" and is_comet_available():
|
||||
import comet_ml
|
||||
|
||||
experiment = comet_ml.get_running_experiment()
|
||||
if experiment:
|
||||
experiment.log_table(
|
||||
f"{name} - Predictions vs Ground Truth.csv",
|
||||
pd.DataFrame(table_data),
|
||||
)
|
||||
|
||||
if is_main_process():
|
||||
log_table_from_dataloader("Eval", eval_dataloader)
|
||||
|
||||
43
src/axolotl/utils/callbacks/comet_.py
Normal file
43
src/axolotl/utils/callbacks/comet_.py
Normal file
@@ -0,0 +1,43 @@
|
||||
"""Comet module for trainer callbacks"""
|
||||
|
||||
import logging
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
import comet_ml
|
||||
from transformers import TrainerCallback, TrainerControl, TrainerState
|
||||
|
||||
from axolotl.utils.distributed import is_main_process
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from axolotl.core.trainer_builder import AxolotlTrainingArguments
|
||||
|
||||
LOG = logging.getLogger("axolotl.callbacks")
|
||||
|
||||
|
||||
class SaveAxolotlConfigtoCometCallback(TrainerCallback):
|
||||
"""Callback to save axolotl config to comet"""
|
||||
|
||||
def __init__(self, axolotl_config_path):
|
||||
self.axolotl_config_path = axolotl_config_path
|
||||
|
||||
def on_train_begin(
|
||||
self,
|
||||
args: "AxolotlTrainingArguments", # pylint: disable=unused-argument
|
||||
state: TrainerState, # pylint: disable=unused-argument
|
||||
control: TrainerControl,
|
||||
**kwargs, # pylint: disable=unused-argument
|
||||
):
|
||||
if is_main_process():
|
||||
try:
|
||||
comet_experiment = comet_ml.start(source="axolotl")
|
||||
comet_experiment.log_other("Created from", "axolotl")
|
||||
comet_experiment.log_asset(
|
||||
self.axolotl_config_path,
|
||||
file_name="axolotl-config",
|
||||
)
|
||||
LOG.info(
|
||||
"The Axolotl config has been saved to the Comet Experiment under assets."
|
||||
)
|
||||
except (FileNotFoundError, ConnectionError) as err:
|
||||
LOG.warning(f"Error while saving Axolotl config to Comet: {err}")
|
||||
return control
|
||||
File diff suppressed because one or more lines are too long
10
src/axolotl/utils/collators/__init__.py
Normal file
10
src/axolotl/utils/collators/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""
|
||||
shared axolotl collators for multipack, mamba, multimodal
|
||||
"""
|
||||
from .batching import ( # noqa: F401
|
||||
BatchSamplerDataCollatorForSeq2Seq,
|
||||
DataCollatorForSeq2Seq,
|
||||
PretrainingBatchSamplerDataCollatorForSeq2Seq,
|
||||
V2BatchSamplerDataCollatorForSeq2Seq,
|
||||
)
|
||||
from .mamba import MambaDataCollator # noqa: F401
|
||||
@@ -1,17 +1,14 @@
|
||||
"""
|
||||
DataCollator for axolotl to pad labels and position_ids for packed sequences
|
||||
"""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, Optional, Sequence, Union
|
||||
from typing import Any, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import transformers
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
from transformers.utils import PaddingStrategy
|
||||
|
||||
IGNORE_INDEX = -100
|
||||
|
||||
|
||||
@dataclass
|
||||
class DataCollatorForSeq2Seq:
|
||||
@@ -183,34 +180,6 @@ class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
||||
return super().__call__(out_features, return_tensors=return_tensors)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MambaDataCollator:
|
||||
"""
|
||||
Collator for State Space Models (Mamba)
|
||||
"""
|
||||
|
||||
tokenizer: transformers.PreTrainedTokenizer
|
||||
|
||||
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
|
||||
input_ids, labels = tuple(
|
||||
[torch.LongTensor(instance[key]) for instance in instances]
|
||||
for key in ("input_ids", "labels")
|
||||
)
|
||||
input_ids = torch.nn.utils.rnn.pad_sequence(
|
||||
input_ids,
|
||||
batch_first=True,
|
||||
padding_value=self.tokenizer.pad_token_id,
|
||||
)
|
||||
labels = torch.nn.utils.rnn.pad_sequence(
|
||||
labels, batch_first=True, padding_value=IGNORE_INDEX
|
||||
)
|
||||
|
||||
return {
|
||||
"input_ids": input_ids,
|
||||
"labels": labels,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class PretrainingBatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
||||
"""
|
||||
4
src/axolotl/utils/collators/core.py
Normal file
4
src/axolotl/utils/collators/core.py
Normal file
@@ -0,0 +1,4 @@
|
||||
"""
|
||||
basic shared collator constants
|
||||
"""
|
||||
IGNORE_INDEX = -100
|
||||
38
src/axolotl/utils/collators/mamba.py
Normal file
38
src/axolotl/utils/collators/mamba.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""
|
||||
collators for Mamba
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Sequence
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
from axolotl.utils.collators.core import IGNORE_INDEX
|
||||
|
||||
|
||||
@dataclass
|
||||
class MambaDataCollator:
|
||||
"""
|
||||
Collator for State Space Models (Mamba)
|
||||
"""
|
||||
|
||||
tokenizer: transformers.PreTrainedTokenizer
|
||||
|
||||
def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
|
||||
input_ids, labels = tuple(
|
||||
[torch.LongTensor(instance[key]) for instance in instances]
|
||||
for key in ("input_ids", "labels")
|
||||
)
|
||||
input_ids = torch.nn.utils.rnn.pad_sequence(
|
||||
input_ids,
|
||||
batch_first=True,
|
||||
padding_value=self.tokenizer.pad_token_id,
|
||||
)
|
||||
labels = torch.nn.utils.rnn.pad_sequence(
|
||||
labels, batch_first=True, padding_value=IGNORE_INDEX
|
||||
)
|
||||
|
||||
return {
|
||||
"input_ids": input_ids,
|
||||
"labels": labels,
|
||||
}
|
||||
77
src/axolotl/utils/collators/mm_chat.py
Normal file
77
src/axolotl/utils/collators/mm_chat.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""
|
||||
Collators for multi-modal chat messages and packing
|
||||
"""
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from transformers import PreTrainedTokenizerBase, ProcessorMixin
|
||||
from transformers.data.data_collator import DataCollatorMixin
|
||||
from transformers.utils import PaddingStrategy
|
||||
|
||||
|
||||
@dataclass
|
||||
class MultiModalChatDataCollator(DataCollatorMixin):
|
||||
"""
|
||||
Collator for multi-modal chat messages
|
||||
"""
|
||||
|
||||
tokenizer: PreTrainedTokenizerBase
|
||||
processor: ProcessorMixin
|
||||
return_tensors: str = "pt"
|
||||
chat_template: Optional[str] = None
|
||||
packing: bool = False
|
||||
max_images: int = -1
|
||||
padding: Union[bool, str, PaddingStrategy] = True
|
||||
pad_to_multiple_of: Optional[int] = None
|
||||
|
||||
def __post_init__(self):
|
||||
if self.packing:
|
||||
raise ValueError("Packing is currently not supported.")
|
||||
|
||||
def torch_call(
|
||||
self, examples: List[Union[List[int], Any, Dict[str, Any]]]
|
||||
) -> Dict[str, Any]:
|
||||
# Handle dict or lists with proper padding and conversion to tensor.
|
||||
|
||||
return self.__class__.process_rows(
|
||||
examples, self.processor, self.chat_template, self.max_images
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def process_rows(examples, processor, chat_template, max_images, length_only=False):
|
||||
# HINT: use `_torch_collate_batch` to stack and pad tensors
|
||||
# see also DataCollatorWithFlattening and DefaultDataCollator
|
||||
|
||||
# *** This is COPIED from the trl example sft_vlm.py code ***
|
||||
# use this as a starting point
|
||||
|
||||
# Get the texts and images, and apply the chat template
|
||||
texts = [
|
||||
processor.apply_chat_template(
|
||||
example["messages"], chat_template=chat_template, tokenize=False
|
||||
)
|
||||
for example in examples
|
||||
]
|
||||
images = [example["images"] for example in examples]
|
||||
|
||||
if max_images > 0:
|
||||
images = [img_batch[:max_images] for img_batch in images]
|
||||
|
||||
# Tokenize the texts and process the images
|
||||
batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
|
||||
|
||||
# The labels are the input_ids, and we mask the padding tokens in the loss computation
|
||||
labels = batch["input_ids"].clone()
|
||||
labels[labels == processor.tokenizer.pad_token_id] = -100 #
|
||||
# Ignore the image token index in the loss computation (model specific)
|
||||
image_token_id = processor.tokenizer.convert_tokens_to_ids(
|
||||
processor.image_token
|
||||
)
|
||||
labels[labels == image_token_id] = -100
|
||||
batch["labels"] = labels
|
||||
|
||||
if length_only:
|
||||
return {
|
||||
"length": [len(sample["input_ids"]) for sample in batch["input_ids"]]
|
||||
}
|
||||
return batch
|
||||
93
src/axolotl/utils/comet_.py
Normal file
93
src/axolotl/utils/comet_.py
Normal file
@@ -0,0 +1,93 @@
|
||||
"""Module for wandb utilities"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
LOG = logging.getLogger("axolotl.utils.comet_")
|
||||
|
||||
COMET_ENV_MAPPING_OVERRIDE = {
|
||||
"comet_mode": "COMET_START_MODE",
|
||||
"comet_online": "COMET_START_ONLINE",
|
||||
}
|
||||
COMET_EXPERIMENT_CONFIG_ENV_MAPPING_OVERRIDE = {
|
||||
"auto_histogram_activation_logging": "COMET_AUTO_LOG_HISTOGRAM_ACTIVATIONS",
|
||||
"auto_histogram_epoch_rate": "COMET_AUTO_LOG_HISTOGRAM_EPOCH_RATE",
|
||||
"auto_histogram_gradient_logging": "COMET_AUTO_LOG_HISTOGRAM_GRADIENTS",
|
||||
"auto_histogram_tensorboard_logging": "COMET_AUTO_LOG_HISTOGRAM_TENSORBOARD",
|
||||
"auto_histogram_weight_logging": "COMET_AUTO_LOG_HISTOGRAM_WEIGHTS",
|
||||
"auto_log_co2": "COMET_AUTO_LOG_CO2",
|
||||
"auto_metric_logging": "COMET_AUTO_LOG_METRICS",
|
||||
"auto_metric_step_rate": "COMET_AUTO_LOG_METRIC_STEP_RATE",
|
||||
"auto_output_logging": "COMET_AUTO_LOG_OUTPUT_LOGGER",
|
||||
"auto_param_logging": "COMET_AUTO_LOG_PARAMETERS",
|
||||
"comet_disabled": "COMET_AUTO_LOG_DISABLE",
|
||||
"display_summary_level": "COMET_DISPLAY_SUMMARY_LEVEL",
|
||||
"distributed_node_identifier": "COMET_DISTRIBUTED_NODE_IDENTIFIER",
|
||||
"log_code": "COMET_AUTO_LOG_CODE",
|
||||
"log_env_cpu": "COMET_AUTO_LOG_ENV_CPU",
|
||||
"log_env_details": "COMET_AUTO_LOG_ENV_DETAILS",
|
||||
"log_env_disk": "COMET_AUTO_LOG_ENV_DISK",
|
||||
"log_env_gpu": "COMET_AUTO_LOG_ENV_GPU",
|
||||
"log_env_host": "COMET_AUTO_LOG_ENV_HOST",
|
||||
"log_env_network": "COMET_AUTO_LOG_ENV_NETWORK",
|
||||
"log_git_metadata": "COMET_AUTO_LOG_GIT_METADATA",
|
||||
"log_git_patch": "COMET_AUTO_LOG_GIT_PATCH",
|
||||
"log_graph": "COMET_AUTO_LOG_GRAPH",
|
||||
"name": "COMET_START_EXPERIMENT_NAME",
|
||||
"offline_directory": "COMET_OFFLINE_DIRECTORY",
|
||||
"parse_args": "COMET_AUTO_LOG_CLI_ARGUMENTS",
|
||||
"tags": "COMET_START_EXPERIMENT_TAGS",
|
||||
}
|
||||
|
||||
|
||||
def python_value_to_environ_value(python_value):
|
||||
if isinstance(python_value, bool):
|
||||
if python_value is True:
|
||||
return "true"
|
||||
|
||||
return "false"
|
||||
|
||||
if isinstance(python_value, int):
|
||||
return str(python_value)
|
||||
|
||||
if isinstance(python_value, list): # Comet only have one list of string parameter
|
||||
return ",".join(map(str, python_value))
|
||||
|
||||
return python_value
|
||||
|
||||
|
||||
def setup_comet_env_vars(cfg: DictDefault):
|
||||
# TODO, we need to convert Axolotl configuration to environment variables
|
||||
# as Transformers integration are call first and would create an
|
||||
# Experiment first
|
||||
|
||||
for key in cfg.keys():
|
||||
if key.startswith("comet_") and key != "comet_experiment_config":
|
||||
value = cfg.get(key, "")
|
||||
|
||||
if value is not None and value != "":
|
||||
env_variable_name = COMET_ENV_MAPPING_OVERRIDE.get(key, key.upper())
|
||||
final_value = python_value_to_environ_value(value)
|
||||
os.environ[env_variable_name] = final_value
|
||||
|
||||
if cfg.comet_experiment_config:
|
||||
for key, value in cfg.comet_experiment_config.items():
|
||||
if value is not None and value != "":
|
||||
config_env_variable_name = (
|
||||
COMET_EXPERIMENT_CONFIG_ENV_MAPPING_OVERRIDE.get(key)
|
||||
)
|
||||
|
||||
if config_env_variable_name is None:
|
||||
LOG.warning(
|
||||
f"Unknown Comet Experiment Config name {key}, ignoring it"
|
||||
)
|
||||
continue
|
||||
|
||||
final_value = python_value_to_environ_value(value)
|
||||
os.environ[config_env_variable_name] = final_value
|
||||
|
||||
# Enable comet if project name is present
|
||||
if cfg.comet_project_name and len(cfg.comet_project_name) > 0:
|
||||
cfg.use_comet = True
|
||||
@@ -121,15 +121,36 @@ def normalize_config(cfg):
|
||||
cfg.base_model_config = cfg.base_model
|
||||
|
||||
model_config = load_model_config(cfg)
|
||||
cfg.model_config_type = model_config.model_type
|
||||
|
||||
cfg.tokenizer_config = (
|
||||
cfg.tokenizer_config or cfg.base_model_config or cfg.base_model
|
||||
)
|
||||
|
||||
cfg.is_multimodal = (
|
||||
hasattr(model_config, "model_type")
|
||||
and model_config.model_type in ["llava", "mllama"]
|
||||
or any(
|
||||
multimodal_name in cfg.base_model.lower()
|
||||
for multimodal_name in [
|
||||
"pixtral",
|
||||
]
|
||||
)
|
||||
or cfg.is_multimodal
|
||||
)
|
||||
if cfg.is_multimodal:
|
||||
cfg.processor_config = (
|
||||
cfg.processor_config or cfg.base_model_config or cfg.base_model
|
||||
)
|
||||
model_config = model_config.text_config
|
||||
|
||||
cfg.model_config_type = model_config.model_type
|
||||
|
||||
# figure out if the model is llama
|
||||
cfg.is_llama_derived_model = (
|
||||
(hasattr(model_config, "model_type") and model_config.model_type == "llama")
|
||||
(
|
||||
hasattr(model_config, "model_type")
|
||||
and model_config.model_type == ["llama", "mllama_text_model"]
|
||||
)
|
||||
or cfg.is_llama_derived_model
|
||||
or "llama" in cfg.base_model.lower()
|
||||
or (cfg.type_of_model and "llama" in cfg.type_of_model.lower())
|
||||
|
||||
@@ -28,19 +28,31 @@ LOG = logging.getLogger("axolotl.utils.config.models.input")
|
||||
SUPPORTED_METRICS = {"sacrebleu", "comet", "ter", "chrf", "perplexity"}
|
||||
|
||||
|
||||
class RLType(str, Enum):
|
||||
"""RL trainer type configuration subset"""
|
||||
|
||||
dpo = "dpo" # pylint: disable=invalid-name
|
||||
ipo = "ipo" # pylint: disable=invalid-name
|
||||
orpo = "orpo" # pylint: disable=invalid-name
|
||||
kto = "kto" # pylint: disable=invalid-name
|
||||
simpo = "simpo" # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class ChatTemplate(str, Enum):
|
||||
"""Chat templates configuration subset"""
|
||||
|
||||
jinja = "jinja" # pylint: disable=invalid-name
|
||||
alpaca = "alpaca" # pylint: disable=invalid-name
|
||||
chatml = "chatml" # pylint: disable=invalid-name
|
||||
inst = "inst" # pylint: disable=invalid-name
|
||||
gemma = "gemma" # pylint: disable=invalid-name
|
||||
cohere = "cohere" # pylint: disable=invalid-name
|
||||
llama3 = "llama3" # pylint: disable=invalid-name
|
||||
llama3_2_vision = "llama3_2_vision" # pylint: disable=invalid-name
|
||||
phi_3 = "phi_3" # pylint: disable=invalid-name
|
||||
phi_35 = "phi_35" # pylint: disable=invalid-name
|
||||
deepseek_v2 = "deepseek_v2" # pylint: disable=invalid-name
|
||||
jamba = "jamba" # pylint: disable=invalid-name
|
||||
jinja = "jinja" # pylint: disable=invalid-name
|
||||
tokenizer_default = "tokenizer_default" # pylint: disable=invalid-name
|
||||
|
||||
|
||||
@@ -127,10 +139,13 @@ class SFTDataset(BaseModel):
|
||||
type: Optional[Union[str, UserDefinedPrompterType]] = None
|
||||
shards: Optional[int] = None
|
||||
conversation: Optional[str] = None
|
||||
chat_template: Union[
|
||||
ChatTemplate,
|
||||
Annotated[str, StringConstraints(pattern="^tokenizer_default_fallback_")],
|
||||
] = ChatTemplate.tokenizer_default
|
||||
# Do not make this too strict or it will break the validator to choose different dataset class
|
||||
chat_template: Optional[
|
||||
Union[
|
||||
ChatTemplate,
|
||||
str,
|
||||
]
|
||||
] = None
|
||||
chat_template_jinja: Optional[str] = None
|
||||
data_files: Optional[Union[str, List[str]]] = None
|
||||
name: Optional[str] = None
|
||||
@@ -153,6 +168,10 @@ class SFTDataset(BaseModel):
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_chat_template_config(cls, data):
|
||||
# Set chat_template to tokenizer_default if not set
|
||||
if data.get("type") == "chat_template" and not data.get("chat_template"):
|
||||
data["chat_template"] = ChatTemplate.tokenizer_default
|
||||
|
||||
# if chat_template is set to jinja, chat_template_jinja is required
|
||||
if data.get("chat_template") == ChatTemplate.jinja and not data.get(
|
||||
"chat_template_jinja"
|
||||
@@ -210,16 +229,6 @@ class KTODataset(BaseModel):
|
||||
trust_remote_code: Optional[bool] = False
|
||||
|
||||
|
||||
class RLType(str, Enum):
|
||||
"""RL trainer type configuration subset"""
|
||||
|
||||
dpo = "dpo" # pylint: disable=invalid-name
|
||||
ipo = "ipo" # pylint: disable=invalid-name
|
||||
orpo = "orpo" # pylint: disable=invalid-name
|
||||
kto = "kto" # pylint: disable=invalid-name
|
||||
simpo = "simpo" # pylint: disable=invalid-name
|
||||
|
||||
|
||||
class LoftQConfig(BaseModel):
|
||||
"""LoftQ configuration subset"""
|
||||
|
||||
@@ -254,11 +263,12 @@ class LoraConfig(BaseModel):
|
||||
lora_r: Optional[int] = None
|
||||
lora_alpha: Optional[int] = None
|
||||
lora_fan_in_fan_out: Optional[bool] = None
|
||||
lora_target_modules: Optional[List[str]] = None
|
||||
lora_target_modules: Optional[Union[str, List[str]]] = None
|
||||
lora_target_linear: Optional[bool] = None
|
||||
lora_modules_to_save: Optional[List[str]] = None
|
||||
lora_dropout: Optional[float] = 0.0
|
||||
peft_layers_to_transform: Optional[List[int]] = None
|
||||
peft_layers_pattern: Optional[List[str]] = None
|
||||
peft: Optional[PeftConfig] = None
|
||||
peft_use_dora: Optional[bool] = None
|
||||
peft_use_rslora: Optional[bool] = None
|
||||
@@ -324,6 +334,13 @@ class LoraConfig(BaseModel):
|
||||
raise ValueError("Require cfg.load_in_4bit to be True for qlora")
|
||||
return self
|
||||
|
||||
@field_validator("loraplus_lr_embedding")
|
||||
@classmethod
|
||||
def convert_loraplus_lr_embedding(cls, loraplus_lr_embedding):
|
||||
if loraplus_lr_embedding and isinstance(loraplus_lr_embedding, str):
|
||||
loraplus_lr_embedding = float(loraplus_lr_embedding)
|
||||
return loraplus_lr_embedding
|
||||
|
||||
|
||||
class ReLoRAConfig(BaseModel):
|
||||
"""ReLoRA configuration subset"""
|
||||
@@ -347,6 +364,9 @@ class ModelInputConfig(BaseModel):
|
||||
tokenizer_type: Optional[str] = Field(
|
||||
default=None, metadata={"help": "transformers tokenizer class"}
|
||||
)
|
||||
processor_type: Optional[str] = Field(
|
||||
default=None, metadata={"help": "transformers processor class"}
|
||||
)
|
||||
trust_remote_code: Optional[bool] = None
|
||||
|
||||
model_kwargs: Optional[Dict[str, Any]] = None
|
||||
@@ -503,6 +523,19 @@ class WandbConfig(BaseModel):
|
||||
return data
|
||||
|
||||
|
||||
class CometConfig(BaseModel):
|
||||
"""Comet configuration subset"""
|
||||
|
||||
use_comet: Optional[bool] = None
|
||||
comet_api_key: Optional[str] = None
|
||||
comet_workspace: Optional[str] = None
|
||||
comet_project_name: Optional[str] = None
|
||||
comet_experiment_key: Optional[str] = None
|
||||
comet_mode: Optional[str] = None
|
||||
comet_online: Optional[bool] = None
|
||||
comet_experiment_config: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class GradioConfig(BaseModel):
|
||||
"""Gradio configuration subset"""
|
||||
|
||||
@@ -523,6 +556,7 @@ class AxolotlInputConfig(
|
||||
HyperparametersConfig,
|
||||
WandbConfig,
|
||||
MLFlowConfig,
|
||||
CometConfig,
|
||||
LISAConfig,
|
||||
GradioConfig,
|
||||
RemappedParameters,
|
||||
@@ -549,6 +583,7 @@ class AxolotlInputConfig(
|
||||
dataset_prepared_path: Optional[str] = None
|
||||
dataset_shard_num: Optional[int] = None
|
||||
dataset_shard_idx: Optional[int] = None
|
||||
skip_prepare_dataset: Optional[bool] = False
|
||||
|
||||
pretraining_dataset: Optional[ # type: ignore
|
||||
conlist(Union[PretrainingDataset, SFTDataset], min_length=1)
|
||||
@@ -707,10 +742,12 @@ class AxolotlInputConfig(
|
||||
gpu_memory_limit: Optional[Union[int, str]] = None
|
||||
low_cpu_mem_usage: Optional[bool] = None
|
||||
|
||||
chat_template: Union[
|
||||
ChatTemplate,
|
||||
Annotated[str, StringConstraints(pattern="^tokenizer_default_fallback_")],
|
||||
] = ChatTemplate.chatml
|
||||
chat_template: Optional[
|
||||
Union[
|
||||
ChatTemplate,
|
||||
Annotated[str, StringConstraints(pattern="^tokenizer_default_fallback_")],
|
||||
]
|
||||
] = None
|
||||
chat_template_jinja: Optional[str] = None
|
||||
default_system_message: Optional[str] = None
|
||||
|
||||
@@ -753,6 +790,20 @@ class AxolotlInputConfig(
|
||||
)
|
||||
return datasets
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def set_default_chat_template(cls, data):
|
||||
if data.get("chat_template") is None:
|
||||
use_chat_template = any(
|
||||
dataset["type"] == "chat_template"
|
||||
for dataset in data.get("datasets", [])
|
||||
)
|
||||
|
||||
if use_chat_template:
|
||||
data["chat_template"] = ChatTemplate.tokenizer_default
|
||||
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_batch_size_fields(cls, data):
|
||||
@@ -1000,6 +1051,26 @@ class AxolotlInputConfig(
|
||||
"evaluation_strategy must be empty or set to `steps` when used with evals_per_epoch."
|
||||
)
|
||||
|
||||
if data.get("do_bench_eval") and not (
|
||||
data.get("evals_per_epoch") or data.get("eval_steps")
|
||||
):
|
||||
raise ValueError(
|
||||
"do_bench_eval requires evals_per_epoch or eval_steps to be set."
|
||||
)
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_test_datasets_bench(cls, data):
|
||||
if (
|
||||
data.get("do_bench_eval")
|
||||
and not data.get("test_datasets")
|
||||
and not data.get("val_set_size")
|
||||
):
|
||||
LOG.warning(
|
||||
"`do_bench_eval` needs a test dataset to run evals, adding an empty test_dataset."
|
||||
)
|
||||
data["test_datasets"] = [{"path": "axolotl-ai-co/empty-test-ds"}]
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@@ -1037,6 +1108,18 @@ class AxolotlInputConfig(
|
||||
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_mm_prepare(cls, data):
|
||||
if data.get("skip_prepare_dataset"):
|
||||
if data.get("remove_unused_columns") is None:
|
||||
LOG.info(
|
||||
"setting `remove_unused_columns: false` for skip_prepare_dataset"
|
||||
)
|
||||
data["remove_unused_columns"] = False
|
||||
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_warmup(cls, data):
|
||||
@@ -1064,12 +1147,20 @@ class AxolotlInputConfig(
|
||||
return neftune_noise_alpha
|
||||
|
||||
@model_validator(mode="after")
|
||||
def check(self):
|
||||
def check_rl_beta(self):
|
||||
if self.dpo_beta and not self.rl_beta:
|
||||
self.rl_beta = self.dpo_beta
|
||||
del self.dpo_beta
|
||||
return self
|
||||
|
||||
@model_validator(mode="after")
|
||||
def check_simpo_warmup(self):
|
||||
if self.rl == "simpo" and self.warmup_ratio:
|
||||
raise ValueError(
|
||||
"warmup_ratio is not supported with the simpo trainer. Please use `warmup_steps` instead"
|
||||
)
|
||||
return self
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_frozen(cls, data):
|
||||
@@ -1084,6 +1175,15 @@ class AxolotlInputConfig(
|
||||
|
||||
return data
|
||||
|
||||
@model_validator(mode="before")
|
||||
@classmethod
|
||||
def check_peft_layers_pattern(cls, data):
|
||||
if data.get("peft_layers_pattern") and not data.get("peft_layers_to_transform"):
|
||||
raise ValueError(
|
||||
"peft_layers_pattern requires peft_layers_to_transform to be set"
|
||||
)
|
||||
return data
|
||||
|
||||
@model_validator(mode="after")
|
||||
def check_fft_possible_bad_config(self):
|
||||
if (
|
||||
|
||||
@@ -51,20 +51,31 @@ from axolotl.utils.trainer import (
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
|
||||
def prepare_dataset(cfg, tokenizer):
|
||||
def prepare_dataset(cfg, tokenizer, processor=None):
|
||||
prompters = []
|
||||
if not cfg.pretraining_dataset:
|
||||
with zero_first(is_local_main_process()):
|
||||
if cfg.test_datasets:
|
||||
train_dataset, _, prompters = load_prepare_datasets(
|
||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH, split="train"
|
||||
tokenizer,
|
||||
cfg,
|
||||
DEFAULT_DATASET_PREPARED_PATH,
|
||||
split="train",
|
||||
processor=processor,
|
||||
)
|
||||
_, eval_dataset, _ = load_prepare_datasets(
|
||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH, split="test"
|
||||
tokenizer,
|
||||
cfg,
|
||||
DEFAULT_DATASET_PREPARED_PATH,
|
||||
split="test",
|
||||
processor=processor,
|
||||
)
|
||||
else:
|
||||
train_dataset, eval_dataset, prompters = load_prepare_datasets(
|
||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
|
||||
tokenizer,
|
||||
cfg,
|
||||
DEFAULT_DATASET_PREPARED_PATH,
|
||||
processor=processor,
|
||||
)
|
||||
else:
|
||||
path = cfg.pretraining_dataset
|
||||
@@ -123,6 +134,7 @@ def load_tokenized_prepared_datasets(
|
||||
cfg,
|
||||
default_dataset_prepared_path,
|
||||
split="train",
|
||||
processor=None,
|
||||
) -> Tuple[DatasetDict, List[Prompter]]:
|
||||
cfg_datasets = cfg.test_datasets if split == "test" else cfg.datasets
|
||||
tokenizer_name = cfg.tokenizer_config
|
||||
@@ -180,6 +192,7 @@ def load_tokenized_prepared_datasets(
|
||||
cfg.dataset_prepared_path
|
||||
and any(prepared_ds_path.glob("*"))
|
||||
and not cfg.is_preprocess
|
||||
and not cfg.skip_prepare_dataset
|
||||
):
|
||||
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
||||
dataset = load_from_disk(str(prepared_ds_path))
|
||||
@@ -423,12 +436,16 @@ def load_tokenized_prepared_datasets(
|
||||
dataset=ds,
|
||||
d_base_type=d_base_type,
|
||||
d_prompt_style=d_prompt_style,
|
||||
processor=processor,
|
||||
)
|
||||
datasets.append(dataset_wrapper)
|
||||
prompters.append(dataset_prompter)
|
||||
|
||||
LOG.info("merging datasets")
|
||||
dataset = concatenate_datasets(datasets)
|
||||
if len(datasets) == 1:
|
||||
dataset = datasets[0]
|
||||
else:
|
||||
LOG.info("merging datasets")
|
||||
dataset = concatenate_datasets(datasets)
|
||||
|
||||
if len(datasets) > 1:
|
||||
if cfg.shuffle_merged_datasets:
|
||||
@@ -437,9 +454,10 @@ def load_tokenized_prepared_datasets(
|
||||
else:
|
||||
LOG.debug("NOT shuffling merged datasets")
|
||||
|
||||
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
|
||||
if not cfg.skip_prepare_dataset:
|
||||
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
|
||||
|
||||
if cfg.local_rank == 0:
|
||||
if cfg.local_rank == 0 and not cfg.skip_prepare_dataset:
|
||||
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
||||
dataset.save_to_disk(str(prepared_ds_path))
|
||||
if cfg.push_dataset_to_hub:
|
||||
@@ -478,9 +496,14 @@ def load_prepare_datasets(
|
||||
cfg,
|
||||
default_dataset_prepared_path,
|
||||
split="train",
|
||||
processor=None,
|
||||
) -> Tuple[Dataset, Dataset, List[Prompter]]:
|
||||
dataset, prompters = load_tokenized_prepared_datasets(
|
||||
tokenizer, cfg, default_dataset_prepared_path, split=split
|
||||
tokenizer,
|
||||
cfg,
|
||||
default_dataset_prepared_path,
|
||||
split=split,
|
||||
processor=processor,
|
||||
)
|
||||
|
||||
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
|
||||
@@ -546,6 +569,7 @@ def get_dataset_wrapper(
|
||||
d_base_type,
|
||||
dataset,
|
||||
d_prompt_style=None,
|
||||
processor=None,
|
||||
):
|
||||
dataset_wrapper = None
|
||||
dataset_prompter = None
|
||||
@@ -578,7 +602,11 @@ def get_dataset_wrapper(
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
elif ds_strategy := load(config_dataset.type, tokenizer, cfg, config_dataset):
|
||||
elif cfg.skip_prepare_dataset:
|
||||
dataset_wrapper = dataset
|
||||
elif ds_strategy := load(
|
||||
config_dataset.type, tokenizer, cfg, config_dataset, processor=processor
|
||||
):
|
||||
dataset_prompter = UnsupportedPrompter()
|
||||
dataset_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy,
|
||||
|
||||
@@ -28,12 +28,17 @@ from transformers import ( # noqa: F401
|
||||
AddedToken,
|
||||
AutoConfig,
|
||||
AutoModelForCausalLM,
|
||||
AutoModelForVision2Seq,
|
||||
AutoProcessor,
|
||||
AutoTokenizer,
|
||||
AwqConfig,
|
||||
BitsAndBytesConfig,
|
||||
GPTQConfig,
|
||||
LlavaForConditionalGeneration,
|
||||
MllamaForConditionalGeneration,
|
||||
PreTrainedModel,
|
||||
PreTrainedTokenizerBase,
|
||||
ProcessorMixin,
|
||||
)
|
||||
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
|
||||
@@ -43,9 +48,6 @@ from axolotl.monkeypatch.multipack import (
|
||||
SUPPORTED_MULTIPACK_MODEL_TYPES,
|
||||
patch_for_multipack,
|
||||
)
|
||||
from axolotl.monkeypatch.transformers_dynamic_module_utils import (
|
||||
patch_transformers_dynamic_module_utils,
|
||||
)
|
||||
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
|
||||
from axolotl.utils.bench import log_gpu_memory_usage
|
||||
from axolotl.utils.chat_templates import get_chat_template_from_config
|
||||
@@ -57,8 +59,6 @@ from axolotl.utils.model_shard_quant import load_sharded_model, load_sharded_mod
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
patch_transformers_dynamic_module_utils()
|
||||
|
||||
|
||||
# copied from accelerator.FullyShardedDataParallelPlugin
|
||||
def get_module_class_from_name(module, name):
|
||||
@@ -85,6 +85,9 @@ def get_module_class_from_name(module, name):
|
||||
|
||||
|
||||
def check_model_config(cfg: DictDefault, model_config: Union[AutoConfig, DictDefault]):
|
||||
if cfg.is_multimodal:
|
||||
model_config = model_config.text_config
|
||||
|
||||
quant_config_exists = (
|
||||
hasattr(model_config, "quantization_config")
|
||||
and model_config.quantization_config
|
||||
@@ -307,11 +310,31 @@ def load_tokenizer(cfg):
|
||||
return tokenizer
|
||||
|
||||
|
||||
def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
|
||||
processor_kwargs: Dict[str, Any] = {} # do we actually need this?
|
||||
|
||||
processor_cls = AutoProcessor
|
||||
if cfg.processor_type:
|
||||
processor_cls = getattr(transformers, cfg.processor_type)
|
||||
|
||||
processor = processor_cls.from_pretrained(
|
||||
cfg.processor_config,
|
||||
trust_remote_code=cfg.trust_remote_code or False,
|
||||
tokenizer=tokenizer,
|
||||
**processor_kwargs,
|
||||
)
|
||||
|
||||
return processor
|
||||
|
||||
|
||||
def load_model(
|
||||
cfg: DictDefault,
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
*,
|
||||
processor: ProcessorMixin = None, # pylint: disable=unused-argument
|
||||
inference: bool = False,
|
||||
reference_model: bool = False,
|
||||
**kwargs, # pylint: disable=unused-argument
|
||||
) -> Tuple[PreTrainedModel, Optional[PeftConfig]]:
|
||||
"""
|
||||
Load a model for a given configuration and tokenizer.
|
||||
@@ -327,12 +350,23 @@ def load_model(
|
||||
plugin_manager = PluginManager.get_instance()
|
||||
plugin_manager.pre_model_load(cfg)
|
||||
|
||||
if cfg.is_multimodal:
|
||||
text_model_config = model_config.text_config
|
||||
else:
|
||||
text_model_config = model_config
|
||||
|
||||
# TODO refactor as a kwarg
|
||||
load_in_8bit = cfg.load_in_8bit
|
||||
|
||||
if cfg.gradient_checkpointing == "unsloth":
|
||||
transformers.modeling_utils.checkpoint = hf_grad_checkpoint_unsloth_wrapper
|
||||
|
||||
if hasattr(model_config, "model_type") and model_config.model_type == "mllama":
|
||||
if cfg.flash_attention:
|
||||
from axolotl.monkeypatch.attention.mllama import patch_mllama
|
||||
|
||||
patch_mllama()
|
||||
|
||||
if hasattr(model_config, "model_type") and model_config.model_type == "btlm":
|
||||
if cfg.flash_attention:
|
||||
from axolotl.monkeypatch.btlm_attn_hijack_flash import (
|
||||
@@ -469,6 +503,19 @@ def load_model(
|
||||
max_memory = cfg.max_memory
|
||||
device_map = cfg.device_map
|
||||
|
||||
AutoModelLoader = AutoModelForCausalLM # pylint: disable=invalid-name
|
||||
if cfg.is_multimodal:
|
||||
if model_config.model_type == "llava":
|
||||
AutoModelLoader = ( # pylint: disable=invalid-name
|
||||
LlavaForConditionalGeneration
|
||||
)
|
||||
elif model_config.model_type == "mllama":
|
||||
AutoModelLoader = ( # pylint: disable=invalid-name
|
||||
MllamaForConditionalGeneration
|
||||
)
|
||||
else:
|
||||
AutoModelLoader = AutoModelForVision2Seq # pylint: disable=invalid-name
|
||||
|
||||
if cfg.gpu_memory_limit:
|
||||
gpu_memory_limit = (
|
||||
str(cfg.gpu_memory_limit) + "GiB"
|
||||
@@ -486,7 +533,7 @@ def load_model(
|
||||
from accelerate import infer_auto_device_map
|
||||
|
||||
with init_empty_weights():
|
||||
model_canvas = AutoModelForCausalLM.from_config(
|
||||
model_canvas = AutoModelLoader.from_config(
|
||||
model_config, trust_remote_code=cfg.trust_remote_code or False
|
||||
)
|
||||
model_canvas.tie_weights()
|
||||
@@ -641,6 +688,8 @@ def load_model(
|
||||
quantization_config = (
|
||||
quantization_config or model_kwargs["quantization_config"]
|
||||
)
|
||||
if cfg.is_multimodal:
|
||||
model_config.text_config = text_model_config
|
||||
model = load_sharded_model_quant(
|
||||
base_model,
|
||||
model_config,
|
||||
@@ -659,7 +708,9 @@ def load_model(
|
||||
if "device_map" in model_kwargs:
|
||||
del model_kwargs["device_map"]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
if cfg.is_multimodal:
|
||||
model_config.text_config = text_model_config
|
||||
model = AutoModelLoader.from_pretrained(
|
||||
base_model,
|
||||
config=model_config,
|
||||
**model_kwargs,
|
||||
@@ -698,13 +749,17 @@ def load_model(
|
||||
and not cfg.trust_remote_code
|
||||
):
|
||||
if cfg.gptq:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
if cfg.is_multimodal:
|
||||
model_config.text_config = text_model_config
|
||||
model = AutoModelLoader.from_pretrained(
|
||||
base_model,
|
||||
config=model_config,
|
||||
trust_remote_code=cfg.trust_remote_code or False,
|
||||
**model_kwargs,
|
||||
)
|
||||
else:
|
||||
if cfg.is_multimodal:
|
||||
model_config.text_config = text_model_config
|
||||
model = getattr(transformers, model_type).from_pretrained(
|
||||
base_model,
|
||||
config=model_config,
|
||||
@@ -715,21 +770,23 @@ def load_model(
|
||||
# Shouldn't be a problem most of the time. will obviously error if the model doesn't support this
|
||||
# when training starts
|
||||
if (
|
||||
hasattr(model_config, "max_seq_len")
|
||||
and model_config.max_seq_len
|
||||
hasattr(text_model_config, "max_seq_len")
|
||||
and text_model_config.max_seq_len
|
||||
and cfg.sequence_len > model_config.max_seq_len
|
||||
):
|
||||
model_config.max_seq_len = cfg.sequence_len
|
||||
text_model_config.max_seq_len = cfg.sequence_len
|
||||
LOG.warning(f"increasing context length to {cfg.sequence_len}")
|
||||
elif (
|
||||
hasattr(model_config, "max_sequence_length")
|
||||
and model_config.max_sequence_length
|
||||
and cfg.sequence_len > model_config.max_sequence_length
|
||||
hasattr(text_model_config, "max_sequence_length")
|
||||
and text_model_config.max_sequence_length
|
||||
and cfg.sequence_len > text_model_config.max_sequence_length
|
||||
):
|
||||
model_config.max_sequence_length = cfg.sequence_len
|
||||
text_model_config.max_sequence_length = cfg.sequence_len
|
||||
LOG.warning(f"increasing context length to {cfg.sequence_len}")
|
||||
if cfg.gptq:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
if cfg.is_multimodal:
|
||||
model_config.text_config = text_model_config
|
||||
model = AutoModelLoader.from_pretrained(
|
||||
base_model,
|
||||
config=model_config,
|
||||
trust_remote_code=cfg.trust_remote_code or False,
|
||||
@@ -742,7 +799,9 @@ def load_model(
|
||||
if "device_map" in model_kwargs:
|
||||
del model_kwargs["device_map"]
|
||||
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
if cfg.is_multimodal:
|
||||
model_config.text_config = text_model_config
|
||||
model = AutoModelLoader.from_pretrained(
|
||||
base_model,
|
||||
config=model_config,
|
||||
trust_remote_code=cfg.trust_remote_code or False,
|
||||
@@ -1024,12 +1083,17 @@ def load_lora(model, cfg, inference=False, config_only=False):
|
||||
|
||||
from peft import LoraConfig, get_peft_model
|
||||
|
||||
lora_target_modules = list(cfg.lora_target_modules or [])
|
||||
lora_target_modules = cfg.lora_target_modules or []
|
||||
|
||||
if cfg.lora_target_linear:
|
||||
linear_names = find_all_linear_names(model)
|
||||
LOG.info(f"found linear modules: {repr(sorted(linear_names))}")
|
||||
lora_target_modules = list(set(lora_target_modules + linear_names))
|
||||
lora_target_modules_as_list = (
|
||||
lora_target_modules
|
||||
if isinstance(lora_target_modules, list)
|
||||
else [lora_target_modules]
|
||||
)
|
||||
lora_target_modules = list(set(lora_target_modules_as_list + linear_names))
|
||||
|
||||
lora_config_kwargs = {}
|
||||
loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
|
||||
@@ -1048,6 +1112,7 @@ def load_lora(model, cfg, inference=False, config_only=False):
|
||||
lora_alpha=cfg.lora_alpha,
|
||||
target_modules=lora_target_modules,
|
||||
layers_to_transform=cfg.peft_layers_to_transform,
|
||||
layers_pattern=cfg.peft_layers_pattern,
|
||||
lora_dropout=cfg.lora_dropout,
|
||||
fan_in_fan_out=cfg.lora_fan_in_fan_out,
|
||||
modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
|
||||
|
||||
@@ -306,7 +306,7 @@ def process_pretraining_datasets_for_packing(
|
||||
|
||||
|
||||
def calculate_total_num_steps(cfg, train_dataset, update=True):
|
||||
if not cfg.total_num_tokens:
|
||||
if not cfg.total_num_tokens and not cfg.skip_prepare_dataset:
|
||||
total_num_tokens = np.sum(
|
||||
train_dataset.data.column("input_ids")
|
||||
.to_pandas()
|
||||
@@ -319,7 +319,11 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
|
||||
|
||||
skip_estimates = cfg.model_config_type == "mamba"
|
||||
|
||||
if not skip_estimates and not cfg.total_supervised_tokens:
|
||||
if (
|
||||
not skip_estimates
|
||||
and not cfg.total_supervised_tokens
|
||||
and not cfg.skip_prepare_dataset
|
||||
):
|
||||
total_supervised_tokens = (
|
||||
train_dataset.data.column("labels")
|
||||
.to_pandas()
|
||||
@@ -478,13 +482,15 @@ def prepare_opinionated_env(cfg):
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
|
||||
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
|
||||
def setup_trainer(
|
||||
cfg, train_dataset, eval_dataset, model, tokenizer, processor, total_num_steps
|
||||
):
|
||||
if cfg.rl in ["dpo", "ipo", "orpo", "kto", "simpo"]:
|
||||
trainer_builder = HFRLTrainerBuilder(cfg, model[0], tokenizer)
|
||||
trainer_builder = HFRLTrainerBuilder(cfg, model[0], tokenizer, processor)
|
||||
trainer_builder.model_ref = model[1]
|
||||
trainer_builder.peft_config = model[2]
|
||||
else:
|
||||
trainer_builder = HFCausalTrainerBuilder(cfg, model[0], tokenizer)
|
||||
trainer_builder = HFCausalTrainerBuilder(cfg, model[0], tokenizer, processor)
|
||||
|
||||
trainer_builder.train_dataset = train_dataset
|
||||
trainer_builder.eval_dataset = eval_dataset
|
||||
|
||||
71
tests/prompt_strategies/conftest.py
Normal file
71
tests/prompt_strategies/conftest.py
Normal file
@@ -0,0 +1,71 @@
|
||||
"""
|
||||
shared fixtures for prompt strategies tests
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from datasets import Dataset
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
|
||||
@pytest.fixture(name="assistant_dataset")
|
||||
def fixture_assistant_dataset():
|
||||
return Dataset.from_list(
|
||||
[
|
||||
{
|
||||
"messages": [
|
||||
{"role": "user", "content": "hello"},
|
||||
{"role": "assistant", "content": "hello"},
|
||||
{"role": "user", "content": "goodbye"},
|
||||
{"role": "assistant", "content": "goodbye"},
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(name="sharegpt_dataset")
|
||||
def fixture_sharegpt_dataset():
|
||||
# pylint: disable=duplicate-code
|
||||
return Dataset.from_list(
|
||||
[
|
||||
{
|
||||
"conversations": [
|
||||
{"from": "human", "value": "hello"},
|
||||
{"from": "gpt", "value": "hello"},
|
||||
{"from": "human", "value": "goodbye"},
|
||||
{"from": "gpt", "value": "goodbye"},
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(name="basic_dataset")
|
||||
def fixture_basic_dataset():
|
||||
# pylint: disable=duplicate-code
|
||||
return Dataset.from_list(
|
||||
[
|
||||
{
|
||||
"conversations": [
|
||||
{"from": "system", "value": "You are an AI assistant."},
|
||||
{"from": "human", "value": "Hello"},
|
||||
{"from": "assistant", "value": "Hi there!"},
|
||||
{"from": "human", "value": "How are you?"},
|
||||
{"from": "assistant", "value": "I'm doing well, thank you!"},
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(name="llama3_tokenizer")
|
||||
def fixture_llama3_tokenizer():
|
||||
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(name="phi35_tokenizer")
|
||||
def fixture_phi35_tokenizer():
|
||||
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
|
||||
return tokenizer
|
||||
@@ -5,10 +5,6 @@ tests for chat_template prompt strategy
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
from datasets import Dataset
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from axolotl.prompt_strategies.chat_template import (
|
||||
ChatTemplatePrompter,
|
||||
ChatTemplateStrategy,
|
||||
@@ -22,657 +18,6 @@ logging.basicConfig(level=logging.DEBUG)
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
|
||||
@pytest.fixture(name="assistant_dataset")
|
||||
def fixture_assistant_dataset():
|
||||
return Dataset.from_list(
|
||||
[
|
||||
{
|
||||
"messages": [
|
||||
{"role": "user", "content": "hello"},
|
||||
{"role": "assistant", "content": "hello"},
|
||||
{"role": "user", "content": "goodbye"},
|
||||
{"role": "assistant", "content": "goodbye"},
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(name="sharegpt_dataset")
|
||||
def fixture_sharegpt_dataset():
|
||||
# pylint: disable=duplicate-code
|
||||
return Dataset.from_list(
|
||||
[
|
||||
{
|
||||
"conversations": [
|
||||
{"from": "human", "value": "hello"},
|
||||
{"from": "gpt", "value": "hello"},
|
||||
{"from": "human", "value": "goodbye"},
|
||||
{"from": "gpt", "value": "goodbye"},
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(name="basic_dataset")
|
||||
def fixture_basic_dataset():
|
||||
# pylint: disable=duplicate-code
|
||||
return Dataset.from_list(
|
||||
[
|
||||
{
|
||||
"conversations": [
|
||||
{"from": "system", "value": "You are an AI assistant."},
|
||||
{"from": "human", "value": "Hello"},
|
||||
{"from": "assistant", "value": "Hi there!"},
|
||||
{"from": "human", "value": "How are you?"},
|
||||
{"from": "assistant", "value": "I'm doing well, thank you!"},
|
||||
]
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(name="llama3_tokenizer")
|
||||
def fixture_llama3_tokenizer():
|
||||
tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
class TestChatTemplateConfigurations:
|
||||
"""
|
||||
Test class for various configurations of ChatTemplateStrategy.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def find_sublist(full_list, sub_list):
|
||||
token_count = len(sub_list)
|
||||
for index in range(len(full_list) - token_count + 1):
|
||||
if full_list[index : index + token_count] == sub_list:
|
||||
return index
|
||||
return -1
|
||||
|
||||
def test_train_on_inputs_true(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_inputs=True")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=True,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Verify that assistant responses are labeled
|
||||
assistant_responses = ["Hi there!", "I'm doing well, thank you!"]
|
||||
for response in assistant_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
LOG.debug(
|
||||
f"Assistant response '{response}' expected IDs: {response_ids}, found at: {start_idx}"
|
||||
)
|
||||
assert start_idx != -1, f"Could not find '{response}' in input_ids"
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(response_ids)]
|
||||
), f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:start_idx+len(response_ids)]}"
|
||||
|
||||
# Check the behavior of human inputs
|
||||
human_inputs = ["Hello", "How are you?"]
|
||||
for input_text in human_inputs:
|
||||
input_ids = llama3_tokenizer.encode(input_text, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, input_ids)
|
||||
labeled = all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(input_ids)]
|
||||
)
|
||||
LOG.debug(
|
||||
f"Human input '{input_text}' is {'labeled' if labeled else 'not labeled'}, expected IDs: {input_ids}, found at: {start_idx}"
|
||||
)
|
||||
|
||||
LOG.debug("Full labels: %s", labels)
|
||||
LOG.debug("Full input_ids: %s", input_ids)
|
||||
|
||||
def test_train_on_inputs_false(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_inputs=False")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Verify that only assistant responses are labeled
|
||||
assistant_responses = ["Hi there!", "I'm doing well, thank you!"]
|
||||
for response in assistant_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
LOG.debug(
|
||||
f"Assistant response '{response}' expected IDs: {response_ids}, found at: {start_idx}"
|
||||
)
|
||||
assert start_idx != -1, f"Could not find '{response}' in input_ids"
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(response_ids)]
|
||||
), f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:start_idx+len(response_ids)]}"
|
||||
|
||||
# Verify that human inputs are not labeled
|
||||
human_inputs = ["Hello", "How are you?"]
|
||||
for input_text in human_inputs:
|
||||
input_ids = llama3_tokenizer.encode(input_text, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, input_ids)
|
||||
LOG.debug(
|
||||
f"Human input '{input_text}' expected IDs: {input_ids}, found at: {start_idx}"
|
||||
)
|
||||
assert start_idx != -1, f"Could not find '{input_text}' in input_ids"
|
||||
assert all(
|
||||
label == IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(input_ids)]
|
||||
), f"Expected labels for human input '{input_text}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:start_idx+len(input_ids)]}"
|
||||
|
||||
def test_roles_to_train_assistant_only(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing roles_to_train with assistant only")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Verify that only assistant responses are labeled
|
||||
assistant_responses = ["Hi there!", "I'm doing well, thank you!"]
|
||||
for response in assistant_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
LOG.debug(
|
||||
f"Assistant response '{response}' expected IDs: {response_ids}, found at: {start_idx}"
|
||||
)
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(response_ids)]
|
||||
), f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:start_idx+len(response_ids)]}"
|
||||
|
||||
def test_roles_to_train_all(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing roles_to_train with all roles")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=True,
|
||||
sequence_len=512,
|
||||
roles_to_train=["human", "assistant"],
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Verify that all responses are labeled (except for special tokens)
|
||||
all_responses = [
|
||||
"Hello",
|
||||
"Hi there!",
|
||||
"How are you?",
|
||||
"I'm doing well, thank you!",
|
||||
]
|
||||
for response in all_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
LOG.debug(
|
||||
f"Response '{response}' expected IDs: {response_ids}, found at: {start_idx}"
|
||||
)
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(response_ids)]
|
||||
), f"Expected labels for response '{response}' to be set, but got {labels[start_idx:start_idx+len(response_ids)]}"
|
||||
|
||||
def test_empty_roles_to_train(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with empty roles_to_train")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=[],
|
||||
train_on_eos="none", # Add this line
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
|
||||
# Verify that no labels are set when roles_to_train is empty
|
||||
LOG.debug("Full labels: %s", labels)
|
||||
assert all(
|
||||
label == IGNORE_TOKEN_ID for label in labels
|
||||
), "Expected all labels to be IGNORE_TOKEN_ID when roles_to_train is empty"
|
||||
|
||||
def test_train_on_eos_all(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_eos='all'")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
train_on_eos="all",
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
eos_token_id = llama3_tokenizer.eos_token_id
|
||||
eos_indices = [
|
||||
i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
|
||||
]
|
||||
|
||||
assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
|
||||
for eos_idx in eos_indices:
|
||||
assert (
|
||||
labels[eos_idx] != IGNORE_TOKEN_ID
|
||||
), f"Expected EOS token at index {eos_idx} to be labeled"
|
||||
|
||||
def test_train_on_eos_turn(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_eos='turn'")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
train_on_eos="turn",
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
eos_token_id = llama3_tokenizer.eos_token_id
|
||||
assistant_responses = ["Hi there!", "I'm doing well, thank you!"]
|
||||
|
||||
for response in assistant_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
assert start_idx != -1, f"Could not find '{response}' in input_ids"
|
||||
|
||||
eos_idx = start_idx + len(response_ids)
|
||||
while eos_idx < len(input_ids) and input_ids[eos_idx] != eos_token_id:
|
||||
eos_idx += 1
|
||||
|
||||
assert eos_idx < len(
|
||||
input_ids
|
||||
), f"Could not find EOS token after '{response}'"
|
||||
assert (
|
||||
labels[eos_idx] != IGNORE_TOKEN_ID
|
||||
), f"Expected EOS token after assistant response '{response}' to be labeled"
|
||||
|
||||
# Check that EOS tokens after human inputs are not labeled
|
||||
human_inputs = ["Hello", "How are you?"]
|
||||
for input_text in human_inputs:
|
||||
input_ids = llama3_tokenizer.encode(input_text, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, input_ids)
|
||||
assert start_idx != -1, f"Could not find '{input_text}' in input_ids"
|
||||
|
||||
eos_idx = start_idx + len(input_ids)
|
||||
while eos_idx < len(input_ids) and input_ids[eos_idx] != eos_token_id:
|
||||
eos_idx += 1
|
||||
|
||||
assert (
|
||||
labels[eos_idx] == IGNORE_TOKEN_ID
|
||||
), f"Expected EOS token after human input '{input_text}' to not be labeled"
|
||||
|
||||
def test_train_on_eos_last(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_eos='last'")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
train_on_eos="last",
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
eos_token_id = llama3_tokenizer.eos_token_id
|
||||
eos_indices = [
|
||||
i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
|
||||
]
|
||||
|
||||
assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
|
||||
last_eos_idx = eos_indices[-1]
|
||||
|
||||
# Check that only the last EOS token is labeled
|
||||
for idx in eos_indices[:-1]:
|
||||
assert (
|
||||
labels[idx] == IGNORE_TOKEN_ID
|
||||
), f"Expected EOS token at index {idx} to not be labeled"
|
||||
assert (
|
||||
labels[last_eos_idx] != IGNORE_TOKEN_ID
|
||||
), f"Expected last EOS token at index {last_eos_idx} to be labeled"
|
||||
|
||||
def test_train_on_eos_none(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_eos='none'")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
train_on_eos="none",
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
eos_token_id = llama3_tokenizer.eos_token_id
|
||||
eos_indices = [
|
||||
i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
|
||||
]
|
||||
|
||||
assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
|
||||
for eos_idx in eos_indices:
|
||||
assert (
|
||||
labels[eos_idx] == IGNORE_TOKEN_ID
|
||||
), f"Expected EOS token at index {eos_idx} to not be labeled"
|
||||
|
||||
def test_drop_system_message(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with drop_system_message=True")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, get_chat_template("llama3"), drop_system_message=True
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Check if system message is not present in input_ids
|
||||
system_message = "You are an AI assistant."
|
||||
system_ids = llama3_tokenizer.encode(system_message, add_special_tokens=False)
|
||||
assert (
|
||||
self.find_sublist(input_ids, system_ids) == -1
|
||||
), "Expected system message to be dropped"
|
||||
|
||||
def test_custom_roles(self, llama3_tokenizer):
|
||||
LOG.info("Testing with custom roles mapping")
|
||||
custom_roles = {
|
||||
"user": ["human", "user"],
|
||||
"assistant": ["ai", "assistant"],
|
||||
"system": ["context"],
|
||||
}
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, get_chat_template("llama3"), roles=custom_roles
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["ai"],
|
||||
)
|
||||
|
||||
# Create a new dataset with modified role names
|
||||
modified_conversations = [
|
||||
{"from": "context", "value": "You are an AI assistant."},
|
||||
{"from": "human", "value": "Hello"},
|
||||
{"from": "ai", "value": "Hi there!"},
|
||||
{"from": "human", "value": "How are you?"},
|
||||
{"from": "ai", "value": "I'm doing well, thank you!"},
|
||||
]
|
||||
|
||||
modified_dataset = Dataset.from_dict(
|
||||
{"conversations": [modified_conversations]}
|
||||
)
|
||||
|
||||
res = strategy.tokenize_prompt(modified_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Check if AI responses are labeled correctly
|
||||
ai_responses = ["Hi there!", "I'm doing well, thank you!"]
|
||||
for response in ai_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
assert start_idx != -1, f"Could not find response '{response}' in input_ids"
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(response_ids)]
|
||||
), f"Expected labels for AI response '{response}' to be set"
|
||||
|
||||
# Check if human messages are not labeled
|
||||
human_messages = ["Hello", "How are you?"]
|
||||
for message in human_messages:
|
||||
message_ids = llama3_tokenizer.encode(message, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, message_ids)
|
||||
assert start_idx != -1, f"Could not find message '{message}' in input_ids"
|
||||
assert all(
|
||||
label == IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(message_ids)]
|
||||
), f"Expected labels for human message '{message}' to be IGNORE_TOKEN_ID"
|
||||
|
||||
def test_message_field_training(self, llama3_tokenizer):
|
||||
LOG.info("Testing with message_field_training")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer,
|
||||
get_chat_template("llama3"),
|
||||
message_field_training="train",
|
||||
message_field_training_detail="train_detail",
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=[],
|
||||
)
|
||||
|
||||
# Create a new dataset with the train and train_detail fields
|
||||
modified_conversation = [
|
||||
{"from": "system", "value": "You are an AI assistant.", "train": False},
|
||||
{"from": "human", "value": "Hello", "train": False},
|
||||
{"from": "assistant", "value": "Hello", "train": True},
|
||||
{"from": "human", "value": "How are you?", "train": True},
|
||||
{
|
||||
"from": "assistant",
|
||||
"value": "I'm doing very well, thank you!",
|
||||
"train_detail": [
|
||||
{"begin_offset": 0, "end_offset": 8, "train": False},
|
||||
{"begin_offset": 9, "end_offset": 18, "train": True},
|
||||
{"begin_offset": 19, "end_offset": 30, "train": False},
|
||||
],
|
||||
},
|
||||
{
|
||||
"from": "human",
|
||||
"value": "I'm doing very well, thank you!",
|
||||
"train": False,
|
||||
},
|
||||
{"from": "assistant", "value": "Hi there!", "train": True},
|
||||
]
|
||||
|
||||
modified_dataset = Dataset.from_dict({"conversations": [modified_conversation]})
|
||||
|
||||
res = strategy.tokenize_prompt(modified_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Function to find all occurrences of a sublist
|
||||
def find_all_sublists(full_list, sub_list):
|
||||
indices = []
|
||||
for index in range(len(full_list) - len(sub_list) + 1):
|
||||
if full_list[index : index + len(sub_list)] == sub_list:
|
||||
indices.append(index)
|
||||
return indices
|
||||
|
||||
# Keep track of which occurrences we've processed
|
||||
processed_occurrences = {}
|
||||
# Check if messages are labeled correctly based on train or train_detail
|
||||
for i, turn in enumerate(modified_conversation):
|
||||
turn_tokens = llama3_tokenizer.encode(
|
||||
turn["value"], add_special_tokens=False
|
||||
)
|
||||
occurrences = find_all_sublists(input_ids, turn_tokens)
|
||||
turn_key = turn["value"]
|
||||
if turn_key not in processed_occurrences:
|
||||
processed_occurrences[turn_key] = 0
|
||||
current_occurrence = processed_occurrences[turn_key]
|
||||
|
||||
if current_occurrence >= len(occurrences):
|
||||
assert (
|
||||
False
|
||||
), f"Not enough occurrences found for message: {turn['value']}"
|
||||
|
||||
start_idx = occurrences[current_occurrence]
|
||||
processed_occurrences[turn_key] += 1
|
||||
end_idx = start_idx + len(turn_tokens)
|
||||
|
||||
LOG.debug(
|
||||
f"Processing turn {i}: role={turn['from']}, content='{turn['value']}', start_idx={start_idx}, end_idx={end_idx}"
|
||||
)
|
||||
|
||||
if "train_detail" in turn:
|
||||
# Get token offsets
|
||||
tokenized_output = llama3_tokenizer(
|
||||
turn["value"], return_offsets_mapping=True, add_special_tokens=False
|
||||
)
|
||||
token_offsets = tokenized_output["offset_mapping"]
|
||||
|
||||
# Adjust token offsets as done in the implementation
|
||||
for i in range(len(token_offsets) - 1):
|
||||
token_offsets[i] = (
|
||||
token_offsets[i][0],
|
||||
token_offsets[i + 1][0] - 1,
|
||||
)
|
||||
token_offsets[-1] = (token_offsets[-1][0], len(turn["value"]) - 1)
|
||||
|
||||
# Adjust train_details
|
||||
adjusted_train_details = strategy.prompter.adjust_train_details(
|
||||
turn["train_detail"], token_offsets
|
||||
)
|
||||
|
||||
LOG.debug(f"Original train_details: {turn['train_detail']}")
|
||||
LOG.debug(f"Adjusted train_details: {adjusted_train_details}")
|
||||
|
||||
# Handle train_detail
|
||||
token_offsets = strategy.prompter.get_offsets_for_train_detail(
|
||||
text=turn["value"],
|
||||
train_details=adjusted_train_details,
|
||||
mask_untrainable=False,
|
||||
)
|
||||
token_offsets_masked = strategy.prompter.get_offsets_for_train_detail(
|
||||
text=turn["value"],
|
||||
train_details=adjusted_train_details,
|
||||
mask_untrainable=True,
|
||||
)
|
||||
LOG.debug(f"Token offsets: {token_offsets_masked}")
|
||||
|
||||
expected_labels = [IGNORE_TOKEN_ID] * len(turn_tokens)
|
||||
for i, offset in enumerate(token_offsets_masked):
|
||||
if offset != IGNORE_TOKEN_ID:
|
||||
expected_labels[i] = turn_tokens[i]
|
||||
actual_labels = labels[
|
||||
start_idx : start_idx + len(token_offsets_masked)
|
||||
]
|
||||
assert (
|
||||
actual_labels == expected_labels
|
||||
), f"Labels mismatch for turn: {turn['value']}\nExpected: {expected_labels}\nActual: {actual_labels}"
|
||||
|
||||
for detail in adjusted_train_details:
|
||||
# Find the token indices that correspond to the character offsets
|
||||
detail_start = start_idx + next(
|
||||
i
|
||||
for i, offset in enumerate(token_offsets)
|
||||
if offset >= detail["begin_offset"]
|
||||
)
|
||||
detail_end = start_idx + next(
|
||||
(
|
||||
i
|
||||
for i, offset in enumerate(token_offsets)
|
||||
if offset > detail["end_offset"]
|
||||
),
|
||||
len(token_offsets),
|
||||
)
|
||||
|
||||
detail_text = turn["value"][
|
||||
detail["begin_offset"] : detail["end_offset"] + 1
|
||||
]
|
||||
detail_labels = labels[detail_start:detail_end]
|
||||
detail_input_ids = input_ids[detail_start:detail_end]
|
||||
|
||||
LOG.debug(
|
||||
f"Detail: '{detail_text}', Start: {detail_start}, End: {detail_end}"
|
||||
)
|
||||
LOG.debug(f"Detail input_ids: {detail_input_ids}")
|
||||
LOG.debug(f"Detail labels: {detail_labels}")
|
||||
LOG.debug(
|
||||
f"Decoded detail: {llama3_tokenizer.decode(detail_input_ids)}"
|
||||
)
|
||||
LOG.debug(
|
||||
f"Token offsets for this detail: {token_offsets[detail_start-start_idx:detail_end-start_idx]}"
|
||||
)
|
||||
|
||||
if detail["train"]:
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID for label in detail_labels
|
||||
), (
|
||||
f"Expected labels for trainable detail '{detail_text}' to be set, but some were IGNORE_TOKEN_ID. "
|
||||
f"Labels({detail_start}:{detail_end}): {detail_labels}, "
|
||||
f"InputIDs: {detail_input_ids}, "
|
||||
f"Decoded: '{llama3_tokenizer.decode(detail_input_ids)}'"
|
||||
)
|
||||
else:
|
||||
assert all(
|
||||
label == IGNORE_TOKEN_ID for label in detail_labels
|
||||
), (
|
||||
f"Expected all labels for non-trainable detail '{detail_text}' to be IGNORE_TOKEN_ID, but some were not. "
|
||||
f"Labels({detail_start}:{detail_end}): {detail_labels}, "
|
||||
f"InputIDs: {detail_input_ids}, "
|
||||
f"Decoded: '{llama3_tokenizer.decode(detail_input_ids)}'"
|
||||
)
|
||||
else:
|
||||
should_train = turn.get("train", False)
|
||||
turn_labels = labels[start_idx:end_idx]
|
||||
|
||||
LOG.debug(f"Should train: {should_train}")
|
||||
LOG.debug(f"Turn indices: start={start_idx}, end={end_idx}")
|
||||
LOG.debug(f"Turn labels: {turn_labels}")
|
||||
LOG.debug(f"Turn input IDs: {input_ids[start_idx:end_idx]}")
|
||||
LOG.debug(
|
||||
f"Decoded turn: {llama3_tokenizer.decode(input_ids[start_idx:end_idx])}"
|
||||
)
|
||||
|
||||
if should_train:
|
||||
assert all(label != IGNORE_TOKEN_ID for label in turn_labels), (
|
||||
f"Expected all labels for '{turn['value']}' to be set\n"
|
||||
f"Labels({start_idx}:{end_idx}): {turn_labels}, "
|
||||
f"InputIDs: {input_ids[start_idx:end_idx]}, "
|
||||
f"Decoded: '{llama3_tokenizer.decode(input_ids[start_idx:end_idx])}'"
|
||||
)
|
||||
else:
|
||||
assert all(label == IGNORE_TOKEN_ID for label in turn_labels), (
|
||||
f"Expected all labels for '{turn['value']}' to be IGNORE_TOKEN_ID\n"
|
||||
f"Labels({start_idx}:{end_idx}): {turn_labels}, "
|
||||
f"InputIDs: {input_ids[start_idx:end_idx]}, "
|
||||
f"Decoded: '{llama3_tokenizer.decode(input_ids[start_idx:end_idx])}'"
|
||||
)
|
||||
|
||||
LOG.debug(
|
||||
f"Processed turn: {turn['from']}, content: '{turn['value']}', "
|
||||
f"start_idx: {start_idx}, end_idx: {end_idx}, "
|
||||
f"labels: {labels[start_idx:end_idx]}"
|
||||
)
|
||||
|
||||
LOG.debug(f"Final labels: {labels}")
|
||||
LOG.debug(f"Final input_ids: {input_ids}")
|
||||
|
||||
|
||||
class TestAssistantChatTemplateLlama3:
|
||||
"""
|
||||
Test class for assistant style datasets with llama-3 prompts using the chat_template strategy.
|
||||
@@ -728,7 +73,7 @@ class TestAssistantChatTemplateLlama3:
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer,
|
||||
get_chat_template("llama3"),
|
||||
chat_template=get_chat_template("llama3"),
|
||||
message_field_role="role",
|
||||
message_field_content="content",
|
||||
roles={
|
||||
@@ -740,7 +85,6 @@ class TestAssistantChatTemplateLlama3:
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
)
|
||||
strategy.messages = "messages"
|
||||
res = strategy.tokenize_prompt(assistant_dataset[0])
|
||||
@@ -764,12 +108,70 @@ class TestAssistantChatTemplateLlama3:
|
||||
input_ids == expected_input_ids
|
||||
), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
|
||||
|
||||
def test_phi35(self, phi35_tokenizer, assistant_dataset):
|
||||
LOG.info("Testing phi-3.5 with assistant dataset")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
phi35_tokenizer,
|
||||
chat_template=get_chat_template("phi_35"),
|
||||
message_field_role="role",
|
||||
message_field_content="content",
|
||||
roles={
|
||||
"user": ["user"],
|
||||
"assistant": ["assistant"],
|
||||
"system": ["system"],
|
||||
},
|
||||
),
|
||||
tokenizer=phi35_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
)
|
||||
strategy.messages = "messages"
|
||||
res = strategy.tokenize_prompt(assistant_dataset[0])
|
||||
input_ids = res["input_ids"]
|
||||
labels = res["labels"]
|
||||
# fmt: off
|
||||
expected_input_ids = [
|
||||
32010, # user
|
||||
22172, 32007, # user eot
|
||||
32001, # assistant
|
||||
22172, 32007, # assistant eot
|
||||
32010, # user
|
||||
1781, 26966, 32007, # user eot
|
||||
32001, # assistant
|
||||
1781, 26966, 32007, # assistant eot
|
||||
32000, # eos
|
||||
]
|
||||
expected_labels = [
|
||||
-100, # user
|
||||
-100, -100, # user eot
|
||||
-100, # assistant
|
||||
-100, -100, # assistant eot,
|
||||
-100, # user
|
||||
-100, -100, -100, # user eot
|
||||
-100, # assistant
|
||||
1781, 26966, 32007, # assistant eot
|
||||
32000, # eos
|
||||
]
|
||||
# fmt: on
|
||||
LOG.debug(f"Expected input_ids: {expected_input_ids}")
|
||||
LOG.debug(f"Actual input_ids: {input_ids}")
|
||||
assert (
|
||||
input_ids == expected_input_ids
|
||||
), f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
|
||||
|
||||
LOG.debug(f"Expected labels : {expected_labels}")
|
||||
LOG.debug(f"Actual labels : {labels}")
|
||||
assert (
|
||||
labels == expected_labels
|
||||
), f"Input IDs mismatch: {labels} != {expected_labels}"
|
||||
|
||||
def test_llama3_with_training_data(self, llama3_tokenizer, assistant_dataset):
|
||||
LOG.info("Testing llama-3 with assistant dataset including training data")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer,
|
||||
get_chat_template("llama3"),
|
||||
chat_template=get_chat_template("llama3"),
|
||||
message_field_role="role",
|
||||
message_field_content="content",
|
||||
message_field_training="training",
|
||||
@@ -825,8 +227,11 @@ class TestSharegptChatTemplateLlama3:
|
||||
|
||||
def test_llama3_assistant(self, llama3_tokenizer, sharegpt_dataset):
|
||||
LOG.info("Testing ShareGPT style datasets with llama-3 assistant prompts")
|
||||
# pylint: disable=duplicate-code
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
train_on_eos="none",
|
||||
@@ -875,8 +280,11 @@ class TestSharegptChatTemplateLlama3:
|
||||
|
||||
def test_llama3_human(self, llama3_tokenizer, sharegpt_dataset):
|
||||
LOG.info("Testing ShareGPT style datasets with llama-3 human prompts")
|
||||
# pylint: disable=duplicate-code
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
train_on_eos="none",
|
||||
@@ -925,8 +333,11 @@ class TestSharegptChatTemplateLlama3:
|
||||
|
||||
def test_llama3_system_human(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing ShareGPT style datasets with llama-3 system/human prompts")
|
||||
# pylint: disable=duplicate-code
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(llama3_tokenizer, get_chat_template("llama3")),
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
train_on_eos="none",
|
||||
|
||||
637
tests/prompt_strategies/test_chat_templates_advanced.py
Normal file
637
tests/prompt_strategies/test_chat_templates_advanced.py
Normal file
@@ -0,0 +1,637 @@
|
||||
"""
|
||||
tests for chat_template prompt strategy
|
||||
"""
|
||||
|
||||
import logging
|
||||
import unittest
|
||||
|
||||
from datasets import Dataset
|
||||
|
||||
from axolotl.prompt_strategies.chat_template import (
|
||||
ChatTemplatePrompter,
|
||||
ChatTemplateStrategy,
|
||||
)
|
||||
from axolotl.prompters import IGNORE_TOKEN_ID
|
||||
from axolotl.utils.chat_templates import get_chat_template
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
|
||||
class TestChatTemplateConfigurations:
|
||||
"""
|
||||
Test class for various configurations of ChatTemplateStrategy.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def find_sublist(full_list, sub_list):
|
||||
token_count = len(sub_list)
|
||||
for index in range(len(full_list) - token_count + 1):
|
||||
if full_list[index : index + token_count] == sub_list:
|
||||
return index
|
||||
return -1
|
||||
|
||||
def test_train_on_inputs_true(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_inputs=True")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=True,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Verify that assistant responses are labeled
|
||||
assistant_responses = ["Hi there!", "I'm doing well, thank you!"]
|
||||
for response in assistant_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
LOG.debug(
|
||||
f"Assistant response '{response}' expected IDs: {response_ids}, found at: {start_idx}"
|
||||
)
|
||||
assert start_idx != -1, f"Could not find '{response}' in input_ids"
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(response_ids)]
|
||||
), f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:start_idx+len(response_ids)]}"
|
||||
|
||||
# Check the behavior of human inputs
|
||||
human_inputs = ["Hello", "How are you?"]
|
||||
for input_text in human_inputs:
|
||||
input_ids = llama3_tokenizer.encode(input_text, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, input_ids)
|
||||
labeled = all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(input_ids)]
|
||||
)
|
||||
LOG.debug(
|
||||
f"Human input '{input_text}' is {'labeled' if labeled else 'not labeled'}, expected IDs: {input_ids}, found at: {start_idx}"
|
||||
)
|
||||
|
||||
LOG.debug("Full labels: %s", labels)
|
||||
LOG.debug("Full input_ids: %s", input_ids)
|
||||
|
||||
def test_train_on_inputs_false(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_inputs=False")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Verify that only assistant responses are labeled
|
||||
assistant_responses = ["Hi there!", "I'm doing well, thank you!"]
|
||||
for response in assistant_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
LOG.debug(
|
||||
f"Assistant response '{response}' expected IDs: {response_ids}, found at: {start_idx}"
|
||||
)
|
||||
assert start_idx != -1, f"Could not find '{response}' in input_ids"
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(response_ids)]
|
||||
), f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:start_idx+len(response_ids)]}"
|
||||
|
||||
# Verify that human inputs are not labeled
|
||||
human_inputs = ["Hello", "How are you?"]
|
||||
for input_text in human_inputs:
|
||||
input_ids = llama3_tokenizer.encode(input_text, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, input_ids)
|
||||
LOG.debug(
|
||||
f"Human input '{input_text}' expected IDs: {input_ids}, found at: {start_idx}"
|
||||
)
|
||||
assert start_idx != -1, f"Could not find '{input_text}' in input_ids"
|
||||
assert all(
|
||||
label == IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(input_ids)]
|
||||
), f"Expected labels for human input '{input_text}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:start_idx+len(input_ids)]}"
|
||||
|
||||
def test_roles_to_train_assistant_only(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing roles_to_train with assistant only")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Verify that only assistant responses are labeled
|
||||
assistant_responses = ["Hi there!", "I'm doing well, thank you!"]
|
||||
for response in assistant_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
LOG.debug(
|
||||
f"Assistant response '{response}' expected IDs: {response_ids}, found at: {start_idx}"
|
||||
)
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(response_ids)]
|
||||
), f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:start_idx+len(response_ids)]}"
|
||||
|
||||
def test_roles_to_train_all(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing roles_to_train with all roles")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=True,
|
||||
sequence_len=512,
|
||||
roles_to_train=["human", "assistant"],
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Verify that all responses are labeled (except for special tokens)
|
||||
all_responses = [
|
||||
"Hello",
|
||||
"Hi there!",
|
||||
"How are you?",
|
||||
"I'm doing well, thank you!",
|
||||
]
|
||||
for response in all_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
LOG.debug(
|
||||
f"Response '{response}' expected IDs: {response_ids}, found at: {start_idx}"
|
||||
)
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(response_ids)]
|
||||
), f"Expected labels for response '{response}' to be set, but got {labels[start_idx:start_idx+len(response_ids)]}"
|
||||
|
||||
def test_empty_roles_to_train(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with empty roles_to_train")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=[],
|
||||
train_on_eos="none", # Add this line
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
|
||||
# Verify that no labels are set when roles_to_train is empty
|
||||
LOG.debug("Full labels: %s", labels)
|
||||
assert all(
|
||||
label == IGNORE_TOKEN_ID for label in labels
|
||||
), "Expected all labels to be IGNORE_TOKEN_ID when roles_to_train is empty"
|
||||
|
||||
def test_train_on_eos_all(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_eos='all'")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
train_on_eos="all",
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
eos_token_id = llama3_tokenizer.eos_token_id
|
||||
eos_indices = [
|
||||
i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
|
||||
]
|
||||
|
||||
assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
|
||||
for eos_idx in eos_indices:
|
||||
assert (
|
||||
labels[eos_idx] != IGNORE_TOKEN_ID
|
||||
), f"Expected EOS token at index {eos_idx} to be labeled"
|
||||
|
||||
def test_train_on_eos_turn(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_eos='turn'")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
train_on_eos="turn",
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
eos_token_id = llama3_tokenizer.eos_token_id
|
||||
assistant_responses = ["Hi there!", "I'm doing well, thank you!"]
|
||||
|
||||
for response in assistant_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
assert start_idx != -1, f"Could not find '{response}' in input_ids"
|
||||
|
||||
eos_idx = start_idx + len(response_ids)
|
||||
while eos_idx < len(input_ids) and input_ids[eos_idx] != eos_token_id:
|
||||
eos_idx += 1
|
||||
|
||||
assert eos_idx < len(
|
||||
input_ids
|
||||
), f"Could not find EOS token after '{response}'"
|
||||
assert (
|
||||
labels[eos_idx] != IGNORE_TOKEN_ID
|
||||
), f"Expected EOS token after assistant response '{response}' to be labeled"
|
||||
|
||||
# Check that EOS tokens after human inputs are not labeled
|
||||
human_inputs = ["Hello", "How are you?"]
|
||||
for input_text in human_inputs:
|
||||
input_ids = llama3_tokenizer.encode(input_text, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, input_ids)
|
||||
assert start_idx != -1, f"Could not find '{input_text}' in input_ids"
|
||||
|
||||
eos_idx = start_idx + len(input_ids)
|
||||
while eos_idx < len(input_ids) and input_ids[eos_idx] != eos_token_id:
|
||||
eos_idx += 1
|
||||
|
||||
assert (
|
||||
labels[eos_idx] == IGNORE_TOKEN_ID
|
||||
), f"Expected EOS token after human input '{input_text}' to not be labeled"
|
||||
|
||||
def test_train_on_eos_last(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_eos='last'")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
train_on_eos="last",
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
eos_token_id = llama3_tokenizer.eos_token_id
|
||||
eos_indices = [
|
||||
i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
|
||||
]
|
||||
|
||||
assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
|
||||
last_eos_idx = eos_indices[-1]
|
||||
|
||||
# Check that only the last EOS token is labeled
|
||||
for idx in eos_indices[:-1]:
|
||||
assert (
|
||||
labels[idx] == IGNORE_TOKEN_ID
|
||||
), f"Expected EOS token at index {idx} to not be labeled"
|
||||
assert (
|
||||
labels[last_eos_idx] != IGNORE_TOKEN_ID
|
||||
), f"Expected last EOS token at index {last_eos_idx} to be labeled"
|
||||
|
||||
def test_train_on_eos_none(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with train_on_eos='none'")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer, chat_template=get_chat_template("llama3")
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
train_on_eos="none",
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
eos_token_id = llama3_tokenizer.eos_token_id
|
||||
eos_indices = [
|
||||
i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
|
||||
]
|
||||
|
||||
assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
|
||||
for eos_idx in eos_indices:
|
||||
assert (
|
||||
labels[eos_idx] == IGNORE_TOKEN_ID
|
||||
), f"Expected EOS token at index {eos_idx} to not be labeled"
|
||||
|
||||
def test_drop_system_message(self, llama3_tokenizer, basic_dataset):
|
||||
LOG.info("Testing with drop_system_message=True")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer,
|
||||
chat_template=get_chat_template("llama3"),
|
||||
drop_system_message=True,
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["assistant"],
|
||||
)
|
||||
res = strategy.tokenize_prompt(basic_dataset[0])
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Check if system message is not present in input_ids
|
||||
system_message = "You are an AI assistant."
|
||||
system_ids = llama3_tokenizer.encode(system_message, add_special_tokens=False)
|
||||
assert (
|
||||
self.find_sublist(input_ids, system_ids) == -1
|
||||
), "Expected system message to be dropped"
|
||||
|
||||
def test_custom_roles(self, llama3_tokenizer):
|
||||
LOG.info("Testing with custom roles mapping")
|
||||
custom_roles = {
|
||||
"user": ["human", "user"],
|
||||
"assistant": ["ai", "assistant"],
|
||||
"system": ["context"],
|
||||
}
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer,
|
||||
chat_template=get_chat_template("llama3"),
|
||||
roles=custom_roles,
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=["ai"],
|
||||
)
|
||||
|
||||
# Create a new dataset with modified role names
|
||||
modified_conversations = [
|
||||
{"from": "context", "value": "You are an AI assistant."},
|
||||
{"from": "human", "value": "Hello"},
|
||||
{"from": "ai", "value": "Hi there!"},
|
||||
{"from": "human", "value": "How are you?"},
|
||||
{"from": "ai", "value": "I'm doing well, thank you!"},
|
||||
]
|
||||
|
||||
modified_dataset = Dataset.from_dict(
|
||||
{"conversations": [modified_conversations]}
|
||||
)
|
||||
|
||||
res = strategy.tokenize_prompt(modified_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Check if AI responses are labeled correctly
|
||||
ai_responses = ["Hi there!", "I'm doing well, thank you!"]
|
||||
for response in ai_responses:
|
||||
response_ids = llama3_tokenizer.encode(response, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, response_ids)
|
||||
assert start_idx != -1, f"Could not find response '{response}' in input_ids"
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(response_ids)]
|
||||
), f"Expected labels for AI response '{response}' to be set"
|
||||
|
||||
# Check if human messages are not labeled
|
||||
human_messages = ["Hello", "How are you?"]
|
||||
for message in human_messages:
|
||||
message_ids = llama3_tokenizer.encode(message, add_special_tokens=False)
|
||||
start_idx = self.find_sublist(input_ids, message_ids)
|
||||
assert start_idx != -1, f"Could not find message '{message}' in input_ids"
|
||||
assert all(
|
||||
label == IGNORE_TOKEN_ID
|
||||
for label in labels[start_idx : start_idx + len(message_ids)]
|
||||
), f"Expected labels for human message '{message}' to be IGNORE_TOKEN_ID"
|
||||
|
||||
def test_message_field_training(self, llama3_tokenizer):
|
||||
LOG.info("Testing with message_field_training")
|
||||
strategy = ChatTemplateStrategy(
|
||||
ChatTemplatePrompter(
|
||||
llama3_tokenizer,
|
||||
chat_template=get_chat_template("llama3"),
|
||||
message_field_training="train",
|
||||
message_field_training_detail="train_detail",
|
||||
),
|
||||
tokenizer=llama3_tokenizer,
|
||||
train_on_inputs=False,
|
||||
sequence_len=512,
|
||||
roles_to_train=[],
|
||||
)
|
||||
|
||||
# Create a new dataset with the train and train_detail fields
|
||||
modified_conversation = [
|
||||
{"from": "system", "value": "You are an AI assistant.", "train": False},
|
||||
{"from": "human", "value": "Hello", "train": False},
|
||||
{"from": "assistant", "value": "Hello", "train": True},
|
||||
{"from": "human", "value": "How are you?", "train": True},
|
||||
{
|
||||
"from": "assistant",
|
||||
"value": "I'm doing very well, thank you!",
|
||||
"train_detail": [
|
||||
{"begin_offset": 0, "end_offset": 8, "train": False},
|
||||
{"begin_offset": 9, "end_offset": 18, "train": True},
|
||||
{"begin_offset": 19, "end_offset": 30, "train": False},
|
||||
],
|
||||
},
|
||||
{
|
||||
"from": "human",
|
||||
"value": "I'm doing very well, thank you!",
|
||||
"train": False,
|
||||
},
|
||||
{"from": "assistant", "value": "Hi there!", "train": True},
|
||||
]
|
||||
|
||||
modified_dataset = Dataset.from_dict({"conversations": [modified_conversation]})
|
||||
|
||||
res = strategy.tokenize_prompt(modified_dataset[0])
|
||||
labels = res["labels"]
|
||||
input_ids = res["input_ids"]
|
||||
|
||||
# Function to find all occurrences of a sublist
|
||||
def find_all_sublists(full_list, sub_list):
|
||||
indices = []
|
||||
for index in range(len(full_list) - len(sub_list) + 1):
|
||||
if full_list[index : index + len(sub_list)] == sub_list:
|
||||
indices.append(index)
|
||||
return indices
|
||||
|
||||
# Keep track of which occurrences we've processed
|
||||
processed_occurrences = {}
|
||||
# Check if messages are labeled correctly based on train or train_detail
|
||||
for i, turn in enumerate(modified_conversation):
|
||||
turn_tokens = llama3_tokenizer.encode(
|
||||
turn["value"], add_special_tokens=False
|
||||
)
|
||||
occurrences = find_all_sublists(input_ids, turn_tokens)
|
||||
turn_key = turn["value"]
|
||||
if turn_key not in processed_occurrences:
|
||||
processed_occurrences[turn_key] = 0
|
||||
current_occurrence = processed_occurrences[turn_key]
|
||||
|
||||
if current_occurrence >= len(occurrences):
|
||||
assert (
|
||||
False
|
||||
), f"Not enough occurrences found for message: {turn['value']}"
|
||||
|
||||
start_idx = occurrences[current_occurrence]
|
||||
processed_occurrences[turn_key] += 1
|
||||
end_idx = start_idx + len(turn_tokens)
|
||||
|
||||
LOG.debug(
|
||||
f"Processing turn {i}: role={turn['from']}, content='{turn['value']}', start_idx={start_idx}, end_idx={end_idx}"
|
||||
)
|
||||
|
||||
if "train_detail" in turn:
|
||||
# Get token offsets
|
||||
tokenized_output = llama3_tokenizer(
|
||||
turn["value"], return_offsets_mapping=True, add_special_tokens=False
|
||||
)
|
||||
token_offsets = tokenized_output["offset_mapping"]
|
||||
|
||||
# Adjust token offsets as done in the implementation
|
||||
for i in range(len(token_offsets) - 1):
|
||||
token_offsets[i] = (
|
||||
token_offsets[i][0],
|
||||
token_offsets[i + 1][0] - 1,
|
||||
)
|
||||
token_offsets[-1] = (token_offsets[-1][0], len(turn["value"]) - 1)
|
||||
|
||||
# Adjust train_details
|
||||
adjusted_train_details = strategy.prompter.adjust_train_details(
|
||||
turn["train_detail"], token_offsets
|
||||
)
|
||||
|
||||
LOG.debug(f"Original train_details: {turn['train_detail']}")
|
||||
LOG.debug(f"Adjusted train_details: {adjusted_train_details}")
|
||||
|
||||
# Handle train_detail
|
||||
token_offsets = strategy.prompter.get_offsets_for_train_detail(
|
||||
text=turn["value"],
|
||||
train_details=adjusted_train_details,
|
||||
mask_untrainable=False,
|
||||
)
|
||||
token_offsets_masked = strategy.prompter.get_offsets_for_train_detail(
|
||||
text=turn["value"],
|
||||
train_details=adjusted_train_details,
|
||||
mask_untrainable=True,
|
||||
)
|
||||
LOG.debug(f"Token offsets: {token_offsets_masked}")
|
||||
|
||||
expected_labels = [IGNORE_TOKEN_ID] * len(turn_tokens)
|
||||
for i, offset in enumerate(token_offsets_masked):
|
||||
if offset != IGNORE_TOKEN_ID:
|
||||
expected_labels[i] = turn_tokens[i]
|
||||
actual_labels = labels[
|
||||
start_idx : start_idx + len(token_offsets_masked)
|
||||
]
|
||||
assert (
|
||||
actual_labels == expected_labels
|
||||
), f"Labels mismatch for turn: {turn['value']}\nExpected: {expected_labels}\nActual: {actual_labels}"
|
||||
|
||||
for detail in adjusted_train_details:
|
||||
# Find the token indices that correspond to the character offsets
|
||||
detail_start = start_idx + next(
|
||||
i
|
||||
for i, offset in enumerate(token_offsets)
|
||||
if offset >= detail["begin_offset"]
|
||||
)
|
||||
detail_end = start_idx + next(
|
||||
(
|
||||
i
|
||||
for i, offset in enumerate(token_offsets)
|
||||
if offset > detail["end_offset"]
|
||||
),
|
||||
len(token_offsets),
|
||||
)
|
||||
|
||||
detail_text = turn["value"][
|
||||
detail["begin_offset"] : detail["end_offset"] + 1
|
||||
]
|
||||
detail_labels = labels[detail_start:detail_end]
|
||||
detail_input_ids = input_ids[detail_start:detail_end]
|
||||
|
||||
LOG.debug(
|
||||
f"Detail: '{detail_text}', Start: {detail_start}, End: {detail_end}"
|
||||
)
|
||||
LOG.debug(f"Detail input_ids: {detail_input_ids}")
|
||||
LOG.debug(f"Detail labels: {detail_labels}")
|
||||
LOG.debug(
|
||||
f"Decoded detail: {llama3_tokenizer.decode(detail_input_ids)}"
|
||||
)
|
||||
LOG.debug(
|
||||
f"Token offsets for this detail: {token_offsets[detail_start-start_idx:detail_end-start_idx]}"
|
||||
)
|
||||
|
||||
if detail["train"]:
|
||||
assert all(
|
||||
label != IGNORE_TOKEN_ID for label in detail_labels
|
||||
), (
|
||||
f"Expected labels for trainable detail '{detail_text}' to be set, but some were IGNORE_TOKEN_ID. "
|
||||
f"Labels({detail_start}:{detail_end}): {detail_labels}, "
|
||||
f"InputIDs: {detail_input_ids}, "
|
||||
f"Decoded: '{llama3_tokenizer.decode(detail_input_ids)}'"
|
||||
)
|
||||
else:
|
||||
assert all(
|
||||
label == IGNORE_TOKEN_ID for label in detail_labels
|
||||
), (
|
||||
f"Expected all labels for non-trainable detail '{detail_text}' to be IGNORE_TOKEN_ID, but some were not. "
|
||||
f"Labels({detail_start}:{detail_end}): {detail_labels}, "
|
||||
f"InputIDs: {detail_input_ids}, "
|
||||
f"Decoded: '{llama3_tokenizer.decode(detail_input_ids)}'"
|
||||
)
|
||||
else:
|
||||
should_train = turn.get("train", False)
|
||||
turn_labels = labels[start_idx:end_idx]
|
||||
|
||||
LOG.debug(f"Should train: {should_train}")
|
||||
LOG.debug(f"Turn indices: start={start_idx}, end={end_idx}")
|
||||
LOG.debug(f"Turn labels: {turn_labels}")
|
||||
LOG.debug(f"Turn input IDs: {input_ids[start_idx:end_idx]}")
|
||||
LOG.debug(
|
||||
f"Decoded turn: {llama3_tokenizer.decode(input_ids[start_idx:end_idx])}"
|
||||
)
|
||||
|
||||
if should_train:
|
||||
assert all(label != IGNORE_TOKEN_ID for label in turn_labels), (
|
||||
f"Expected all labels for '{turn['value']}' to be set\n"
|
||||
f"Labels({start_idx}:{end_idx}): {turn_labels}, "
|
||||
f"InputIDs: {input_ids[start_idx:end_idx]}, "
|
||||
f"Decoded: '{llama3_tokenizer.decode(input_ids[start_idx:end_idx])}'"
|
||||
)
|
||||
else:
|
||||
assert all(label == IGNORE_TOKEN_ID for label in turn_labels), (
|
||||
f"Expected all labels for '{turn['value']}' to be IGNORE_TOKEN_ID\n"
|
||||
f"Labels({start_idx}:{end_idx}): {turn_labels}, "
|
||||
f"InputIDs: {input_ids[start_idx:end_idx]}, "
|
||||
f"Decoded: '{llama3_tokenizer.decode(input_ids[start_idx:end_idx])}'"
|
||||
)
|
||||
|
||||
LOG.debug(
|
||||
f"Processed turn: {turn['from']}, content: '{turn['value']}', "
|
||||
f"start_idx: {start_idx}, end_idx: {end_idx}, "
|
||||
f"labels: {labels[start_idx:end_idx]}"
|
||||
)
|
||||
|
||||
LOG.debug(f"Final labels: {labels}")
|
||||
LOG.debug(f"Final input_ids: {input_ids}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
@@ -86,6 +86,13 @@ def fixture_llama3_tokenizer():
|
||||
return tokenizer
|
||||
|
||||
|
||||
@pytest.fixture(name="phi3_tokenizer")
|
||||
def fixture_phi3_tokenizer():
|
||||
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-128k-instruct")
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
class TestAssistantDPOChatTemplateLlama3:
|
||||
"""
|
||||
Test class for assistant style datasets with llama-3 prompts using the chat_template strategy.
|
||||
@@ -152,5 +159,36 @@ class TestAssistantDPOChatTemplateLlama3:
|
||||
assert result["rejected"] == "party on<|eot_id|>"
|
||||
|
||||
|
||||
class TestAssistantDPOChatTemplatePhi3:
|
||||
"""
|
||||
Test class for assistant style datasets with phi-3 prompts using the tokenizer's chat_template strategy.
|
||||
"""
|
||||
|
||||
def test_phi3_defaults(self, phi3_tokenizer, assistant_dataset):
|
||||
# pylint: disable=duplicate-code
|
||||
transform_fn = default(
|
||||
DictDefault(
|
||||
{
|
||||
"chat_template": "tokenizer_default",
|
||||
"datasets": [
|
||||
{
|
||||
"type": "chat_template",
|
||||
"chat_template": "tokenizer_default",
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
)
|
||||
result = transform_fn(assistant_dataset[0], tokenizer=phi3_tokenizer)
|
||||
assert result["prompt"] == (
|
||||
"<|user|>\nhello<|end|>\n"
|
||||
+ "<|assistant|>\nhello<|end|>\n"
|
||||
+ "<|user|>\ngoodbye<|end|>\n"
|
||||
+ "<|assistant|>\n"
|
||||
)
|
||||
assert result["chosen"] == "goodbye<|end|>"
|
||||
assert result["rejected"] == "party on<|end|>"
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
@@ -9,6 +9,7 @@ from typing import Optional
|
||||
import pytest
|
||||
from pydantic import ValidationError
|
||||
|
||||
from axolotl.utils import is_comet_available
|
||||
from axolotl.utils.config import validate_config
|
||||
from axolotl.utils.config.models.input.v0_4_1 import AxolotlConfigWCapabilities
|
||||
from axolotl.utils.dict import DictDefault
|
||||
@@ -1329,3 +1330,105 @@ class TestValidationWandb(BaseValidation):
|
||||
|
||||
os.environ.pop("WANDB_PROJECT", None)
|
||||
os.environ.pop("WANDB_DISABLED", None)
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_comet_available() is False, reason="comet_ml is not installed")
|
||||
class TestValidationComet(BaseValidation):
|
||||
"""
|
||||
Validation test for comet
|
||||
"""
|
||||
|
||||
def test_comet_sets_env(self, minimal_cfg):
|
||||
from axolotl.utils.comet_ import setup_comet_env_vars
|
||||
|
||||
comet_config = {
|
||||
"comet_api_key": "foo",
|
||||
"comet_workspace": "some_workspace",
|
||||
"comet_project_name": "some_project",
|
||||
"comet_experiment_key": "some_experiment_key",
|
||||
"comet_mode": "get_or_create",
|
||||
"comet_online": False,
|
||||
"comet_experiment_config": {
|
||||
"auto_histogram_activation_logging": False,
|
||||
"auto_histogram_epoch_rate": 2,
|
||||
"auto_histogram_gradient_logging": True,
|
||||
"auto_histogram_tensorboard_logging": False,
|
||||
"auto_histogram_weight_logging": True,
|
||||
"auto_log_co2": False,
|
||||
"auto_metric_logging": True,
|
||||
"auto_metric_step_rate": 15,
|
||||
"auto_output_logging": False,
|
||||
"auto_param_logging": True,
|
||||
"comet_disabled": False,
|
||||
"display_summary_level": 2,
|
||||
"distributed_node_identifier": "some_distributed_node_identifier",
|
||||
"log_code": True,
|
||||
"log_env_cpu": False,
|
||||
"log_env_details": True,
|
||||
"log_env_disk": False,
|
||||
"log_env_gpu": True,
|
||||
"log_env_host": False,
|
||||
"log_env_network": True,
|
||||
"log_git_metadata": False,
|
||||
"log_git_patch": True,
|
||||
"log_graph": False,
|
||||
"name": "some_name",
|
||||
"offline_directory": "some_offline_directory",
|
||||
"parse_args": True,
|
||||
"tags": ["tag1", "tag2"],
|
||||
},
|
||||
}
|
||||
|
||||
cfg = DictDefault(comet_config) | minimal_cfg
|
||||
|
||||
new_cfg = validate_config(cfg)
|
||||
|
||||
setup_comet_env_vars(new_cfg)
|
||||
|
||||
comet_env = {
|
||||
key: value for key, value in os.environ.items() if key.startswith("COMET_")
|
||||
}
|
||||
|
||||
assert (
|
||||
len(comet_env)
|
||||
== len(comet_config) + len(comet_config["comet_experiment_config"]) - 1
|
||||
)
|
||||
|
||||
assert comet_env == {
|
||||
"COMET_API_KEY": "foo",
|
||||
"COMET_AUTO_LOG_CLI_ARGUMENTS": "true",
|
||||
"COMET_AUTO_LOG_CO2": "false",
|
||||
"COMET_AUTO_LOG_CODE": "true",
|
||||
"COMET_AUTO_LOG_DISABLE": "false",
|
||||
"COMET_AUTO_LOG_ENV_CPU": "false",
|
||||
"COMET_AUTO_LOG_ENV_DETAILS": "true",
|
||||
"COMET_AUTO_LOG_ENV_DISK": "false",
|
||||
"COMET_AUTO_LOG_ENV_GPU": "true",
|
||||
"COMET_AUTO_LOG_ENV_HOST": "false",
|
||||
"COMET_AUTO_LOG_ENV_NETWORK": "true",
|
||||
"COMET_AUTO_LOG_GIT_METADATA": "false",
|
||||
"COMET_AUTO_LOG_GIT_PATCH": "true",
|
||||
"COMET_AUTO_LOG_GRAPH": "false",
|
||||
"COMET_AUTO_LOG_HISTOGRAM_ACTIVATIONS": "false",
|
||||
"COMET_AUTO_LOG_HISTOGRAM_EPOCH_RATE": "2",
|
||||
"COMET_AUTO_LOG_HISTOGRAM_GRADIENTS": "true",
|
||||
"COMET_AUTO_LOG_HISTOGRAM_TENSORBOARD": "false",
|
||||
"COMET_AUTO_LOG_HISTOGRAM_WEIGHTS": "true",
|
||||
"COMET_AUTO_LOG_METRIC_STEP_RATE": "15",
|
||||
"COMET_AUTO_LOG_METRICS": "true",
|
||||
"COMET_AUTO_LOG_OUTPUT_LOGGER": "false",
|
||||
"COMET_AUTO_LOG_PARAMETERS": "true",
|
||||
"COMET_DISPLAY_SUMMARY_LEVEL": "2",
|
||||
"COMET_DISTRIBUTED_NODE_IDENTIFIER": "some_distributed_node_identifier",
|
||||
"COMET_EXPERIMENT_KEY": "some_experiment_key",
|
||||
"COMET_OFFLINE_DIRECTORY": "some_offline_directory",
|
||||
"COMET_PROJECT_NAME": "some_project",
|
||||
"COMET_START_EXPERIMENT_NAME": "some_name",
|
||||
"COMET_START_EXPERIMENT_TAGS": "tag1,tag2",
|
||||
"COMET_START_MODE": "get_or_create",
|
||||
"COMET_START_ONLINE": "false",
|
||||
"COMET_WORKSPACE": "some_workspace",
|
||||
}
|
||||
|
||||
for key in comet_env.keys():
|
||||
os.environ.pop(key, None)
|
||||
|
||||
238
tests/test_validation_dataset.py
Normal file
238
tests/test_validation_dataset.py
Normal file
@@ -0,0 +1,238 @@
|
||||
"""Module for testing the validation module for the dataset config"""
|
||||
|
||||
import warnings
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
|
||||
from axolotl.utils.config import validate_config
|
||||
from axolotl.utils.config.models.input.v0_4_1 import ChatTemplate
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
warnings.filterwarnings("error")
|
||||
|
||||
|
||||
@pytest.fixture(name="minimal_cfg")
|
||||
def fixture_cfg():
|
||||
return DictDefault(
|
||||
{
|
||||
"base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
|
||||
"learning_rate": 0.000001,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": 1,
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
# pylint: disable=too-many-public-methods (duplicate-code)
|
||||
class BaseValidation:
|
||||
"""
|
||||
Base validation module to setup the log capture
|
||||
"""
|
||||
|
||||
_caplog: Optional[pytest.LogCaptureFixture] = None
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def inject_fixtures(self, caplog):
|
||||
self._caplog = caplog
|
||||
|
||||
|
||||
class TestValidationCheckDatasetConfig(BaseValidation):
|
||||
"""
|
||||
Test the validation for the dataset config to ensure no correct parameters are dropped
|
||||
"""
|
||||
|
||||
def test_dataset_config_no_drop_param(self, minimal_cfg):
|
||||
cfg = DictDefault(
|
||||
minimal_cfg
|
||||
| {
|
||||
"datasets": [
|
||||
{
|
||||
"path": "LDJnr/Puffin",
|
||||
"type": "sharegpt",
|
||||
"conversation": "chatml",
|
||||
"shards": 10,
|
||||
}
|
||||
]
|
||||
}
|
||||
)
|
||||
|
||||
checked_cfg = validate_config(cfg)
|
||||
|
||||
def _check_config():
|
||||
assert checked_cfg.datasets[0].path == cfg.datasets[0].path
|
||||
assert checked_cfg.datasets[0].type == cfg.datasets[0].type
|
||||
assert checked_cfg.datasets[0].conversation == cfg.datasets[0].conversation
|
||||
assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards
|
||||
|
||||
_check_config()
|
||||
|
||||
checked_cfg = validate_config(
|
||||
cfg,
|
||||
capabilities={
|
||||
"bf16": "false",
|
||||
"n_gpu": 1,
|
||||
"compute_capability": "8.0",
|
||||
},
|
||||
)
|
||||
|
||||
_check_config()
|
||||
|
||||
def test_dataset_default_chat_template_no_drop_param(self, minimal_cfg):
|
||||
cfg = DictDefault(
|
||||
minimal_cfg
|
||||
| {
|
||||
"datasets": [
|
||||
{
|
||||
"path": "LDJnr/Puffin",
|
||||
"type": "chat_template",
|
||||
"field_messages": "conversations",
|
||||
"shards": 10,
|
||||
"message_field_role": "from",
|
||||
"message_field_content": "value",
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
checked_cfg = validate_config(cfg)
|
||||
|
||||
def _check_config():
|
||||
assert checked_cfg.datasets[0].path == cfg.datasets[0].path
|
||||
assert checked_cfg.datasets[0].type == cfg.datasets[0].type
|
||||
assert checked_cfg.chat_template == ChatTemplate.tokenizer_default
|
||||
assert (
|
||||
checked_cfg.datasets[0].chat_template == ChatTemplate.tokenizer_default
|
||||
)
|
||||
assert (
|
||||
checked_cfg.datasets[0].field_messages == cfg.datasets[0].field_messages
|
||||
)
|
||||
assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards
|
||||
assert (
|
||||
checked_cfg.datasets[0].message_field_role
|
||||
== cfg.datasets[0].message_field_role
|
||||
)
|
||||
assert (
|
||||
checked_cfg.datasets[0].message_field_content
|
||||
== cfg.datasets[0].message_field_content
|
||||
)
|
||||
|
||||
_check_config()
|
||||
|
||||
checked_cfg = validate_config(
|
||||
cfg,
|
||||
capabilities={
|
||||
"bf16": "false",
|
||||
"n_gpu": 1,
|
||||
"compute_capability": "8.0",
|
||||
},
|
||||
)
|
||||
|
||||
_check_config()
|
||||
|
||||
def test_dataset_partial_default_chat_template_no_drop_param(self, minimal_cfg):
|
||||
cfg = DictDefault(
|
||||
minimal_cfg
|
||||
| {
|
||||
"chat_template": "chatml",
|
||||
"datasets": [
|
||||
{
|
||||
"path": "LDJnr/Puffin",
|
||||
"type": "chat_template",
|
||||
"field_messages": "conversations",
|
||||
"shards": 10,
|
||||
"message_field_role": "from",
|
||||
"message_field_content": "value",
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
checked_cfg = validate_config(cfg)
|
||||
|
||||
def _check_config():
|
||||
assert checked_cfg.datasets[0].path == cfg.datasets[0].path
|
||||
assert checked_cfg.datasets[0].type == cfg.datasets[0].type
|
||||
assert checked_cfg.chat_template == ChatTemplate.chatml
|
||||
assert (
|
||||
checked_cfg.datasets[0].chat_template == ChatTemplate.tokenizer_default
|
||||
)
|
||||
assert (
|
||||
checked_cfg.datasets[0].field_messages == cfg.datasets[0].field_messages
|
||||
)
|
||||
assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards
|
||||
assert (
|
||||
checked_cfg.datasets[0].message_field_role
|
||||
== cfg.datasets[0].message_field_role
|
||||
)
|
||||
assert (
|
||||
checked_cfg.datasets[0].message_field_content
|
||||
== cfg.datasets[0].message_field_content
|
||||
)
|
||||
|
||||
_check_config()
|
||||
|
||||
checked_cfg = validate_config(
|
||||
cfg,
|
||||
capabilities={
|
||||
"bf16": "false",
|
||||
"n_gpu": 1,
|
||||
"compute_capability": "8.0",
|
||||
},
|
||||
)
|
||||
|
||||
_check_config()
|
||||
|
||||
def test_dataset_chatml_chat_template_no_drop_param(self, minimal_cfg):
|
||||
cfg = DictDefault(
|
||||
minimal_cfg
|
||||
| {
|
||||
"chat_template": "chatml",
|
||||
"datasets": [
|
||||
{
|
||||
"path": "LDJnr/Puffin",
|
||||
"type": "chat_template",
|
||||
"chat_template": "gemma",
|
||||
"field_messages": "conversations",
|
||||
"shards": 10,
|
||||
"message_field_role": "from",
|
||||
"message_field_content": "value",
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
checked_cfg = validate_config(cfg)
|
||||
|
||||
def _check_config():
|
||||
assert checked_cfg.datasets[0].path == cfg.datasets[0].path
|
||||
assert checked_cfg.datasets[0].type == cfg.datasets[0].type
|
||||
assert checked_cfg.chat_template == cfg.chat_template
|
||||
assert (
|
||||
checked_cfg.datasets[0].chat_template == cfg.datasets[0].chat_template
|
||||
)
|
||||
assert (
|
||||
checked_cfg.datasets[0].field_messages == cfg.datasets[0].field_messages
|
||||
)
|
||||
assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards
|
||||
assert (
|
||||
checked_cfg.datasets[0].message_field_role
|
||||
== cfg.datasets[0].message_field_role
|
||||
)
|
||||
assert (
|
||||
checked_cfg.datasets[0].message_field_content
|
||||
== cfg.datasets[0].message_field_content
|
||||
)
|
||||
|
||||
_check_config()
|
||||
|
||||
checked_cfg = validate_config(
|
||||
cfg,
|
||||
capabilities={
|
||||
"bf16": "false",
|
||||
"n_gpu": 1,
|
||||
"compute_capability": "8.0",
|
||||
},
|
||||
)
|
||||
|
||||
_check_config()
|
||||
Reference in New Issue
Block a user