Compare commits

..

11 Commits

Author SHA1 Message Date
Wing Lian
1a538be9c2 add a prelim test for expading the 4d mask 2024-01-26 00:41:24 -05:00
Wing Lian
74c72ca5eb drop py39 docker images, add py311, upgrade pytorch to 2.1.2 (#1205)
* drop py39 docker images, add py311, upgrade pytorch to 2.1.2

* also allow the main build to be manually triggered

* fix workflow_dispatch in yaml
2024-01-26 00:38:49 -05:00
Wing Lian
e923e62d24 more checks and fixes for deepspeed and fsdp (#1208) [skip ci] 2024-01-25 20:01:45 -05:00
Wing Lian
ba944e6554 workaround for transformers bug requireing do_sample for saveing pretrained (#1206) 2024-01-25 11:34:41 -05:00
Wing Lian
badda3783b make sure to register the base chatml template even if no system message is provided (#1207) 2024-01-25 10:38:08 -05:00
Wing Lian
a01b998c0f Update deps 202401 (#1204) [skip ci]
* update deps

* xformers fix too
2024-01-25 10:11:49 -05:00
Wing Lian
33e117088f precompute dpo logprobs setting and fixes (#1199) [skip ci]
* add support for precompute_ref_log_probs for dpo

* add chatml.icr type for argilla orca dpo

* update inline doc

* also set use_reentrant to false for dpo when not set

* don't set use_reentrant to true for rl

* make sure to set gradient checkpointing too
2024-01-25 09:31:55 -05:00
Ricardo Dominguez-Olmedo
b4ac96adef fix learning rate scheduler's warnings (#1135) [skip ci]
* fix schedulers warnings

* chore: lint

---------

Co-authored-by: Wing Lian <wing.lian@gmail.com>
2024-01-25 07:09:34 -05:00
mhenrichsen
98b4762077 Feat/chatml add system message (#1117)
* add system message to template

* readme update

* added code to register new system message

* register chatml template for test

---------

Co-authored-by: Mads Henrichsen <mads@BrbartiendeMads.lan>
Co-authored-by: Wing Lian <wing.lian@gmail.com>
2024-01-25 08:24:27 +01:00
JohanWork
ee0b5f60e5 add colab example (#1196) [skip ci] 2024-01-24 20:09:09 -05:00
NanoCode012
08719b9609 fix(log): improve warning to clarify that lora_modules_to_save expect a list (#1197) 2024-01-24 20:08:34 -05:00
23 changed files with 411 additions and 125 deletions

View File

@@ -1,10 +1,7 @@
name: ci-cd-base
on:
push:
branches:
- "main-base"
- "dev-base"
workflow_dispatch:
jobs:
build-base:
@@ -15,11 +12,6 @@ jobs:
fail-fast: false
matrix:
include:
- cuda: "118"
cuda_version: 11.8.0
python_version: "3.9"
pytorch: 2.0.1
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
- cuda: "118"
cuda_version: 11.8.0
python_version: "3.10"
@@ -28,12 +20,17 @@ jobs:
- cuda: "118"
cuda_version: 11.8.0
python_version: "3.10"
pytorch: 2.1.1
pytorch: 2.1.2
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
- cuda: "121"
cuda_version: 12.1.0
python_version: "3.10"
pytorch: 2.1.1
pytorch: 2.1.2
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
- cuda: "121"
cuda_version: 12.1.0
python_version: "3.11"
pytorch: 2.1.2
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 9.0+PTX"
steps:
- name: Checkout
@@ -56,7 +53,7 @@ jobs:
context: .
file: ./docker/Dockerfile-base
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
labels: ${{ steps.metadata.outputs.labels }}
build-args: |
CUDA_VERSION=${{ matrix.cuda_version }}

View File

@@ -4,6 +4,7 @@ on:
push:
branches:
- "main"
workflow_dispatch:
jobs:
build-axolotl:
@@ -15,24 +16,24 @@ jobs:
include:
- cuda: 118
cuda_version: 11.8.0
python_version: "3.9"
python_version: "3.10"
pytorch: 2.0.1
axolotl_extras:
- cuda: 118
cuda_version: 11.8.0
python_version: "3.10"
pytorch: 2.0.1
pytorch: 2.1.2
axolotl_extras:
is_latest: true
- cuda: 118
cuda_version: 11.8.0
python_version: "3.10"
pytorch: 2.1.1
axolotl_extras:
- cuda: 121
cuda_version: 12.1.0
python_version: "3.10"
pytorch: 2.1.1
pytorch: 2.1.2
axolotl_extras:
- cuda: 121
cuda_version: 12.1.0
python_version: "3.11"
pytorch: 2.1.2
axolotl_extras:
runs-on: [self-hosted, gpu, docker]
steps:
@@ -86,24 +87,24 @@ jobs:
include:
- cuda: 118
cuda_version: 11.8.0
python_version: "3.9"
python_version: "3.10"
pytorch: 2.0.1
axolotl_extras:
- cuda: 118
cuda_version: 11.8.0
python_version: "3.10"
pytorch: 2.0.1
pytorch: 2.1.2
axolotl_extras:
is_latest: true
- cuda: 118
cuda_version: 11.8.0
python_version: "3.10"
pytorch: 2.1.1
axolotl_extras:
- cuda: 121
cuda_version: 12.1.0
python_version: "3.10"
pytorch: 2.1.1
pytorch: 2.1.2
axolotl_extras:
- cuda: 121
cuda_version: 12.1.0
python_version: "3.11"
pytorch: 2.1.2
axolotl_extras:
runs-on: [self-hosted, gpu, docker]
steps:

View File

@@ -106,3 +106,7 @@ jobs:
- name: GPU Unit Tests monkeypatched w docker image
run: |
docker run --privileged --gpus "all" --env WANDB_DISABLED=true --rm ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }} pytest /workspace/axolotl/tests/e2e/patched/
- name: Prune image from docker
if: github.ref != 'refs/heads/main'
run: |
docker rmi -f ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}

View File

@@ -613,6 +613,8 @@ rl:
# Saves the desired chat template to the tokenizer_config.json for easier inferencing
# Currently supports chatml and inst (mistral/mixtral)
chat_template: chatml
# Changes the default system message
default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
# Axolotl attempts to save the dataset as an arrow after packing the data together so
# subsequent training attempts load faster, relative path
dataset_prepared_path: data/last_run_prepared

View File

@@ -15,15 +15,6 @@
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",

View File

@@ -19,15 +19,6 @@
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",

View File

@@ -23,15 +23,6 @@
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",

View File

@@ -23,15 +23,6 @@
"hysteresis": 2,
"min_loss_scale": 1
},
"optimizer": {
"type": "AdamW",
"params": {
"lr": "auto",
"betas": "auto",
"eps": "auto",
"weight_decay": "auto"
}
},
"gradient_accumulation_steps": "auto",
"train_batch_size": "auto",
"train_micro_batch_size_per_gpu": "auto",

View File

@@ -0,0 +1,197 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "AKjdG7tbTb-n"
},
"source": [
"# Example notebook for running Axolotl on google colab"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "RcbNpOgWRcii"
},
"outputs": [],
"source": [
"import torch\n",
"# Check so there is a gpu available, a T4(free tier) is enough to run this notebook\n",
"assert (torch.cuda.is_available()==True)"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "h3nLav8oTRA5"
},
"source": [
"## Install Axolotl and dependencies"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "3c3yGAwnOIdi",
"outputId": "e3777b5a-40ef-424f-e181-62dfecd1dd01"
},
"outputs": [],
"source": [
"!pip install -e git+https://github.com/OpenAccess-AI-Collective/axolotl#egg=axolotl\n",
"!pip install flash-attn==\"2.5.0\"\n",
"!pip install deepspeed==\"0.13.1\""
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BW2MFr7HTjub"
},
"source": [
"## Create an yaml config file"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "9pkF2dSoQEUN"
},
"outputs": [],
"source": [
"import yaml\n",
"\n",
"# Your YAML string\n",
"yaml_string = \"\"\"\n",
"base_model: TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T\n",
"model_type: LlamaForCausalLM\n",
"tokenizer_type: LlamaTokenizer\n",
"is_llama_derived_model: true\n",
"\n",
"load_in_8bit: false\n",
"load_in_4bit: true\n",
"strict: false\n",
"\n",
"datasets:\n",
" - path: mhenrichsen/alpaca_2k_test\n",
" type: alpaca\n",
"dataset_prepared_path:\n",
"val_set_size: 0.05\n",
"output_dir: ./qlora-out\n",
"\n",
"adapter: qlora\n",
"lora_model_dir:\n",
"\n",
"sequence_len: 1096\n",
"sample_packing: true\n",
"pad_to_sequence_len: true\n",
"\n",
"lora_r: 32\n",
"lora_alpha: 16\n",
"lora_dropout: 0.05\n",
"lora_target_modules:\n",
"lora_target_linear: true\n",
"lora_fan_in_fan_out:\n",
"\n",
"wandb_project:\n",
"wandb_entity:\n",
"wandb_watch:\n",
"wandb_name:\n",
"wandb_log_model:\n",
"\n",
"mlflow_experiment_name: colab-example\n",
"\n",
"gradient_accumulation_steps: 1\n",
"micro_batch_size: 1\n",
"num_epochs: 4\n",
"max_steps: 20\n",
"optimizer: paged_adamw_32bit\n",
"lr_scheduler: cosine\n",
"learning_rate: 0.0002\n",
"\n",
"train_on_inputs: false\n",
"group_by_length: false\n",
"bf16: false\n",
"fp16: true\n",
"tf32: false\n",
"\n",
"gradient_checkpointing: true\n",
"early_stopping_patience:\n",
"resume_from_checkpoint:\n",
"local_rank:\n",
"logging_steps: 1\n",
"xformers_attention:\n",
"flash_attention: false\n",
"\n",
"warmup_steps: 10\n",
"evals_per_epoch:\n",
"saves_per_epoch:\n",
"debug:\n",
"deepspeed:\n",
"weight_decay: 0.0\n",
"fsdp:\n",
"fsdp_config:\n",
"special_tokens:\n",
"\n",
"\"\"\"\n",
"\n",
"# Convert the YAML string to a Python dictionary\n",
"yaml_dict = yaml.safe_load(yaml_string)\n",
"\n",
"# Specify your file path\n",
"file_path = 'test_axolotl.yaml'\n",
"\n",
"# Write the YAML file\n",
"with open(file_path, 'w') as file:\n",
" yaml.dump(yaml_dict, file)\n"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "bidoj8YLTusD"
},
"source": [
"## Launch the training"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "ydTI2Jk2RStU",
"outputId": "d6d0df17-4b53-439c-c802-22c0456d301b"
},
"outputs": [],
"source": [
"# Buy using the ! the comand will be executed as a bash command\n",
"!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml"
]
}
],
"metadata": {
"accelerator": "GPU",
"colab": {
"gpuType": "T4",
"provenance": []
},
"kernelspec": {
"display_name": "Python 3",
"name": "python3"
},
"language_info": {
"name": "python"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@@ -1,6 +1,6 @@
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
packaging==23.2
peft==0.7.0
peft==0.7.1
transformers==4.37.0
tokenizers==0.15.0
bitsandbytes>=0.41.1
@@ -15,16 +15,14 @@ sentencepiece
wandb
einops
xformers==0.0.22
optimum==1.13.2
optimum==1.16.2
hf_transfer
colorama
numba
numpy>=1.24.4
mlflow
# qlora things
bert-score==0.3.13
evaluate==0.4.0
rouge-score==0.1.2
scipy
scikit-learn==1.2.2
pynvml

View File

@@ -27,9 +27,9 @@ def parse_requirements():
try:
torch_version = version("torch")
if torch_version.startswith("2.1.1"):
if torch_version.startswith("2.1."):
_install_requires.pop(_install_requires.index("xformers==0.0.22"))
_install_requires.append("xformers==0.0.23")
_install_requires.append("xformers>=0.0.23")
except PackageNotFoundError:
pass

View File

@@ -18,6 +18,7 @@ from axolotl.cli import (
)
from axolotl.common.cli import PreprocessCliArgs
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
from axolotl.prompt_strategies.sharegpt import register_chatml_template
LOG = logging.getLogger("axolotl.cli.preprocess")
@@ -34,6 +35,14 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
return_remaining_strings=True
)
if parsed_cfg.chat_template == "chatml" and parsed_cfg.default_system_message:
LOG.info(
f"ChatML set. Adding default system message: {parsed_cfg.default_system_message}"
)
register_chatml_template(parsed_cfg.default_system_message)
else:
register_chatml_template()
if not parsed_cfg.dataset_prepared_path:
msg = (
Fore.RED

View File

@@ -18,6 +18,7 @@ from axolotl.cli import (
print_axolotl_text_art,
)
from axolotl.common.cli import TrainerCliArgs
from axolotl.prompt_strategies.sharegpt import register_chatml_template
from axolotl.train import train
LOG = logging.getLogger("axolotl.cli.train")
@@ -37,6 +38,14 @@ def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
print_axolotl_text_art()
check_accelerate_default_config()
check_user_token()
if cfg.chat_template == "chatml" and cfg.default_system_message:
LOG.info(
f"ChatML set. Adding default system message: {cfg.default_system_message}"
)
register_chatml_template(cfg.default_system_message)
else:
register_chatml_template()
if cfg.rl:
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
else:

View File

@@ -170,24 +170,30 @@ class AxolotlTrainer(Trainer):
num_training_steps (int): The number of training steps to do.
optimizer (torch.optim.Optimizer): The training optimizer
"""
use_cosine_quadratic = (
self.args.lr_scheduler_type == "cosine"
and self.args.lr_quadratic_warmup is True
)
use_cosine_min_lr = (
self.args.lr_scheduler_type == "cosine"
and self.args.cosine_min_lr_ratio is not None
)
# fmt: off
if self.lr_scheduler is None: # type: ignore # pylint: disable=access-member-before-definition
# fmt: on
if (
self.args.lr_scheduler_type == "cosine"
and self.args.lr_quadratic_warmup is True
):
if use_cosine_quadratic:
if use_cosine_min_lr:
LOG.warning("Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup( # pylint: disable=attribute-defined-outside-init
optimizer,
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
num_training_steps=num_training_steps,
)
elif self.args.lr_scheduler_type == "cosine" and self.args.cosine_min_lr_ratio is not None:
elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
if self.args.deepspeed:
LOG.warning("Using cosine scheduler with deepspeed. This may be ignored if a scheduler is set \
in the deepspeed JSON")
self.lr_scheduler = get_cosine_schedule_with_min_lr( # pylint: disable=attribute-defined-outside-init
optimizer,
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
@@ -196,6 +202,13 @@ class AxolotlTrainer(Trainer):
)
else:
return super().create_scheduler(num_training_steps, optimizer)
else:
if use_cosine_quadratic:
LOG.warning("axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).")
if use_cosine_min_lr:
LOG.warning("axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")
return self.lr_scheduler
def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
@@ -638,7 +651,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
training_arguments_kwargs[
"gradient_checkpointing"
] = self.cfg.gradient_checkpointing
if self.cfg.gradient_checkpointing_kwargs:
if self.cfg.gradient_checkpointing_kwargs is not None:
training_arguments_kwargs[
"gradient_checkpointing_kwargs"
] = self.cfg.gradient_checkpointing_kwargs
@@ -1015,6 +1028,18 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
training_args_kwargs[
"dataloader_prefetch_factor"
] = self.cfg.dataloader_prefetch_factor
if self.cfg.gradient_checkpointing:
training_args_kwargs[
"gradient_checkpointing"
] = self.cfg.gradient_checkpointing
if self.cfg.gradient_checkpointing_kwargs is not None:
training_args_kwargs[
"gradient_checkpointing_kwargs"
] = self.cfg.gradient_checkpointing_kwargs
else:
training_args_kwargs["gradient_checkpointing_kwargs"] = {
"use_reentrant": False
}
training_args = TrainingArguments(
per_device_train_batch_size=self.cfg.micro_batch_size,
@@ -1025,9 +1050,6 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
save_steps=self.cfg.save_steps,
output_dir=self.cfg.output_dir,
warmup_steps=self.cfg.warmup_steps,
gradient_checkpointing=self.cfg.gradient_checkpointing,
gradient_checkpointing_kwargs=self.cfg.gradient_checkpointing_kwargs
or {"use_reentrant": False},
logging_first_step=True,
logging_steps=1,
optim=self.cfg.optimizer,
@@ -1050,6 +1072,10 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
dpo_trainer_kwargs["eval_dataset"] = self.eval_dataset
if self.cfg.adapter and self.peft_config:
dpo_trainer_kwargs["peft_config"] = self.peft_config
if self.cfg.precompute_ref_log_probs is not None:
dpo_trainer_kwargs[
"precompute_ref_log_probs"
] = self.cfg.precompute_ref_log_probs
dpo_trainer = DPOTrainer(
self.model,
self.model_ref,

View File

@@ -23,6 +23,31 @@ def argilla(
return transform_fn
def icr(
cfg,
): # pylint: disable=possibly-unused-variable,unused-argument
"""
chatml transforms for datasets with system, input, chosen, rejected
ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
"""
def transform_fn(sample):
if "system" in sample and sample["system"]:
sample["prompt"] = (
f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
f"<|im_start|>user\n{sample['input']}<|im_end|>\n<|im_start|>assistant\n"
)
else:
sample[
"prompt"
] = f"<|im_start|>user\n{sample['input']}<|im_end|>\n<|im_start|>assistant\n"
sample["chosen"] = f"{sample['chosen']}<|im_end|>"
sample["rejected"] = f"{sample['rejected']}<|im_end|>"
return sample
return transform_fn
def intel(cfg): # pylint: disable=possibly-unused-variable,unused-argument
"""
For Intel Orca DPO Pairs

View File

@@ -6,16 +6,19 @@ from fastchat.conversation import Conversation, SeparatorStyle, register_conv_te
from axolotl.prompt_tokenizers import ShareGPTPromptTokenizingStrategy
from axolotl.prompters import ShareGPTPrompterV2
register_conv_template(
Conversation(
name="chatml",
system_template="<|im_start|>system\n{system_message}",
system_message="You are a helpful assistant.",
roles=["<|im_start|>user", "<|im_start|>assistant"],
sep_style=SeparatorStyle.CHATML,
sep="<|im_end|>",
def register_chatml_template(system_message=None):
system_message = system_message or "You are a helpful assistant."
register_conv_template(
Conversation(
name="chatml",
system_template="<|im_start|>system\n{system_message}",
system_message=system_message,
roles=["<|im_start|>user", "<|im_start|>assistant"],
sep_style=SeparatorStyle.CHATML,
sep="<|im_end|>",
)
)
)
def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):

View File

@@ -63,6 +63,8 @@ def train(
msg += " and peft_config..."
LOG.debug(msg)
model, peft_config = load_model(cfg, tokenizer, inference=cli_args.inference)
model.generation_config.do_sample = True
model_ref = None
if cfg.rl:
if cfg.adapter and not cfg.rl_adapter_ref_model:

View File

@@ -20,7 +20,7 @@ def chat_templates(user_choice: str):
templates = {
"inst": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}", # I don't know what this one is called. Used by Mistral/Mixtral.
"chatml": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
"chatml": "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'You are a helpful assistant.' %}{% endif %}{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in loop_messages %}{% if loop.index0 == 0 %}{{'<|im_start|>system\n' + system_message + '<|im_end|>\n'}}{% endif %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
}
if user_choice in templates:

View File

@@ -95,7 +95,7 @@ def normalize_config(cfg):
save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
if save_steps < 1.0: # prevent saves on every step
cfg.save_steps = save_steps
if cfg.evals_per_epoch:
if (cfg.val_set_size or cfg.test_datasets) and cfg.evals_per_epoch:
eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
if eval_steps < 1.0: # prevent evals on every step
cfg.eval_steps = eval_steps
@@ -163,6 +163,7 @@ def normalize_config(cfg):
cfg.gradient_checkpointing
and cfg.unfrozen_parameters is None
and cfg.gradient_checkpointing_kwargs is None
and cfg.rl is None
):
cfg.gradient_checkpointing_kwargs = {"use_reentrant": True}
@@ -484,35 +485,43 @@ def validate_config(cfg):
"`use_reentrant` must be false when used with partially frozen model."
)
if cfg.flash_attention and cfg.deepspeed and Path(cfg.deepspeed).is_file():
if cfg.deepspeed and Path(cfg.deepspeed).is_file():
with open(cfg.deepspeed, encoding="utf-8") as file:
contents = file.read()
deepspeed_cfg: DictDefault = DictDefault(json.loads(contents))
if (
deepspeed_cfg.zero_optimization
and deepspeed_cfg.zero_optimization.stage == 3
):
if not (
(
deepspeed_cfg.bf16
and deepspeed_cfg.bf16.enabled # pylint: disable=no-member
is True
)
or (
deepspeed_cfg.fp16
and deepspeed_cfg.fp16.enabled # pylint: disable=no-member
is True
)
if cfg.flash_attention:
if (
deepspeed_cfg.zero_optimization
and deepspeed_cfg.zero_optimization.stage == 3
):
raise ValueError(
"bf16.enabled or fp16.enabled must be set to true when using ZeRO-3 with flash-attention"
)
if not (
(
deepspeed_cfg.bf16
and deepspeed_cfg.bf16.enabled # pylint: disable=no-member
is True
)
or (
deepspeed_cfg.fp16
and deepspeed_cfg.fp16.enabled # pylint: disable=no-member
is True
)
):
raise ValueError(
"bf16.enabled or fp16.enabled must be set to true when using ZeRO-3 with flash-attention"
)
if "8bit" in cfg.optimizer and deepspeed_cfg.optimizer:
LOG.warning(
f"conflicting optimizer: {cfg.optimizer} used alongside deepspeed optimizer."
)
if cfg.test_datasets and cfg.val_set_size:
raise ValueError(
"non-zero val_set_size should not be used with test_datasets configuration"
)
if cfg.fsdp and "bnb" in cfg.optimizer:
raise ValueError(f"FSDP not compatible with {cfg.optimizer}")
# TODO
# MPT 7b
# https://github.com/facebookresearch/bitsandbytes/issues/25

View File

@@ -16,6 +16,7 @@ from datasets import (
load_from_disk,
)
from huggingface_hub import hf_hub_download
from huggingface_hub.utils import HFValidationError
from torch.utils.data import RandomSampler
from transformers import PreTrainedTokenizerBase
@@ -213,7 +214,7 @@ def load_tokenized_prepared_datasets(
token=use_auth_token,
)
ds_from_hub = True
except (FileNotFoundError, ConnectionError):
except (FileNotFoundError, ConnectionError, HFValidationError):
pass
ds_from_cloud = False

View File

@@ -67,7 +67,7 @@ def check_model_config(cfg: DictDefault, model_config: Union[AutoConfig, DictDef
):
lora_modules_to_save = ", ".join(map(lambda x: f"`{x}`", lora_modules_to_save))
raise ValueError(
f"`lora_modules_to_save` not properly set when adding new tokens. Please include {lora_modules_to_save} in `lora_modules_to_save`."
f"`lora_modules_to_save` not properly set when adding new tokens. Please include [{lora_modules_to_save}] in `lora_modules_to_save`."
)
@@ -182,7 +182,7 @@ def load_tokenizer(cfg):
[f"`{x}`" for x in lora_modules_to_save]
)
raise ValueError(
f"Please set lora_modules_to_save to {lora_modules_to_save} when using an adapter and changing the special tokens."
f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens."
)
tokenizer.add_special_tokens(
@@ -219,7 +219,13 @@ def load_tokenizer(cfg):
LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")
if cfg.chat_template:
tokenizer.chat_template = chat_templates(cfg.chat_template)
chat_template_string = chat_templates(cfg.chat_template)
if cfg.default_system_message and cfg.chat_template == "chatml":
chat_template_string = chat_template_string.replace(
"You are a helpful assistant.", cfg.default_system_message
)
tokenizer.chat_template = chat_template_string
else:
LOG.info(
"No Chat template selected. Consider adding a chat template for easier inference."
@@ -636,15 +642,17 @@ def load_model(
# make sure these are fp32 per Ramesh et al. (2021)
embedding_modules = get_linear_embedding_layers(cfg.model_config_type)
for name, module in model.named_modules():
if any(m in name for m in ["norm", "gate"]):
module.to(torch.float32)
if model_config.model_type == "btlm":
# don't upcast lm_head for btlm
continue
if any(m in name for m in embedding_modules):
if hasattr(module, "weight"):
if not cfg.fsdp:
# FSDP doesn't like mixed Float and BFloat16
for name, module in model.named_modules():
if any(m in name for m in ["norm", "gate"]):
module.to(torch.float32)
if model_config.model_type == "btlm":
# don't upcast lm_head for btlm
continue
if any(m in name for m in embedding_modules):
if hasattr(module, "weight"):
module.to(torch.float32)
needs_fa2_dtype = cfg.adapter or cfg.fsdp
skip_prepare_model_for_kbit_training = False

View File

@@ -7,9 +7,14 @@ from tokenizers import AddedToken
from transformers import AutoTokenizer
from axolotl.datasets import TokenizedPromptDataset
from axolotl.prompt_strategies.sharegpt import SimpleShareGPTPromptTokenizingStrategy
from axolotl.prompt_strategies.sharegpt import (
SimpleShareGPTPromptTokenizingStrategy,
register_chatml_template,
)
from axolotl.prompters import ShareGPTPrompterV2
register_chatml_template()
@pytest.fixture(name="sharegpt_dataset")
def fixture_sharegpt_dataset():

View File

@@ -39,6 +39,32 @@ class TestExpandMask(unittest.TestCase):
# Check that the output matches the expected output
self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))
def test_output_multipack(self):
mask = torch.tensor([[1, 1, 1, 0], [2, 2, 3, 3]])
dtype = torch.float32
expected_output = torch.tensor(
[
[
[
[0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
[0.0000e00, 0.0000e00, -3.4028e38, -3.4028e38],
[0.0000e00, 0.0000e00, 0.0000e00, -3.4028e38],
[-3.4028e38, -3.4028e38, -3.4028e38, -3.4028e38],
]
],
[
[
[0.0000e00, -3.4028e38, -3.4028e38, -3.4028e38],
[0.0000e00, 0.0000e00, -3.4028e38, -3.4028e38],
[-3.4028e38, -3.4028e38, 0.0000e00, -3.4028e38],
[-3.4028e38, -3.4028e38, 0.0000e00, 0.0000e00],
]
],
]
)
# Check that the output matches the expected output
self.assertTrue(torch.allclose(_expand_mask(mask, dtype), expected_output))
if __name__ == "__main__":
unittest.main()