Compare commits
20 Commits
20230920-b
...
llama-drop
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7771498eae | ||
|
|
5e5296a77c | ||
|
|
f3d939016a | ||
|
|
cfbce020e9 | ||
|
|
4fecbfe5e1 | ||
|
|
67b9888630 | ||
|
|
923eb91304 | ||
|
|
a363604dcf | ||
|
|
501958bb6f | ||
|
|
c25ba7939b | ||
|
|
d5f8589021 | ||
|
|
03e59077a0 | ||
|
|
97d3776ce6 | ||
|
|
2844eb22b6 | ||
|
|
e85d2eb06b | ||
|
|
196ff1181e | ||
|
|
92512c390b | ||
|
|
2fe95cdcc1 | ||
|
|
c1382e79b6 | ||
|
|
5d931cc042 |
4
.github/workflows/tests.yml
vendored
4
.github/workflows/tests.yml
vendored
@@ -4,7 +4,11 @@ on:
|
||||
push:
|
||||
branches:
|
||||
- "main"
|
||||
paths:
|
||||
- '**.py'
|
||||
pull_request:
|
||||
paths:
|
||||
- '**.py'
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
|
||||
4
.gitignore
vendored
4
.gitignore
vendored
@@ -161,3 +161,7 @@ cython_debug/
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
.idea/
|
||||
|
||||
# WandB
|
||||
# wandb creates a folder to store logs for training runs
|
||||
wandb
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
[settings]
|
||||
profile=black
|
||||
known_third_party=wandb
|
||||
|
||||
32
README.md
32
README.md
@@ -31,6 +31,7 @@ Features:
|
||||
- [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset)
|
||||
- [Config](#config)
|
||||
- [Train](#train)
|
||||
- [Training w/ Deepspeed](#training-with-deepspeed)
|
||||
- [Inference](#inference)
|
||||
- [Merge LORA to Base](#merge-lora-to-base)
|
||||
- [Common Errors](#common-errors-)
|
||||
@@ -86,7 +87,7 @@ git clone https://github.com/OpenAccess-AI-Collective/axolotl
|
||||
cd axolotl
|
||||
|
||||
pip3 install packaging
|
||||
pip3 install -e .[flash-attn]
|
||||
pip3 install -e '.[flash-attn,deepspeed]'
|
||||
pip3 install -U git+https://github.com/huggingface/peft.git
|
||||
|
||||
# finetune lora
|
||||
@@ -121,7 +122,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
|
||||
3. Install axolotl along with python dependencies
|
||||
```bash
|
||||
pip3 install packaging
|
||||
pip3 install -e .[flash-attn]
|
||||
pip3 install -e '.[flash-attn,deepspeed]'
|
||||
```
|
||||
|
||||
- LambdaLabs
|
||||
@@ -157,7 +158,7 @@ accelerate launch -m axolotl.cli.inference examples/openllama-3b/lora.yml \
|
||||
cd axolotl
|
||||
|
||||
pip3 install packaging
|
||||
pip3 install -e .[flash-attn]
|
||||
pip3 install -e '.[flash-attn,deepspeed]'
|
||||
pip3 install protobuf==3.20.3
|
||||
pip3 install -U --ignore-installed requests Pillow psutil scipy
|
||||
```
|
||||
@@ -492,6 +493,8 @@ pad_to_sequence_len:
|
||||
max_packed_sequence_len: 1024
|
||||
# use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
|
||||
sample_packing:
|
||||
# set to 'false' if getting errors during eval with sample_packing on.
|
||||
eval_sample_packing:
|
||||
# you can set these packing optimizations AFTER starting a training at least once.
|
||||
# The trainer will provide recommended values for these values.
|
||||
sample_packing_eff_est:
|
||||
@@ -715,11 +718,6 @@ fsdp_config:
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
```
|
||||
|
||||
- llama Deepspeed
|
||||
```yaml
|
||||
deepspeed: deepspeed/zero3.json
|
||||
```
|
||||
|
||||
##### Weights & Biases Logging
|
||||
|
||||
- wandb options
|
||||
@@ -732,6 +730,24 @@ wandb_run_id:
|
||||
wandb_log_model:
|
||||
```
|
||||
|
||||
### Training with Deepspeed
|
||||
|
||||
Deepspeed is an optimization suite for multi-gpu systems allowing you to train much larger models than you
|
||||
might typically be able to fit into your GPU's VRAM. More information about the various optimization types
|
||||
for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed#what-is-integrated
|
||||
|
||||
We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3.
|
||||
|
||||
```shell
|
||||
accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json
|
||||
```
|
||||
|
||||
or
|
||||
|
||||
```yaml
|
||||
deepspeed: deepspeed/zero1.json
|
||||
```
|
||||
|
||||
### Inference
|
||||
|
||||
Pass the appropriate flag to the train command:
|
||||
|
||||
@@ -1,39 +1,41 @@
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 1,
|
||||
"overlap_comm": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": "auto",
|
||||
"eps": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
"zero_optimization": {
|
||||
"stage": 1,
|
||||
"overlap_comm": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": "auto",
|
||||
"eps": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"warmup_type": "linear",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
|
||||
@@ -1,43 +1,45 @@
|
||||
{
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu"
|
||||
},
|
||||
"contiguous_gradients": true,
|
||||
"overlap_comm": true
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu"
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": "auto",
|
||||
"eps": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
"contiguous_gradients": true,
|
||||
"overlap_comm": true
|
||||
},
|
||||
"bf16": {
|
||||
"enabled": "auto"
|
||||
},
|
||||
"fp16": {
|
||||
"enabled": "auto",
|
||||
"auto_cast": false,
|
||||
"loss_scale": 0,
|
||||
"initial_scale_power": 32,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": "auto",
|
||||
"betas": "auto",
|
||||
"eps": "auto",
|
||||
"weight_decay": "auto"
|
||||
}
|
||||
},
|
||||
"scheduler": {
|
||||
"type": "WarmupDecayLR",
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto",
|
||||
"warmup_type": "linear",
|
||||
"total_num_steps": "auto"
|
||||
}
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
|
||||
@@ -45,9 +45,11 @@
|
||||
"params": {
|
||||
"warmup_min_lr": "auto",
|
||||
"warmup_max_lr": "auto",
|
||||
"warmup_num_steps": "auto"
|
||||
"warmup_num_steps": "auto",
|
||||
"warmup_type": "linear"
|
||||
}
|
||||
},
|
||||
"gradient_accumulation_steps": "auto",
|
||||
"train_batch_size": "auto",
|
||||
"train_micro_batch_size_per_gpu": "auto",
|
||||
"wall_clock_breakdown": false
|
||||
|
||||
@@ -13,16 +13,14 @@ ARG CUDA="118"
|
||||
|
||||
ENV PYTHON_VERSION=$PYTHON_VERSION
|
||||
|
||||
RUN apt-get update
|
||||
RUN apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/*
|
||||
|
||||
RUN wget \
|
||||
RUN apt-get update \
|
||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev && rm -rf /var/lib/apt/lists/*
|
||||
&& wget \
|
||||
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
||||
&& mkdir /root/.conda \
|
||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh
|
||||
|
||||
RUN conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||
|
||||
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||
|
||||
@@ -46,6 +44,8 @@ FROM base-builder AS bnb-builder
|
||||
WORKDIR /workspace
|
||||
ARG CUDA="118"
|
||||
ENV CUDA=$CUDA
|
||||
ARG MAX_JOBS="-1"
|
||||
ENV MAX_JOBS=$MAX_JOBS
|
||||
|
||||
RUN git clone https://github.com/TimDettmers/bitsandbytes.git && \
|
||||
cd bitsandbytes && \
|
||||
@@ -60,8 +60,7 @@ ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
|
||||
# recompile apex
|
||||
RUN python3 -m pip uninstall -y apex
|
||||
RUN git clone https://github.com/NVIDIA/apex
|
||||
# `MAX_JOBS=1` disables parallel building to avoid cpu memory OOM when building image on GitHub Action (standard) runners
|
||||
RUN cd apex && MAX_JOBS=1 python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
|
||||
RUN cd apex && python3 -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
|
||||
|
||||
RUN mkdir -p /workspace/builds
|
||||
COPY --from=bnb-builder /workspace/bitsandbytes /workspace/builds/bitsandbytes
|
||||
|
||||
45
docs/multi-node.md
Normal file
45
docs/multi-node.md
Normal file
@@ -0,0 +1,45 @@
|
||||
# Multi Node
|
||||
|
||||
You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:
|
||||
|
||||
~/.cache/huggingface/accelerate/default_config.yaml
|
||||
```yaml
|
||||
compute_environment: LOCAL_MACHINE
|
||||
debug: false
|
||||
distributed_type: FSDP
|
||||
downcast_bf16: 'no'
|
||||
machine_rank: 0 # Set to 0 for the main machine, increment by one for other machines
|
||||
main_process_ip: 10.0.0.4 # Set to main machine's IP
|
||||
main_process_port: 5000
|
||||
main_training_function: main
|
||||
mixed_precision: bf16
|
||||
num_machines: 2 # Change to the number of machines
|
||||
num_processes: 4 # That's the total number of GPUs, (for example: if you have 2 machines with 4 GPU, put 8)
|
||||
rdzv_backend: static
|
||||
same_network: true
|
||||
tpu_env: []
|
||||
tpu_use_cluster: false
|
||||
tpu_use_sudo: false
|
||||
use_cpu: false
|
||||
```
|
||||
|
||||
Configure your model to use FSDP with for example:
|
||||
```yaml
|
||||
fsdp:
|
||||
- full_shard
|
||||
- auto_wrap
|
||||
fsdp_config:
|
||||
fsdp_offload_params: true
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
|
||||
```
|
||||
|
||||
## Machine configuration
|
||||
|
||||
On each machine you need a copy of Axolotl, we suggest using the same commit to ensure compatibility.
|
||||
|
||||
You will also need to have the same configuration file for your model on each machine.
|
||||
|
||||
On the main machine only, make sure the port you set as `main_process_port` is open in TCP and reachable by other machines.
|
||||
|
||||
All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.
|
||||
@@ -1,5 +1,5 @@
|
||||
base_model: meta-llama/Llama-2-7b-hf
|
||||
base_model_config: meta-llama/Llama-2-7b-hf
|
||||
base_model: NousResearch/Llama-2-7b-hf
|
||||
base_model_config: NousResearch/Llama-2-7b-hf
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
is_llama_derived_model: true
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
base_model: meta-llama/Llama-2-7b-hf
|
||||
base_model_config: meta-llama/Llama-2-7b-hf
|
||||
base_model: NousResearch/Llama-2-7b-hf
|
||||
base_model_config: NousResearch/Llama-2-7b-hf
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
is_llama_derived_model: true
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
base_model: meta-llama/Llama-2-7b-hf
|
||||
base_model_config: meta-llama/Llama-2-7b-hf
|
||||
base_model: NousResearch/Llama-2-7b-hf
|
||||
base_model_config: NousResearch/Llama-2-7b-hf
|
||||
model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
is_llama_derived_model: true
|
||||
|
||||
@@ -7,6 +7,7 @@ peft @ git+https://github.com/huggingface/peft.git
|
||||
transformers @ git+https://github.com/huggingface/transformers.git
|
||||
bitsandbytes>=0.41.1
|
||||
accelerate @ git+https://github.com/huggingface/accelerate
|
||||
deepspeed
|
||||
addict
|
||||
evaluate
|
||||
fire
|
||||
|
||||
9
setup.py
9
setup.py
@@ -13,7 +13,12 @@ def parse_requirements():
|
||||
# Handle custom index URLs
|
||||
_, url = line.split()
|
||||
_dependency_links.append(url)
|
||||
elif "flash-attn" not in line and line and line[0] != "#":
|
||||
elif (
|
||||
"flash-attn" not in line
|
||||
and "deepspeed" not in line
|
||||
and line
|
||||
and line[0] != "#"
|
||||
):
|
||||
# Handle standard packages
|
||||
_install_requires.append(line)
|
||||
return _install_requires, _dependency_links
|
||||
@@ -35,7 +40,7 @@ setup(
|
||||
"flash-attn": [
|
||||
"flash-attn>=2.2.1",
|
||||
],
|
||||
"extras": [
|
||||
"deepspeed": [
|
||||
"deepspeed",
|
||||
],
|
||||
},
|
||||
|
||||
@@ -38,10 +38,15 @@ class TokenizedPromptDataset(Dataset):
|
||||
def process(self, dataset):
|
||||
features = dataset.features.keys()
|
||||
num_proc = min(64, os.cpu_count())
|
||||
map_kwargs = {}
|
||||
if self.prompt_tokenizer.supports_batched:
|
||||
map_kwargs["batched"] = True
|
||||
map_kwargs["batch_size"] = 100
|
||||
return dataset.map(
|
||||
self.prompt_tokenizer.tokenize_prompt,
|
||||
num_proc=num_proc,
|
||||
remove_columns=features,
|
||||
**map_kwargs,
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -12,6 +12,8 @@ import torch.nn.functional as F
|
||||
import transformers
|
||||
from einops import rearrange
|
||||
from flash_attn.bert_padding import pad_input, unpad_input
|
||||
from torch import nn
|
||||
from transformers import LlamaConfig
|
||||
from transformers.modeling_outputs import BaseModelOutputWithPast
|
||||
from transformers.models.llama.modeling_llama import (
|
||||
LlamaDecoderLayer as OriginalLlamaDecoderLayer,
|
||||
@@ -78,6 +80,19 @@ def replace_llama_attn_with_flash_attn(packed: Optional[bool] = False):
|
||||
)
|
||||
|
||||
|
||||
class GaussianDropout(nn.Module):
|
||||
def __init__(self, p=0.5):
|
||||
super(GaussianDropout, self).__init__()
|
||||
if p <= 0 or p >= 1:
|
||||
raise Exception("p value should accomplish 0 < p < 1")
|
||||
self.p = p
|
||||
|
||||
def forward(self, x):
|
||||
stddev = (self.p / (1.0 - self.p)) ** 0.5
|
||||
epsilon = torch.randn_like(x) * stddev
|
||||
return x * epsilon
|
||||
|
||||
|
||||
# Disable the transformation of the attention mask in LlamaModel as the flash attention
|
||||
# requires the attention mask to be the same as the key_padding_mask
|
||||
def _prepare_decoder_attention_mask(
|
||||
@@ -202,7 +217,7 @@ def flashattn_forward(
|
||||
qkv = rearrange(qkv, "b s ... -> (b s) ...")
|
||||
|
||||
output = flash_attn_varlen_qkvpacked_func(
|
||||
qkv, cu_seqlens, max_seqlen, 0.0, softmax_scale=None, causal=True
|
||||
qkv, cu_seqlens, max_seqlen, dropout_p=0.0, softmax_scale=None, causal=True
|
||||
)
|
||||
output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
|
||||
elif query_states.shape == key_states.shape:
|
||||
@@ -571,6 +586,15 @@ class LlamaDecoderLayer(OriginalLlamaDecoderLayer):
|
||||
patched version of LlamaDecoderLayer to pass through the precalculated cu_seqlens
|
||||
"""
|
||||
|
||||
def __init__(self, config: LlamaConfig):
|
||||
super(LlamaDecoderLayer, self).__init__(config)
|
||||
self.attn_dropout = None
|
||||
self.mlp_dropout = None
|
||||
if config.dropout_attn:
|
||||
self.attn_dropout = GaussianDropout(p=config.dropout_attn)
|
||||
if config.dropout_mlp:
|
||||
self.mlp_dropout = GaussianDropout(p=config.dropout_mlp)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
@@ -614,12 +638,16 @@ class LlamaDecoderLayer(OriginalLlamaDecoderLayer):
|
||||
cu_seqlens=cu_seqlens,
|
||||
max_seqlen=max_seqlen,
|
||||
)
|
||||
if self.training and self.attn_dropout:
|
||||
hidden_states = self.attn_dropout(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
# Fully Connected
|
||||
residual = hidden_states
|
||||
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||
hidden_states = self.mlp(hidden_states)
|
||||
if self.training and self.mlp_dropout:
|
||||
hidden_states = self.mlp_dropout(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
outputs = (hidden_states,)
|
||||
|
||||
@@ -1,10 +1,81 @@
|
||||
"""
|
||||
Basic completion text
|
||||
"""
|
||||
from typing import Any, Dict, Optional
|
||||
from collections import defaultdict
|
||||
from typing import Any, Dict, Generator, Optional, Tuple
|
||||
|
||||
from axolotl.prompt_tokenizers import CompletionPromptTokenizingStrategy
|
||||
from axolotl.prompters import CompletionPrompter
|
||||
from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy
|
||||
|
||||
|
||||
class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
||||
"""
|
||||
Tokenizing strategy for Completion prompts.
|
||||
"""
|
||||
|
||||
_field: str = "text"
|
||||
|
||||
def __init__(self, *args, max_length=None, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if max_length is not None:
|
||||
self.max_length = max_length
|
||||
|
||||
@property
|
||||
def supports_batched(self):
|
||||
return True
|
||||
|
||||
@property
|
||||
def field(self) -> str:
|
||||
return self._field
|
||||
|
||||
@field.setter
|
||||
def field(self, new_field: str):
|
||||
self._field = new_field
|
||||
|
||||
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
|
||||
return (
|
||||
prompt[self.field],
|
||||
"",
|
||||
"",
|
||||
)
|
||||
|
||||
def tokenize_prompt(self, prompt):
|
||||
res = defaultdict(lambda: [])
|
||||
feature_names = list(prompt.keys())
|
||||
for row in zip(*prompt.values()):
|
||||
prompt_row = dict(zip(feature_names, row))
|
||||
(
|
||||
instruction,
|
||||
_,
|
||||
_,
|
||||
) = self.parse_instruction_fields(prompt_row)
|
||||
|
||||
full_prompt = self._build_full_prompt(instruction, None, None)
|
||||
tokenized_full_prompt = self._tokenize(full_prompt)
|
||||
|
||||
for key, val in tokenized_full_prompt.items():
|
||||
for i in range(0, len(val), self.sequence_len):
|
||||
res[key].append(val[i : i + self.sequence_len])
|
||||
|
||||
return dict(res)
|
||||
|
||||
def _build_full_prompt(
|
||||
self, instruction, input, response
|
||||
): # pylint: disable=redefined-builtin
|
||||
return next(iter(self.prompter.build_prompt(instruction, input, response)))
|
||||
|
||||
|
||||
class CompletionPrompter:
|
||||
"""
|
||||
Prompter for completion
|
||||
"""
|
||||
|
||||
def build_prompt(
|
||||
self,
|
||||
instruction: str,
|
||||
input=None, # pylint: disable=redefined-builtin, unused-argument
|
||||
output=None, # pylint: disable=unused-argument
|
||||
) -> Generator[str, None, None]:
|
||||
yield instruction
|
||||
|
||||
|
||||
def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
|
||||
@@ -13,6 +84,7 @@ def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
|
||||
tokenizer,
|
||||
cfg.train_on_inputs,
|
||||
cfg.sequence_len,
|
||||
max_length=cfg.sequence_len * 64,
|
||||
)
|
||||
if ds_cfg and "field" in ds_cfg:
|
||||
strat.field = ds_cfg["field"]
|
||||
|
||||
@@ -41,11 +41,16 @@ class PromptTokenizingStrategy(abc.ABC):
|
||||
self.tokenizer: PreTrainedTokenizer = tokenizer
|
||||
self.train_on_inputs = train_on_inputs
|
||||
self.sequence_len = sequence_len
|
||||
self.max_length = sequence_len
|
||||
|
||||
@abc.abstractmethod
|
||||
def tokenize_prompt(self, prompt):
|
||||
pass
|
||||
|
||||
@property
|
||||
def supports_batched(self):
|
||||
return False
|
||||
|
||||
@functools.lru_cache(maxsize=128)
|
||||
def _get_user_token(self):
|
||||
try:
|
||||
@@ -77,7 +82,7 @@ class PromptTokenizingStrategy(abc.ABC):
|
||||
result = self.tokenizer(
|
||||
prompt,
|
||||
truncation=True,
|
||||
max_length=self.sequence_len,
|
||||
max_length=self.max_length,
|
||||
padding=False,
|
||||
return_tensors=None,
|
||||
)
|
||||
@@ -86,7 +91,7 @@ class PromptTokenizingStrategy(abc.ABC):
|
||||
if (
|
||||
len(result["input_ids"]) > 0
|
||||
and result["input_ids"][-1] != self.tokenizer.eos_token_id
|
||||
and len(result["input_ids"]) < self.sequence_len
|
||||
and len(result["input_ids"]) < self.max_length
|
||||
and add_eos_token
|
||||
):
|
||||
result["input_ids"].append(self.tokenizer.eos_token_id)
|
||||
@@ -247,46 +252,6 @@ class NomicGPT4AllPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
||||
)
|
||||
|
||||
|
||||
class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
|
||||
"""
|
||||
Tokenizing strategy for Completion prompts.
|
||||
"""
|
||||
|
||||
_field: str = "text"
|
||||
|
||||
@property
|
||||
def field(self) -> str:
|
||||
return self._field
|
||||
|
||||
@field.setter
|
||||
def field(self, new_field: str):
|
||||
self._field = new_field
|
||||
|
||||
def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
|
||||
return (
|
||||
prompt[self.field],
|
||||
"",
|
||||
"",
|
||||
)
|
||||
|
||||
def tokenize_prompt(self, prompt):
|
||||
(
|
||||
instruction,
|
||||
_,
|
||||
_,
|
||||
) = self.parse_instruction_fields(prompt)
|
||||
|
||||
full_prompt = self._build_full_prompt(instruction, None, None)
|
||||
tokenized_full_prompt = self._tokenize(full_prompt)
|
||||
|
||||
return tokenized_full_prompt
|
||||
|
||||
def _build_full_prompt(
|
||||
self, instruction, input, response
|
||||
): # pylint: disable=redefined-builtin
|
||||
return next(iter(self.prompter.build_prompt(instruction, input, response)))
|
||||
|
||||
|
||||
class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
"""
|
||||
Tokenizing strategy for Reflection prompts.
|
||||
@@ -393,10 +358,12 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
):
|
||||
if isinstance(part, tuple):
|
||||
if part[0] == "USER:":
|
||||
part = part[0] + part[1] if not user_token else part[1]
|
||||
turn = part[0] + part[1] if not user_token else part[1]
|
||||
# this is still the user query, we should
|
||||
if not part[1].strip():
|
||||
LOG.warning(f"user turn has empty text: {prompt}")
|
||||
res = self._tokenize(
|
||||
part.strip(),
|
||||
turn.strip(),
|
||||
add_eos_token=False,
|
||||
strip_bos_token=True,
|
||||
)
|
||||
@@ -406,10 +373,12 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
|
||||
elif part[0] == "ASSISTANT:":
|
||||
# TODO label assistant token/tokens w/ IGNORE_TOKEN_ID
|
||||
part = part[0] + part[1] if not assistant_token else part[1]
|
||||
# this should be the assistent response, should end with an eos token
|
||||
turn = part[0] + part[1] if not assistant_token else part[1]
|
||||
# this should be the assistant response, should end with an eos token
|
||||
if not part[1].strip():
|
||||
LOG.warning(f"assistant turn has empty text: {prompt}")
|
||||
res = self._tokenize(
|
||||
part.strip(),
|
||||
turn.strip(),
|
||||
add_eos_token=True,
|
||||
strip_bos_token=True,
|
||||
)
|
||||
@@ -444,22 +413,31 @@ class ShareGPTPromptTokenizingStrategy(PromptTokenizingStrategy):
|
||||
raise InvalidDataException(str(err)) from err
|
||||
|
||||
def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
|
||||
result = self.tokenizer(
|
||||
prompt,
|
||||
truncation=True,
|
||||
max_length=self.sequence_len,
|
||||
padding=False,
|
||||
return_tensors=None,
|
||||
)
|
||||
if not prompt.strip():
|
||||
LOG.warning("Empty text requested for tokenization.")
|
||||
result = BatchEncoding(data={"input_ids": [], "attention_mask": []})
|
||||
else:
|
||||
result = self.tokenizer(
|
||||
prompt,
|
||||
truncation=True,
|
||||
max_length=self.sequence_len,
|
||||
padding=False,
|
||||
return_tensors=None,
|
||||
)
|
||||
if (
|
||||
result["input_ids"][-1] != self.tokenizer.eos_token_id
|
||||
len(result["input_ids"]) > 0
|
||||
and result["input_ids"][-1] != self.tokenizer.eos_token_id
|
||||
and len(result["input_ids"]) < self.sequence_len
|
||||
and add_eos_token
|
||||
):
|
||||
result["input_ids"].append(self.tokenizer.eos_token_id)
|
||||
result["attention_mask"].append(1)
|
||||
|
||||
if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
|
||||
if (
|
||||
len(result["input_ids"]) > 0
|
||||
and result["input_ids"][0] == self.tokenizer.bos_token_id
|
||||
and strip_bos_token
|
||||
):
|
||||
result["input_ids"] = result["input_ids"][1:]
|
||||
result["attention_mask"] = result["attention_mask"][1:]
|
||||
|
||||
|
||||
@@ -135,20 +135,6 @@ class SummarizeTLDRPrompter(AlpacaPrompter):
|
||||
self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"
|
||||
|
||||
|
||||
class CompletionPrompter:
|
||||
"""
|
||||
Prompter for completion
|
||||
"""
|
||||
|
||||
def build_prompt(
|
||||
self,
|
||||
instruction: str,
|
||||
input=None, # pylint: disable=redefined-builtin, unused-argument
|
||||
output=None, # pylint: disable=unused-argument
|
||||
) -> Generator[str, None, None]:
|
||||
yield instruction
|
||||
|
||||
|
||||
class GPTeacherPrompter(AlpacaPrompter):
|
||||
"""
|
||||
Prompter for GPTeacher
|
||||
|
||||
@@ -9,8 +9,7 @@ from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import torch
|
||||
|
||||
# add src to the pythonpath so we don't need to pip install this
|
||||
import transformers.modelcard
|
||||
from datasets import Dataset
|
||||
from optimum.bettertransformer import BetterTransformer
|
||||
|
||||
@@ -103,6 +102,9 @@ def train(
|
||||
signal.SIGINT, lambda signum, frame: terminate_handler(signum, frame, model)
|
||||
)
|
||||
|
||||
badge_markdown = """[<img src="https://raw.githubusercontent.com/OpenAccess-AI-Collective/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/OpenAccess-AI-Collective/axolotl)"""
|
||||
transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}"
|
||||
|
||||
LOG.info("Starting trainer...")
|
||||
if cfg.group_by_length:
|
||||
LOG.info("hang tight... sorting dataset for group_by_length")
|
||||
@@ -138,4 +140,7 @@ def train(
|
||||
|
||||
model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)
|
||||
|
||||
if not cfg.hub_model_id:
|
||||
trainer.create_model_card(model_name=cfg.output_dir.lstrip("./"))
|
||||
|
||||
return model, tokenizer
|
||||
|
||||
@@ -1,14 +1,44 @@
|
||||
"""Benchmarking and measurement utilities"""
|
||||
import functools
|
||||
|
||||
import pynvml
|
||||
import torch
|
||||
from pynvml.nvml import NVMLError
|
||||
|
||||
|
||||
def check_cuda_device(default_value):
|
||||
"""
|
||||
wraps a function and returns the default value instead of running the
|
||||
wrapped function if cuda isn't available or the device is auto
|
||||
:param default_value:
|
||||
:return:
|
||||
"""
|
||||
|
||||
def deco(func):
|
||||
@functools.wraps(func)
|
||||
def wrapper(*args, **kwargs):
|
||||
device = kwargs.get("device", args[0] if args else None)
|
||||
|
||||
if (
|
||||
not torch.cuda.is_available()
|
||||
or device == "auto"
|
||||
or torch.device(device).type == "cpu"
|
||||
):
|
||||
return default_value
|
||||
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return wrapper
|
||||
|
||||
return deco
|
||||
|
||||
|
||||
@check_cuda_device(0.0)
|
||||
def gpu_memory_usage(device=0):
|
||||
return torch.cuda.memory_allocated(device) / 1024.0**3
|
||||
|
||||
|
||||
@check_cuda_device((0.0, 0.0, 0.0))
|
||||
def gpu_memory_usage_all(device=0):
|
||||
usage = torch.cuda.memory_allocated(device) / 1024.0**3
|
||||
reserved = torch.cuda.memory_reserved(device) / 1024.0**3
|
||||
@@ -16,6 +46,7 @@ def gpu_memory_usage_all(device=0):
|
||||
return usage, reserved - usage, max(0, smi - reserved)
|
||||
|
||||
|
||||
@check_cuda_device(0.0)
|
||||
def gpu_memory_usage_smi(device=0):
|
||||
if isinstance(device, torch.device):
|
||||
device = device.index
|
||||
@@ -31,9 +62,6 @@ def gpu_memory_usage_smi(device=0):
|
||||
|
||||
|
||||
def log_gpu_memory_usage(log, msg, device):
|
||||
if not torch.cuda.is_available() or device == "auto":
|
||||
return (0, 0, 0)
|
||||
|
||||
usage, cache, misc = gpu_memory_usage_all(device)
|
||||
extras = []
|
||||
if cache > 0:
|
||||
|
||||
@@ -43,26 +43,26 @@ LOG = logging.getLogger("axolotl.callbacks")
|
||||
IGNORE_INDEX = -100
|
||||
|
||||
|
||||
class SavePeftModelCallback(TrainerCallback): # pylint: disable=too-few-public-methods
|
||||
"""Callback to save the PEFT adapter"""
|
||||
class EvalFirstStepCallback(
|
||||
TrainerCallback
|
||||
): # pylint: disable=too-few-public-methods disable=unused-argument
|
||||
"""
|
||||
Callback to trigger evals on the first step
|
||||
"""
|
||||
|
||||
def on_save(
|
||||
def on_step_end(
|
||||
self,
|
||||
args: TrainingArguments,
|
||||
state: TrainerState,
|
||||
control: TrainerControl,
|
||||
**kwargs,
|
||||
):
|
||||
checkpoint_folder = os.path.join(
|
||||
args.output_dir,
|
||||
f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
|
||||
)
|
||||
|
||||
peft_model_path = os.path.join(checkpoint_folder, "adapter_model")
|
||||
kwargs["model"].save_pretrained(
|
||||
peft_model_path, save_safetensors=args.save_safetensors
|
||||
)
|
||||
|
||||
if (
|
||||
args.evaluation_strategy == IntervalStrategy.STEPS
|
||||
and args.eval_steps < 1.0
|
||||
and state.global_step == 1
|
||||
):
|
||||
control.should_evaluate = True
|
||||
return control
|
||||
|
||||
|
||||
|
||||
@@ -94,7 +94,7 @@ def validate_config(cfg):
|
||||
if not cfg.bf16 and not cfg.bfloat16:
|
||||
LOG.info("bf16 support detected, but not enabled for this configuration.")
|
||||
else:
|
||||
if cfg.bf16 or cfg.bfloat16:
|
||||
if not cfg.merge_lora and (cfg.bf16 or cfg.bfloat16):
|
||||
raise ValueError(
|
||||
"bf16 requested, but AMP is not supported on this GPU. Requires Ampere series or above."
|
||||
)
|
||||
|
||||
@@ -77,7 +77,9 @@ def gather_scalar_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-n
|
||||
value_scalar = fn()
|
||||
if not is_distributed():
|
||||
return [value_scalar]
|
||||
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
|
||||
value_tensor = torch.tensor(
|
||||
value_scalar, device=torch.cuda.current_device()
|
||||
).float()
|
||||
|
||||
if not is_main_process():
|
||||
dist.gather(value_tensor, dst=0)
|
||||
@@ -137,9 +139,13 @@ def compute_and_broadcast(fn): # pylint: disable=invalid-name
|
||||
"""
|
||||
if is_main_process():
|
||||
value_scalar = fn()
|
||||
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
|
||||
value_tensor = torch.tensor(
|
||||
value_scalar, device=torch.cuda.current_device()
|
||||
).float()
|
||||
else:
|
||||
value_tensor = torch.tensor(0.0, device=dist.get_rank()) # Placeholder tensor
|
||||
value_tensor = torch.tensor(
|
||||
0.0, device=torch.cuda.current_device()
|
||||
) # Placeholder tensor
|
||||
|
||||
# Broadcast the tensor to all processes.
|
||||
barrier()
|
||||
@@ -164,7 +170,9 @@ def gather_from_all_ranks(fn, world_size=1): # pylint: disable=invalid-name
|
||||
- A list of computed values from all ranks if on the gathering rank, otherwise None.
|
||||
"""
|
||||
value_scalar = fn()
|
||||
value_tensor = torch.tensor(value_scalar, device=dist.get_rank()).float()
|
||||
value_tensor = torch.tensor(
|
||||
value_scalar, device=torch.cuda.current_device()
|
||||
).float()
|
||||
|
||||
# Placeholder tensor for gathering results
|
||||
if is_main_process():
|
||||
|
||||
@@ -10,6 +10,7 @@ import torch
|
||||
import transformers
|
||||
from optimum.bettertransformer import BetterTransformer
|
||||
from peft import PeftConfig, prepare_model_for_kbit_training
|
||||
from peft.tuners.lora import QuantLinear
|
||||
from transformers import ( # noqa: F401
|
||||
AutoConfig,
|
||||
AutoModelForCausalLM,
|
||||
@@ -309,16 +310,26 @@ def load_model(
|
||||
):
|
||||
config.max_sequence_length = cfg.sequence_len
|
||||
LOG.warning(f"increasing context length to {cfg.sequence_len}")
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model,
|
||||
config=config,
|
||||
device_map=cfg.device_map,
|
||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
||||
torch_dtype=cfg.torch_dtype,
|
||||
trust_remote_code=cfg.trust_remote_code or False,
|
||||
**model_kwargs,
|
||||
)
|
||||
if cfg.gptq:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model,
|
||||
config=config,
|
||||
device_map=cfg.device_map,
|
||||
torch_dtype=cfg.torch_dtype,
|
||||
trust_remote_code=cfg.trust_remote_code or False,
|
||||
**model_kwargs,
|
||||
)
|
||||
else:
|
||||
model = AutoModelForCausalLM.from_pretrained(
|
||||
base_model,
|
||||
config=config,
|
||||
device_map=cfg.device_map,
|
||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
||||
torch_dtype=cfg.torch_dtype,
|
||||
trust_remote_code=cfg.trust_remote_code or False,
|
||||
**model_kwargs,
|
||||
)
|
||||
except Exception as err: # pylint: disable=broad-exception-caught
|
||||
LOG.error(
|
||||
"Exception raised attempting to load model, retrying with AutoModelForCausalLM"
|
||||
@@ -466,10 +477,10 @@ def load_llama_adapter(model, cfg):
|
||||
|
||||
|
||||
def find_all_linear_names(model):
|
||||
cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
|
||||
cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear, QuantLinear)
|
||||
lora_module_names = set()
|
||||
for name, module in model.named_modules():
|
||||
if isinstance(module, cls):
|
||||
if isinstance(module, cls) or "Linear" in module.__class__.__name__:
|
||||
names = name.split(".")
|
||||
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
|
||||
|
||||
|
||||
@@ -28,9 +28,9 @@ from transformers.trainer_pt_utils import SequentialDistributedSampler
|
||||
|
||||
from axolotl.monkeypatch.relora import ReLoRACallback, ReLoRAScheduler
|
||||
from axolotl.utils.callbacks import (
|
||||
EvalFirstStepCallback,
|
||||
GPUStatsCallback,
|
||||
SaveBetterTransformerModelCallback,
|
||||
SavePeftModelCallback,
|
||||
bench_eval_callback_factory,
|
||||
log_prediction_callback_factory,
|
||||
)
|
||||
@@ -675,6 +675,7 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
||||
(cfg.load_best_model_at_end is not False or cfg.early_stopping_patience)
|
||||
and cfg.val_set_size > 0
|
||||
and cfg.save_steps
|
||||
and cfg.eval_steps
|
||||
and cfg.save_steps % cfg.eval_steps == 0
|
||||
)
|
||||
or False,
|
||||
@@ -704,16 +705,11 @@ def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_
|
||||
|
||||
callbacks = []
|
||||
callbacks.append(GPUStatsCallback(cfg))
|
||||
callbacks.append(EvalFirstStepCallback)
|
||||
|
||||
if cfg.relora_steps:
|
||||
callbacks.append(ReLoRACallback(cfg))
|
||||
|
||||
if cfg.local_rank == 0 and cfg.adapter in [
|
||||
"lora",
|
||||
"qlora",
|
||||
]: # only save in rank 0
|
||||
callbacks.append(SavePeftModelCallback)
|
||||
|
||||
if hasattr(model, "use_bettertransformer") and model.use_bettertransformer is True:
|
||||
callbacks.append(SaveBetterTransformerModelCallback)
|
||||
|
||||
|
||||
@@ -6,6 +6,7 @@ import logging
|
||||
import os
|
||||
import tempfile
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from axolotl.cli import load_datasets
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
@@ -24,6 +25,7 @@ class TestLoraLlama(unittest.TestCase):
|
||||
|
||||
def test_lora(self):
|
||||
# pylint: disable=duplicate-code
|
||||
output_dir = tempfile.mkdtemp()
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "JackFram/llama-68m",
|
||||
@@ -51,7 +53,7 @@ class TestLoraLlama(unittest.TestCase):
|
||||
"num_epochs": 2,
|
||||
"micro_batch_size": 8,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": tempfile.mkdtemp(),
|
||||
"output_dir": output_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"lr_scheduler": "cosine",
|
||||
@@ -62,9 +64,11 @@ class TestLoraLlama(unittest.TestCase):
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(output_dir) / "adapter_model.bin").exists()
|
||||
|
||||
def test_lora_packing(self):
|
||||
# pylint: disable=duplicate-code
|
||||
output_dir = tempfile.mkdtemp()
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "JackFram/llama-68m",
|
||||
@@ -94,7 +98,7 @@ class TestLoraLlama(unittest.TestCase):
|
||||
"num_epochs": 2,
|
||||
"micro_batch_size": 8,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": tempfile.mkdtemp(),
|
||||
"output_dir": output_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"lr_scheduler": "cosine",
|
||||
@@ -105,3 +109,53 @@ class TestLoraLlama(unittest.TestCase):
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(output_dir) / "adapter_model.bin").exists()
|
||||
|
||||
def test_lora_gptq(self):
|
||||
# pylint: disable=duplicate-code
|
||||
output_dir = tempfile.mkdtemp()
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "TheBlokeAI/jackfram_llama-68m-GPTQ",
|
||||
"base_model_config": "TheBlokeAI/jackfram_llama-68m-GPTQ",
|
||||
"model_type": "AutoModelForCausalLM",
|
||||
"tokenizer_type": "LlamaTokenizer",
|
||||
"sequence_len": 1024,
|
||||
"sample_packing": True,
|
||||
"flash_attention": True,
|
||||
"load_in_8bit": True,
|
||||
"adapter": "lora",
|
||||
"gptq": True,
|
||||
"gptq_disable_exllama": True,
|
||||
"lora_r": 32,
|
||||
"lora_alpha": 64,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"unk_token": "<unk>",
|
||||
"bos_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 2,
|
||||
"save_steps": 0.5,
|
||||
"micro_batch_size": 8,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": output_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"lr_scheduler": "cosine",
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(output_dir) / "adapter_model.bin").exists()
|
||||
|
||||
@@ -31,9 +31,9 @@ class TestPhi(unittest.TestCase):
|
||||
"trust_remote_code": True,
|
||||
"model_type": "MixFormerSequentialForCausalLM",
|
||||
"tokenizer_type": "AutoTokenizer",
|
||||
"sequence_len": 2048,
|
||||
"sequence_len": 512,
|
||||
"sample_packing": False,
|
||||
"load_in_8bit": True,
|
||||
"load_in_8bit": False,
|
||||
"adapter": None,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
@@ -55,8 +55,9 @@ class TestPhi(unittest.TestCase):
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": tempfile.mkdtemp(),
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"optimizer": "adamw_bnb_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"bf16": True,
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
@@ -74,9 +75,9 @@ class TestPhi(unittest.TestCase):
|
||||
"trust_remote_code": True,
|
||||
"model_type": "MixFormerSequentialForCausalLM",
|
||||
"tokenizer_type": "AutoTokenizer",
|
||||
"sequence_len": 2048,
|
||||
"sequence_len": 512,
|
||||
"sample_packing": True,
|
||||
"load_in_8bit": True,
|
||||
"load_in_8bit": False,
|
||||
"adapter": None,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
@@ -98,8 +99,9 @@ class TestPhi(unittest.TestCase):
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": tempfile.mkdtemp(),
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"optimizer": "adamw_bnb_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"bf16": True,
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
|
||||
52
tests/fixtures/conversation.missingturns.json
vendored
Normal file
52
tests/fixtures/conversation.missingturns.json
vendored
Normal file
@@ -0,0 +1,52 @@
|
||||
{
|
||||
"conversations": [
|
||||
{
|
||||
"from": "human",
|
||||
"value": "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?"
|
||||
},
|
||||
{
|
||||
"from": "gpt",
|
||||
"value": ""
|
||||
},
|
||||
{
|
||||
"from": "human",
|
||||
"value": "could you give me the gist of how it could though?"
|
||||
},
|
||||
{
|
||||
"from": "gpt",
|
||||
"value": "Sure. The basic idea would be to use algebraic topology to analyze the structure of the Minesweeper game board and determine the locations of mines based on certain topological invariants. This could involve representing the game board as a topological space and using topological invariants such as homotopy groups and Betti numbers to infer information about the location of mines on the board. However, as I mentioned before, implementing this would require a deep understanding of both algebraic topology and the game of Minesweeper, so it's not something that can be explained in a simple answer."
|
||||
},
|
||||
{
|
||||
"from": "human",
|
||||
"value": "give me a specific example"
|
||||
},
|
||||
{
|
||||
"from": "gpt",
|
||||
"value": "As a specific example, suppose we have a Minesweeper game board with the following configuration:\n```\n 1 1 1 1 1\n 1 ? ? ? 1\n 1 ? ? ? 1\n 1 ? ? ? 1\n 1 1 1 1 1\n```\nIn this case, the game board can be represented as a topological space, where each square on the board corresponds to a point in the space. The mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space. Using algebraic topology, we can analyze the structure of this space and determine the locations of the mines based on certain topological invariants.\n\nFor instance, one way to do this is to compute the homotopy groups of the space. Homotopy groups are a way of describing the holes in a space, and they can be used to infer information about the locations of the mines on the Minesweeper board. In the case of the board above, we can compute the first homotopy group of the space, which describes the number and \"shape\" of the holes in the space. In this case, the first homotopy group would tell us that there are four holes in the space, arranged in a square shape.\n\nBased on this information, we can infer that the mines in the game are located in the four corners of the board, as shown below:\n```\n 1 1 1 1 1\n 1 M ? ? 1\n 1 ? ? ? 1\n 1 ? ? M 1\n 1 1 1 1 1\n```\nOf course, this is just one example of how algebraic topology could be used to solve Minesweeper. There are many other possible approaches, and the specific details of how to implement them would depend on the specific game board and the topological invariants being used. As I mentioned before, implementing a solution like this would require a deep understanding of both algebraic topology and the game of Minesweeper."
|
||||
},
|
||||
{
|
||||
"from": "human",
|
||||
"value": "assuming i was someone with a deep understanding of both, could you explain the same concept to me with that assumption?"
|
||||
},
|
||||
{
|
||||
"from": "gpt",
|
||||
"value": "Sure. Assuming you have a deep understanding of both algebraic topology and the game of Minesweeper, here is how you might use algebraic topology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Use topological invariants to infer information about the locations of the mines on the board. There are many possible topological invariants that could be used for this, such as homotopy groups, Betti numbers, or persistent homology.\n3. Use the information inferred from the topological invariants to determine the locations of the mines on the board. For example, if the first homotopy group of the space indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using algebraic topology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the topological invariants being used. However, this should give you a general idea of how algebraic topology can be used to solve Minesweeper."
|
||||
},
|
||||
{
|
||||
"from": "human",
|
||||
"value": "how would i do it with persistent homology or any of the other topological invariants?"
|
||||
},
|
||||
{
|
||||
"from": "gpt",
|
||||
"value": "To use persistent homology or any of the other topological invariants to solve a Minesweeper game board, you would follow the same basic steps as described above, but with some differences in the specific details.\n\nFor instance, here is how you might use persistent homology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Compute the persistent homology of the space. Persistent homology is a way of describing the holes in a space over time, and it can be used to infer information about the locations of the mines on the Minesweeper board.\n3. Use the information from the persistent homology to determine the locations of the mines on the board. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using persistent homology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the specific details of the persistent homology computation. However, this should give you a general idea of how persistent homology can be used to solve Minesweeper."
|
||||
},
|
||||
{
|
||||
"from": "human",
|
||||
"value": "how could i implement this in python?"
|
||||
},
|
||||
{
|
||||
"from": "gpt",
|
||||
"value": "To implement a Minesweeper solver that uses algebraic topology in Python, you would need to do the following:\n\n1. Install a Python package for algebraic topology. There are several options available, such as Gudhi, Dionysus, or scikit-tda. These packages provide algorithms and data structures for computing topological invariants, such as homotopy groups, Betti numbers, or persistent homology.\n2. Represent the Minesweeper game board as a topological space. This could be done using a data structure provided by the algebraic topology package, such as a simplicial complex or a Vietoris-Rips complex. Each square on the board would correspond to a point in the space, and the mines on the board would be represented as \"holes\" in the space.\n3. Compute the topological invariants of the space. This could be done using algorithms provided by the algebraic topology package. For example, you could compute the persistent homology of the space using the persistent homology algorithm.\n4. Use the information from the topological invariants to determine the locations of the mines on the board. This could be done using a combination of mathematical reasoning and programming. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then you could use a piece of code to infer that the mines are located in the four corners of the board.\n5. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you could use a piece of code to automatically click on the safe squares and clear the board.\n\nOf course, this is just one possible approach to implementing a Minesweeper solver that uses algebraic topology in Python. There may be other ways to do it, depending on the specific details of the implementation. However, this should give you a general idea of how to get started with such a project."
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -3,7 +3,9 @@ import json
|
||||
import logging
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer, LlamaTokenizer
|
||||
|
||||
from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
|
||||
@@ -29,6 +31,12 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
|
||||
Test class for prompt tokenization strategies.
|
||||
"""
|
||||
|
||||
_caplog: Optional[pytest.LogCaptureFixture] = None
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def inject_fixtures(self, caplog):
|
||||
self._caplog = caplog
|
||||
|
||||
def setUp(self) -> None:
|
||||
# pylint: disable=duplicate-code
|
||||
self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
|
||||
@@ -64,6 +72,24 @@ class TestPromptTokenizationStrategies(unittest.TestCase):
|
||||
self.assertEqual(len(example[fields]), len(tokenized_conversation[fields]))
|
||||
self.assertEqual(example[fields], tokenized_conversation[fields])
|
||||
|
||||
def test_sharegpt_warnings_integration(self):
|
||||
with open(
|
||||
Path(__file__).parent / "fixtures/conversation.missingturns.json",
|
||||
encoding="utf-8",
|
||||
) as fin:
|
||||
data = fin.read()
|
||||
conversation = json.loads(data)
|
||||
prompter = ShareGPTPrompter("chat")
|
||||
strat = ShareGPTPromptTokenizingStrategy(
|
||||
prompter,
|
||||
self.tokenizer,
|
||||
False,
|
||||
2048,
|
||||
)
|
||||
with self._caplog.at_level(logging.WARNING):
|
||||
strat.tokenize_prompt(conversation)
|
||||
assert "assistant turn has empty text" in self._caplog.records[1].message
|
||||
|
||||
def test_no_sys_prompt(self):
|
||||
"""
|
||||
tests the interface between the user and assistant parts
|
||||
|
||||
@@ -351,3 +351,26 @@ class ValidationTest(unittest.TestCase):
|
||||
regex_exp = r".*set only one of max_packed_sequence_len \(deprecated soon\) or sample_packing.*"
|
||||
with pytest.raises(ValueError, match=regex_exp):
|
||||
validate_config(cfg)
|
||||
|
||||
def test_merge_lora_no_bf16_fail(self):
|
||||
"""
|
||||
This is assumed to be run on a CPU machine, so bf16 is not supported.
|
||||
"""
|
||||
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"bf16": True,
|
||||
}
|
||||
)
|
||||
|
||||
with pytest.raises(ValueError, match=r".*AMP is not supported on this GPU*"):
|
||||
validate_config(cfg)
|
||||
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"bf16": True,
|
||||
"merge_lora": True,
|
||||
}
|
||||
)
|
||||
|
||||
validate_config(cfg)
|
||||
|
||||
Reference in New Issue
Block a user