Compare commits
38 Commits
deepspeed-
...
v0.4.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
1427d5b502 | ||
|
|
54d2ac155b | ||
|
|
af0243021c | ||
|
|
8a49309489 | ||
|
|
5bce45f800 | ||
|
|
d85d4942cf | ||
|
|
02f2c720fc | ||
|
|
71141deb18 | ||
|
|
dc051b861d | ||
|
|
59a31fe613 | ||
|
|
814aee6603 | ||
|
|
b715cd549a | ||
|
|
fb7f9b9516 | ||
|
|
cc250391a0 | ||
|
|
9135b9e2aa | ||
|
|
7523d1f557 | ||
|
|
5439707489 | ||
|
|
684038111e | ||
|
|
cda52dc32b | ||
|
|
e799e08d3c | ||
|
|
0f77b8d798 | ||
|
|
32580c1ca7 | ||
|
|
802f9667a2 | ||
|
|
b8e5603467 | ||
|
|
782b6a4216 | ||
|
|
eaaeefce55 | ||
|
|
f5a828aa20 | ||
|
|
fccb542b47 | ||
|
|
2ce5c0d68a | ||
|
|
3db5f2fd17 | ||
|
|
cbecf3e62a | ||
|
|
729740df81 | ||
|
|
08b8ba09a5 | ||
|
|
6910e6a8ca | ||
|
|
1d70f24b50 | ||
|
|
317fa2555a | ||
|
|
1e56b88cde | ||
|
|
7570446596 |
4
.github/workflows/main.yml
vendored
4
.github/workflows/main.yml
vendored
@@ -7,7 +7,7 @@ on:
|
||||
|
||||
jobs:
|
||||
build-axolotl:
|
||||
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
fail-fast: false
|
||||
@@ -79,7 +79,7 @@ jobs:
|
||||
|
||||
build-axolotl-runpod:
|
||||
needs: build-axolotl
|
||||
if: github.repository_owner == 'OpenAccess-AI-Collective'
|
||||
if: ${{ ! contains(github.event.commits[0].message, '[skip docker]]') && github.repository_owner == 'OpenAccess-AI-Collective' }}
|
||||
# this job needs to be run on self-hosted GPU runners...
|
||||
strategy:
|
||||
matrix:
|
||||
|
||||
27
README.md
27
README.md
@@ -105,6 +105,9 @@ pip3 install -e '.[flash-attn,deepspeed]'
|
||||
|
||||
### Usage
|
||||
```bash
|
||||
# preprocess datasets - optional but recommended
|
||||
CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/openllama-3b/lora.yml
|
||||
|
||||
# finetune lora
|
||||
accelerate launch -m axolotl.cli.train examples/openllama-3b/lora.yml
|
||||
|
||||
@@ -464,8 +467,8 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod
|
||||
```yaml
|
||||
load_in_4bit: true
|
||||
load_in_8bit: true
|
||||
bf16: true # require >=ampere
|
||||
fp16: true
|
||||
bf16: auto # require >=ampere, auto will detect if your GPU supports this and choose automatically.
|
||||
fp16: # leave empty to use fp16 when bf16 is 'auto'. set to false if you want to fallback to fp32
|
||||
tf32: true # require >=ampere
|
||||
bfloat16: true # require >=ampere, use instead of bf16 when you don't want AMP (automatic mixed precision)
|
||||
float16: true # use instead of fp16 when you don't want AMP
|
||||
@@ -618,6 +621,9 @@ push_dataset_to_hub: # repo path
|
||||
# The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
|
||||
# if not set.
|
||||
dataset_processes: # defaults to os.cpu_count() if not set
|
||||
# Keep dataset in memory while preprocessing
|
||||
# Only needed if cached dataset is taking too much storage
|
||||
dataset_keep_in_memory:
|
||||
# push checkpoints to hub
|
||||
hub_model_id: # repo path to push finetuned model
|
||||
# how to push checkpoints to hub
|
||||
@@ -639,10 +645,6 @@ sequence_len: 2048
|
||||
# Pad inputs so each step uses constant sized buffers
|
||||
# This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
|
||||
pad_to_sequence_len:
|
||||
# Max sequence length to concatenate training samples together up to
|
||||
# Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
|
||||
# FutureWarning: This will soon be DEPRECATED
|
||||
max_packed_sequence_len: 1024
|
||||
# Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
|
||||
sample_packing:
|
||||
# Set to 'false' if getting errors during eval with sample_packing on.
|
||||
@@ -834,7 +836,8 @@ flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
|
||||
# Whether to use scaled-dot-product attention
|
||||
# https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
|
||||
sdp_attention:
|
||||
|
||||
# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
|
||||
s2_attention:
|
||||
# Resume from a specific checkpoint dir
|
||||
resume_from_checkpoint:
|
||||
# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
|
||||
@@ -858,7 +861,7 @@ tokens:
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
|
||||
# Deepspeed config path. e.g., deepspeed/zero3.json
|
||||
# Deepspeed config path. e.g., deepspeed_configs/zero3.json
|
||||
deepspeed:
|
||||
|
||||
# Advanced DDP Arguments
|
||||
@@ -979,11 +982,11 @@ for deepspeed is available at https://huggingface.co/docs/accelerate/main/en/usa
|
||||
We provide several default deepspeed JSON configurations for ZeRO stage 1, 2, and 3.
|
||||
|
||||
```yaml
|
||||
deepspeed: deepspeed/zero1.json
|
||||
deepspeed: deepspeed_configs/zero1.json
|
||||
```
|
||||
|
||||
```shell
|
||||
accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed/zero1.json
|
||||
accelerate launch -m axolotl.cli.train examples/llama-2/config.py --deepspeed deepspeed_configs/zero1.json
|
||||
```
|
||||
|
||||
##### FSDP
|
||||
@@ -1122,7 +1125,7 @@ If you decode a prompt constructed by axolotl, you might see spaces between toke
|
||||
1. Materialize some data using `python -m axolotl.cli.preprocess your_config.yml --debug`, and then decode the first few rows with your model's tokenizer.
|
||||
2. During inference, right before you pass a tensor of token ids to your model, decode these tokens back into a string.
|
||||
3. Make sure the inference string from #2 looks **exactly** like the data you fine tuned on from #1, including spaces and new lines. If they aren't the same adjust your inference server accordingly.
|
||||
4. As an additional troubleshooting step, you can look look at the token ids between 1 and 2 to make sure they are identical.
|
||||
4. As an additional troubleshooting step, you can look at the token ids between 1 and 2 to make sure they are identical.
|
||||
|
||||
Having misalignment between your prompts during training and inference can cause models to perform very poorly, so it is worth checking this. See [this blog post](https://hamel.dev/notes/llm/05_tokenizer_gotchas.html) for a concrete example.
|
||||
|
||||
@@ -1149,7 +1152,7 @@ Building something cool with Axolotl? Consider adding a badge to your model card
|
||||
Check out some of the projects and models that have been built using Axolotl! Have a model you'd like to add to our Community Showcase? Open a PR with your model.
|
||||
|
||||
Open Access AI Collective
|
||||
- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b)
|
||||
- [Minotaur 13b](https://huggingface.co/openaccess-ai-collective/minotaur-13b-fixed)
|
||||
- [Manticore 13b](https://huggingface.co/openaccess-ai-collective/manticore-13b)
|
||||
- [Hippogriff 30b](https://huggingface.co/openaccess-ai-collective/hippogriff-30b-chat)
|
||||
|
||||
|
||||
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
|
||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} deepspeed-kernels --extra-index-url https://download.pytorch.org/whl/cu$CUDA
|
||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA
|
||||
|
||||
RUN git lfs install --skip-repo && \
|
||||
pip3 install awscli && \
|
||||
|
||||
@@ -7,9 +7,12 @@ ENV TRANSFORMERS_CACHE="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
|
||||
ENV HF_HUB_ENABLE_HF_TRANSFER="1"
|
||||
|
||||
EXPOSE 8888
|
||||
EXPOSE 22
|
||||
|
||||
COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
|
||||
|
||||
RUN pip install jupyterlab notebook && \
|
||||
RUN pip install jupyterlab notebook ipywidgets && \
|
||||
jupyter lab clean
|
||||
RUN apt install --yes --no-install-recommends openssh-server tmux && \
|
||||
mkdir -p ~/.ssh && \
|
||||
|
||||
16
docs/rlhf.md
16
docs/rlhf.md
@@ -19,14 +19,14 @@ The various RL training methods are implemented in trl and wrapped via axolotl.
|
||||
|
||||
#### DPO
|
||||
```yaml
|
||||
rl: true
|
||||
rl: dpo
|
||||
datasets:
|
||||
- path: Intel/orca_dpo_pairs
|
||||
split: train
|
||||
type: intel_apply_chatml
|
||||
type: chatml.intel
|
||||
- path: argilla/ultrafeedback-binarized-preferences
|
||||
split: train
|
||||
type: argilla_apply_chatml
|
||||
type: chatml.argilla
|
||||
```
|
||||
|
||||
#### IPO
|
||||
@@ -34,6 +34,16 @@ datasets:
|
||||
rl: ipo
|
||||
```
|
||||
|
||||
#### Using local dataset files
|
||||
```yaml
|
||||
datasets:
|
||||
- ds_type: json
|
||||
data_files:
|
||||
- orca_rlhf.jsonl
|
||||
split: train
|
||||
type: chatml.intel
|
||||
```
|
||||
|
||||
#### Trl autounwrap for peft
|
||||
|
||||
Trl supports autounwrapping peft models, so that a ref model does not need to be additionally loaded, leading to less VRAM needed. This is on by default. To turn it off, pass the following config.
|
||||
|
||||
@@ -53,8 +53,8 @@ lr_quadratic_warmup: true
|
||||
learning_rate: 0.000085
|
||||
train_on_inputs: true
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: false
|
||||
|
||||
@@ -36,8 +36,8 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
|
||||
@@ -41,8 +41,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
@@ -52,6 +52,7 @@ local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
s2_attention:
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
|
||||
@@ -43,8 +43,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -41,8 +41,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
@@ -52,6 +52,7 @@ local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
s2_attention:
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
|
||||
@@ -43,8 +43,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -41,8 +41,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
@@ -52,6 +52,7 @@ local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
s2_attention:
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
|
||||
@@ -43,8 +43,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -38,8 +38,8 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.00003
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
@@ -60,5 +60,5 @@ fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
pad_token: "<|endoftext|>"
|
||||
bos_token: ">>ABSTRACT<<"
|
||||
bos_token: "<|endoftext|>"
|
||||
eos_token: "<|endoftext|>"
|
||||
|
||||
@@ -64,8 +64,8 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
gradient_checkpointing: true
|
||||
# stop training after this many evaluation losses have increased in a row
|
||||
@@ -89,5 +89,5 @@ fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
pad_token: "<|endoftext|>"
|
||||
bos_token: ">>ABSTRACT<<"
|
||||
bos_token: "<|endoftext|>"
|
||||
eos_token: "<|endoftext|>"
|
||||
|
||||
@@ -38,8 +38,8 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.00003
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
@@ -60,5 +60,5 @@ fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
pad_token: "<|endoftext|>"
|
||||
bos_token: ">>ABSTRACT<<"
|
||||
bos_token: "<|endoftext|>"
|
||||
eos_token: "<|endoftext|>"
|
||||
|
||||
@@ -33,8 +33,8 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.0001
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
|
||||
@@ -31,7 +31,7 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.00003
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
bf16: auto
|
||||
tf32: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
|
||||
@@ -41,8 +41,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
@@ -62,7 +62,7 @@ evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed: #deepspeed/zero2.json # multi-gpu only
|
||||
deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
|
||||
weight_decay: 0.1
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
|
||||
@@ -41,8 +41,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
@@ -52,6 +52,7 @@ local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
s2_attention:
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
|
||||
@@ -43,8 +43,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -47,8 +47,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -34,8 +34,8 @@ learning_rate: 5e-5
|
||||
train_on_inputs: false
|
||||
group_by_length: true
|
||||
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: false
|
||||
|
||||
12
examples/mistral/Mistral-7b-example/README.md
Normal file
12
examples/mistral/Mistral-7b-example/README.md
Normal file
@@ -0,0 +1,12 @@
|
||||
# Description
|
||||
This repository presents an in-depth guide for fine-tuning Mistral-7b or any other compatible model using Axolotl, tailored specifically for chatbot development. It streamlines the process of fine-tuning and uploading the enhanced model to HuggingFace 🤗, thereby serving as an invaluable tool for developers in the AI and chatbot domain.
|
||||
|
||||
**What’s Inside:**
|
||||
|
||||
Beginner-Friendly Instructions: Comprehensive steps to guide you through fine-tuning your chosen model, including details on the data structure (jsonl), configuration, and the code itself.
|
||||
|
||||
Hardware Utilized: For reference, the fine-tuning in this guide was performed using 4x NVIDIA GeForce RTX 3090 (rented 2.1.2-cuda12.1-cudnn8-devel).
|
||||
|
||||
**Uploading to HuggingFace 🤗:**
|
||||
To upload your fine-tuned model to Hugging Face, include the following files:
|
||||

|
||||
970
examples/mistral/Mistral-7b-example/code.ipynb
Normal file
970
examples/mistral/Mistral-7b-example/code.ipynb
Normal file
@@ -0,0 +1,970 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "3fe31229-8f6b-48bc-a86d-af8e5466d11c",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"GPU available? True\n",
|
||||
"BF16 is supported? True\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# Check if GPU is available I used 4x NVIDIA GeForce RTX 3090 (rented 2.1.2-cuda12.1-cudnn8-devel)\n",
|
||||
"import torch\n",
|
||||
"print('GPU available?', torch.cuda.is_available())\n",
|
||||
"print('BF16 is supported?', torch.cuda.is_bf16_supported())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "1dee845b-f3cb-4b1e-bdd9-1a918eac140b",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Collecting huggingface_hub\n",
|
||||
" Downloading huggingface_hub-0.20.1-py3-none-any.whl.metadata (12 kB)\n",
|
||||
"Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (3.9.0)\n",
|
||||
"Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2023.10.0)\n",
|
||||
"Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (2.31.0)\n",
|
||||
"Requirement already satisfied: tqdm>=4.42.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.65.0)\n",
|
||||
"Requirement already satisfied: pyyaml>=5.1 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (6.0.1)\n",
|
||||
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (4.7.1)\n",
|
||||
"Requirement already satisfied: packaging>=20.9 in /opt/conda/lib/python3.10/site-packages (from huggingface_hub) (23.1)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2.0.4)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (3.4)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (1.26.18)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface_hub) (2023.7.22)\n",
|
||||
"Downloading huggingface_hub-0.20.1-py3-none-any.whl (330 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m330.1/330.1 kB\u001b[0m \u001b[31m8.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
|
||||
"\u001b[?25hInstalling collected packages: huggingface_hub\n",
|
||||
"Successfully installed huggingface_hub-0.20.1\n",
|
||||
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!pip install huggingface_hub"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "88731672-9050-4034-8266-11aaace2a44e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"from huggingface_hub import notebook_login"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "6b5aa7d7-3b18-4c14-afd4-043c2c545259",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"application/vnd.jupyter.widget-view+json": {
|
||||
"model_id": "60df98d7b0294289aad8b6c8cd023c3b",
|
||||
"version_major": 2,
|
||||
"version_minor": 0
|
||||
},
|
||||
"text/plain": [
|
||||
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#Login to huggingface so you can push the model to hub later\n",
|
||||
"import sys\n",
|
||||
"stdout = sys.stdout\n",
|
||||
"notebook_login()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "b74d0635-5033-4494-b7bd-ff6822103d93",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"#I noticed that when you use notebook_login() nothing gets printed after so we use sys \n",
|
||||
"sys.stdout = stdout"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "e3c3b088-45e7-484b-ae39-66beabc48da8",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Cloning into 'axolotl'...\n",
|
||||
"remote: Enumerating objects: 235, done.\u001b[K\n",
|
||||
"remote: Counting objects: 100% (235/235), done.\u001b[K\n",
|
||||
"remote: Compressing objects: 100% (207/207), done.\u001b[K\n",
|
||||
"remote: Total 235 (delta 48), reused 123 (delta 13), pack-reused 0\u001b[K\n",
|
||||
"Receiving objects: 100% (235/235), 1.46 MiB | 11.65 MiB/s, done.\n",
|
||||
"Resolving deltas: 100% (48/48), done.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#axolotl\n",
|
||||
"!git clone -b main --depth 1 https://github.com/OpenAccess-AI-Collective/axolotl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "66927751-4fd6-4477-97fc-6ab08c9d9a74",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/axolotl\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cd axolotl"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "fcccf8da-353b-4d70-8f55-5cfe08c7e6b9",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (23.1)\n",
|
||||
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0mObtaining file:///axolotl\n",
|
||||
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting auto-gptq==0.5.1\n",
|
||||
" Downloading auto_gptq-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)\n",
|
||||
"Requirement already satisfied: packaging in /opt/conda/lib/python3.10/site-packages (23.1)\n",
|
||||
"Collecting peft==0.6.0\n",
|
||||
" Downloading peft-0.6.0-py3-none-any.whl.metadata (23 kB)\n",
|
||||
"Collecting transformers==4.36.2\n",
|
||||
" Downloading transformers-4.36.2-py3-none-any.whl.metadata (126 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m126.8/126.8 kB\u001b[0m \u001b[31m9.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting tokenizers==0.15.0\n",
|
||||
" Downloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
|
||||
"Collecting bitsandbytes>=0.41.1\n",
|
||||
" Downloading bitsandbytes-0.41.3.post2-py3-none-any.whl.metadata (9.8 kB)\n",
|
||||
"Collecting accelerate==0.24.1\n",
|
||||
" Downloading accelerate-0.24.1-py3-none-any.whl.metadata (18 kB)\n",
|
||||
"Collecting addict\n",
|
||||
" Downloading addict-2.4.0-py3-none-any.whl (3.8 kB)\n",
|
||||
"Collecting fire\n",
|
||||
" Downloading fire-0.5.0.tar.gz (88 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m88.3/88.3 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hRequirement already satisfied: PyYAML>=6.0 in /opt/conda/lib/python3.10/site-packages (6.0.1)\n",
|
||||
"Collecting datasets>=2.15.0\n",
|
||||
" Downloading datasets-2.16.0-py3-none-any.whl.metadata (20 kB)\n",
|
||||
"Collecting sentencepiece\n",
|
||||
" Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m47.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting wandb\n",
|
||||
" Downloading wandb-0.16.1-py3-none-any.whl.metadata (9.8 kB)\n",
|
||||
"Collecting einops\n",
|
||||
" Downloading einops-0.7.0-py3-none-any.whl.metadata (13 kB)\n",
|
||||
"Collecting optimum==1.13.2\n",
|
||||
" Downloading optimum-1.13.2.tar.gz (300 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.0/301.0 kB\u001b[0m \u001b[31m72.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Installing build dependencies ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting hf_transfer\n",
|
||||
" Downloading hf_transfer-0.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.5 kB)\n",
|
||||
"Collecting colorama\n",
|
||||
" Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n",
|
||||
"Collecting numba\n",
|
||||
" Downloading numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)\n",
|
||||
"Requirement already satisfied: numpy>=1.24.4 in /opt/conda/lib/python3.10/site-packages (1.26.0)\n",
|
||||
"Collecting bert-score==0.3.13\n",
|
||||
" Downloading bert_score-0.3.13-py3-none-any.whl (61 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m61.1/61.1 kB\u001b[0m \u001b[31m20.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting evaluate==0.4.0\n",
|
||||
" Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m81.4/81.4 kB\u001b[0m \u001b[31m26.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting rouge-score==0.1.2\n",
|
||||
" Downloading rouge_score-0.1.2.tar.gz (17 kB)\n",
|
||||
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting scipy\n",
|
||||
" Downloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.4/60.4 kB\u001b[0m \u001b[31m17.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting scikit-learn==1.2.2\n",
|
||||
" Downloading scikit_learn-1.2.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.6/9.6 MB\u001b[0m \u001b[31m83.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0mm\n",
|
||||
"\u001b[?25hCollecting pynvml\n",
|
||||
" Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.1/53.1 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting art\n",
|
||||
" Downloading art-6.1-py3-none-any.whl.metadata (69 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m69.9/69.9 kB\u001b[0m \u001b[31m21.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting fschat==0.2.34\n",
|
||||
" Downloading fschat-0.2.34-py3-none-any.whl.metadata (20 kB)\n",
|
||||
"Collecting gradio==3.50.2\n",
|
||||
" Downloading gradio-3.50.2-py3-none-any.whl.metadata (17 kB)\n",
|
||||
"Collecting tensorboard\n",
|
||||
" Downloading tensorboard-2.15.1-py3-none-any.whl.metadata (1.7 kB)\n",
|
||||
"Collecting s3fs\n",
|
||||
" Downloading s3fs-2023.12.2-py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
"Collecting gcsfs\n",
|
||||
" Downloading gcsfs-2023.12.2.post1-py2.py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
"Collecting xformers==0.0.23\n",
|
||||
" Downloading xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.0 kB)\n",
|
||||
"Collecting deepspeed\n",
|
||||
" Downloading deepspeed-0.12.6.tar.gz (1.2 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m109.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting flash-attn==2.3.3\n",
|
||||
" Downloading flash_attn-2.3.3.tar.gz (2.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.3/2.3 MB\u001b[0m \u001b[31m111.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hRequirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (5.9.0)\n",
|
||||
"Requirement already satisfied: torch>=1.10.0 in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (2.1.1)\n",
|
||||
"Requirement already satisfied: huggingface-hub in /opt/conda/lib/python3.10/site-packages (from accelerate==0.24.1) (0.20.1)\n",
|
||||
"Collecting rouge (from auto-gptq==0.5.1)\n",
|
||||
" Downloading rouge-1.0.1-py3-none-any.whl (13 kB)\n",
|
||||
"Collecting gekko (from auto-gptq==0.5.1)\n",
|
||||
" Downloading gekko-1.0.6-py3-none-any.whl (12.2 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.2/12.2 MB\u001b[0m \u001b[31m77.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m0:01\u001b[0m\n",
|
||||
"\u001b[?25hCollecting safetensors (from auto-gptq==0.5.1)\n",
|
||||
" Downloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
|
||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from auto-gptq==0.5.1) (4.65.0)\n",
|
||||
"Collecting pandas>=1.0.1 (from bert-score==0.3.13)\n",
|
||||
" Downloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)\n",
|
||||
"Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from bert-score==0.3.13) (2.31.0)\n",
|
||||
"Collecting matplotlib (from bert-score==0.3.13)\n",
|
||||
" Downloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)\n",
|
||||
"Collecting dill (from evaluate==0.4.0)\n",
|
||||
" Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)\n",
|
||||
"Collecting xxhash (from evaluate==0.4.0)\n",
|
||||
" Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
|
||||
"Collecting multiprocess (from evaluate==0.4.0)\n",
|
||||
" Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)\n",
|
||||
"Requirement already satisfied: fsspec>=2021.05.0 in /opt/conda/lib/python3.10/site-packages (from fsspec[http]>=2021.05.0->evaluate==0.4.0) (2023.10.0)\n",
|
||||
"Collecting responses<0.19 (from evaluate==0.4.0)\n",
|
||||
" Downloading responses-0.18.0-py3-none-any.whl (38 kB)\n",
|
||||
"Collecting ninja (from flash-attn==2.3.3)\n",
|
||||
" Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl.metadata (5.3 kB)\n",
|
||||
"Collecting aiohttp (from fschat==0.2.34)\n",
|
||||
" Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)\n",
|
||||
"Collecting fastapi (from fschat==0.2.34)\n",
|
||||
" Downloading fastapi-0.108.0-py3-none-any.whl.metadata (24 kB)\n",
|
||||
"Collecting httpx (from fschat==0.2.34)\n",
|
||||
" Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)\n",
|
||||
"Collecting markdown2[all] (from fschat==0.2.34)\n",
|
||||
" Downloading markdown2-2.4.12-py2.py3-none-any.whl.metadata (2.0 kB)\n",
|
||||
"Collecting nh3 (from fschat==0.2.34)\n",
|
||||
" Downloading nh3-0.2.15-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.7 kB)\n",
|
||||
"Requirement already satisfied: prompt-toolkit>=3.0.0 in /opt/conda/lib/python3.10/site-packages (from fschat==0.2.34) (3.0.36)\n",
|
||||
"Collecting pydantic<2,>=1 (from fschat==0.2.34)\n",
|
||||
" Downloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (149 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m149.6/149.6 kB\u001b[0m \u001b[31m42.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting rich>=10.0.0 (from fschat==0.2.34)\n",
|
||||
" Downloading rich-13.7.0-py3-none-any.whl.metadata (18 kB)\n",
|
||||
"Collecting shortuuid (from fschat==0.2.34)\n",
|
||||
" Downloading shortuuid-1.0.11-py3-none-any.whl (10 kB)\n",
|
||||
"Collecting tiktoken (from fschat==0.2.34)\n",
|
||||
" Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
|
||||
"Collecting uvicorn (from fschat==0.2.34)\n",
|
||||
" Downloading uvicorn-0.25.0-py3-none-any.whl.metadata (6.4 kB)\n",
|
||||
"Collecting aiofiles<24.0,>=22.0 (from gradio==3.50.2)\n",
|
||||
" Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)\n",
|
||||
"Collecting altair<6.0,>=4.2.0 (from gradio==3.50.2)\n",
|
||||
" Downloading altair-5.2.0-py3-none-any.whl.metadata (8.7 kB)\n",
|
||||
"Collecting ffmpy (from gradio==3.50.2)\n",
|
||||
" Downloading ffmpy-0.3.1.tar.gz (5.5 kB)\n",
|
||||
" Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hCollecting gradio-client==0.6.1 (from gradio==3.50.2)\n",
|
||||
" Downloading gradio_client-0.6.1-py3-none-any.whl.metadata (7.1 kB)\n",
|
||||
"Collecting importlib-resources<7.0,>=1.3 (from gradio==3.50.2)\n",
|
||||
" Downloading importlib_resources-6.1.1-py3-none-any.whl.metadata (4.1 kB)\n",
|
||||
"Requirement already satisfied: jinja2<4.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (3.1.2)\n",
|
||||
"Requirement already satisfied: markupsafe~=2.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (2.1.1)\n",
|
||||
"Collecting orjson~=3.0 (from gradio==3.50.2)\n",
|
||||
" Downloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (49 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m49.3/49.3 kB\u001b[0m \u001b[31m14.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: pillow<11.0,>=8.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (10.0.1)\n",
|
||||
"Collecting pydub (from gradio==3.50.2)\n",
|
||||
" Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)\n",
|
||||
"Collecting python-multipart (from gradio==3.50.2)\n",
|
||||
" Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m45.7/45.7 kB\u001b[0m \u001b[31m13.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting semantic-version~=2.0 (from gradio==3.50.2)\n",
|
||||
" Downloading semantic_version-2.10.0-py2.py3-none-any.whl (15 kB)\n",
|
||||
"Requirement already satisfied: typing-extensions~=4.0 in /opt/conda/lib/python3.10/site-packages (from gradio==3.50.2) (4.7.1)\n",
|
||||
"Collecting websockets<12.0,>=10.0 (from gradio==3.50.2)\n",
|
||||
" Downloading websockets-11.0.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (129 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m129.9/129.9 kB\u001b[0m \u001b[31m30.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting coloredlogs (from optimum==1.13.2)\n",
|
||||
" Downloading coloredlogs-15.0.1-py2.py3-none-any.whl (46 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.0/46.0 kB\u001b[0m \u001b[31m11.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from optimum==1.13.2) (1.11.1)\n",
|
||||
"Collecting absl-py (from rouge-score==0.1.2)\n",
|
||||
" Downloading absl_py-2.0.0-py3-none-any.whl.metadata (2.3 kB)\n",
|
||||
"Collecting nltk (from rouge-score==0.1.2)\n",
|
||||
" Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.5/1.5 MB\u001b[0m \u001b[31m90.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: six>=1.14.0 in /opt/conda/lib/python3.10/site-packages (from rouge-score==0.1.2) (1.16.0)\n",
|
||||
"Collecting joblib>=1.1.1 (from scikit-learn==1.2.2)\n",
|
||||
" Downloading joblib-1.3.2-py3-none-any.whl.metadata (5.4 kB)\n",
|
||||
"Collecting threadpoolctl>=2.0.0 (from scikit-learn==1.2.2)\n",
|
||||
" Downloading threadpoolctl-3.2.0-py3-none-any.whl.metadata (10.0 kB)\n",
|
||||
"Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from transformers==4.36.2) (3.9.0)\n",
|
||||
"Collecting regex!=2019.12.17 (from transformers==4.36.2)\n",
|
||||
" Downloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m12.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.10.0->accelerate==0.24.1) (3.1)\n",
|
||||
"Collecting pyarrow>=8.0.0 (from datasets>=2.15.0)\n",
|
||||
" Downloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.0 kB)\n",
|
||||
"Collecting pyarrow-hotfix (from datasets>=2.15.0)\n",
|
||||
" Downloading pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)\n",
|
||||
"Collecting hjson (from deepspeed)\n",
|
||||
" Downloading hjson-3.1.0-py3-none-any.whl (54 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m54.0/54.0 kB\u001b[0m \u001b[31m19.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting py-cpuinfo (from deepspeed)\n",
|
||||
" Downloading py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)\n",
|
||||
"Collecting termcolor (from fire)\n",
|
||||
" Downloading termcolor-2.4.0-py3-none-any.whl.metadata (6.1 kB)\n",
|
||||
"Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.10/site-packages (from gcsfs) (5.1.1)\n",
|
||||
"INFO: pip is looking at multiple versions of gcsfs to determine which version is compatible with other requirements. This could take a while.\n",
|
||||
"Collecting gcsfs\n",
|
||||
" Downloading gcsfs-2023.12.1-py2.py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
" Downloading gcsfs-2023.12.0-py2.py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
" Downloading gcsfs-2023.10.0-py2.py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
"Collecting google-auth>=1.2 (from gcsfs)\n",
|
||||
" Downloading google_auth-2.25.2-py2.py3-none-any.whl.metadata (4.7 kB)\n",
|
||||
"Collecting google-auth-oauthlib (from gcsfs)\n",
|
||||
" Downloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl.metadata (2.7 kB)\n",
|
||||
"Collecting google-cloud-storage (from gcsfs)\n",
|
||||
" Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl.metadata (6.1 kB)\n",
|
||||
"Collecting llvmlite<0.42,>=0.41.0dev0 (from numba)\n",
|
||||
" Downloading llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)\n",
|
||||
"Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)\n",
|
||||
" Downloading aiobotocore-2.9.0-py3-none-any.whl.metadata (20 kB)\n",
|
||||
"INFO: pip is looking at multiple versions of s3fs to determine which version is compatible with other requirements. This could take a while.\n",
|
||||
"Collecting s3fs\n",
|
||||
" Downloading s3fs-2023.12.1-py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
" Downloading s3fs-2023.10.0-py3-none-any.whl.metadata (1.6 kB)\n",
|
||||
"Collecting aiobotocore~=2.7.0 (from s3fs)\n",
|
||||
" Downloading aiobotocore-2.7.0-py3-none-any.whl.metadata (20 kB)\n",
|
||||
"Collecting grpcio>=1.48.2 (from tensorboard)\n",
|
||||
" Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)\n",
|
||||
"Collecting markdown>=2.6.8 (from tensorboard)\n",
|
||||
" Downloading Markdown-3.5.1-py3-none-any.whl.metadata (7.1 kB)\n",
|
||||
"Collecting protobuf<4.24,>=3.19.6 (from tensorboard)\n",
|
||||
" Downloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl.metadata (540 bytes)\n",
|
||||
"Requirement already satisfied: setuptools>=41.0.0 in /opt/conda/lib/python3.10/site-packages (from tensorboard) (68.0.0)\n",
|
||||
"Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)\n",
|
||||
" Downloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl.metadata (1.1 kB)\n",
|
||||
"Collecting werkzeug>=1.0.1 (from tensorboard)\n",
|
||||
" Downloading werkzeug-3.0.1-py3-none-any.whl.metadata (4.1 kB)\n",
|
||||
"Requirement already satisfied: Click!=8.0.0,>=7.1 in /opt/conda/lib/python3.10/site-packages (from wandb) (8.1.7)\n",
|
||||
"Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)\n",
|
||||
" Downloading GitPython-3.1.40-py3-none-any.whl.metadata (12 kB)\n",
|
||||
"Collecting sentry-sdk>=1.0.0 (from wandb)\n",
|
||||
" Downloading sentry_sdk-1.39.1-py2.py3-none-any.whl.metadata (9.7 kB)\n",
|
||||
"Collecting docker-pycreds>=0.4.0 (from wandb)\n",
|
||||
" Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)\n",
|
||||
"Collecting setproctitle (from wandb)\n",
|
||||
" Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.9 kB)\n",
|
||||
"Collecting appdirs>=1.4.3 (from wandb)\n",
|
||||
" Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)\n",
|
||||
"Collecting botocore<1.31.65,>=1.31.16 (from aiobotocore~=2.7.0->s3fs)\n",
|
||||
" Downloading botocore-1.31.64-py3-none-any.whl.metadata (6.1 kB)\n",
|
||||
"Collecting wrapt<2.0.0,>=1.10.10 (from aiobotocore~=2.7.0->s3fs)\n",
|
||||
" Downloading wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
|
||||
"Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore~=2.7.0->s3fs)\n",
|
||||
" Downloading aioitertools-0.11.0-py3-none-any.whl (23 kB)\n",
|
||||
"Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.10/site-packages (from aiohttp->fschat==0.2.34) (23.1.0)\n",
|
||||
"Collecting multidict<7.0,>=4.5 (from aiohttp->fschat==0.2.34)\n",
|
||||
" Downloading multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m114.5/114.5 kB\u001b[0m \u001b[31m37.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting yarl<2.0,>=1.0 (from aiohttp->fschat==0.2.34)\n",
|
||||
" Downloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (31 kB)\n",
|
||||
"Collecting frozenlist>=1.1.1 (from aiohttp->fschat==0.2.34)\n",
|
||||
" Downloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)\n",
|
||||
"Collecting aiosignal>=1.1.2 (from aiohttp->fschat==0.2.34)\n",
|
||||
" Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)\n",
|
||||
"Collecting async-timeout<5.0,>=4.0 (from aiohttp->fschat==0.2.34)\n",
|
||||
" Downloading async_timeout-4.0.3-py3-none-any.whl.metadata (4.2 kB)\n",
|
||||
"Requirement already satisfied: jsonschema>=3.0 in /opt/conda/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio==3.50.2) (4.20.0)\n",
|
||||
"Requirement already satisfied: toolz in /opt/conda/lib/python3.10/site-packages (from altair<6.0,>=4.2.0->gradio==3.50.2) (0.12.0)\n",
|
||||
"Collecting gitdb<5,>=4.0.1 (from GitPython!=3.1.29,>=1.0.0->wandb)\n",
|
||||
" Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)\n",
|
||||
"Collecting cachetools<6.0,>=2.0.0 (from google-auth>=1.2->gcsfs)\n",
|
||||
" Downloading cachetools-5.3.2-py3-none-any.whl.metadata (5.2 kB)\n",
|
||||
"Collecting pyasn1-modules>=0.2.1 (from google-auth>=1.2->gcsfs)\n",
|
||||
" Downloading pyasn1_modules-0.3.0-py2.py3-none-any.whl (181 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m181.3/181.3 kB\u001b[0m \u001b[31m59.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting rsa<5,>=3.1.4 (from google-auth>=1.2->gcsfs)\n",
|
||||
" Downloading rsa-4.9-py3-none-any.whl (34 kB)\n",
|
||||
"Collecting requests-oauthlib>=0.7.0 (from google-auth-oauthlib->gcsfs)\n",
|
||||
" Downloading requests_oauthlib-1.3.1-py2.py3-none-any.whl (23 kB)\n",
|
||||
"Collecting contourpy>=1.0.1 (from matplotlib->bert-score==0.3.13)\n",
|
||||
" Downloading contourpy-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.8 kB)\n",
|
||||
"Collecting cycler>=0.10 (from matplotlib->bert-score==0.3.13)\n",
|
||||
" Downloading cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)\n",
|
||||
"Collecting fonttools>=4.22.0 (from matplotlib->bert-score==0.3.13)\n",
|
||||
" Downloading fonttools-4.47.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (157 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m157.2/157.2 kB\u001b[0m \u001b[31m41.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting kiwisolver>=1.3.1 (from matplotlib->bert-score==0.3.13)\n",
|
||||
" Downloading kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (6.4 kB)\n",
|
||||
"Collecting pyparsing>=2.3.1 (from matplotlib->bert-score==0.3.13)\n",
|
||||
" Downloading pyparsing-3.1.1-py3-none-any.whl.metadata (5.1 kB)\n",
|
||||
"Requirement already satisfied: python-dateutil>=2.7 in /opt/conda/lib/python3.10/site-packages (from matplotlib->bert-score==0.3.13) (2.8.2)\n",
|
||||
"Requirement already satisfied: pytz>=2020.1 in /opt/conda/lib/python3.10/site-packages (from pandas>=1.0.1->bert-score==0.3.13) (2023.3.post1)\n",
|
||||
"Collecting tzdata>=2022.1 (from pandas>=1.0.1->bert-score==0.3.13)\n",
|
||||
" Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m341.8/341.8 kB\u001b[0m \u001b[31m72.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: wcwidth in /opt/conda/lib/python3.10/site-packages (from prompt-toolkit>=3.0.0->fschat==0.2.34) (0.2.5)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (2.0.4)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (3.4)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (1.26.18)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->bert-score==0.3.13) (2023.7.22)\n",
|
||||
"Collecting markdown-it-py>=2.2.0 (from rich>=10.0.0->fschat==0.2.34)\n",
|
||||
" Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n",
|
||||
"Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /opt/conda/lib/python3.10/site-packages (from rich>=10.0.0->fschat==0.2.34) (2.15.1)\n",
|
||||
"Collecting h11>=0.8 (from uvicorn->fschat==0.2.34)\n",
|
||||
" Downloading h11-0.14.0-py3-none-any.whl (58 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m21.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting humanfriendly>=9.1 (from coloredlogs->optimum==1.13.2)\n",
|
||||
" Downloading humanfriendly-10.0-py2.py3-none-any.whl (86 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m86.8/86.8 kB\u001b[0m \u001b[31m27.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hCollecting starlette<0.33.0,>=0.29.0 (from fastapi->fschat==0.2.34)\n",
|
||||
" Downloading starlette-0.32.0.post1-py3-none-any.whl.metadata (5.8 kB)\n",
|
||||
"Collecting typing-extensions~=4.0 (from gradio==3.50.2)\n",
|
||||
" Downloading typing_extensions-4.9.0-py3-none-any.whl.metadata (3.0 kB)\n",
|
||||
"Collecting google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 (from google-cloud-storage->gcsfs)\n",
|
||||
" Downloading google_api_core-2.15.0-py3-none-any.whl.metadata (2.7 kB)\n",
|
||||
"Collecting google-cloud-core<3.0dev,>=2.3.0 (from google-cloud-storage->gcsfs)\n",
|
||||
" Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl.metadata (2.7 kB)\n",
|
||||
"Collecting google-resumable-media>=2.6.0 (from google-cloud-storage->gcsfs)\n",
|
||||
" Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl.metadata (2.2 kB)\n",
|
||||
"Collecting google-crc32c<2.0dev,>=1.0 (from google-cloud-storage->gcsfs)\n",
|
||||
" Downloading google_crc32c-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (32 kB)\n",
|
||||
"Requirement already satisfied: anyio in /opt/conda/lib/python3.10/site-packages (from httpx->fschat==0.2.34) (4.2.0)\n",
|
||||
"Collecting httpcore==1.* (from httpx->fschat==0.2.34)\n",
|
||||
" Downloading httpcore-1.0.2-py3-none-any.whl.metadata (20 kB)\n",
|
||||
"Requirement already satisfied: sniffio in /opt/conda/lib/python3.10/site-packages (from httpx->fschat==0.2.34) (1.3.0)\n",
|
||||
"Collecting wavedrom (from markdown2[all]->fschat==0.2.34)\n",
|
||||
" Downloading wavedrom-2.0.3.post3.tar.gz (137 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m137.7/137.7 kB\u001b[0m \u001b[31m47.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hRequirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->optimum==1.13.2) (1.3.0)\n",
|
||||
"Collecting jmespath<2.0.0,>=0.7.1 (from botocore<1.31.65,>=1.31.16->aiobotocore~=2.7.0->s3fs)\n",
|
||||
" Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
|
||||
"Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->GitPython!=3.1.29,>=1.0.0->wandb)\n",
|
||||
" Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)\n",
|
||||
"Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage->gcsfs)\n",
|
||||
" Downloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl.metadata (1.5 kB)\n",
|
||||
"Requirement already satisfied: jsonschema-specifications>=2023.03.6 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (2023.12.1)\n",
|
||||
"Requirement already satisfied: referencing>=0.28.4 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (0.32.0)\n",
|
||||
"Requirement already satisfied: rpds-py>=0.7.1 in /opt/conda/lib/python3.10/site-packages (from jsonschema>=3.0->altair<6.0,>=4.2.0->gradio==3.50.2) (0.15.2)\n",
|
||||
"Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich>=10.0.0->fschat==0.2.34)\n",
|
||||
" Downloading mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
|
||||
"Collecting pyasn1<0.6.0,>=0.4.6 (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs)\n",
|
||||
" Downloading pyasn1-0.5.1-py2.py3-none-any.whl.metadata (8.6 kB)\n",
|
||||
"Collecting oauthlib>=3.0.0 (from requests-oauthlib>=0.7.0->google-auth-oauthlib->gcsfs)\n",
|
||||
" Downloading oauthlib-3.2.2-py3-none-any.whl (151 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m151.7/151.7 kB\u001b[0m \u001b[31m50.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hRequirement already satisfied: exceptiongroup>=1.0.2 in /opt/conda/lib/python3.10/site-packages (from anyio->httpx->fschat==0.2.34) (1.0.4)\n",
|
||||
"Collecting svgwrite (from wavedrom->markdown2[all]->fschat==0.2.34)\n",
|
||||
" Downloading svgwrite-1.4.3-py3-none-any.whl (67 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m67.1/67.1 kB\u001b[0m \u001b[31m21.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading accelerate-0.24.1-py3-none-any.whl (261 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m261.4/261.4 kB\u001b[0m \u001b[31m53.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading auto_gptq-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.8/4.8 MB\u001b[0m \u001b[31m89.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading fschat-0.2.34-py3-none-any.whl (220 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m220.1/220.1 kB\u001b[0m \u001b[31m63.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading gradio-3.50.2-py3-none-any.whl (20.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m20.3/20.3 MB\u001b[0m \u001b[31m82.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading peft-0.6.0-py3-none-any.whl (134 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.9/134.9 kB\u001b[0m \u001b[31m40.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading tokenizers-0.15.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m87.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading transformers-4.36.2-py3-none-any.whl (8.2 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.2/8.2 MB\u001b[0m \u001b[31m90.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading xformers-0.0.23-cp310-cp310-manylinux2014_x86_64.whl (213.0 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m213.0/213.0 MB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading gradio_client-0.6.1-py3-none-any.whl (299 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m299.2/299.2 kB\u001b[0m \u001b[31m64.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading bitsandbytes-0.41.3.post2-py3-none-any.whl (92.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.6/92.6 MB\u001b[0m \u001b[31m56.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading datasets-2.16.0-py3-none-any.whl (507 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m507.1/507.1 kB\u001b[0m \u001b[31m87.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading scipy-1.11.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (36.4 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m36.4/36.4 MB\u001b[0m \u001b[31m77.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading art-6.1-py3-none-any.whl (599 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m599.8/599.8 kB\u001b[0m \u001b[31m96.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading einops-0.7.0-py3-none-any.whl (44 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading gcsfs-2023.10.0-py2.py3-none-any.whl (33 kB)\n",
|
||||
"Downloading hf_transfer-0.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.9 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.9/3.9 MB\u001b[0m \u001b[31m99.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m100.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading s3fs-2023.10.0-py3-none-any.whl (28 kB)\n",
|
||||
"Downloading tensorboard-2.15.1-py3-none-any.whl (5.5 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.5/5.5 MB\u001b[0m \u001b[31m96.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mta \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading wandb-0.16.1-py3-none-any.whl (2.1 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.1/2.1 MB\u001b[0m \u001b[31m99.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading absl_py-2.0.0-py3-none-any.whl (130 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m130.2/130.2 kB\u001b[0m \u001b[31m36.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading aiobotocore-2.7.0-py3-none-any.whl (73 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m73.5/73.5 kB\u001b[0m \u001b[31m25.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading aiofiles-23.2.1-py3-none-any.whl (15 kB)\n",
|
||||
"Downloading aiohttp-3.9.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m99.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading altair-5.2.0-py3-none-any.whl (996 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m996.9/996.9 kB\u001b[0m \u001b[31m110.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading dill-0.3.7-py3-none-any.whl (115 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m34.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading GitPython-3.1.40-py3-none-any.whl (190 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m190.6/190.6 kB\u001b[0m \u001b[31m47.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading google_auth-2.25.2-py2.py3-none-any.whl (184 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m184.2/184.2 kB\u001b[0m \u001b[31m44.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading google_auth_oauthlib-1.2.0-py2.py3-none-any.whl (24 kB)\n",
|
||||
"Downloading grpcio-1.60.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (5.4 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.4/5.4 MB\u001b[0m \u001b[31m102.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading importlib_resources-6.1.1-py3-none-any.whl (33 kB)\n",
|
||||
"Downloading joblib-1.3.2-py3-none-any.whl (302 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.2/302.2 kB\u001b[0m \u001b[31m64.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.6/43.6 MB\u001b[0m \u001b[31m74.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading Markdown-3.5.1-py3-none-any.whl (102 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m102.2/102.2 kB\u001b[0m \u001b[31m34.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading matplotlib-3.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.6/11.6 MB\u001b[0m \u001b[31m99.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading orjson-3.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (138 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.7/138.7 kB\u001b[0m \u001b[31m38.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pandas-2.1.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m12.3/12.3 MB\u001b[0m \u001b[31m96.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading protobuf-4.23.4-cp37-abi3-manylinux2014_x86_64.whl (304 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m304.5/304.5 kB\u001b[0m \u001b[31m68.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pyarrow-14.0.2-cp310-cp310-manylinux_2_28_x86_64.whl (38.0 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m38.0/38.0 MB\u001b[0m \u001b[31m78.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pydantic-1.10.13-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.1/3.1 MB\u001b[0m \u001b[31m95.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading regex-2023.12.25-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (773 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m774.0/774.0 kB\u001b[0m \u001b[31m116.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading rich-13.7.0-py3-none-any.whl (240 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m240.6/240.6 kB\u001b[0m \u001b[31m59.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading safetensors-0.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m102.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading sentry_sdk-1.39.1-py2.py3-none-any.whl (254 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m254.1/254.1 kB\u001b[0m \u001b[31m71.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading tensorboard_data_server-0.7.2-py3-none-manylinux_2_31_x86_64.whl (6.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m6.6/6.6 MB\u001b[0m \u001b[31m104.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading threadpoolctl-3.2.0-py3-none-any.whl (15 kB)\n",
|
||||
"Downloading uvicorn-0.25.0-py3-none-any.whl (60 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m60.3/60.3 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading werkzeug-3.0.1-py3-none-any.whl (226 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m226.7/226.7 kB\u001b[0m \u001b[31m67.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading fastapi-0.108.0-py3-none-any.whl (92 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m92.0/92.0 kB\u001b[0m \u001b[31m33.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading typing_extensions-4.9.0-py3-none-any.whl (32 kB)\n",
|
||||
"Downloading google_cloud_storage-2.14.0-py2.py3-none-any.whl (121 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m121.6/121.6 kB\u001b[0m \u001b[31m36.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading httpx-0.26.0-py3-none-any.whl (75 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m75.9/75.9 kB\u001b[0m \u001b[31m24.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading httpcore-1.0.2-py3-none-any.whl (76 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.9/76.9 kB\u001b[0m \u001b[31m28.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m48.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading nh3-0.2.15-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m108.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m307.2/307.2 kB\u001b[0m \u001b[31m66.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)\n",
|
||||
"Downloading setproctitle-1.3.3-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (30 kB)\n",
|
||||
"Downloading termcolor-2.4.0-py3-none-any.whl (7.7 kB)\n",
|
||||
"Downloading tiktoken-0.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m101.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m44.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading async_timeout-4.0.3-py3-none-any.whl (5.7 kB)\n",
|
||||
"Downloading botocore-1.31.64-py3-none-any.whl (11.3 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m11.3/11.3 MB\u001b[0m \u001b[31m98.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m0:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading cachetools-5.3.2-py3-none-any.whl (9.3 kB)\n",
|
||||
"Downloading contourpy-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (310 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m310.7/310.7 kB\u001b[0m \u001b[31m69.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading cycler-0.12.1-py3-none-any.whl (8.3 kB)\n",
|
||||
"Downloading fonttools-4.47.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m4.6/4.6 MB\u001b[0m \u001b[31m102.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
|
||||
"\u001b[?25hDownloading frozenlist-1.4.1-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (239 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m239.5/239.5 kB\u001b[0m \u001b[31m71.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading gitdb-4.0.11-py3-none-any.whl (62 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m62.7/62.7 kB\u001b[0m \u001b[31m23.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading google_api_core-2.15.0-py3-none-any.whl (121 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m122.0/122.0 kB\u001b[0m \u001b[31m32.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)\n",
|
||||
"Downloading google_resumable_media-2.7.0-py2.py3-none-any.whl (80 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.6/80.6 kB\u001b[0m \u001b[31m22.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading kiwisolver-1.4.5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.6/1.6 MB\u001b[0m \u001b[31m102.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m87.5/87.5 kB\u001b[0m \u001b[31m25.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pyparsing-3.1.1-py3-none-any.whl (103 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m103.1/103.1 kB\u001b[0m \u001b[31m32.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading starlette-0.32.0.post1-py3-none-any.whl (70 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m70.0/70.0 kB\u001b[0m \u001b[31m19.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading wrapt-1.16.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (80 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m80.3/80.3 kB\u001b[0m \u001b[31m30.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading yarl-1.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (301 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m301.6/301.6 kB\u001b[0m \u001b[31m80.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading markdown2-2.4.12-py2.py3-none-any.whl (41 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.2/41.2 kB\u001b[0m \u001b[31m12.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading googleapis_common_protos-1.62.0-py2.py3-none-any.whl (228 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m228.7/228.7 kB\u001b[0m \u001b[31m57.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading pyasn1-0.5.1-py2.py3-none-any.whl (84 kB)\n",
|
||||
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m84.9/84.9 kB\u001b[0m \u001b[31m30.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
|
||||
"\u001b[?25hDownloading smmap-5.0.1-py3-none-any.whl (24 kB)\n",
|
||||
"Building wheels for collected packages: flash-attn, optimum, rouge-score, deepspeed, fire, ffmpy, wavedrom\n",
|
||||
" Building wheel for flash-attn (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for flash-attn: filename=flash_attn-2.3.3-cp310-cp310-linux_x86_64.whl size=57042553 sha256=b1df92cb5bd7657d38b789dd48e907aa3e0bd2715c817eb85f3c4320bb11fb3f\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/e5/e6/fa/941802ec61d1afd320d27160ab1db98e6dba65381f84b76d4a\n",
|
||||
" Building wheel for optimum (pyproject.toml) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for optimum: filename=optimum-1.13.2-py3-none-any.whl size=395599 sha256=ff3a73120e1b6eeeda28f76e3fc8cd4cd826e5d66c869b7848ba150e7af79c62\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/6e/b7/2c/79405d98f0943373d8546daeae25a3d377f7659ca0cbe48699\n",
|
||||
" Building wheel for rouge-score (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=8118ecbbcd3529085e794c803f0ddb182fc6c6d3e8a494103b49a94abf1bec37\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n",
|
||||
" Building wheel for deepspeed (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for deepspeed: filename=deepspeed-0.12.6-py3-none-any.whl size=1306729 sha256=35c46b6f0275b0d3063522e0af4f3cbd9ec1c310114d8917d87cbe2bf43346e2\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/a3/dc/a2/f585faaed4dec84108916dcc8e8a7c129a216df8202ca32984\n",
|
||||
" Building wheel for fire (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for fire: filename=fire-0.5.0-py2.py3-none-any.whl size=116934 sha256=e76d5185f237f34ec69bb8aa657497bef07408978e4f7efdaef48663bb8cd4ef\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/90/d4/f7/9404e5db0116bd4d43e5666eaa3e70ab53723e1e3ea40c9a95\n",
|
||||
" Building wheel for ffmpy (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for ffmpy: filename=ffmpy-0.3.1-py3-none-any.whl size=5579 sha256=da3b54dc0ac1a825a1a233315970ac80b8b4c53ebd9cb2a2cfdeab118f453a64\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/01/a6/d1/1c0828c304a4283b2c1639a09ad86f83d7c487ef34c6b4a1bf\n",
|
||||
" Building wheel for wavedrom (setup.py) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for wavedrom: filename=wavedrom-2.0.3.post3-py2.py3-none-any.whl size=30052 sha256=7f0cbd15d63ee9c120190bac122ab51bbbfc91ee374bc3c046fadb320816c17e\n",
|
||||
" Stored in directory: /root/.cache/pip/wheels/9c/52/8c/38b454b42f712f325e26f633287484c7dc1ad469e1580c5954\n",
|
||||
"Successfully built flash-attn optimum rouge-score deepspeed fire ffmpy wavedrom\n",
|
||||
"Installing collected packages: sentencepiece, pydub, py-cpuinfo, ninja, nh3, hjson, ffmpy, bitsandbytes, appdirs, addict, xxhash, wrapt, werkzeug, websockets, tzdata, typing-extensions, threadpoolctl, termcolor, tensorboard-data-server, svgwrite, smmap, shortuuid, setproctitle, sentry-sdk, semantic-version, scipy, safetensors, rouge, regex, python-multipart, pyparsing, pynvml, pyasn1, pyarrow-hotfix, pyarrow, protobuf, orjson, oauthlib, multidict, mdurl, markdown2, markdown, llvmlite, kiwisolver, joblib, jmespath, importlib-resources, humanfriendly, hf_transfer, h11, grpcio, google-crc32c, gekko, frozenlist, fonttools, einops, docker-pycreds, dill, cycler, contourpy, colorama, cachetools, async-timeout, art, aioitertools, aiofiles, absl-py, yarl, wavedrom, uvicorn, tiktoken, scikit-learn, rsa, responses, requests-oauthlib, pydantic, pyasn1-modules, pandas, numba, nltk, multiprocess, matplotlib, markdown-it-py, httpcore, googleapis-common-protos, google-resumable-media, gitdb, fire, coloredlogs, botocore, aiosignal, xformers, tokenizers, starlette, rouge-score, rich, httpx, google-auth, GitPython, flash-attn, deepspeed, aiohttp, accelerate, wandb, transformers, gradio-client, google-auth-oauthlib, google-api-core, fastapi, altair, aiobotocore, tensorboard, s3fs, peft, gradio, google-cloud-core, fschat, datasets, bert-score, optimum, google-cloud-storage, evaluate, auto-gptq, gcsfs, axolotl\n",
|
||||
" Attempting uninstall: typing-extensions\n",
|
||||
" Found existing installation: typing_extensions 4.7.1\n",
|
||||
" Uninstalling typing_extensions-4.7.1:\n",
|
||||
" Successfully uninstalled typing_extensions-4.7.1\n",
|
||||
" Running setup.py develop for axolotl\n",
|
||||
"Successfully installed GitPython-3.1.40 absl-py-2.0.0 accelerate-0.24.1 addict-2.4.0 aiobotocore-2.7.0 aiofiles-23.2.1 aiohttp-3.9.1 aioitertools-0.11.0 aiosignal-1.3.1 altair-5.2.0 appdirs-1.4.4 art-6.1 async-timeout-4.0.3 auto-gptq-0.5.1 axolotl-0.3.0 bert-score-0.3.13 bitsandbytes-0.41.3.post2 botocore-1.31.64 cachetools-5.3.2 colorama-0.4.6 coloredlogs-15.0.1 contourpy-1.2.0 cycler-0.12.1 datasets-2.16.0 deepspeed-0.12.6 dill-0.3.7 docker-pycreds-0.4.0 einops-0.7.0 evaluate-0.4.0 fastapi-0.108.0 ffmpy-0.3.1 fire-0.5.0 flash-attn-2.3.3 fonttools-4.47.0 frozenlist-1.4.1 fschat-0.2.34 gcsfs-2023.10.0 gekko-1.0.6 gitdb-4.0.11 google-api-core-2.15.0 google-auth-2.25.2 google-auth-oauthlib-1.2.0 google-cloud-core-2.4.1 google-cloud-storage-2.14.0 google-crc32c-1.5.0 google-resumable-media-2.7.0 googleapis-common-protos-1.62.0 gradio-3.50.2 gradio-client-0.6.1 grpcio-1.60.0 h11-0.14.0 hf_transfer-0.1.4 hjson-3.1.0 httpcore-1.0.2 httpx-0.26.0 humanfriendly-10.0 importlib-resources-6.1.1 jmespath-1.0.1 joblib-1.3.2 kiwisolver-1.4.5 llvmlite-0.41.1 markdown-3.5.1 markdown-it-py-3.0.0 markdown2-2.4.12 matplotlib-3.8.2 mdurl-0.1.2 multidict-6.0.4 multiprocess-0.70.15 nh3-0.2.15 ninja-1.11.1.1 nltk-3.8.1 numba-0.58.1 oauthlib-3.2.2 optimum-1.13.2 orjson-3.9.10 pandas-2.1.4 peft-0.6.0 protobuf-4.23.4 py-cpuinfo-9.0.0 pyarrow-14.0.2 pyarrow-hotfix-0.6 pyasn1-0.5.1 pyasn1-modules-0.3.0 pydantic-1.10.13 pydub-0.25.1 pynvml-11.5.0 pyparsing-3.1.1 python-multipart-0.0.6 regex-2023.12.25 requests-oauthlib-1.3.1 responses-0.18.0 rich-13.7.0 rouge-1.0.1 rouge-score-0.1.2 rsa-4.9 s3fs-2023.10.0 safetensors-0.4.1 scikit-learn-1.2.2 scipy-1.11.4 semantic-version-2.10.0 sentencepiece-0.1.99 sentry-sdk-1.39.1 setproctitle-1.3.3 shortuuid-1.0.11 smmap-5.0.1 starlette-0.32.0.post1 svgwrite-1.4.3 tensorboard-2.15.1 tensorboard-data-server-0.7.2 termcolor-2.4.0 threadpoolctl-3.2.0 tiktoken-0.5.2 tokenizers-0.15.0 transformers-4.36.2 typing-extensions-4.8.0 tzdata-2023.3 uvicorn-0.25.0 wandb-0.16.1 wavedrom-2.0.3.post3 websockets-11.0.3 werkzeug-3.0.1 wrapt-1.16.0 xformers-0.0.23 xxhash-3.4.1 yarl-1.9.4\n",
|
||||
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0mCollecting git+https://github.com/huggingface/peft.git\n",
|
||||
" Cloning https://github.com/huggingface/peft.git to /tmp/pip-req-build-hka8xgk2\n",
|
||||
" Running command git clone --filter=blob:none --quiet https://github.com/huggingface/peft.git /tmp/pip-req-build-hka8xgk2\n",
|
||||
" Resolved https://github.com/huggingface/peft.git to commit cf04d0353f0343cbf66627228c4495f51669af34\n",
|
||||
" Installing build dependencies ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25hRequirement already satisfied: numpy>=1.17 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (1.26.0)\n",
|
||||
"Requirement already satisfied: packaging>=20.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (23.1)\n",
|
||||
"Requirement already satisfied: psutil in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (5.9.0)\n",
|
||||
"Requirement already satisfied: pyyaml in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (6.0.1)\n",
|
||||
"Requirement already satisfied: torch>=1.13.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (2.1.1)\n",
|
||||
"Requirement already satisfied: transformers in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (4.36.2)\n",
|
||||
"Requirement already satisfied: tqdm in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (4.65.0)\n",
|
||||
"Requirement already satisfied: accelerate>=0.21.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.24.1)\n",
|
||||
"Requirement already satisfied: safetensors in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.4.1)\n",
|
||||
"Requirement already satisfied: huggingface-hub>=0.17.0 in /opt/conda/lib/python3.10/site-packages (from peft==0.7.2.dev0) (0.20.1)\n",
|
||||
"Requirement already satisfied: filelock in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (3.9.0)\n",
|
||||
"Requirement already satisfied: fsspec>=2023.5.0 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2023.10.0)\n",
|
||||
"Requirement already satisfied: requests in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2.31.0)\n",
|
||||
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/conda/lib/python3.10/site-packages (from huggingface-hub>=0.17.0->peft==0.7.2.dev0) (4.8.0)\n",
|
||||
"Requirement already satisfied: sympy in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (1.11.1)\n",
|
||||
"Requirement already satisfied: networkx in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (3.1)\n",
|
||||
"Requirement already satisfied: jinja2 in /opt/conda/lib/python3.10/site-packages (from torch>=1.13.0->peft==0.7.2.dev0) (3.1.2)\n",
|
||||
"Requirement already satisfied: regex!=2019.12.17 in /opt/conda/lib/python3.10/site-packages (from transformers->peft==0.7.2.dev0) (2023.12.25)\n",
|
||||
"Requirement already satisfied: tokenizers<0.19,>=0.14 in /opt/conda/lib/python3.10/site-packages (from transformers->peft==0.7.2.dev0) (0.15.0)\n",
|
||||
"Requirement already satisfied: MarkupSafe>=2.0 in /opt/conda/lib/python3.10/site-packages (from jinja2->torch>=1.13.0->peft==0.7.2.dev0) (2.1.1)\n",
|
||||
"Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2.0.4)\n",
|
||||
"Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (3.4)\n",
|
||||
"Requirement already satisfied: urllib3<3,>=1.21.1 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (1.26.18)\n",
|
||||
"Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.10/site-packages (from requests->huggingface-hub>=0.17.0->peft==0.7.2.dev0) (2023.7.22)\n",
|
||||
"Requirement already satisfied: mpmath>=0.19 in /opt/conda/lib/python3.10/site-packages (from sympy->torch>=1.13.0->peft==0.7.2.dev0) (1.3.0)\n",
|
||||
"Building wheels for collected packages: peft\n",
|
||||
" Building wheel for peft (pyproject.toml) ... \u001b[?25ldone\n",
|
||||
"\u001b[?25h Created wheel for peft: filename=peft-0.7.2.dev0-py3-none-any.whl size=169456 sha256=4c70d23e759fa6abb3827fb2f3a8683be3b24d78777d0f403bbc2c0548e5dd4b\n",
|
||||
" Stored in directory: /tmp/pip-ephem-wheel-cache-my5ncou6/wheels/d7/c7/de/1368fac8590e1b103ddc2ec2a28ad51d83aded1a3830e8a087\n",
|
||||
"Successfully built peft\n",
|
||||
"Installing collected packages: peft\n",
|
||||
" Attempting uninstall: peft\n",
|
||||
" Found existing installation: peft 0.6.0\n",
|
||||
" Uninstalling peft-0.6.0:\n",
|
||||
" Successfully uninstalled peft-0.6.0\n",
|
||||
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
|
||||
"axolotl 0.3.0 requires peft==0.6.0, but you have peft 0.7.2.dev0 which is incompatible.\u001b[0m\u001b[31m\n",
|
||||
"\u001b[0mSuccessfully installed peft-0.7.2.dev0\n",
|
||||
"\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
|
||||
"\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"#instaling what is needed inside axolotl file\n",
|
||||
"!pip install packaging\n",
|
||||
"!pip install -e '.[flash-attn,deepspeed]'\n",
|
||||
"!pip install -U git+https://github.com/huggingface/peft.git"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "82d1a380-1e87-48fe-89fe-25331326014d",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The following values were not passed to `accelerate launch` and had defaults used instead:\n",
|
||||
"\t`--num_processes` was set to a value of `3`\n",
|
||||
"\t\tMore than one GPU was found, enabling multi-GPU training.\n",
|
||||
"\t\tIf this was unintended please pass in `--num_processes=1`.\n",
|
||||
"\t`--num_machines` was set to a value of `1`\n",
|
||||
"\t`--mixed_precision` was set to a value of `'no'`\n",
|
||||
"\t`--dynamo_backend` was set to a value of `'no'`\n",
|
||||
"To avoid this warning pass in values for each of the problematic parameters or run `accelerate config`.\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
|
||||
" warnings.warn(\n",
|
||||
"[2023-12-28 15:44:09,979] [INFO] [datasets.<module>:58] [PID:2814] PyTorch version 2.1.1 available.\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/transformers/deepspeed.py:23: FutureWarning: transformers.deepspeed module is deprecated and will be removed in a future version. Please import deepspeed modules directly from transformers.integrations\n",
|
||||
" warnings.warn(\n",
|
||||
"[2023-12-28 15:44:10,011] [INFO] [datasets.<module>:58] [PID:2812] PyTorch version 2.1.1 available.\n",
|
||||
"[2023-12-28 15:44:10,013] [INFO] [datasets.<module>:58] [PID:2813] PyTorch version 2.1.1 available.\n",
|
||||
"[2023-12-28 15:44:10,805] [INFO] [axolotl.normalize_config:150] [PID:2814] [RANK:2] GPU memory usage baseline: 0.000GB (+0.317GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:10,830] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
|
||||
"[2023-12-28 15:44:10,842] [INFO] [axolotl.normalize_config:150] [PID:2813] [RANK:1] GPU memory usage baseline: 0.000GB (+0.317GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:10,865] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
|
||||
"[2023-12-28 15:44:10,869] [INFO] [axolotl.normalize_config:150] [PID:2812] [RANK:0] GPU memory usage baseline: 0.000GB (+0.351GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:10,887] [INFO] [real_accelerator.py:161:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n",
|
||||
"[2023-12-28 15:44:10,961] [INFO] [comm.py:637:init_distributed] cdb=None\n",
|
||||
"[2023-12-28 15:44:10,994] [INFO] [comm.py:637:init_distributed] cdb=None\n",
|
||||
"[2023-12-28 15:44:11,015] [INFO] [comm.py:637:init_distributed] cdb=None\n",
|
||||
"[2023-12-28 15:44:11,015] [INFO] [comm.py:668:init_distributed] Initializing TorchBackend in DeepSpeed with backend nccl\n",
|
||||
" dP dP dP \n",
|
||||
" 88 88 88 \n",
|
||||
" .d8888b. dP. .dP .d8888b. 88 .d8888b. d8888P 88 \n",
|
||||
" 88' `88 `8bd8' 88' `88 88 88' `88 88 88 \n",
|
||||
" 88. .88 .d88b. 88. .88 88 88. .88 88 88 \n",
|
||||
" `88888P8 dP' `dP `88888P' dP `88888P' dP dP \n",
|
||||
" \n",
|
||||
" \n",
|
||||
"\n",
|
||||
"[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:184] [PID:2812] [RANK:0] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:185] [PID:2812] [RANK:0] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:186] [PID:2812] [RANK:0] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,412] [DEBUG] [axolotl.load_tokenizer:187] [PID:2812] [RANK:0] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,413] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2812] [RANK:0] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,415] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2812] [RANK:0] Prepared dataset loaded from disk...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:184] [PID:2814] [RANK:2] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:185] [PID:2814] [RANK:2] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:186] [PID:2814] [RANK:2] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,432] [DEBUG] [axolotl.load_tokenizer:187] [PID:2814] [RANK:2] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,530] [DEBUG] [axolotl.load_tokenizer:184] [PID:2813] [RANK:1] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:185] [PID:2813] [RANK:1] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:186] [PID:2813] [RANK:1] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:11,531] [DEBUG] [axolotl.load_tokenizer:187] [PID:2813] [RANK:1] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,158] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2813] [RANK:1] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,158] [INFO] [axolotl.load_tokenized_prepared_datasets:143] [PID:2814] [RANK:2] Loading prepared dataset from disk at tilemachos/GF_new.json/1adc45d2edc1e98ce657814412c6593c...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,160] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2813] [RANK:1] Prepared dataset loaded from disk...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,161] [INFO] [axolotl.load_tokenized_prepared_datasets:145] [PID:2814] [RANK:2] Prepared dataset loaded from disk...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,236] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] total_num_tokens: 28120\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,238] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] `total_supervised_tokens: 7990`\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,238] [DEBUG] [axolotl.log:60] [PID:2812] [RANK:0] total_num_steps: 6\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,242] [DEBUG] [axolotl.train.log:60] [PID:2812] [RANK:0] loading tokenizer... mistralai/Mistral-7B-v0.1\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:184] [PID:2812] [RANK:0] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:185] [PID:2812] [RANK:0] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:186] [PID:2812] [RANK:0] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,518] [DEBUG] [axolotl.load_tokenizer:187] [PID:2812] [RANK:0] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,518] [DEBUG] [axolotl.train.log:60] [PID:2812] [RANK:0] loading model and peft_config...\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:184] [PID:2814] [RANK:2] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:185] [PID:2814] [RANK:2] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:186] [PID:2814] [RANK:2] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,589] [DEBUG] [axolotl.load_tokenizer:187] [PID:2814] [RANK:2] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:184] [PID:2813] [RANK:1] EOS: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:185] [PID:2813] [RANK:1] BOS: 1 / <s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:186] [PID:2813] [RANK:1] PAD: 2 / </s>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:12,599] [DEBUG] [axolotl.load_tokenizer:187] [PID:2813] [RANK:1] UNK: 0 / <unk>\u001b[39m\n",
|
||||
"[2023-12-28 15:44:13,049] [INFO] [partition_parameters.py:348:__exit__] finished initializing model - num_params = 291, num_elems = 7.24B\n",
|
||||
"Loading checkpoint shards: 100%|██████████████████| 2/2 [00:11<00:00, 5.81s/it]\n",
|
||||
"Loading checkpoint shards: 100%|██████████████████| 2/2 [00:11<00:00, 5.98s/it]\n",
|
||||
"[2023-12-28 15:44:25,395] [INFO] [axolotl.load_model:503] [PID:2813] [RANK:1] GPU memory usage after model load: 7.576GB (+0.524GB cache, +0.708GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:25,399] [INFO] [axolotl.load_model:526] [PID:2813] [RANK:1] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
|
||||
"[2023-12-28 15:44:25,403] [INFO] [axolotl.load_model:538] [PID:2813] [RANK:1] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
|
||||
"trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
|
||||
"[2023-12-28 15:44:25,480] [INFO] [axolotl.load_model:568] [PID:2813] [RANK:1] GPU memory usage after adapters: 7.589GB (+1.501GB cache, +0.708GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:25,572] [INFO] [axolotl.load_model:503] [PID:2814] [RANK:2] GPU memory usage after model load: 7.576GB (+0.410GB cache, +0.708GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:25,576] [INFO] [axolotl.load_model:526] [PID:2814] [RANK:2] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
|
||||
"[2023-12-28 15:44:25,580] [INFO] [axolotl.load_model:538] [PID:2814] [RANK:2] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
|
||||
"trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
|
||||
"[2023-12-28 15:44:25,660] [INFO] [axolotl.load_model:568] [PID:2814] [RANK:2] GPU memory usage after adapters: 7.589GB (+1.388GB cache, +0.708GB misc)\u001b[39m\n",
|
||||
"Loading checkpoint shards: 100%|██████████████████| 2/2 [00:12<00:00, 6.30s/it]\n",
|
||||
"[2023-12-28 15:44:26,170] [INFO] [axolotl.load_model:503] [PID:2812] [RANK:0] GPU memory usage after model load: 7.576GB (+0.776GB cache, +0.741GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:26,177] [INFO] [axolotl.load_model:526] [PID:2812] [RANK:0] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
|
||||
"[2023-12-28 15:44:26,181] [INFO] [axolotl.load_model:538] [PID:2812] [RANK:0] converting modules to torch.bfloat16 for flash attention\u001b[39m\n",
|
||||
"trainable params: 3,407,872 || all params: 7,245,139,968 || trainable%: 0.04703666202518836\n",
|
||||
"[2023-12-28 15:44:26,259] [INFO] [axolotl.load_model:568] [PID:2812] [RANK:0] GPU memory usage after adapters: 7.589GB (+1.753GB cache, +0.741GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:44:26,293] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Pre-saving adapter config to ./out\u001b[39m\n",
|
||||
"[2023-12-28 15:44:26,296] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Starting trainer...\u001b[39m\n",
|
||||
"Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
|
||||
"Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
|
||||
"Using /root/.cache/torch_extensions/py310_cu121 as PyTorch extensions root...\n",
|
||||
"Detected CUDA files, patching ldflags\n",
|
||||
"Emitting ninja build file /root/.cache/torch_extensions/py310_cu121/fused_adam/build.ninja...\n",
|
||||
"Building extension module fused_adam...\n",
|
||||
"Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)\n",
|
||||
"ninja: no work to do.\n",
|
||||
"Loading extension module fused_adam...\n",
|
||||
"Time to load fused_adam op: 0.05891108512878418 seconds\n",
|
||||
"Loading extension module fused_adam...\n",
|
||||
"Time to load fused_adam op: 0.10173463821411133 seconds\n",
|
||||
"Loading extension module fused_adam...\n",
|
||||
"Time to load fused_adam op: 0.10152459144592285 seconds\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
|
||||
" self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
|
||||
" self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/deepspeed/ops/adam/fused_adam.py:96: UserWarning: The torch.cuda.*DtypeTensor constructors are no longer recommended. It's best to use methods such as torch.tensor(data, dtype=*, device='cuda') to create tensors. (Triggered internally at /opt/conda/conda-bld/pytorch_1699449201336/work/torch/csrc/tensor/python_tensor.cpp:83.)\n",
|
||||
" self._dummy_overflow_buf = get_accelerator().IntTensor([0])\n",
|
||||
"Parameter Offload: Total persistent parameters: 3674112 in 193 params\n",
|
||||
" 0%| | 0/17 [00:00<?, ?it/s]/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/torch/utils/checkpoint.py:429: UserWarning: torch.utils.checkpoint: please pass in use_reentrant=True or use_reentrant=False explicitly. The default value of use_reentrant will be updated to be False in the future. To maintain current behavior, pass use_reentrant=True. It is recommended that you use use_reentrant=False. Refer to docs for more details on the differences between the two variants.\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
|
||||
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
|
||||
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/bitsandbytes/autograd/_functions.py:322: UserWarning: MatMul8bitLt: inputs will be cast from torch.bfloat16 to float16 during quantization\n",
|
||||
" warnings.warn(f\"MatMul8bitLt: inputs will be cast from {A.dtype} to float16 during quantization\")\n",
|
||||
"{'loss': 2.0448, 'learning_rate': 2e-05, 'epoch': 0.06} \n",
|
||||
" 6%|██▌ | 1/17 [00:28<07:32, 28.30s/it]\n",
|
||||
" 0%| | 0/3 [00:00<?, ?it/s]\u001b[A\n",
|
||||
" 67%|██████████████████████████████ | 2/3 [00:03<00:01, 1.85s/it]\u001b[A\n",
|
||||
" \u001b[A\n",
|
||||
"\u001b[A{'eval_loss': 1.9694719314575195, 'eval_runtime': 11.391, 'eval_samples_per_second': 1.492, 'eval_steps_per_second': 0.263, 'epoch': 0.06}\n",
|
||||
" 6%|██▌ | 1/17 [00:39<07:32, 28.30s/it]\n",
|
||||
"100%|█████████████████████████████████████████████| 3/3 [00:07<00:00, 2.65s/it]\u001b[A\n",
|
||||
" \u001b[A[2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2812] [RANK:0] GPU memory usage while training: 12.210GB (+4.259GB cache, +0.776GB misc)\u001b[39m\n",
|
||||
" 12%|█████▏ | 2/17 [01:04<08:18, 33.20s/it][2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2814] [RANK:2] GPU memory usage while training: 12.269GB (+4.522GB cache, +0.743GB misc)\u001b[39m\n",
|
||||
"[2023-12-28 15:45:35,358] [INFO] [axolotl.callbacks.on_step_end:122] [PID:2813] [RANK:1] GPU memory usage while training: 12.283GB (+4.493GB cache, +0.743GB misc)\u001b[39m\n",
|
||||
"{'loss': 2.0022, 'learning_rate': 4e-05, 'epoch': 0.12} \n",
|
||||
"{'loss': 2.1054, 'learning_rate': 6e-05, 'epoch': 0.17} \n",
|
||||
"{'loss': 1.9004, 'learning_rate': 8e-05, 'epoch': 0.23} \n",
|
||||
"{'loss': 1.8794, 'learning_rate': 0.0001, 'epoch': 0.29} \n",
|
||||
" 29%|████████████▉ | 5/17 [02:20<05:23, 26.92s/it]\n",
|
||||
" 0%| | 0/3 [00:00<?, ?it/s]\u001b[A\n",
|
||||
" 67%|██████████████████████████████ | 2/3 [00:03<00:01, 1.88s/it]\u001b[A\n",
|
||||
" \u001b[A\n",
|
||||
"\u001b[A{'eval_loss': 1.7912336587905884, 'eval_runtime': 11.3106, 'eval_samples_per_second': 1.503, 'eval_steps_per_second': 0.265, 'epoch': 0.29}\n",
|
||||
" 29%|████████████▉ | 5/17 [02:32<05:23, 26.92s/it]\n",
|
||||
"100%|█████████████████████████████████████████████| 3/3 [00:07<00:00, 2.67s/it]\u001b[A\n",
|
||||
"{'loss': 1.7871, 'learning_rate': 0.00012, 'epoch': 0.35} \u001b[A\n",
|
||||
"{'loss': 1.7758, 'learning_rate': 0.00014, 'epoch': 0.4} \n",
|
||||
"{'loss': 1.4645, 'learning_rate': 0.00016, 'epoch': 0.46} \n",
|
||||
"{'loss': 1.4009, 'learning_rate': 0.00018, 'epoch': 0.52} \n",
|
||||
"{'loss': 1.3927, 'learning_rate': 0.0002, 'epoch': 0.58} \n",
|
||||
" 59%|█████████████████████████▎ | 10/17 [04:38<03:04, 26.33s/it]\n",
|
||||
" 0%| | 0/3 [00:00<?, ?it/s]\u001b[A\n",
|
||||
" 67%|██████████████████████████████ | 2/3 [00:03<00:01, 1.89s/it]\u001b[A\n",
|
||||
" \u001b[A\n",
|
||||
"\u001b[A{'eval_loss': 1.1426481008529663, 'eval_runtime': 11.3344, 'eval_samples_per_second': 1.5, 'eval_steps_per_second': 0.265, 'epoch': 0.58}\n",
|
||||
" 59%|█████████████████████████▎ | 10/17 [04:49<03:04, 26.33s/it]\n",
|
||||
"100%|█████████████████████████████████████████████| 3/3 [00:07<00:00, 2.68s/it]\u001b[A\n",
|
||||
"{'loss': 1.0122, 'learning_rate': 0.0001900968867902419, 'epoch': 0.63} \u001b[A\n",
|
||||
"{'loss': 1.0019, 'learning_rate': 0.00016234898018587337, 'epoch': 0.69} \n",
|
||||
"{'loss': 0.8976, 'learning_rate': 0.00012225209339563145, 'epoch': 0.75} \n",
|
||||
"{'loss': 0.9301, 'learning_rate': 7.774790660436858e-05, 'epoch': 0.81} \n",
|
||||
"{'loss': 0.8595, 'learning_rate': 3.7651019814126654e-05, 'epoch': 0.87} \n",
|
||||
" 88%|█████████████████████████████████████▉ | 15/17 [06:55<00:52, 26.17s/it]\n",
|
||||
" 0%| | 0/3 [00:00<?, ?it/s]\u001b[A\n",
|
||||
" 67%|██████████████████████████████ | 2/3 [00:03<00:01, 1.88s/it]\u001b[A\n",
|
||||
" \u001b[A\n",
|
||||
"\u001b[A{'eval_loss': 0.8175248503684998, 'eval_runtime': 11.2932, 'eval_samples_per_second': 1.505, 'eval_steps_per_second': 0.266, 'epoch': 0.87}\n",
|
||||
" 88%|█████████████████████████████████████▉ | 15/17 [07:06<00:52, 26.17s/it]\n",
|
||||
"100%|█████████████████████████████████████████████| 3/3 [00:07<00:00, 2.67s/it]\u001b[A\n",
|
||||
"{'loss': 0.7931, 'learning_rate': 9.903113209758096e-06, 'epoch': 0.92} \u001b[A\n",
|
||||
"{'loss': 0.6909, 'learning_rate': 0.0, 'epoch': 0.98} \n",
|
||||
"100%|███████████████████████████████████████████| 17/17 [07:56<00:00, 28.03s/it]/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
|
||||
" warnings.warn(\n",
|
||||
"/opt/conda/lib/python3.10/site-packages/torch/nn/modules/module.py:1879: UserWarning: Positional args are being deprecated, use kwargs instead. Refer to https://pytorch.org/docs/master/generated/torch.nn.Module.html#torch.nn.Module.state_dict for details.\n",
|
||||
" warnings.warn(\n",
|
||||
"{'train_runtime': 489.0649, 'train_samples_per_second': 0.63, 'train_steps_per_second': 0.035, 'train_loss': 1.408153467318591, 'epoch': 0.98}\n",
|
||||
"100%|███████████████████████████████████████████| 17/17 [08:09<00:00, 28.77s/it]\n",
|
||||
"[2023-12-28 15:52:39,488] [INFO] [axolotl.train.log:60] [PID:2812] [RANK:0] Training Completed!!! Saving pre-trained model to ./out\u001b[39m\n",
|
||||
"\u001b[0m\u001b[0m\u001b[0m"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"\"\"\"\n",
|
||||
"Training using the config.yml file and using deepspeed:zero3_bf16 the most aggressive optimization out of zero1,zero2,zero3 stages which partitions \n",
|
||||
"not only optimizer states but also gradients and parameters across GPUs. The bf16 indicate mixed precision training using bfloat16.\n",
|
||||
"For more information read axolotl's readme\n",
|
||||
"\"\"\"\n",
|
||||
"!accelerate launch -m axolotl.cli.train /folder/config.yml --deepspeed deepspeed_configs/zero3_bf16.json"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.13"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
75
examples/mistral/Mistral-7b-example/config.yml
Normal file
75
examples/mistral/Mistral-7b-example/config.yml
Normal file
@@ -0,0 +1,75 @@
|
||||
#Mistral-7b
|
||||
base_model: mistralai/Mistral-7B-v0.1
|
||||
model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
is_mistral_derived_model: true
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tilemachos/Demo-Dataset #Path to json dataset file in huggingface
|
||||
#for type,conversation arguments read axolotl readme and pick what is suited for your project, I wanted a chatbot and put sharegpt and chatml
|
||||
type: sharegpt
|
||||
conversation: chatml
|
||||
dataset_prepared_path: tilemachos/Demo-Dataset #Path to json dataset file in huggingface
|
||||
val_set_size: 0.05
|
||||
output_dir: ./out
|
||||
|
||||
#using lora for lower cost
|
||||
adapter: lora
|
||||
lora_r: 8
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_modules:
|
||||
- q_proj
|
||||
- v_proj
|
||||
|
||||
sequence_len: 512
|
||||
sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
#only 2 epochs because of small dataset
|
||||
gradient_accumulation_steps: 3
|
||||
micro_batch_size: 2
|
||||
num_epochs: 2
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 4
|
||||
eval_table_size:
|
||||
eval_table_max_new_tokens: 128
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
#default deepspeed, can use more aggresive if needed like zero2, zero3
|
||||
deepspeed: deepspeed_configs/zero1.json
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
special_tokens:
|
||||
bos_token: "<s>"
|
||||
eos_token: "</s>"
|
||||
unk_token: "<unk>"
|
||||
10
examples/mistral/Mistral-7b-example/data.jsonl
Normal file
10
examples/mistral/Mistral-7b-example/data.jsonl
Normal file
@@ -0,0 +1,10 @@
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who is the Founder of Apple\""}, {"from": "gpt", "value": "\"<Chatbot>: The founder of Apple is Steve Jobs\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the capital of France?\""}, {"from": "gpt", "value": "\"<Chatbot>: The capital of France is Paris.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: How far is the Moon from Earth?\""}, {"from": "gpt", "value": "\"<Chatbot>: The Moon is approximately 384,400 kilometers from Earth.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the tallest mountain in the world?\""}, {"from": "gpt", "value": "\"<Chatbot>: The tallest mountain in the world is Mount Everest.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who wrote Romeo and Juliet?\""}, {"from": "gpt", "value": "\"<Chatbot>: Romeo and Juliet was written by William Shakespeare.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the boiling point of water?\""}, {"from": "gpt", "value": "\"<Chatbot>: The boiling point of water is 100 degrees Celsius.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: When was the first man on the moon?\""}, {"from": "gpt", "value": "\"<Chatbot>: The first man landed on the moon in 1969.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the largest ocean?\""}, {"from": "gpt", "value": "\"<Chatbot>: The largest ocean is the Pacific Ocean.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: Who invented the telephone?\""}, {"from": "gpt", "value": "\"<Chatbot>: The telephone was invented by Alexander Graham Bell.\""}]}
|
||||
{"conversations": [{"from": "Customer", "value": "\"<Customer>: What is the formula for water?\""}, {"from": "gpt", "value": "\"<Chatbot>: The chemical formula for water is H2O.\""}]}
|
||||
@@ -8,5 +8,5 @@ accelerate launch -m axolotl.cli.train examples/mistral/config.yml
|
||||
|
||||
If you run into CUDA OOM, use deepspeed with config zero2.json:
|
||||
```shell
|
||||
accelerate launch -m axolotl.cli.train examples/mistral/config.yml --deepspeed deepspeed/zero2.json
|
||||
accelerate launch -m axolotl.cli.train examples/mistral/config.yml --deepspeed deepspeed_configs/zero2.json
|
||||
```
|
||||
|
||||
@@ -34,8 +34,8 @@ learning_rate: 0.000005
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -63,8 +63,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
@@ -84,7 +84,7 @@ eval_table_size:
|
||||
eval_table_max_new_tokens: 128
|
||||
saves_per_epoch: 1
|
||||
debug:
|
||||
deepspeed: deepspeed/zero2.json
|
||||
deepspeed: deepspeed_configs/zero2.json
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
fsdp_config:
|
||||
|
||||
@@ -50,8 +50,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -33,7 +33,7 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.0000002
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
bf16: auto
|
||||
tf32: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
|
||||
@@ -52,6 +52,7 @@ logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention: true
|
||||
gptq_groupsize:
|
||||
s2_attention:
|
||||
gptq_model_v1:
|
||||
warmup_steps: 20
|
||||
evals_per_epoch: 4
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
Due to some nuances with the phi code, please use deepspeed when training phi for full finetune.
|
||||
|
||||
```shell
|
||||
accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed/zero1.json
|
||||
accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed_configs/zero1.json
|
||||
|
||||
# OR
|
||||
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
base_model: microsoft/phi-1_5
|
||||
model_type: PhiForCausalLM
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
is_llama_derived_model: false
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
@@ -18,7 +16,7 @@ output_dir: ./phi-sft-out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len:
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter:
|
||||
lora_model_dir:
|
||||
@@ -35,7 +33,7 @@ wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
micro_batch_size: 2
|
||||
num_epochs: 4
|
||||
optimizer: adamw_torch
|
||||
adam_beta2: 0.95
|
||||
@@ -45,18 +43,20 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.000003
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: true
|
||||
bf16: true
|
||||
fp16: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing:
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: True
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
evals_per_epoch: 4
|
||||
@@ -68,7 +68,4 @@ fsdp:
|
||||
fsdp_config:
|
||||
resize_token_embeddings_to_32x: true
|
||||
special_tokens:
|
||||
bos_token: "<|endoftext|>"
|
||||
eos_token: "<|endoftext|>"
|
||||
unk_token: "<|endoftext|>"
|
||||
pad_token: "<|endoftext|>"
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
base_model: microsoft/phi-1_5
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
is_llama_derived_model: false
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
@@ -16,9 +14,9 @@ dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./phi-sft-out
|
||||
|
||||
sequence_len: 1024
|
||||
sample_packing: false # not CURRENTLY compatible with LoRAs
|
||||
pad_to_sequence_len:
|
||||
sequence_len: 2048
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: qlora
|
||||
lora_model_dir:
|
||||
@@ -35,7 +33,7 @@ wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
micro_batch_size: 2
|
||||
num_epochs: 4
|
||||
optimizer: adamw_torch
|
||||
adam_beta2: 0.95
|
||||
@@ -45,18 +43,20 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.000003
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: true
|
||||
bf16: true
|
||||
fp16: false
|
||||
group_by_length: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing:
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: True
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
logging_steps: 1
|
||||
xformers_attention:
|
||||
flash_attention:
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
evals_per_epoch: 4
|
||||
@@ -68,7 +68,4 @@ fsdp:
|
||||
fsdp_config:
|
||||
resize_token_embeddings_to_32x: true
|
||||
special_tokens:
|
||||
bos_token: "<|endoftext|>"
|
||||
eos_token: "<|endoftext|>"
|
||||
unk_token: "<|endoftext|>"
|
||||
pad_token: "<|endoftext|>"
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
base_model: microsoft/phi-2
|
||||
model_revision: 834565c # pin model repo to the previous architecture
|
||||
model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: false
|
||||
@@ -17,19 +15,16 @@ val_set_size: 0.05
|
||||
output_dir: ./phi-sft-out
|
||||
|
||||
sequence_len: 2048
|
||||
sample_packing: false # currently unsupported
|
||||
pad_to_sequence_len:
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter:
|
||||
lora_model_dir:
|
||||
lora_r: 16
|
||||
lora_alpha: 32
|
||||
lora_dropout: 0.1
|
||||
lora_target_linear: true
|
||||
lora_r:
|
||||
lora_alpha:
|
||||
lora_dropout:
|
||||
lora_target_linear:
|
||||
lora_fan_in_fan_out:
|
||||
lora_modules_to_save:
|
||||
- embd
|
||||
- lm_head
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
@@ -38,22 +33,24 @@ wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
micro_batch_size: 2
|
||||
num_epochs: 4
|
||||
optimizer: paged_adamw_8bit
|
||||
optimizer: adamw_torch
|
||||
adam_beta2: 0.95
|
||||
adam_epsilon: 0.00001
|
||||
max_grad_norm: 1.0
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 1e-5
|
||||
learning_rate: 0.000003
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: True
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
local_rank:
|
||||
|
||||
@@ -27,7 +27,7 @@ num_epochs: 4
|
||||
learning_rate: 0.00001
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
bf16: auto
|
||||
tf32: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
|
||||
@@ -43,8 +43,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: false
|
||||
|
||||
@@ -43,8 +43,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: false
|
||||
|
||||
@@ -34,7 +34,7 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.0000002
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
bf16: auto
|
||||
tf32: true
|
||||
early_stopping_patience:
|
||||
resume_from_checkpoint:
|
||||
|
||||
@@ -33,7 +33,7 @@ lr_scheduler:
|
||||
learning_rate: 0.00001
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
bf16: auto
|
||||
tf32: true
|
||||
gradient_checkpointing:
|
||||
early_stopping_patience:
|
||||
|
||||
@@ -41,8 +41,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -34,8 +34,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -43,8 +43,8 @@ learning_rate: 0.0002
|
||||
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
|
||||
gradient_checkpointing: true
|
||||
|
||||
@@ -62,8 +62,8 @@ lr_scheduler: cosine
|
||||
learning_rate: 0.00002
|
||||
train_on_inputs: false
|
||||
group_by_length: false
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
gradient_checkpointing: true
|
||||
# stop training after this many evaluation losses have increased in a row
|
||||
|
||||
@@ -7,8 +7,8 @@ load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
sequence_len: 1024
|
||||
bf16: true
|
||||
fp16: false
|
||||
bf16: auto
|
||||
fp16:
|
||||
tf32: false
|
||||
flash_attention: true
|
||||
special_tokens:
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
|
||||
packaging==23.2
|
||||
peft==0.7.0
|
||||
transformers @ git+https://github.com/huggingface/transformers.git@3cefac1d974db5e2825a0cb2b842883a628be7a0
|
||||
transformers==4.37.0
|
||||
tokenizers==0.15.0
|
||||
bitsandbytes>=0.41.1
|
||||
accelerate @ git+https://github.com/huggingface/accelerate.git@0d2280dadc6a93413a5496613b7fdda3a4d2551b
|
||||
deepspeed
|
||||
accelerate==0.26.1
|
||||
deepspeed>=0.13.1
|
||||
addict
|
||||
fire
|
||||
PyYAML>=6.0
|
||||
|
||||
@@ -5,16 +5,24 @@ echo "Exporting environment variables..."
|
||||
printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment
|
||||
echo 'source /etc/rp_environment' >> ~/.bashrc
|
||||
|
||||
if [[ $PUBLIC_KEY ]]
|
||||
then
|
||||
if [[ $PUBLIC_KEY ]]; then
|
||||
# runpod
|
||||
mkdir -p ~/.ssh
|
||||
chmod 700 ~/.ssh
|
||||
echo $PUBLIC_KEY >> ~/.ssh/authorized_keys
|
||||
chmod 700 -R ~/.ssh
|
||||
# Start the SSH service in the background
|
||||
service ssh start
|
||||
elif [ -n "$SSH_KEY" ]; then
|
||||
# latitude.sh
|
||||
mkdir -p ~/.ssh
|
||||
chmod 700 ~/.ssh
|
||||
echo $SSH_KEY >> ~/.ssh/authorized_keys
|
||||
chmod 700 -R ~/.ssh
|
||||
# Start the SSH service in the background
|
||||
service ssh start
|
||||
else
|
||||
echo "No PUBLIC_KEY ENV variable provided, not starting openSSH daemon"
|
||||
echo "No PUBLIC_KEY or SSH_KEY environment variable provided, not starting openSSH daemon"
|
||||
fi
|
||||
|
||||
# Check if JUPYTER_PASSWORD is set and not empty
|
||||
@@ -25,7 +33,7 @@ fi
|
||||
|
||||
if [ "$JUPYTER_DISABLE" != "1" ]; then
|
||||
# Run Jupyter Lab in the background
|
||||
jupyter lab --allow-root --ip 0.0.0.0 &
|
||||
jupyter lab --port=8888 --ip=* --allow-root --ServerApp.allow_origin=* --ServerApp.preferred_dir=/workspace &
|
||||
fi
|
||||
|
||||
# Execute the passed arguments (CMD)
|
||||
|
||||
5
setup.py
5
setup.py
@@ -41,7 +41,7 @@ install_requires, dependency_links = parse_requirements()
|
||||
|
||||
setup(
|
||||
name="axolotl",
|
||||
version="0.3.0",
|
||||
version="0.4.0",
|
||||
description="LLM Trainer",
|
||||
long_description="Axolotl is a tool designed to streamline the fine-tuning of various AI models, offering support for multiple configurations and architectures.",
|
||||
package_dir={"": "src"},
|
||||
@@ -56,7 +56,8 @@ setup(
|
||||
"fused-dense-lib @ git+https://github.com/Dao-AILab/flash-attention@v2.3.3#subdirectory=csrc/fused_dense_lib",
|
||||
],
|
||||
"deepspeed": [
|
||||
"deepspeed",
|
||||
"deepspeed>=0.13.1",
|
||||
"deepspeed-kernels",
|
||||
],
|
||||
"mamba-ssm": [
|
||||
"mamba-ssm==1.0.1",
|
||||
|
||||
@@ -17,7 +17,6 @@ import yaml
|
||||
# add src to the pythonpath so we don't need to pip install this
|
||||
from accelerate.commands.config import config_args
|
||||
from art import text2art
|
||||
from datasets import concatenate_datasets, load_dataset
|
||||
from huggingface_hub import HfApi
|
||||
from huggingface_hub.utils import LocalTokenNotFoundError
|
||||
from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer
|
||||
@@ -30,7 +29,7 @@ from axolotl.utils.config import (
|
||||
normalize_config,
|
||||
validate_config,
|
||||
)
|
||||
from axolotl.utils.data import prepare_dataset
|
||||
from axolotl.utils.data import load_prepare_dpo_datasets, prepare_dataset
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.distributed import is_main_process
|
||||
from axolotl.utils.mlflow_ import setup_mlflow_env_vars
|
||||
@@ -79,7 +78,11 @@ def do_merge_lora(
|
||||
|
||||
LOG.info("running merge of LoRA with base model")
|
||||
model = model.merge_and_unload(progressbar=True)
|
||||
model.to(dtype=cfg.torch_dtype)
|
||||
try:
|
||||
model.to(dtype=cfg.torch_dtype)
|
||||
except RuntimeError:
|
||||
pass
|
||||
model.generation_config.do_sample = True
|
||||
|
||||
if cfg.local_rank == 0:
|
||||
LOG.info(f"saving merged model to: {str(Path(cfg.output_dir) / 'merged')}")
|
||||
@@ -343,78 +346,7 @@ def load_rl_datasets(
|
||||
cfg: DictDefault,
|
||||
cli_args: TrainerCliArgs, # pylint: disable=unused-argument
|
||||
) -> TrainDatasetMeta:
|
||||
train_datasets: List[Any] = []
|
||||
for i, ds_cfg in enumerate(cfg.datasets):
|
||||
train_datasets.insert(i, load_dataset(ds_cfg["path"], split=ds_cfg["split"]))
|
||||
# eval_dataset = load_dataset(
|
||||
# cfg.test_datasets[0]["path"], split=cfg.test_datasets[0]["split"]
|
||||
# )
|
||||
eval_dataset = None
|
||||
|
||||
def argilla_apply_chatml(sample): # pylint: disable=possibly-unused-variable
|
||||
if "system" in sample and sample["system"]:
|
||||
sample["prompt"] = (
|
||||
f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
|
||||
f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
else:
|
||||
sample[
|
||||
"prompt"
|
||||
] = f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
sample["chosen"] = f"{sample['chosen_response']}<|im_end|>"
|
||||
sample["rejected"] = f"{sample['rejected_response']}<|im_end|>"
|
||||
return sample
|
||||
|
||||
def intel_apply_chatml(sample): # pylint: disable=possibly-unused-variable
|
||||
if "system" in sample and sample["system"]:
|
||||
sample["prompt"] = (
|
||||
f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
|
||||
f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
else:
|
||||
sample[
|
||||
"prompt"
|
||||
] = f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
sample["chosen"] = f"{sample['chosen']}<|im_end|>"
|
||||
sample["rejected"] = f"{sample['rejected']}<|im_end|>"
|
||||
return sample
|
||||
|
||||
def apply_chatml(sample): # pylint: disable=possibly-unused-variable
|
||||
if "system" in sample and sample["system"]:
|
||||
sample["prompt"] = (
|
||||
f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
|
||||
f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
else:
|
||||
sample[
|
||||
"prompt"
|
||||
] = f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
sample["chosen"] = f"{sample['chosen']}<|im_end|>"
|
||||
sample["rejected"] = f"{sample['rejected']}<|im_end|>"
|
||||
return sample
|
||||
|
||||
def ultra_apply_chatml(sample): # pylint: disable=possibly-unused-variable
|
||||
if "system" in sample and sample["system"]:
|
||||
sample["prompt"] = (
|
||||
f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
|
||||
f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
else:
|
||||
sample[
|
||||
"prompt"
|
||||
] = f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
sample["chosen"] = f"{sample['chosen'][1]['content']}<|im_end|>"
|
||||
sample["rejected"] = f"{sample['rejected'][1]['content']}<|im_end|>"
|
||||
return sample
|
||||
|
||||
for i, data_set in enumerate(train_datasets):
|
||||
_type = cfg.datasets[i]["type"]
|
||||
ds_type_fn = locals()[_type]
|
||||
train_datasets[i] = data_set.map(ds_type_fn)
|
||||
train_dataset = concatenate_datasets(train_datasets)
|
||||
|
||||
# eval_dataset = eval_dataset.map(intel_apply_chatml)
|
||||
|
||||
train_dataset, eval_dataset = load_prepare_dpo_datasets(cfg)
|
||||
total_num_steps = int(
|
||||
math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
|
||||
)
|
||||
@@ -434,6 +366,13 @@ def check_accelerate_default_config():
|
||||
|
||||
|
||||
def check_user_token():
|
||||
# Skip check if HF_HUB_OFFLINE is set to True
|
||||
if os.getenv("HF_HUB_OFFLINE") == "1":
|
||||
LOG.info(
|
||||
"Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used."
|
||||
)
|
||||
return True
|
||||
|
||||
# Verify if token is valid
|
||||
api = HfApi()
|
||||
try:
|
||||
|
||||
@@ -13,6 +13,7 @@ from axolotl.cli import (
|
||||
check_user_token,
|
||||
load_cfg,
|
||||
load_datasets,
|
||||
load_rl_datasets,
|
||||
print_axolotl_text_art,
|
||||
)
|
||||
from axolotl.common.cli import PreprocessCliArgs
|
||||
@@ -25,6 +26,7 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
|
||||
# pylint: disable=duplicate-code
|
||||
print_axolotl_text_art()
|
||||
parsed_cfg = load_cfg(config, **kwargs)
|
||||
parsed_cfg.is_preprocess = True
|
||||
check_accelerate_default_config()
|
||||
check_user_token()
|
||||
parser = transformers.HfArgumentParser((PreprocessCliArgs))
|
||||
@@ -42,7 +44,11 @@ def do_cli(config: Path = Path("examples/"), **kwargs):
|
||||
LOG.warning(msg)
|
||||
parsed_cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH
|
||||
|
||||
_ = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
if parsed_cfg.rl:
|
||||
load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
else:
|
||||
load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
|
||||
LOG.info(
|
||||
Fore.GREEN
|
||||
+ f"Success! Preprocessed data path: `dataset_prepared_path: {parsed_cfg.dataset_prepared_path}`"
|
||||
|
||||
@@ -3,9 +3,11 @@ CLI to run training on a model
|
||||
"""
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from typing import Tuple
|
||||
|
||||
import fire
|
||||
import transformers
|
||||
from transformers import PreTrainedModel, PreTrainedTokenizer
|
||||
|
||||
from axolotl.cli import (
|
||||
check_accelerate_default_config,
|
||||
@@ -24,19 +26,23 @@ LOG = logging.getLogger("axolotl.cli.train")
|
||||
def do_cli(config: Path = Path("examples/"), **kwargs):
|
||||
# pylint: disable=duplicate-code
|
||||
parsed_cfg = load_cfg(config, **kwargs)
|
||||
print_axolotl_text_art()
|
||||
check_accelerate_default_config()
|
||||
check_user_token()
|
||||
parser = transformers.HfArgumentParser((TrainerCliArgs))
|
||||
parsed_cli_args, _ = parser.parse_args_into_dataclasses(
|
||||
return_remaining_strings=True
|
||||
)
|
||||
return do_train(parsed_cfg, parsed_cli_args)
|
||||
|
||||
if parsed_cfg.rl:
|
||||
dataset_meta = load_rl_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
|
||||
def do_train(cfg, cli_args) -> Tuple[PreTrainedModel, PreTrainedTokenizer]:
|
||||
print_axolotl_text_art()
|
||||
check_accelerate_default_config()
|
||||
check_user_token()
|
||||
if cfg.rl:
|
||||
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
|
||||
else:
|
||||
dataset_meta = load_datasets(cfg=parsed_cfg, cli_args=parsed_cli_args)
|
||||
train(cfg=parsed_cfg, cli_args=parsed_cli_args, dataset_meta=dataset_meta)
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
return train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -12,14 +12,19 @@ from abc import abstractmethod
|
||||
from dataclasses import dataclass, field
|
||||
from functools import wraps
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
from typing import List, Optional, Type, Union
|
||||
|
||||
import torch
|
||||
import transformers
|
||||
from datasets import Dataset
|
||||
from torch.optim.lr_scheduler import OneCycleLR
|
||||
from torch.utils.data import BatchSampler, DataLoader, RandomSampler, SequentialSampler
|
||||
from transformers import EarlyStoppingCallback, Trainer, TrainingArguments
|
||||
from transformers import (
|
||||
EarlyStoppingCallback,
|
||||
Trainer,
|
||||
TrainerCallback,
|
||||
TrainingArguments,
|
||||
)
|
||||
from transformers.trainer_utils import seed_worker
|
||||
from trl import DPOTrainer
|
||||
|
||||
@@ -28,6 +33,7 @@ from axolotl.utils.callbacks import (
|
||||
EvalFirstStepCallback,
|
||||
GPUStatsCallback,
|
||||
LossWatchDogCallback,
|
||||
SaveAxolotlConfigtoMlflowCallback,
|
||||
SaveAxolotlConfigtoWandBCallback,
|
||||
SaveBetterTransformerModelCallback,
|
||||
bench_eval_callback_factory,
|
||||
@@ -37,6 +43,7 @@ from axolotl.utils.collators import (
|
||||
BatchSamplerDataCollatorForSeq2Seq,
|
||||
DataCollatorForSeq2Seq,
|
||||
MambaDataCollator,
|
||||
V2BatchSamplerDataCollatorForSeq2Seq,
|
||||
)
|
||||
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
|
||||
from axolotl.utils.schedulers import (
|
||||
@@ -220,7 +227,8 @@ class AxolotlTrainer(Trainer):
|
||||
def get_train_dataloader(self) -> DataLoader:
|
||||
if self.args.sample_packing and not self.args.pretraining:
|
||||
train_dataset = self.train_dataset
|
||||
train_dataset = train_dataset.remove_columns(["length"])
|
||||
if "length" in train_dataset.features.keys():
|
||||
train_dataset = train_dataset.remove_columns(["length"])
|
||||
data_collator = self.data_collator
|
||||
dataloader_params = {
|
||||
"batch_size": self._train_batch_size,
|
||||
@@ -458,6 +466,7 @@ class TrainerBuilderBase(abc.ABC):
|
||||
_train_dataset = None
|
||||
_eval_dataset = None
|
||||
_model_ref = None
|
||||
_peft_config = None
|
||||
|
||||
def __init__(self, cfg, model, tokenizer):
|
||||
self.cfg = cfg
|
||||
@@ -488,13 +497,26 @@ class TrainerBuilderBase(abc.ABC):
|
||||
def eval_dataset(self, dataset):
|
||||
self._eval_dataset = dataset
|
||||
|
||||
@property
|
||||
def peft_config(self):
|
||||
return self._peft_config
|
||||
|
||||
@peft_config.setter
|
||||
def peft_config(self, peft_config):
|
||||
self._peft_config = peft_config
|
||||
|
||||
@abstractmethod
|
||||
def build(self, total_num_steps):
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_callbacks(self):
|
||||
pass
|
||||
def get_callbacks(self) -> List[TrainerCallback]:
|
||||
callbacks = []
|
||||
if self.cfg.use_wandb:
|
||||
callbacks.append(
|
||||
SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
|
||||
)
|
||||
|
||||
return callbacks
|
||||
|
||||
@abstractmethod
|
||||
def get_post_trainer_create_callbacks(self, trainer):
|
||||
@@ -502,12 +524,6 @@ class TrainerBuilderBase(abc.ABC):
|
||||
Callbacks added after the trainer is created, usually b/c these need access to the trainer
|
||||
"""
|
||||
|
||||
|
||||
class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
"""
|
||||
Build the HuggingFace training args/trainer for Causal models
|
||||
"""
|
||||
|
||||
def hook_pre_create_training_args(self, training_arguments_kwargs):
|
||||
# TODO
|
||||
return training_arguments_kwargs
|
||||
@@ -524,10 +540,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
# TODO
|
||||
return trainer
|
||||
|
||||
|
||||
class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
"""
|
||||
Build the HuggingFace training args/trainer for Causal models
|
||||
"""
|
||||
|
||||
def get_callbacks(self):
|
||||
callbacks = []
|
||||
callbacks = super().get_callbacks()
|
||||
callbacks.append(GPUStatsCallback(self.cfg))
|
||||
callbacks.append(EvalFirstStepCallback)
|
||||
callbacks.append(EvalFirstStepCallback())
|
||||
|
||||
if self.cfg.relora_steps:
|
||||
callbacks.append(ReLoRACallback(self.cfg))
|
||||
@@ -536,12 +558,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
hasattr(self.model, "use_bettertransformer")
|
||||
and self.model.use_bettertransformer is True
|
||||
):
|
||||
callbacks.append(SaveBetterTransformerModelCallback)
|
||||
callbacks.append(SaveBetterTransformerModelCallback())
|
||||
|
||||
if self.cfg.use_wandb:
|
||||
callbacks.append(
|
||||
SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
|
||||
)
|
||||
if self.cfg.use_mlflow:
|
||||
callbacks.append(
|
||||
SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path)
|
||||
)
|
||||
|
||||
if self.cfg.loss_watchdog_threshold is not None:
|
||||
callbacks.append(LossWatchDogCallback(self.cfg))
|
||||
@@ -745,9 +771,10 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
training_arguments_kwargs[
|
||||
"per_device_train_batch_size"
|
||||
] = self.cfg.micro_batch_size
|
||||
training_arguments_kwargs[
|
||||
"per_device_eval_batch_size"
|
||||
] = self.cfg.eval_batch_size
|
||||
if self.cfg.eval_batch_size:
|
||||
training_arguments_kwargs[
|
||||
"per_device_eval_batch_size"
|
||||
] = self.cfg.eval_batch_size
|
||||
training_arguments_kwargs[
|
||||
"gradient_accumulation_steps"
|
||||
] = self.cfg.gradient_accumulation_steps
|
||||
@@ -896,14 +923,22 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
if is_eval and training_args.eval_sample_packing:
|
||||
use_batch_sampler_collator = True
|
||||
|
||||
collator: Type[
|
||||
Union[
|
||||
V2BatchSamplerDataCollatorForSeq2Seq,
|
||||
BatchSamplerDataCollatorForSeq2Seq,
|
||||
DataCollatorForSeq2Seq,
|
||||
]
|
||||
]
|
||||
if use_batch_sampler_collator:
|
||||
return BatchSamplerDataCollatorForSeq2Seq(
|
||||
self.tokenizer,
|
||||
return_tensors="pt",
|
||||
**kwargs,
|
||||
)
|
||||
if self.cfg.model_config_type in ["mixtral", "qwen2", "falcon", "phi"]:
|
||||
collator = V2BatchSamplerDataCollatorForSeq2Seq
|
||||
else:
|
||||
collator = BatchSamplerDataCollatorForSeq2Seq
|
||||
else:
|
||||
collator = DataCollatorForSeq2Seq
|
||||
|
||||
return DataCollatorForSeq2Seq(
|
||||
return collator(
|
||||
self.tokenizer,
|
||||
return_tensors="pt",
|
||||
**kwargs,
|
||||
@@ -916,7 +951,7 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
|
||||
"""
|
||||
|
||||
def get_callbacks(self):
|
||||
callbacks = []
|
||||
callbacks = super().get_callbacks()
|
||||
return callbacks
|
||||
|
||||
def get_post_trainer_create_callbacks(self, trainer):
|
||||
@@ -934,21 +969,65 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
|
||||
]:
|
||||
if hasattr(self.cfg, arg) and getattr(self.cfg, arg) is not None:
|
||||
training_args_kwargs[arg] = getattr(self.cfg, arg)
|
||||
|
||||
if self.cfg.hub_model_id:
|
||||
training_args_kwargs["hub_model_id"] = self.cfg.hub_model_id
|
||||
training_args_kwargs["push_to_hub"] = True
|
||||
training_args_kwargs["hub_private_repo"] = True
|
||||
training_args_kwargs["hub_always_push"] = True
|
||||
|
||||
if self.cfg.hub_strategy:
|
||||
training_args_kwargs["hub_strategy"] = self.cfg.hub_strategy
|
||||
|
||||
if self.cfg.save_safetensors is not None:
|
||||
training_args_kwargs["save_safetensors"] = self.cfg.save_safetensors
|
||||
|
||||
if self.eval_dataset:
|
||||
training_args_kwargs["evaluation_strategy"] = "steps"
|
||||
training_args_kwargs["eval_steps"] = self.cfg.eval_steps
|
||||
else:
|
||||
training_args_kwargs["evaluation_strategy"] = "no"
|
||||
if self.cfg.bf16 or self.cfg.bfloat16:
|
||||
training_args_kwargs["bf16"] = True
|
||||
|
||||
training_args_kwargs["lr_scheduler_type"] = (
|
||||
self.cfg.lr_scheduler if self.cfg.lr_scheduler else "cosine"
|
||||
)
|
||||
training_args_kwargs["lr_scheduler_kwargs"] = (
|
||||
self.cfg.lr_scheduler_kwargs if self.cfg.lr_scheduler_kwargs else {}
|
||||
)
|
||||
if self.cfg.remove_unused_columns is not None:
|
||||
training_args_kwargs[
|
||||
"remove_unused_columns"
|
||||
] = self.cfg.remove_unused_columns
|
||||
else:
|
||||
training_args_kwargs["remove_unused_columns"] = False
|
||||
|
||||
if self.cfg.dataloader_pin_memory is not None:
|
||||
training_args_kwargs[
|
||||
"dataloader_pin_memory"
|
||||
] = self.cfg.dataloader_pin_memory
|
||||
if self.cfg.dataloader_num_workers is not None:
|
||||
training_args_kwargs[
|
||||
"dataloader_num_workers"
|
||||
] = self.cfg.dataloader_num_workers
|
||||
if self.cfg.dataloader_prefetch_factor is not None:
|
||||
training_args_kwargs[
|
||||
"dataloader_prefetch_factor"
|
||||
] = self.cfg.dataloader_prefetch_factor
|
||||
|
||||
training_args = TrainingArguments(
|
||||
per_device_train_batch_size=self.cfg.micro_batch_size,
|
||||
max_steps=total_num_steps,
|
||||
remove_unused_columns=False,
|
||||
max_steps=self.cfg.max_steps or total_num_steps,
|
||||
gradient_accumulation_steps=self.cfg.gradient_accumulation_steps,
|
||||
learning_rate=self.cfg.learning_rate,
|
||||
evaluation_strategy="no",
|
||||
# eval_steps=self.cfg.eval_steps,
|
||||
save_strategy="steps",
|
||||
save_steps=self.cfg.save_steps,
|
||||
output_dir=self.cfg.output_dir,
|
||||
warmup_steps=self.cfg.warmup_steps,
|
||||
bf16=True,
|
||||
gradient_checkpointing=self.cfg.gradient_checkpointing,
|
||||
gradient_checkpointing_kwargs={"use_reentrant": False},
|
||||
gradient_checkpointing_kwargs=self.cfg.gradient_checkpointing_kwargs
|
||||
or {"use_reentrant": False},
|
||||
logging_first_step=True,
|
||||
logging_steps=1,
|
||||
optim=self.cfg.optimizer,
|
||||
@@ -967,22 +1046,27 @@ class HFDPOTrainerBuilder(TrainerBuilderBase):
|
||||
dpo_trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
|
||||
elif self.cfg.rl == "kto_pair":
|
||||
dpo_trainer_kwargs["loss_type"] = "kto_pair"
|
||||
|
||||
if self.eval_dataset:
|
||||
dpo_trainer_kwargs["eval_dataset"] = self.eval_dataset
|
||||
if self.cfg.adapter and self.peft_config:
|
||||
dpo_trainer_kwargs["peft_config"] = self.peft_config
|
||||
dpo_trainer = DPOTrainer(
|
||||
self.model,
|
||||
self.model_ref,
|
||||
args=training_args,
|
||||
beta=self.cfg.dpo_beta or 0.1,
|
||||
train_dataset=self.train_dataset,
|
||||
# eval_dataset=self.eval_dataset,
|
||||
eval_dataset=None,
|
||||
tokenizer=self.tokenizer,
|
||||
max_length=self.cfg.sequence_len,
|
||||
max_target_length=None,
|
||||
max_prompt_length=self.cfg.sequence_len,
|
||||
generate_during_eval=True,
|
||||
callbacks=self.get_callbacks(),
|
||||
**dpo_trainer_kwargs,
|
||||
)
|
||||
dpo_trainer = self.hook_post_create_trainer(dpo_trainer)
|
||||
for callback in self.get_post_trainer_create_callbacks(dpo_trainer):
|
||||
dpo_trainer.add_callback(callback)
|
||||
|
||||
return dpo_trainer
|
||||
|
||||
|
||||
@@ -24,6 +24,8 @@ class TokenizedPromptDataset(Dataset):
|
||||
Args:
|
||||
prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data.
|
||||
dataset (dataset.Dataset): Dataset with text files.
|
||||
process_count (int): Number of processes to use for tokenizing.
|
||||
keep_in_memory (bool): Whether to keep the tokenized dataset in memory.
|
||||
"""
|
||||
|
||||
def __init__( # pylint: disable=super-init-not-called
|
||||
@@ -31,19 +33,21 @@ class TokenizedPromptDataset(Dataset):
|
||||
prompt_tokenizer: PromptTokenizingStrategy,
|
||||
dataset: IterableDataset,
|
||||
process_count: Optional[int] = None,
|
||||
keep_in_memory: Optional[bool] = False,
|
||||
**kwargs,
|
||||
):
|
||||
self.prompt_tokenizer = prompt_tokenizer
|
||||
self.process_count = process_count
|
||||
super().__init__(self.process(dataset).data, **kwargs)
|
||||
self.keep_in_memory = keep_in_memory
|
||||
super().__init__(
|
||||
self.process(dataset).data,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def process(self, dataset):
|
||||
features = dataset.features.keys()
|
||||
num_proc = (
|
||||
min(64, self.process_count)
|
||||
if self.process_count
|
||||
else min(64, os.cpu_count())
|
||||
)
|
||||
num_proc = min(64, self.process_count if self.process_count else os.cpu_count())
|
||||
|
||||
map_kwargs = {}
|
||||
if self.prompt_tokenizer.supports_batched:
|
||||
map_kwargs["batched"] = True
|
||||
@@ -52,6 +56,8 @@ class TokenizedPromptDataset(Dataset):
|
||||
self.prompt_tokenizer.tokenize_prompt,
|
||||
num_proc=num_proc,
|
||||
remove_columns=features,
|
||||
keep_in_memory=self.keep_in_memory,
|
||||
desc="Tokenizing Prompts",
|
||||
**map_kwargs,
|
||||
)
|
||||
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
"""
|
||||
MixFormers model architecture used for phi models
|
||||
"""
|
||||
|
||||
from .configuration_mixformer_sequential import MixFormerSequentialConfig # noqa
|
||||
from .configuration_phi import PhiConfig # noqa
|
||||
from .modeling_mixformer_sequential import MixFormerSequentialForCausalLM # noqa
|
||||
from .modeling_phi import PhiForCausalLM # noqa
|
||||
@@ -1,63 +0,0 @@
|
||||
# pylint: skip-file
|
||||
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
import math
|
||||
from typing import Any, Dict, List, Optional, Union
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class MixFormerSequentialConfig(PretrainedConfig):
|
||||
"""MixFormer (sequential for DeepSpeed) configuration."""
|
||||
|
||||
model_type = "mixformer-sequential"
|
||||
|
||||
attribute_map = {
|
||||
"max_position_embeddings": "n_positions",
|
||||
"hidden_size": "n_embd",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
"input_emb_layer": "embd_layer", # `input_emb_layer` key is for backward compatibility
|
||||
"blocks": "architecture", # `blocks` key is for backward compatibility
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: Optional[int] = 50304,
|
||||
n_positions: Optional[int] = 2048,
|
||||
n_embd: Optional[int] = 1024,
|
||||
n_layer: Optional[int] = 20,
|
||||
n_inner: Optional[int] = None,
|
||||
n_head: Optional[int] = 16,
|
||||
rotary_dim: Optional[int] = 32,
|
||||
activation_function: Optional[str] = "gelu_new",
|
||||
embd_layer: Optional[str] = "default",
|
||||
architecture: Union[Dict[str, Any], List[Dict[str, Any]]] = None,
|
||||
embd_pdrop: Optional[float] = 0.0,
|
||||
resid_pdrop: Optional[float] = 0.0,
|
||||
layer_norm_epsilon: Optional[float] = 1e-5,
|
||||
initializer_range: Optional[float] = 0.02,
|
||||
tie_word_embeddings: Optional[bool] = False,
|
||||
pad_vocab_size_multiple: Optional[int] = 64,
|
||||
**kwargs
|
||||
) -> None:
|
||||
self.vocab_size = int(
|
||||
math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
|
||||
)
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_inner = n_inner
|
||||
self.n_head = n_head
|
||||
self.rotary_dim = min(rotary_dim, n_embd // n_head)
|
||||
self.activation_function = activation_function
|
||||
self.embd_layer = embd_layer
|
||||
self.architecture = architecture
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||
@@ -1,65 +0,0 @@
|
||||
# pylint: skip-file
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
import math
|
||||
from typing import Optional
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
|
||||
class PhiConfig(PretrainedConfig):
|
||||
"""Phi configuration."""
|
||||
|
||||
model_type = "phi"
|
||||
attribute_map = {
|
||||
"max_position_embeddings": "n_positions",
|
||||
"hidden_size": "n_embd",
|
||||
"num_attention_heads": "n_head",
|
||||
"num_hidden_layers": "n_layer",
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
vocab_size: int = 50304,
|
||||
n_positions: int = 2048,
|
||||
n_embd: int = 1024,
|
||||
n_layer: int = 20,
|
||||
n_inner: Optional[int] = None,
|
||||
n_head: int = 16,
|
||||
n_head_kv: Optional[int] = None,
|
||||
rotary_dim: Optional[int] = 32,
|
||||
activation_function: Optional[str] = "gelu_new",
|
||||
flash_attn: bool = False,
|
||||
flash_rotary: bool = False,
|
||||
fused_dense: bool = False,
|
||||
attn_pdrop: float = 0.0,
|
||||
embd_pdrop: float = 0.0,
|
||||
resid_pdrop: float = 0.0,
|
||||
layer_norm_epsilon: float = 1e-5,
|
||||
initializer_range: float = 0.02,
|
||||
tie_word_embeddings: bool = False,
|
||||
pad_vocab_size_multiple: int = 64,
|
||||
**kwargs
|
||||
) -> None:
|
||||
self.vocab_size = int(
|
||||
math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple
|
||||
)
|
||||
self.n_positions = n_positions
|
||||
self.n_embd = n_embd
|
||||
self.n_layer = n_layer
|
||||
self.n_inner = n_inner
|
||||
self.n_head = n_head
|
||||
self.n_head_kv = n_head_kv
|
||||
self.rotary_dim = min(rotary_dim, n_embd // n_head)
|
||||
self.activation_function = activation_function
|
||||
self.flash_attn = flash_attn
|
||||
self.flash_rotary = flash_rotary
|
||||
self.fused_dense = fused_dense
|
||||
self.attn_pdrop = attn_pdrop
|
||||
self.embd_pdrop = embd_pdrop
|
||||
self.resid_pdrop = resid_pdrop
|
||||
self.layer_norm_epsilon = layer_norm_epsilon
|
||||
self.initializer_range = initializer_range
|
||||
|
||||
super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
|
||||
@@ -1,930 +0,0 @@
|
||||
# pylint: skip-file
|
||||
|
||||
# Copyright (c) Microsoft Corporation.
|
||||
# Licensed under the MIT license.
|
||||
|
||||
# BSD 3-Clause License
|
||||
#
|
||||
# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
|
||||
# All rights reserved.
|
||||
#
|
||||
# Redistribution and use in source and binary forms, with or without
|
||||
# modification, are permitted provided that the following conditions are met:
|
||||
#
|
||||
# * Redistributions of source code must retain the above copyright notice, this
|
||||
# list of conditions and the following disclaimer.
|
||||
#
|
||||
# * Redistributions in binary form must reproduce the above copyright notice,
|
||||
# this list of conditions and the following disclaimer in the documentation
|
||||
# and/or other materials provided with the distribution.
|
||||
#
|
||||
# * Neither the name of the copyright holder nor the names of its
|
||||
# contributors may be used to endorse or promote products derived from
|
||||
# this software without specific prior written permission.
|
||||
#
|
||||
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
||||
# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
||||
# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import copy
|
||||
import inspect
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from einops import rearrange
|
||||
from flash_attn.flash_attn_interface import (
|
||||
flash_attn_kvpacked_func,
|
||||
flash_attn_qkvpacked_func,
|
||||
flash_attn_varlen_qkvpacked_func,
|
||||
)
|
||||
from transformers import PretrainedConfig, PreTrainedModel
|
||||
from transformers.activations import ACT2FN
|
||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
||||
|
||||
from ...monkeypatch.utils import get_cu_seqlens_from_pos_ids
|
||||
from .configuration_mixformer_sequential import MixFormerSequentialConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class InferenceParams:
|
||||
"""Inference parameters that are passed to the main model in order
|
||||
to efficienly calculate and store the context during inference.
|
||||
Adapted from https://github.com/Dao-AILab/flash-attention."""
|
||||
|
||||
max_sequence_len: int
|
||||
max_batch_size: int
|
||||
sequence_len_offset: int = 0
|
||||
batch_size_offset: int = 0
|
||||
key_value_memory_dict: dict = field(default_factory=dict)
|
||||
fused_ft_kernel: bool = False
|
||||
lengths_per_sample: Optional[torch.Tensor] = None
|
||||
|
||||
|
||||
class Embedding(nn.Module):
|
||||
"""Token embedding with dropout."""
|
||||
|
||||
def __init__(self, config: PretrainedConfig) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.wte = nn.Embedding(config.vocab_size, config.n_embd)
|
||||
self.drop = nn.Dropout(config.embd_pdrop)
|
||||
|
||||
def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
|
||||
input_shape = input_ids.size()
|
||||
input_ids = input_ids.view(-1, input_shape[-1])
|
||||
|
||||
hidden_states = self.wte(input_ids)
|
||||
hidden_states = self.drop(hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class RotaryEmbedding(nn.Module):
|
||||
"""PyTorch implementation of `flash-attn` RotaryEmbedding layer.
|
||||
Adapted from https://github.com/Dao-AILab/flash-attention."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
dim: int,
|
||||
base: Optional[int] = 10000,
|
||||
scale_base: Optional[float] = None,
|
||||
device: Optional[str] = None,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
if scale_base is not None:
|
||||
raise NotImplementedError
|
||||
|
||||
# Generate and save the inverse frequency buffer (non-trainable)
|
||||
self.dim = dim
|
||||
self.base = base
|
||||
self.scale_base = scale_base
|
||||
self.device = device
|
||||
|
||||
inv_freq = 1.0 / (
|
||||
base ** (torch.arange(0, dim, 2, device=device, dtype=torch.float32) / dim)
|
||||
)
|
||||
self.register_buffer("inv_freq", inv_freq)
|
||||
|
||||
scale = (
|
||||
(torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim)
|
||||
/ (1.4 * dim)
|
||||
if scale_base is not None
|
||||
else None
|
||||
)
|
||||
self.register_buffer("scale", scale)
|
||||
|
||||
self._seq_len_cached = 0
|
||||
self._cos_cached = None
|
||||
self._sin_cached = None
|
||||
self._cos_k_cached = None
|
||||
self._sin_k_cached = None
|
||||
|
||||
def _update_cos_sin_cache(
|
||||
self, x: torch.FloatTensor, seqlen_offset: Optional[int] = 0
|
||||
) -> None:
|
||||
# Reset the tables if the sequence length has changed,
|
||||
# or if we're on a new device (possibly due to tracing for instance)
|
||||
seqlen = x.shape[1] + seqlen_offset
|
||||
|
||||
# Re-generate the inverse frequency buffer if it's not fp32
|
||||
# (for instance if model.half() was called)
|
||||
if self.inv_freq.dtype != "torch.float32":
|
||||
self.inv_freq = 1.0 / (
|
||||
self.base
|
||||
** (
|
||||
torch.arange(
|
||||
0, self.dim, 2, device=self.device, dtype=torch.float32
|
||||
)
|
||||
/ self.dim
|
||||
)
|
||||
)
|
||||
|
||||
if (
|
||||
seqlen > self._seq_len_cached
|
||||
or self._cos_cached.device != x.device
|
||||
or self._cos_cached.dtype != x.dtype
|
||||
):
|
||||
self._seq_len_cached = seqlen
|
||||
t = torch.arange(seqlen, device=x.device, dtype=torch.float32)
|
||||
|
||||
# Don't do einsum, it converts fp32 to fp16
|
||||
# freqs = torch.einsum("i,j->ij", t, self.inv_freq)
|
||||
freqs = torch.outer(
|
||||
t, self.inv_freq.to(device=t.device, dtype=torch.float32)
|
||||
)
|
||||
if self.scale is None:
|
||||
self._cos_cached = torch.cos(freqs).to(x.dtype)
|
||||
self._sin_cached = torch.sin(freqs).to(x.dtype)
|
||||
else:
|
||||
power = (
|
||||
torch.arange(
|
||||
seqlen, dtype=self.scale.dtype, device=self.scale.device
|
||||
)
|
||||
- seqlen // 2
|
||||
) / self.scale_base
|
||||
scale = self.scale.to(device=power.device) ** rearrange(
|
||||
power, "s -> s 1"
|
||||
)
|
||||
|
||||
# We want the multiplication by scale to happen in fp32
|
||||
self._cos_cached = (torch.cos(freqs) * scale).to(x.dtype)
|
||||
self._sin_cached = (torch.sin(freqs) * scale).to(x.dtype)
|
||||
self._cos_k_cached = (torch.cos(freqs) / scale).to(x.dtype)
|
||||
self._sin_k_cached = (torch.sin(freqs) / scale).to(x.dtype)
|
||||
|
||||
def apply_rotary_emb_qkv(
|
||||
self,
|
||||
qkv: torch.FloatTensor,
|
||||
sin: torch.FloatTensor,
|
||||
cos: torch.FloatTensor,
|
||||
sin_k: Optional[torch.FloatTensor] = None,
|
||||
cos_k: Optional[torch.FloatTensor] = None,
|
||||
) -> torch.FloatTensor:
|
||||
_, seqlen, three, _, headdim = qkv.shape
|
||||
assert three == 3
|
||||
|
||||
rotary_seqlen, rotary_dim = cos.shape
|
||||
rotary_dim *= 2
|
||||
assert rotary_dim <= headdim
|
||||
assert seqlen <= rotary_seqlen
|
||||
|
||||
cos_k = cos if cos_k is None else cos_k
|
||||
sin_k = sin if sin_k is None else sin_k
|
||||
assert (
|
||||
sin.shape == cos_k.shape == sin_k.shape == (rotary_seqlen, rotary_dim // 2)
|
||||
)
|
||||
|
||||
q_rot = qkv[:, :, 0, :, :rotary_dim]
|
||||
q_pass = qkv[:, :, 0, :, rotary_dim:]
|
||||
|
||||
k_rot = qkv[:, :, 1, :, :rotary_dim]
|
||||
k_pass = qkv[:, :, 1, :, rotary_dim:]
|
||||
|
||||
# Splits the queries and keys in half
|
||||
q1, q2 = q_rot.chunk(2, dim=-1)
|
||||
k1, k2 = k_rot.chunk(2, dim=-1)
|
||||
c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(
|
||||
sin[:seqlen], "s d -> s 1 d"
|
||||
)
|
||||
|
||||
# Casts to fp32 are necessary to prevent fp16 overflow issues
|
||||
q1, q2, k1, k2, c, s = [
|
||||
t.to(dtype=torch.float32) for t in [q1, q2, k1, k2, c, s]
|
||||
]
|
||||
|
||||
# Computes the new keys and queries, recasting to original dtype
|
||||
q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
|
||||
|
||||
k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(qkv.dtype)
|
||||
|
||||
return torch.cat(
|
||||
[
|
||||
torch.cat([q_rot, q_pass], axis=-1).unsqueeze(2),
|
||||
torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
|
||||
qkv[:, :, 2:3, :, :],
|
||||
],
|
||||
axis=2,
|
||||
)
|
||||
|
||||
def forward(
|
||||
self, qkv: torch.Tensor, seqlen_offset: int = 0
|
||||
) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
"""Perform the forward pass.
|
||||
|
||||
Args:
|
||||
qkv: Query, key and value tensors of shape (batch, seqlen, nheads, headdim) or (batch, seqlen, 3, nheads, headdim).
|
||||
seqlen_offset: Used in generation where the passed `qkv` is only the last token in the batch.
|
||||
|
||||
Returns:
|
||||
New `qkv` and the cached sinusoids.
|
||||
|
||||
"""
|
||||
|
||||
self._update_cos_sin_cache(qkv, seqlen_offset)
|
||||
|
||||
return self.apply_rotary_emb_qkv(
|
||||
qkv, self._sin_cached[seqlen_offset:], self._cos_cached[seqlen_offset:]
|
||||
)
|
||||
|
||||
|
||||
def _update_kv_cache(kv, inference_params, layer_idx):
|
||||
"""kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)
|
||||
Adapted from https://github.com/Dao-AILab/flash-attention."""
|
||||
# Pre-allocate memory for key-values for inference.
|
||||
num_heads, head_dim = kv.shape[-2:]
|
||||
if layer_idx not in inference_params.key_value_memory_dict:
|
||||
kv_cache = torch.empty(
|
||||
inference_params.max_batch_size,
|
||||
inference_params.max_sequence_len,
|
||||
2,
|
||||
num_heads,
|
||||
head_dim,
|
||||
dtype=kv.dtype,
|
||||
device=kv.device,
|
||||
)
|
||||
inference_params.key_value_memory_dict[layer_idx] = kv_cache
|
||||
else:
|
||||
kv_cache = inference_params.key_value_memory_dict[layer_idx]
|
||||
|
||||
# Adjust key and value for inference
|
||||
batch_start = inference_params.batch_size_offset
|
||||
batch_end = batch_start + kv.shape[0]
|
||||
sequence_start = inference_params.sequence_len_offset
|
||||
sequence_end = sequence_start + kv.shape[1]
|
||||
assert batch_end <= (
|
||||
kv_cache.shape[0] if kv_cache is not None else v_cache.shape[0] # noqa
|
||||
)
|
||||
assert sequence_end <= (
|
||||
kv_cache.shape[1] if kv_cache is not None else v_cache.shape[2] # noqa
|
||||
)
|
||||
|
||||
assert kv_cache is not None
|
||||
kv_cache[batch_start:batch_end, sequence_start:sequence_end, ...] = kv
|
||||
kv = kv_cache[batch_start:batch_end, :sequence_end, ...]
|
||||
return kv
|
||||
|
||||
|
||||
class MLP(nn.Module):
|
||||
"""Multi-Layer Perceptron.
|
||||
|
||||
Reference:
|
||||
Attention Is All You Need.
|
||||
https://arxiv.org/pdf/1706.03762.pdf.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
n_inner: Optional[int] = None,
|
||||
act_fn: Optional[str] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
act_fn = config.activation_function if act_fn is None else act_fn
|
||||
assert act_fn in ACT2FN.keys(), f"`act_fn` must be one of: {ACT2FN.keys()}."
|
||||
|
||||
n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
|
||||
n_inner = n_inner if n_inner is not None else 4 * config.n_embd
|
||||
|
||||
self.fc1 = nn.Linear(config.n_embd, n_inner)
|
||||
self.fc2 = nn.Linear(n_inner, config.n_embd)
|
||||
self.act = ACT2FN[act_fn]
|
||||
|
||||
def _load_from_state_dict(
|
||||
self,
|
||||
state_dict,
|
||||
prefix,
|
||||
local_metadata,
|
||||
strict,
|
||||
missing_keys,
|
||||
unexpected_keys,
|
||||
error_msgs,
|
||||
):
|
||||
old_keys = [
|
||||
prefix + "fc_in.weight",
|
||||
prefix + "fc_out.weight",
|
||||
prefix + "fc_in.bias",
|
||||
prefix + "fc_out.bias",
|
||||
]
|
||||
new_keys = [
|
||||
prefix + "fc1.weight",
|
||||
prefix + "fc2.weight",
|
||||
prefix + "fc1.bias",
|
||||
prefix + "fc2.bias",
|
||||
]
|
||||
|
||||
if all(k in state_dict for k in old_keys) and not all(
|
||||
k in state_dict for k in new_keys
|
||||
):
|
||||
# Older version of `MLP` saved with different key names.
|
||||
for old_key, new_key in zip(old_keys, new_keys):
|
||||
state_dict[new_key] = state_dict.pop(old_key)
|
||||
|
||||
return super()._load_from_state_dict(
|
||||
state_dict,
|
||||
prefix,
|
||||
local_metadata,
|
||||
strict,
|
||||
missing_keys,
|
||||
unexpected_keys,
|
||||
error_msgs,
|
||||
)
|
||||
|
||||
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
|
||||
hidden_states = self.fc1(hidden_states)
|
||||
hidden_states = self.act(hidden_states)
|
||||
hidden_states = self.fc2(hidden_states)
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class FusedMLP(nn.Module):
|
||||
"""Fused Multi-Layer Perceptron from `flash-attn`.
|
||||
|
||||
Reference:
|
||||
https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/ops/fused_dense.py.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
n_inner: Optional[int] = None,
|
||||
act_fn: Optional[str] = None,
|
||||
raise_on_missing: bool = False,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
act_fn = config.activation_function if act_fn is None else act_fn
|
||||
assert act_fn in ACT2FN.keys(), f"`act_fn` must be one of: {ACT2FN.keys()}."
|
||||
|
||||
n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
|
||||
n_inner = n_inner if n_inner is not None else 4 * config.n_embd
|
||||
|
||||
gelu_activations = ["gelu_new", "gelu_fast", "gelu_approx"] # noqa
|
||||
activation = "gelu_approx" if act_fn in gelu_activations else "relu" # noqa
|
||||
|
||||
self.mlp = MLP(config, n_inner=n_inner, act_fn=act_fn)
|
||||
|
||||
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
|
||||
return self.mlp(hidden_states)
|
||||
|
||||
|
||||
class SelfAttention(nn.Module):
|
||||
"""Implement the scaled dot product attention with softmax.
|
||||
Adapted from https://github.com/Dao-AILab/flash-attention.
|
||||
Arguments
|
||||
---------
|
||||
softmax_scale: The temperature to use for the softmax attention.
|
||||
(default: 1/sqrt(d_keys) where d_keys is computed at
|
||||
runtime)
|
||||
attention_dropout: The dropout rate to apply to the attention
|
||||
(default: 0.0)
|
||||
"""
|
||||
|
||||
def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
|
||||
super().__init__()
|
||||
self.causal = causal
|
||||
self.softmax_scale = softmax_scale
|
||||
self.drop = nn.Dropout(attention_dropout)
|
||||
|
||||
def forward(
|
||||
self, qkv, causal=None, key_padding_mask=None, cu_seqlens=None, max_seqlen=None
|
||||
):
|
||||
"""Implements the multihead softmax attention.
|
||||
Arguments
|
||||
---------
|
||||
qkv: The tensor containing the query, key, and value. (B, S, 3, H, D)
|
||||
causal: if passed, will override self.causal
|
||||
key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
|
||||
False means to mask out. (B, S)
|
||||
"""
|
||||
causal = self.causal if causal is None else causal
|
||||
if cu_seqlens is not None:
|
||||
return flash_attn_varlen_qkvpacked_func(
|
||||
qkv.squeeze(0),
|
||||
cu_seqlens,
|
||||
max_seqlen,
|
||||
dropout_p=self.drop.p,
|
||||
softmax_scale=self.softmax_scale,
|
||||
causal=causal,
|
||||
)
|
||||
else:
|
||||
return flash_attn_qkvpacked_func(
|
||||
qkv,
|
||||
dropout_p=self.drop.p,
|
||||
softmax_scale=self.softmax_scale,
|
||||
causal=causal,
|
||||
)
|
||||
|
||||
|
||||
class CrossAttention(nn.Module):
|
||||
"""Implement the scaled dot product attention with softmax.
|
||||
Adapted from https://github.com/Dao-AILab/flash-attention.
|
||||
Arguments
|
||||
---------
|
||||
softmax_scale: The temperature to use for the softmax attention.
|
||||
(default: 1/sqrt(d_keys) where d_keys is computed at
|
||||
runtime)
|
||||
attention_dropout: The dropout rate to apply to the attention
|
||||
(default: 0.0)
|
||||
"""
|
||||
|
||||
def __init__(self, causal=False, softmax_scale=None, attention_dropout=0.0):
|
||||
super().__init__()
|
||||
self.causal = causal
|
||||
self.softmax_scale = softmax_scale
|
||||
self.drop = nn.Dropout(attention_dropout)
|
||||
|
||||
def forward(self, q, kv, causal=None, key_padding_mask=None):
|
||||
"""Implements the multihead softmax attention.
|
||||
Arguments
|
||||
---------
|
||||
q: The tensor containing the query. (B, Sq, H, D)
|
||||
kv: The tensor containing the key and value. (B, Sk, 2, H, D)
|
||||
causal: if passed, will override self.causal
|
||||
key_padding_mask: boolean mask to apply to the attention weights. True means to keep,
|
||||
False means to mask out. (B, Sk)
|
||||
"""
|
||||
causal = self.causal if causal is None else causal
|
||||
return flash_attn_kvpacked_func(
|
||||
q,
|
||||
kv,
|
||||
dropout_p=self.drop.p,
|
||||
softmax_scale=self.softmax_scale,
|
||||
causal=causal,
|
||||
)
|
||||
|
||||
|
||||
def find_mha_dims(
|
||||
config: PretrainedConfig,
|
||||
n_head: Optional[int] = None,
|
||||
head_dim: Optional[int] = None,
|
||||
) -> Tuple[int, int]:
|
||||
"""Validate and return the number of heads and head dimension for multi-head attention.
|
||||
|
||||
Args:
|
||||
config: Model configuration.
|
||||
n_head: Number of heads.
|
||||
head_dim: Head dimension.
|
||||
|
||||
Returns:
|
||||
Number of heads and head dimension.
|
||||
|
||||
"""
|
||||
|
||||
assert all(
|
||||
hasattr(config, attr) for attr in ["n_embd", "n_head"]
|
||||
), "`config` must have `n_embd` and `n_head` attributes."
|
||||
|
||||
if head_dim is None:
|
||||
assert (
|
||||
config.n_embd % config.n_head == 0
|
||||
), f"Hidden size ({config.n_embd}) must be divisible by the number of heads ({config.n_head})."
|
||||
|
||||
if n_head is None and head_dim is None:
|
||||
head_dim = config.n_embd // config.n_head
|
||||
n_head = config.n_head
|
||||
elif n_head is None or head_dim is None:
|
||||
raise ValueError("`n_head` and `head_dim` must be both specified or `None`.")
|
||||
|
||||
return n_head, head_dim
|
||||
|
||||
|
||||
class MHA(nn.Module):
|
||||
"""Multi-head attention layer.
|
||||
Adapted from https://github.com/Dao-AILab/flash-attention."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
rotary_dim: Optional[int] = None,
|
||||
n_head: Optional[int] = None,
|
||||
head_dim: Optional[int] = None,
|
||||
bias: Optional[bool] = True,
|
||||
dropout: Optional[float] = 0.0,
|
||||
softmax_scale: Optional[float] = None,
|
||||
causal: Optional[bool] = True,
|
||||
layer_idx: Optional[int] = None,
|
||||
rotary_emb_scale_base: Optional[float] = None,
|
||||
return_residual: Optional[bool] = False,
|
||||
checkpointing: Optional[bool] = False,
|
||||
device: Optional[str] = None,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
fused_dense: Optional[bool] = True,
|
||||
flash_attn: Optional[bool] = True,
|
||||
cutlass_attn: Optional[bool] = False,
|
||||
flash_rotary: Optional[bool] = True,
|
||||
raise_on_missing: Optional[bool] = False,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
factory_kwargs = {"device": device, "dtype": dtype}
|
||||
n_head, head_dim = find_mha_dims(config, n_head, head_dim)
|
||||
|
||||
self.hidden_size = config.n_embd
|
||||
self.n_head = n_head
|
||||
self.head_dim = head_dim
|
||||
self.op_size = n_head * head_dim
|
||||
|
||||
self.causal = causal
|
||||
self.layer_idx = layer_idx
|
||||
self.rotary_emb_dim = (
|
||||
rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
|
||||
)
|
||||
self.fused_dense = fused_dense
|
||||
self.flash_attn = flash_attn
|
||||
self.cutlass_attn = cutlass_attn
|
||||
self.flash_rotary = flash_rotary
|
||||
self.return_residual = return_residual
|
||||
self.checkpointing = checkpointing
|
||||
|
||||
if self.rotary_emb_dim > 0:
|
||||
rotary_kwargs = {"device": device}
|
||||
if rotary_emb_scale_base is not None and rotary_emb_scale_base > 0.0:
|
||||
rotary_kwargs["scale_base"] = rotary_emb_scale_base
|
||||
|
||||
self.rotary_emb = RotaryEmbedding(self.rotary_emb_dim, **rotary_kwargs)
|
||||
else:
|
||||
pass
|
||||
|
||||
self.Wqkv = nn.Linear(
|
||||
self.hidden_size, 3 * self.op_size, bias=bias, **factory_kwargs
|
||||
)
|
||||
self.out_proj = nn.Linear(
|
||||
self.op_size, self.hidden_size, bias=bias, **factory_kwargs
|
||||
)
|
||||
|
||||
self.inner_attn = SelfAttention(
|
||||
causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
|
||||
)
|
||||
self.inner_cross_attn = CrossAttention(
|
||||
causal=causal, softmax_scale=softmax_scale, attention_dropout=dropout
|
||||
)
|
||||
|
||||
def _update_kv_cache(
|
||||
self, kv: torch.FloatTensor, inference_params: InferenceParams
|
||||
) -> None:
|
||||
"""kv: (batch_size, seqlen, 2, nheads, head_dim) or (batch_size, 1, 2, nheads, head_dim)
|
||||
Adapted from https://github.com/Dao-AILab/flash-attention."""
|
||||
|
||||
assert (
|
||||
self.layer_idx is not None
|
||||
), "Generation requires layer_idx in the constructor"
|
||||
|
||||
return _update_kv_cache(kv, inference_params, self.layer_idx)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
x: torch.FloatTensor,
|
||||
x_kv: Optional[torch.FloatTensor] = None,
|
||||
key_padding_mask: Optional[torch.BoolTensor] = None,
|
||||
cu_seqlens: Optional[torch.LongTensor] = None,
|
||||
max_seqlen: Optional[int] = None,
|
||||
mixer_subset: Optional[torch.LongTensor] = None,
|
||||
past_cache: Optional[InferenceParams] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
|
||||
"""Perform the forward pass.
|
||||
|
||||
Args:
|
||||
x: (batch, seqlen, hidden_dim) (where hidden_dim = num heads * head dim) if
|
||||
cu_seqlens is None and max_seqlen is None, else (total, hidden_dim) where total
|
||||
is the is the sum of the sequence lengths in the batch.
|
||||
x_kv: (batch, seqlen, hidden_dim), only applicable for cross-attention. If None, use x.
|
||||
key_padding_mask: boolean mask, True means to keep, False means to mask out.
|
||||
(batch, seqlen). Only applicable when not using FlashAttention.
|
||||
cu_seqlens: (batch_size + 1,), dtype torch.int32. The cumulative sequence lengths
|
||||
of the sequences in the batch, used to index into x. Only applicable when using
|
||||
FlashAttention.
|
||||
max_seqlen: int. Maximum sequence length in the batch.
|
||||
mixer_subset: for cross-attention only. If not None, will take a subset of x
|
||||
before applying the query projection. Useful for e.g., ViT where we only care
|
||||
about the CLS token in the last layer.
|
||||
past_cache: For generation only.
|
||||
|
||||
Returns:
|
||||
(batch, seqlen, hidden_dim) if cu_seqlens is None and max_seqlen is None,
|
||||
else (total, hidden_dim) where total is the is the sum of the sequence lengths
|
||||
in the batch.
|
||||
|
||||
"""
|
||||
|
||||
if cu_seqlens is not None:
|
||||
assert max_seqlen is not None
|
||||
assert key_padding_mask is None
|
||||
assert self.flash_attn
|
||||
# assert self.rotary_emb_dim == 0
|
||||
|
||||
if key_padding_mask is not None:
|
||||
assert cu_seqlens is None
|
||||
assert max_seqlen is None
|
||||
assert not self.flash_attn
|
||||
|
||||
if past_cache is not None:
|
||||
assert key_padding_mask is None
|
||||
assert cu_seqlens is None and max_seqlen is None
|
||||
|
||||
attn_kwargs = {"key_padding_mask": key_padding_mask}
|
||||
|
||||
assert x_kv is None and mixer_subset is None
|
||||
|
||||
qkv = self.Wqkv(x)
|
||||
qkv = rearrange(
|
||||
qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim
|
||||
)
|
||||
|
||||
if past_cache is None:
|
||||
if self.rotary_emb_dim > 0:
|
||||
qkv = self.rotary_emb(qkv)
|
||||
context = self.inner_attn(
|
||||
qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen, **attn_kwargs
|
||||
)
|
||||
|
||||
else:
|
||||
if self.rotary_emb_dim > 0:
|
||||
qkv = self.rotary_emb(qkv, seqlen_offset=past_cache.sequence_len_offset)
|
||||
q = qkv[:, :, 0]
|
||||
kv = self._update_kv_cache(qkv[:, :, 1:], past_cache)
|
||||
# If we're processing the prompt, causal=None (use self.causal).
|
||||
# If we're decoding, then causal=False.
|
||||
causal = None if past_cache.sequence_len_offset == 0 else False
|
||||
context = self.inner_cross_attn(q, kv, causal=causal)
|
||||
|
||||
out = rearrange(context, "... h d -> ... (h d)")
|
||||
out = self.out_proj(out)
|
||||
|
||||
return out if not self.return_residual else (out, x)
|
||||
|
||||
|
||||
class ParallelBlock(nn.Module):
|
||||
"""Parallel block.
|
||||
|
||||
This block applies parallel mixer and MLP layers to the input (used in GPT-J and CodeGen).
|
||||
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
config: PretrainedConfig,
|
||||
mixer: Optional[Dict[str, Any]] = None,
|
||||
mlp: Optional[Dict[str, Any]] = None,
|
||||
block_idx: Optional[int] = None,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
||||
self.resid_dropout = nn.Dropout(config.resid_pdrop)
|
||||
self.block_idx = block_idx
|
||||
|
||||
self.mixer = MHA(config, layer_idx=block_idx)
|
||||
self.mlp = MLP(config)
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.FloatTensor,
|
||||
past_cache: Optional[torch.FloatTensor] = None,
|
||||
cu_seqlens: Optional[torch.LongTensor] = None,
|
||||
max_seqlen: Optional[int] = None,
|
||||
) -> torch.FloatTensor:
|
||||
residual = hidden_states
|
||||
hidden_states = self.ln(hidden_states)
|
||||
|
||||
attn_outputs = self.mixer(
|
||||
hidden_states,
|
||||
past_cache=past_cache,
|
||||
cu_seqlens=cu_seqlens,
|
||||
max_seqlen=max_seqlen,
|
||||
)
|
||||
if isinstance(attn_outputs, tuple):
|
||||
attn_outputs = attn_outputs[0]
|
||||
|
||||
attn_outputs = self.resid_dropout(attn_outputs)
|
||||
feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
|
||||
|
||||
hidden_states = attn_outputs + feed_forward_hidden_states + residual
|
||||
|
||||
return hidden_states
|
||||
|
||||
|
||||
class CausalLMHead(nn.Module):
|
||||
"""Causal Language Modeling head.
|
||||
|
||||
Reference:
|
||||
Improving Language Understanding by Generative Pre-Training.
|
||||
https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, config: PretrainedConfig) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
|
||||
self.linear = nn.Linear(config.n_embd, config.vocab_size)
|
||||
|
||||
def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
|
||||
hidden_states = self.ln(hidden_states)
|
||||
logits = self.linear(hidden_states).to(torch.float32)
|
||||
|
||||
return logits
|
||||
|
||||
|
||||
class CausalLMLoss(nn.Module):
|
||||
"""Causal Language Modeling loss.
|
||||
|
||||
Reference:
|
||||
Improving Language Understanding by Generative Pre-Training.
|
||||
https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self, shift_labels: Optional[bool] = True) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.shift_labels = shift_labels
|
||||
self.loss_fct = nn.CrossEntropyLoss()
|
||||
|
||||
def forward(
|
||||
self, logits: torch.FloatTensor, labels: torch.LongTensor
|
||||
) -> torch.FloatTensor:
|
||||
if self.shift_labels:
|
||||
logits = logits[..., :-1, :].contiguous()
|
||||
labels = labels[..., 1:].contiguous()
|
||||
|
||||
loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
|
||||
|
||||
return loss
|
||||
|
||||
|
||||
class MixFormerSequentialPreTrainedModel(PreTrainedModel):
|
||||
"""MixFormer (sequential for DeepSpeed) pre-trained model."""
|
||||
|
||||
config_class = MixFormerSequentialConfig
|
||||
base_model_prefix = "transformer"
|
||||
supports_gradient_checkpointing = True
|
||||
|
||||
def __init__(self, *inputs, **kwargs) -> None:
|
||||
super().__init__(*inputs, **kwargs)
|
||||
|
||||
def prepare_inputs_for_generation(
|
||||
self, input_ids, past_key_values=None, **kwargs
|
||||
) -> Dict[str, Any]:
|
||||
if "use_cache" in kwargs and not kwargs["use_cache"]:
|
||||
return {"input_ids": input_ids}
|
||||
|
||||
if past_key_values is None or not (
|
||||
isinstance(past_key_values, InferenceParams)
|
||||
):
|
||||
past_key_values = InferenceParams(
|
||||
max_batch_size=input_ids.shape[0],
|
||||
max_sequence_len=self.config.n_positions,
|
||||
sequence_len_offset=0,
|
||||
batch_size_offset=0,
|
||||
fused_ft_kernel=False,
|
||||
key_value_memory_dict={},
|
||||
)
|
||||
else:
|
||||
# assume past_key_values has cached all but last token in input_ids
|
||||
past_key_values.sequence_len_offset = len(input_ids[0]) - 1
|
||||
input_ids = input_ids[:, -1].unsqueeze(-1)
|
||||
|
||||
return {"input_ids": input_ids, "past_key_values": past_key_values, **kwargs}
|
||||
|
||||
|
||||
class PackedSequential(nn.Sequential):
|
||||
def forward(
|
||||
self,
|
||||
input,
|
||||
cu_seqlens: Optional[torch.LongTensor] = None,
|
||||
max_seqlen: Optional[int] = None,
|
||||
):
|
||||
for module in self:
|
||||
sig = inspect.signature(module.forward)
|
||||
if "cu_seqlens" in sig.parameters:
|
||||
input = module(input, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen)
|
||||
else:
|
||||
input = module(input)
|
||||
return input
|
||||
|
||||
|
||||
class MixFormerSequentialForCausalLM(MixFormerSequentialPreTrainedModel):
|
||||
"""MixFormer (sequential for DeepSpeed) for Causal Language Modeling."""
|
||||
|
||||
_keys_to_ignore_on_load_missing = [""]
|
||||
_keys_to_ignore_on_load_unexpected = [
|
||||
r"layers\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"
|
||||
]
|
||||
_no_split_modules = ["ParallelBlock"]
|
||||
|
||||
def __init__(self, config: MixFormerSequentialConfig) -> None:
|
||||
super().__init__(config)
|
||||
|
||||
modules = [Embedding(config)]
|
||||
block_config = config.architecture
|
||||
|
||||
if not isinstance(block_config, list):
|
||||
block_config = [block_config for _ in range(config.n_layer)]
|
||||
|
||||
if config.n_layer != len(block_config):
|
||||
config.n_layer = len(block_config)
|
||||
|
||||
for block_idx, block in enumerate(block_config):
|
||||
# `block_cls` with `legacy` value is for backward compatibility
|
||||
# `path` key is for backward compatibility
|
||||
block = copy.deepcopy(block) or {"block_cls": "parallel"}
|
||||
block.pop("path", None) or block.pop("block_cls", None)
|
||||
|
||||
block["block_idx"] = block_idx
|
||||
modules.append(ParallelBlock(config, **block))
|
||||
|
||||
modules.append(CausalLMHead(config))
|
||||
|
||||
self.layers = PackedSequential(*modules)
|
||||
self.loss = CausalLMLoss()
|
||||
|
||||
self.post_init()
|
||||
|
||||
def get_input_embeddings(self) -> nn.Embedding:
|
||||
return self.layers[0].wte
|
||||
|
||||
def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
|
||||
self.layers[0].wte = new_embeddings
|
||||
|
||||
def get_output_embeddings(self) -> nn.Linear:
|
||||
return self.layers[-1].linear
|
||||
|
||||
def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
|
||||
self.layers[-1].linear = new_embeddings
|
||||
|
||||
def forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor,
|
||||
labels: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[torch.FloatTensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
**kwargs,
|
||||
) -> CausalLMOutputWithPast:
|
||||
cu_seqlens: Optional[torch.LongTensor] = None
|
||||
max_seqlen: Optional[int] = None
|
||||
if position_ids is not None:
|
||||
batch_size, seq_length = input_ids.shape
|
||||
position_ids = position_ids.view(-1, seq_length).long()
|
||||
cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids)
|
||||
cu_seqlens = cu_seqlens.squeeze()
|
||||
|
||||
if not past_key_values:
|
||||
lm_logits = self.layers(
|
||||
input_ids, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
|
||||
)
|
||||
else:
|
||||
hidden_layer = self.layers[0](input_ids)
|
||||
for module in self.layers[1:-1]:
|
||||
hidden_layer = module(
|
||||
hidden_layer,
|
||||
past_cache=past_key_values,
|
||||
cu_seqlens=cu_seqlens,
|
||||
max_seqlen=max_seqlen,
|
||||
)
|
||||
lm_logits = self.layers[-1](hidden_layer)
|
||||
|
||||
loss = None
|
||||
if labels is not None:
|
||||
loss = self.loss(lm_logits, labels)
|
||||
|
||||
return CausalLMOutputWithPast(
|
||||
loss=loss, logits=lm_logits, past_key_values=past_key_values
|
||||
)
|
||||
File diff suppressed because it is too large
Load Diff
12
src/axolotl/monkeypatch/falcon/__init__.py
Normal file
12
src/axolotl/monkeypatch/falcon/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""
|
||||
Patches to support multipack for falcon
|
||||
"""
|
||||
import transformers
|
||||
|
||||
from axolotl.monkeypatch.utils import get_unpad_data
|
||||
|
||||
|
||||
def replace_falcon_attn_with_multipack_flash_attn():
|
||||
transformers.models.falcon.modeling_falcon._get_unpad_data = ( # pylint: disable=protected-access
|
||||
get_unpad_data
|
||||
)
|
||||
@@ -70,11 +70,20 @@ def replace_llama_attn_with_flash_attn(
|
||||
packed: Optional[bool] = False,
|
||||
cross_entropy: Optional[bool] = False,
|
||||
rms_norm: Optional[bool] = False,
|
||||
use_shifted_sparse_attn: Optional[bool] = False,
|
||||
):
|
||||
transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = ( # pylint: disable=protected-access
|
||||
_prepare_decoder_attention_mask
|
||||
)
|
||||
transformers.models.llama.modeling_llama.LlamaAttention.forward = flashattn_forward
|
||||
if use_shifted_sparse_attn:
|
||||
transformers.models.llama.modeling_llama.LlamaAttention.forward = (
|
||||
flashattn_forward_with_s2attn
|
||||
)
|
||||
else:
|
||||
transformers.models.llama.modeling_llama.LlamaAttention.forward = (
|
||||
flashattn_forward
|
||||
)
|
||||
|
||||
if packed:
|
||||
transformers.models.llama.modeling_llama.LlamaDecoderLayer = LlamaDecoderLayer
|
||||
transformers.models.llama.modeling_llama.LlamaModel.forward = (
|
||||
@@ -213,6 +222,136 @@ def _prepare_decoder_attention_mask(
|
||||
return attention_mask
|
||||
|
||||
|
||||
GROUP_SIZE_RATIO = 1 / 4
|
||||
|
||||
|
||||
def flashattn_forward_with_s2attn(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.Tensor] = None,
|
||||
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||
output_attentions: bool = False,
|
||||
use_cache: bool = False,
|
||||
padding_mask: Optional[torch.LongTensor] = None, # pylint: disable=unused-argument
|
||||
cu_seqlens: Optional[torch.Tensor] = None, # pylint: disable=unused-argument
|
||||
max_seqlen: Optional[torch.Tensor] = None, # pylint: disable=unused-argument
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
"""Input shape: Batch x Time x Channel
|
||||
|
||||
From: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py
|
||||
|
||||
attention_mask: [bsz, q_len]
|
||||
|
||||
`cu_seqlens` will be ignored if provided
|
||||
`max_seqlen` will be ignored if provided
|
||||
"""
|
||||
if output_attentions:
|
||||
warnings.warn(
|
||||
"Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
|
||||
)
|
||||
|
||||
bsz, q_len, _ = hidden_states.size()
|
||||
|
||||
query_states = (
|
||||
self.q_proj(hidden_states)
|
||||
.view(bsz, q_len, self.num_heads, self.head_dim)
|
||||
.transpose(1, 2)
|
||||
)
|
||||
key_states = (
|
||||
self.k_proj(hidden_states)
|
||||
.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
|
||||
.transpose(1, 2)
|
||||
)
|
||||
value_states = (
|
||||
self.v_proj(hidden_states)
|
||||
.view(bsz, q_len, self.num_key_value_heads, self.head_dim)
|
||||
.transpose(1, 2)
|
||||
)
|
||||
# [bsz, q_len, nh, hd]
|
||||
# [bsz, nh, q_len, hd]
|
||||
# pylint: disable=duplicate-code
|
||||
|
||||
kv_seq_len = key_states.shape[-2]
|
||||
if past_key_value is not None:
|
||||
kv_seq_len += past_key_value[0].shape[-2]
|
||||
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
||||
query_states, key_states = apply_rotary_pos_emb(
|
||||
query_states, key_states, cos, sin, position_ids
|
||||
)
|
||||
|
||||
# Past Key value support
|
||||
if past_key_value is not None:
|
||||
# reuse k, v, self_attention
|
||||
key_states = torch.cat([past_key_value[0], key_states], dim=2)
|
||||
value_states = torch.cat([past_key_value[1], value_states], dim=2)
|
||||
|
||||
past_key_value = (key_states, value_states) if use_cache else None
|
||||
|
||||
# repeat k/v heads if n_kv_heads < n_heads
|
||||
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
||||
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
||||
|
||||
# Flash attention codes from
|
||||
# https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
|
||||
|
||||
# transform the data into the format required by flash attention
|
||||
qkv = torch.stack(
|
||||
[query_states, key_states, value_states], dim=2
|
||||
) # [bsz, nh, 3, q_len, hd]
|
||||
qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
|
||||
|
||||
# We have disabled _prepare_decoder_attention_mask in LlamaModel
|
||||
# the attention_mask should be the same as the key_padding_mask
|
||||
|
||||
key_padding_mask = attention_mask.repeat(2, 1)
|
||||
nheads = qkv.shape[-2]
|
||||
# shift
|
||||
|
||||
group_size = int(q_len * GROUP_SIZE_RATIO)
|
||||
if q_len % group_size > 0:
|
||||
raise ValueError(
|
||||
f"q_len {q_len} should be divisible by group size {group_size}."
|
||||
)
|
||||
|
||||
qkv = (
|
||||
qkv.reshape(bsz, q_len, 3, 2, self.num_heads // 2, self.head_dim)
|
||||
.permute(0, 3, 1, 2, 4, 5)
|
||||
.reshape(bsz * 2, q_len, 3, self.num_heads // 2, self.head_dim)
|
||||
)
|
||||
x = rearrange( # pylint: disable=invalid-name
|
||||
qkv, "b s three h d -> b s (three h d)"
|
||||
)
|
||||
x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
|
||||
cu_q_len_tmp = torch.arange(
|
||||
0, max_s, group_size, device=key_padding_mask.device, dtype=cu_q_lens.dtype
|
||||
)
|
||||
cu_q_len_tmp = torch.stack([cu_q_len_tmp, cu_q_len_tmp + group_size // 2]).repeat(
|
||||
bsz, 1
|
||||
) + cu_q_lens[:-1].unsqueeze(-1)
|
||||
cu_q_lens = torch.cat([cu_q_len_tmp, cu_q_lens[1:].unsqueeze(-1)], dim=-1).view(-1)
|
||||
|
||||
x_unpad = rearrange(
|
||||
x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads // 2
|
||||
)
|
||||
output_unpad = flash_attn_varlen_qkvpacked_func(
|
||||
x_unpad, cu_q_lens, group_size, 0.0, softmax_scale=None, causal=True
|
||||
)
|
||||
output = rearrange(
|
||||
pad_input(
|
||||
rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz * 2, q_len
|
||||
),
|
||||
"b s (h d) -> b s h d",
|
||||
h=nheads // 2,
|
||||
)
|
||||
output = (
|
||||
output.reshape(bsz, 2, q_len, nheads // 2, self.head_dim)
|
||||
.transpose(1, 2)
|
||||
.reshape(bsz, q_len, nheads, self.head_dim)
|
||||
)
|
||||
return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value
|
||||
|
||||
|
||||
def flashattn_forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
|
||||
@@ -1,22 +1,61 @@
|
||||
"""
|
||||
Patches to support multipack for mixtral
|
||||
"""
|
||||
import torch
|
||||
import transformers
|
||||
|
||||
from axolotl.monkeypatch.utils import get_unpad_data
|
||||
|
||||
def replace_mixtral_attn_with_multipack_flash_attn():
|
||||
from .modeling_mixtral import (
|
||||
MixtralMultipackFlashAttention2,
|
||||
mixtral_decoder_layer_forward,
|
||||
mixtral_model_forward,
|
||||
|
||||
def patch_mixtral_moe_forward_zero3() -> None:
|
||||
import torch.nn.functional as F
|
||||
|
||||
def mlp_forward(self, hidden_states):
|
||||
current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(
|
||||
hidden_states
|
||||
)
|
||||
current_hidden_states = self.w2(current_hidden_states)
|
||||
return current_hidden_states
|
||||
|
||||
# Ref. https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py
|
||||
def moe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
|
||||
batch_size, sequence_length, hidden_dim = hidden_states.shape
|
||||
hidden_states = hidden_states.view(-1, hidden_dim)
|
||||
# router_logits: (batch * sequence_length, n_experts)
|
||||
router_logits = self.gate(hidden_states)
|
||||
|
||||
routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
|
||||
topk_weight, topk_idx = torch.topk(
|
||||
routing_weights, self.top_k, dim=-1, sorted=False
|
||||
)
|
||||
topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
|
||||
# we cast back to the input dtype
|
||||
topk_weight = topk_weight.to(hidden_states.dtype)
|
||||
|
||||
hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
|
||||
y = torch.empty_like(hidden_states) # pylint: disable=invalid-name
|
||||
flat_topk_idx = topk_idx.view(-1)
|
||||
for i in range(self.num_experts):
|
||||
expert = self.experts[i]
|
||||
y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
|
||||
y = ( # pylint: disable=invalid-name
|
||||
y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)
|
||||
).sum(dim=1)
|
||||
final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
|
||||
return final_hidden_states, router_logits
|
||||
|
||||
from transformers.models.mixtral.modeling_mixtral import (
|
||||
MixtralBLockSparseTop2MLP,
|
||||
MixtralSparseMoeBlock,
|
||||
)
|
||||
|
||||
transformers.models.mixtral.modeling_mixtral.MixtralDecoderLayer.forward = (
|
||||
mixtral_decoder_layer_forward
|
||||
MixtralBLockSparseTop2MLP.forward = mlp_forward
|
||||
MixtralSparseMoeBlock.forward = moe_forward
|
||||
|
||||
|
||||
def replace_mixtral_attn_with_multipack_flash_attn(for_zero3=False):
|
||||
transformers.models.mixtral.modeling_mixtral._get_unpad_data = ( # pylint: disable=protected-access
|
||||
get_unpad_data
|
||||
)
|
||||
transformers.models.mixtral.modeling_mixtral.MixtralModel.forward = (
|
||||
mixtral_model_forward
|
||||
)
|
||||
transformers.models.mixtral.modeling_mixtral.MIXTRAL_ATTENTION_CLASSES[
|
||||
"flash_attention_2"
|
||||
] = MixtralMultipackFlashAttention2
|
||||
if for_zero3:
|
||||
patch_mixtral_moe_forward_zero3()
|
||||
|
||||
@@ -1,383 +0,0 @@
|
||||
"""
|
||||
Mixtral modeling for multipack
|
||||
"""
|
||||
# pylint: disable=missing-module-docstring,unused-argument,protected-access,pointless-string-statement,duplicate-code
|
||||
import logging
|
||||
import warnings
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
from einops import rearrange
|
||||
from flash_attn import flash_attn_varlen_qkvpacked_func
|
||||
from transformers import Cache, DynamicCache
|
||||
from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
|
||||
from transformers.modeling_outputs import MoeModelOutputWithPast
|
||||
from transformers.models.mixtral.modeling_mixtral import (
|
||||
MixtralFlashAttention2,
|
||||
apply_rotary_pos_emb,
|
||||
repeat_kv,
|
||||
)
|
||||
|
||||
from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
|
||||
|
||||
LOG = logging.getLogger("axolotl.monkeypatch.mixtral")
|
||||
|
||||
|
||||
class MixtralMultipackFlashAttention2(MixtralFlashAttention2):
|
||||
"""
|
||||
Custom multipack implementation w flash attention 2
|
||||
"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
self._flash_attn_uses_top_left_mask = True
|
||||
|
||||
def forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_value: Optional[Cache] = None,
|
||||
output_attentions: bool = False,
|
||||
use_cache: bool = False,
|
||||
cu_seqlens: Optional[torch.Tensor] = None,
|
||||
max_seqlen: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
|
||||
if "padding_mask" in kwargs:
|
||||
warnings.warn(
|
||||
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
||||
)
|
||||
bsz, q_len, _ = hidden_states.size()
|
||||
|
||||
query_states = self.q_proj(hidden_states)
|
||||
key_states = self.k_proj(hidden_states)
|
||||
value_states = self.v_proj(hidden_states)
|
||||
|
||||
query_states = query_states.view(
|
||||
bsz, q_len, self.num_heads, self.head_dim
|
||||
).transpose(1, 2)
|
||||
key_states = key_states.view(
|
||||
bsz, q_len, self.num_key_value_heads, self.head_dim
|
||||
).transpose(1, 2)
|
||||
value_states = value_states.view(
|
||||
bsz, q_len, self.num_key_value_heads, self.head_dim
|
||||
).transpose(1, 2)
|
||||
|
||||
kv_seq_len = key_states.shape[-2]
|
||||
if past_key_value is not None:
|
||||
if self.layer_idx is None:
|
||||
raise ValueError(
|
||||
f"The cache structure has changed since version v4.36. If you are using {self.__class__.__name__} "
|
||||
"for auto-regressive decoding with k/v caching, please make sure to initialize the attention class "
|
||||
"with a layer index."
|
||||
)
|
||||
kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
|
||||
cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
|
||||
query_states, key_states = apply_rotary_pos_emb(
|
||||
query_states, key_states, cos, sin, position_ids
|
||||
)
|
||||
|
||||
if past_key_value is not None:
|
||||
cache_kwargs = {"sin": sin, "cos": cos} # Specific to RoPE models
|
||||
key_states, value_states = past_key_value.update(
|
||||
key_states, value_states, self.layer_idx, cache_kwargs
|
||||
)
|
||||
|
||||
# repeat k/v heads if n_kv_heads < n_heads
|
||||
key_states = repeat_kv(key_states, self.num_key_value_groups)
|
||||
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
||||
|
||||
if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
|
||||
# special handling using sample packing
|
||||
qkv = torch.stack(
|
||||
[query_states, key_states, value_states], dim=2
|
||||
) # [bsz, nh, 3, q_len, hd]
|
||||
qkv = qkv.transpose(1, 3) # [bsz, q_len, 3, nh, hd]
|
||||
qkv = rearrange(qkv, "b s ... -> (b s) ...")
|
||||
|
||||
attn_output = flash_attn_varlen_qkvpacked_func(
|
||||
qkv,
|
||||
cu_seqlens,
|
||||
max_seqlen,
|
||||
dropout_p=self.attention_dropout,
|
||||
softmax_scale=None,
|
||||
causal=True,
|
||||
)
|
||||
attn_output = rearrange(attn_output, "(b s) ... -> b s ...", b=bsz)
|
||||
|
||||
attn_output = attn_output.reshape(bsz, q_len, self.hidden_size).contiguous()
|
||||
attn_output = self.o_proj(attn_output)
|
||||
|
||||
if not output_attentions:
|
||||
attn_weights = None
|
||||
|
||||
return attn_output, attn_weights, past_key_value
|
||||
|
||||
|
||||
def mixtral_decoder_layer_forward(
|
||||
self,
|
||||
hidden_states: torch.Tensor,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_value: Optional[Tuple[torch.Tensor]] = None,
|
||||
output_attentions: Optional[bool] = False,
|
||||
output_router_logits: Optional[bool] = False,
|
||||
use_cache: Optional[bool] = False,
|
||||
cu_seqlens: Optional[torch.Tensor] = None,
|
||||
max_seqlen: Optional[torch.Tensor] = None,
|
||||
**kwargs,
|
||||
) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
|
||||
if "padding_mask" in kwargs:
|
||||
warnings.warn(
|
||||
"Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
|
||||
)
|
||||
"""
|
||||
Args:
|
||||
hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
|
||||
attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
|
||||
`(batch, sequence_length)` where padding elements are indicated by 0.
|
||||
past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
|
||||
output_attentions (`bool`, *optional*):
|
||||
Whether or not to return the attentions tensors of all attention layers. See `attentions` under
|
||||
returned tensors for more detail.
|
||||
output_router_logits (`bool`, *optional*):
|
||||
Whether or not to return the logits of all the routers. They are useful for computing the router loss, and
|
||||
should not be returned during inference.
|
||||
use_cache (`bool`, *optional*):
|
||||
If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
|
||||
(see `past_key_values`).
|
||||
"""
|
||||
|
||||
residual = hidden_states
|
||||
|
||||
hidden_states = self.input_layernorm(hidden_states)
|
||||
|
||||
# Self Attention
|
||||
hidden_states, self_attn_weights, present_key_value = self.self_attn(
|
||||
hidden_states=hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_value,
|
||||
output_attentions=output_attentions,
|
||||
use_cache=use_cache,
|
||||
cu_seqlens=cu_seqlens,
|
||||
max_seqlen=max_seqlen,
|
||||
)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
# Fully Connected
|
||||
residual = hidden_states
|
||||
hidden_states = self.post_attention_layernorm(hidden_states)
|
||||
hidden_states, router_logits = self.block_sparse_moe(hidden_states)
|
||||
hidden_states = residual + hidden_states
|
||||
|
||||
outputs = (hidden_states,)
|
||||
|
||||
if output_attentions:
|
||||
outputs += (self_attn_weights,)
|
||||
|
||||
if use_cache:
|
||||
outputs += (present_key_value,)
|
||||
|
||||
if output_router_logits:
|
||||
outputs += (router_logits,)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def mixtral_model_forward(
|
||||
self,
|
||||
input_ids: torch.LongTensor = None,
|
||||
attention_mask: Optional[torch.Tensor] = None,
|
||||
position_ids: Optional[torch.LongTensor] = None,
|
||||
past_key_values: Optional[List[torch.FloatTensor]] = None,
|
||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
||||
use_cache: Optional[bool] = None,
|
||||
output_attentions: Optional[bool] = None,
|
||||
output_hidden_states: Optional[bool] = None,
|
||||
output_router_logits: Optional[bool] = None,
|
||||
return_dict: Optional[bool] = None,
|
||||
) -> Union[Tuple, MoeModelOutputWithPast]:
|
||||
output_attentions = (
|
||||
output_attentions
|
||||
if output_attentions is not None
|
||||
else self.config.output_attentions
|
||||
)
|
||||
output_router_logits = (
|
||||
output_router_logits
|
||||
if output_router_logits is not None
|
||||
else self.config.output_router_logits
|
||||
)
|
||||
output_hidden_states = (
|
||||
output_hidden_states
|
||||
if output_hidden_states is not None
|
||||
else self.config.output_hidden_states
|
||||
)
|
||||
use_cache = use_cache if use_cache is not None else self.config.use_cache
|
||||
|
||||
return_dict = (
|
||||
return_dict if return_dict is not None else self.config.use_return_dict
|
||||
)
|
||||
|
||||
# retrieve input_ids and inputs_embeds
|
||||
if input_ids is not None and inputs_embeds is not None:
|
||||
raise ValueError(
|
||||
"You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
|
||||
)
|
||||
if input_ids is not None:
|
||||
batch_size, seq_length = input_ids.shape
|
||||
elif inputs_embeds is not None:
|
||||
batch_size, seq_length, _ = inputs_embeds.shape
|
||||
else:
|
||||
raise ValueError(
|
||||
"You have to specify either decoder_input_ids or decoder_inputs_embeds"
|
||||
)
|
||||
|
||||
past_key_values_length = 0
|
||||
|
||||
if use_cache:
|
||||
use_legacy_cache = not isinstance(past_key_values, Cache)
|
||||
if use_legacy_cache:
|
||||
past_key_values = DynamicCache.from_legacy_cache(past_key_values)
|
||||
past_key_values_length = past_key_values.get_usable_length(seq_length)
|
||||
|
||||
cu_seqlens = None
|
||||
max_seqlen = None
|
||||
if position_ids is None:
|
||||
device = input_ids.device if input_ids is not None else inputs_embeds.device
|
||||
position_ids = torch.arange(
|
||||
past_key_values_length,
|
||||
seq_length + past_key_values_length,
|
||||
dtype=torch.long,
|
||||
device=device,
|
||||
)
|
||||
position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
|
||||
else:
|
||||
position_ids = position_ids.view(-1, seq_length).long()
|
||||
cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids)
|
||||
cu_seqlens = cu_seqlens.squeeze()
|
||||
|
||||
if inputs_embeds is None:
|
||||
inputs_embeds = self.embed_tokens(input_ids)
|
||||
|
||||
if (
|
||||
attention_mask is not None
|
||||
and self._attn_implementation == "flash_attention_2"
|
||||
and use_cache
|
||||
):
|
||||
is_padding_right = attention_mask[:, -1].sum().item() != batch_size
|
||||
if is_padding_right:
|
||||
raise ValueError(
|
||||
"You are attempting to perform batched generation with padding_side='right'"
|
||||
" this may lead to unexpected behaviour for Flash Attention version of Mixtral. Make sure to "
|
||||
" call `tokenizer.padding_side = 'left'` before tokenizing the input. "
|
||||
)
|
||||
|
||||
if self._attn_implementation == "flash_attention_2":
|
||||
# 2d mask is passed through the layers
|
||||
attention_mask = (
|
||||
attention_mask
|
||||
if (attention_mask is not None and 0 in attention_mask)
|
||||
else None
|
||||
)
|
||||
else:
|
||||
# 4d mask is passed through the layers
|
||||
attention_mask = _prepare_4d_causal_attention_mask(
|
||||
attention_mask,
|
||||
(batch_size, seq_length),
|
||||
inputs_embeds,
|
||||
past_key_values_length,
|
||||
sliding_window=self.config.sliding_window,
|
||||
)
|
||||
|
||||
hidden_states = inputs_embeds
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
if use_cache:
|
||||
LOG.warning_once(
|
||||
"`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
|
||||
)
|
||||
use_cache = False
|
||||
|
||||
# decoder layers
|
||||
all_hidden_states = () if output_hidden_states else None
|
||||
all_self_attns = () if output_attentions else None
|
||||
all_router_logits = () if output_router_logits else None
|
||||
next_decoder_cache = None
|
||||
|
||||
for decoder_layer in self.layers:
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
if self.gradient_checkpointing and self.training:
|
||||
layer_outputs = self._gradient_checkpointing_func(
|
||||
decoder_layer.__call__,
|
||||
hidden_states,
|
||||
attention_mask,
|
||||
position_ids,
|
||||
past_key_values,
|
||||
output_attentions,
|
||||
output_router_logits,
|
||||
use_cache,
|
||||
cu_seqlens,
|
||||
max_seqlen,
|
||||
)
|
||||
else:
|
||||
layer_outputs = decoder_layer(
|
||||
hidden_states,
|
||||
attention_mask=attention_mask,
|
||||
position_ids=position_ids,
|
||||
past_key_value=past_key_values,
|
||||
output_attentions=output_attentions,
|
||||
output_router_logits=output_router_logits,
|
||||
use_cache=use_cache,
|
||||
cu_seqlens=cu_seqlens,
|
||||
max_seqlen=max_seqlen,
|
||||
)
|
||||
|
||||
hidden_states = layer_outputs[0]
|
||||
|
||||
if use_cache:
|
||||
next_decoder_cache = layer_outputs[2 if output_attentions else 1]
|
||||
|
||||
if output_attentions:
|
||||
all_self_attns += (layer_outputs[1],)
|
||||
|
||||
if output_router_logits:
|
||||
all_router_logits += (layer_outputs[-1],)
|
||||
|
||||
hidden_states = self.norm(hidden_states)
|
||||
|
||||
# add hidden states from the last decoder layer
|
||||
if output_hidden_states:
|
||||
all_hidden_states += (hidden_states,)
|
||||
|
||||
next_cache = None
|
||||
if use_cache:
|
||||
next_cache = (
|
||||
next_decoder_cache.to_legacy_cache()
|
||||
if use_legacy_cache
|
||||
else next_decoder_cache
|
||||
)
|
||||
|
||||
if not return_dict:
|
||||
return tuple(
|
||||
v
|
||||
for v in [
|
||||
hidden_states,
|
||||
next_cache,
|
||||
all_hidden_states,
|
||||
all_self_attns,
|
||||
all_router_logits,
|
||||
]
|
||||
if v is not None
|
||||
)
|
||||
|
||||
return MoeModelOutputWithPast(
|
||||
last_hidden_state=hidden_states,
|
||||
past_key_values=next_cache,
|
||||
hidden_states=all_hidden_states,
|
||||
attentions=all_self_attns,
|
||||
router_logits=all_router_logits,
|
||||
)
|
||||
12
src/axolotl/monkeypatch/phi/__init__.py
Normal file
12
src/axolotl/monkeypatch/phi/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""
|
||||
Patches to support multipack for phi2
|
||||
"""
|
||||
import transformers
|
||||
|
||||
from axolotl.monkeypatch.utils import get_unpad_data
|
||||
|
||||
|
||||
def replace_phi_attn_with_multipack_flash_attn():
|
||||
transformers.models.phi.modeling_phi._get_unpad_data = ( # pylint: disable=protected-access
|
||||
get_unpad_data
|
||||
)
|
||||
12
src/axolotl/monkeypatch/qwen2/__init__.py
Normal file
12
src/axolotl/monkeypatch/qwen2/__init__.py
Normal file
@@ -0,0 +1,12 @@
|
||||
"""
|
||||
Patches to support multipack for qwen2
|
||||
"""
|
||||
import transformers
|
||||
|
||||
from axolotl.monkeypatch.utils import get_unpad_data
|
||||
|
||||
|
||||
def replace_qwen2_attn_with_multipack_flash_attn():
|
||||
transformers.models.qwen2.modeling_qwen2._get_unpad_data = ( # pylint: disable=protected-access
|
||||
get_unpad_data
|
||||
)
|
||||
@@ -2,6 +2,40 @@
|
||||
Shared utils for the monkeypatches
|
||||
"""
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def get_max_seqlen_in_batch(attention_mask: torch.Tensor) -> torch.Tensor:
|
||||
max_num = int(torch.max(attention_mask).item())
|
||||
batch_size, _ = attention_mask.shape
|
||||
counts = torch.zeros((batch_size, max_num), dtype=torch.int32)
|
||||
|
||||
for i in range(1, max_num + 1):
|
||||
mask = attention_mask == i
|
||||
counts[:, i - 1] = torch.sum(mask, dim=-1).to(dtype=torch.int32)
|
||||
|
||||
result = counts.flatten()
|
||||
nonzero_indices = torch.nonzero(result).squeeze(-1)
|
||||
return result[nonzero_indices]
|
||||
|
||||
|
||||
@torch.jit.script
|
||||
def get_unpad_data(attention_mask: torch.Tensor):
|
||||
device = attention_mask.device
|
||||
seqlens_in_batch = get_max_seqlen_in_batch(attention_mask)
|
||||
indices = torch.nonzero(attention_mask.flatten()).flatten()
|
||||
max_seqlen_in_batch = seqlens_in_batch.max().item()
|
||||
cu_seqlens = (
|
||||
F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
|
||||
.to(device=device)
|
||||
.detach()
|
||||
)
|
||||
return (
|
||||
indices,
|
||||
cu_seqlens,
|
||||
max_seqlen_in_batch,
|
||||
)
|
||||
|
||||
|
||||
def get_cu_seqlens(attn_mask):
|
||||
|
||||
21
src/axolotl/prompt_strategies/dpo/__init__.py
Normal file
21
src/axolotl/prompt_strategies/dpo/__init__.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""
|
||||
module for DPO style dataset transform strategies
|
||||
"""
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
|
||||
def load(strategy, cfg):
|
||||
try:
|
||||
load_fn = strategy.split(".")[-1]
|
||||
strategy = ".".join(strategy.split(".")[:-1])
|
||||
mod = importlib.import_module(f".{strategy}", "axolotl.prompt_strategies.dpo")
|
||||
func = getattr(mod, load_fn)
|
||||
load_kwargs = {}
|
||||
return func(cfg, **load_kwargs)
|
||||
except Exception: # pylint: disable=broad-exception-caught
|
||||
LOG.warning(f"unable to load strategy {strategy}")
|
||||
return None
|
||||
85
src/axolotl/prompt_strategies/dpo/chatml.py
Normal file
85
src/axolotl/prompt_strategies/dpo/chatml.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""
|
||||
DPO strategies for chatml
|
||||
"""
|
||||
|
||||
|
||||
def argilla(
|
||||
cfg,
|
||||
): # pylint: disable=possibly-unused-variable,unused-argument
|
||||
def transform_fn(sample):
|
||||
if "system" in sample and sample["system"]:
|
||||
sample["prompt"] = (
|
||||
f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
|
||||
f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
else:
|
||||
sample[
|
||||
"prompt"
|
||||
] = f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
sample["chosen"] = f"{sample['chosen_response']}<|im_end|>"
|
||||
sample["rejected"] = f"{sample['rejected_response']}<|im_end|>"
|
||||
return sample
|
||||
|
||||
return transform_fn
|
||||
|
||||
|
||||
def intel(cfg): # pylint: disable=possibly-unused-variable,unused-argument
|
||||
"""
|
||||
For Intel Orca DPO Pairs
|
||||
"""
|
||||
|
||||
def transform_fn(sample):
|
||||
if "system" in sample and sample["system"]:
|
||||
sample["prompt"] = (
|
||||
f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
|
||||
f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
else:
|
||||
sample[
|
||||
"prompt"
|
||||
] = f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
sample["chosen"] = f"{sample['chosen']}<|im_end|>"
|
||||
sample["rejected"] = f"{sample['rejected']}<|im_end|>"
|
||||
return sample
|
||||
|
||||
return transform_fn
|
||||
|
||||
|
||||
def prompt_pairs(cfg): # pylint: disable=possibly-unused-variable,unused-argument
|
||||
def transform_fn(sample):
|
||||
if "system" in sample and sample["system"]:
|
||||
sample["prompt"] = (
|
||||
f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
|
||||
f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
else:
|
||||
sample[
|
||||
"prompt"
|
||||
] = f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
sample["chosen"] = f"{sample['chosen']}<|im_end|>"
|
||||
sample["rejected"] = f"{sample['rejected']}<|im_end|>"
|
||||
return sample
|
||||
|
||||
return transform_fn
|
||||
|
||||
|
||||
def ultra(cfg): # pylint: disable=possibly-unused-variable,unused-argument
|
||||
"""
|
||||
for ultrafeedback binarized conversations
|
||||
"""
|
||||
|
||||
def transform_fn(sample):
|
||||
if "system" in sample and sample["system"]:
|
||||
sample["prompt"] = (
|
||||
f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
|
||||
f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
else:
|
||||
sample[
|
||||
"prompt"
|
||||
] = f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
|
||||
sample["chosen"] = f"{sample['chosen'][1]['content']}<|im_end|>"
|
||||
sample["rejected"] = f"{sample['rejected'][1]['content']}<|im_end|>"
|
||||
return sample
|
||||
|
||||
return transform_fn
|
||||
21
src/axolotl/prompt_strategies/dpo/zephyr.py
Normal file
21
src/axolotl/prompt_strategies/dpo/zephyr.py
Normal file
@@ -0,0 +1,21 @@
|
||||
"""
|
||||
DPO strategies for zephyr
|
||||
"""
|
||||
|
||||
|
||||
def nectar(cfg): # pylint: disable=possibly-unused-variable,unused-argument
|
||||
def transform_fn(sample):
|
||||
data = {}
|
||||
data["prompt"] = (
|
||||
"<|system|>\n</s>\n"
|
||||
"<|user|>\n"
|
||||
f"{sample['prompt']}</s>\n"
|
||||
"<|assistant|>\n"
|
||||
)
|
||||
answers = sorted(sample["answers"], key=lambda x: x["rank"])
|
||||
data["chosen"] = answers[-1]["answer"]
|
||||
data["rejected"] = answers[-2]["answer"]
|
||||
|
||||
return data
|
||||
|
||||
return transform_fn
|
||||
@@ -51,7 +51,7 @@ class AlpacaPrompter(Prompter):
|
||||
self.turn_no_input_format = (
|
||||
"### Instruction:\n{instruction}\n\n### Response:\n"
|
||||
)
|
||||
self.system_format = "### System:\n{system}\n\n"
|
||||
self.system_format = "{system}\n\n"
|
||||
if self.prompt_style == PromptStyle.CHAT.value:
|
||||
self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
|
||||
self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
|
||||
|
||||
@@ -15,7 +15,7 @@ from optimum.bettertransformer import BetterTransformer
|
||||
from peft import PeftModel
|
||||
from pkg_resources import get_distribution # type: ignore
|
||||
from transformers import PreTrainedModel, PreTrainedTokenizer
|
||||
from transformers.deepspeed import is_deepspeed_zero3_enabled
|
||||
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.logging_config import configure_logging
|
||||
@@ -96,7 +96,12 @@ def train(
|
||||
freeze_parameters_except(model, cfg.unfrozen_parameters)
|
||||
|
||||
trainer = setup_trainer(
|
||||
cfg, train_dataset, eval_dataset, (model, model_ref), tokenizer, total_num_steps
|
||||
cfg,
|
||||
train_dataset,
|
||||
eval_dataset,
|
||||
(model, model_ref, peft_config),
|
||||
tokenizer,
|
||||
total_num_steps,
|
||||
)
|
||||
|
||||
if hasattr(model, "config"):
|
||||
|
||||
@@ -20,7 +20,8 @@ def check_cuda_device(default_value):
|
||||
device = kwargs.get("device", args[0] if args else None)
|
||||
|
||||
if (
|
||||
not torch.cuda.is_available()
|
||||
device is None
|
||||
or not torch.cuda.is_available()
|
||||
or device == "auto"
|
||||
or torch.device(device).type == "cpu"
|
||||
):
|
||||
|
||||
@@ -9,6 +9,7 @@ from tempfile import NamedTemporaryFile
|
||||
from typing import TYPE_CHECKING, Dict, List
|
||||
|
||||
import evaluate
|
||||
import mlflow
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import torch
|
||||
@@ -575,3 +576,31 @@ class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
|
||||
except (FileNotFoundError, ConnectionError) as err:
|
||||
LOG.warning(f"Error while saving Axolotl config to WandB: {err}")
|
||||
return control
|
||||
|
||||
|
||||
class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
|
||||
"""Callback to save axolotl config to mlflow"""
|
||||
|
||||
def __init__(self, axolotl_config_path):
|
||||
self.axolotl_config_path = axolotl_config_path
|
||||
|
||||
def on_train_begin(
|
||||
self,
|
||||
args: AxolotlTrainingArguments, # pylint: disable=unused-argument
|
||||
state: TrainerState, # pylint: disable=unused-argument
|
||||
control: TrainerControl,
|
||||
**kwargs, # pylint: disable=unused-argument
|
||||
):
|
||||
if is_main_process():
|
||||
try:
|
||||
with NamedTemporaryFile(
|
||||
mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
|
||||
) as temp_file:
|
||||
copyfile(self.axolotl_config_path, temp_file.name)
|
||||
mlflow.log_artifact(temp_file.name, artifact_path="")
|
||||
LOG.info(
|
||||
"The Axolotl config has been saved to the MLflow artifacts."
|
||||
)
|
||||
except (FileNotFoundError, ConnectionError) as err:
|
||||
LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
|
||||
return control
|
||||
|
||||
@@ -152,6 +152,33 @@ class BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
||||
return super().__call__(features, return_tensors=return_tensors)
|
||||
|
||||
|
||||
@dataclass
|
||||
class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
||||
"""
|
||||
Collator for multipack specific to the using the BatchSampler
|
||||
"""
|
||||
|
||||
def __call__(self, features, return_tensors=None):
|
||||
chunked_data = {}
|
||||
for feature in features[0].keys():
|
||||
if feature == "length":
|
||||
continue
|
||||
if feature == "attention_mask":
|
||||
arrays = [
|
||||
(i + 1) * np.array(item[feature])
|
||||
for i, item in enumerate(features)
|
||||
if feature in item
|
||||
]
|
||||
chunked_data[feature] = np.concatenate(arrays)
|
||||
else:
|
||||
arrays = [
|
||||
np.array(item[feature]) for item in features if feature in item
|
||||
]
|
||||
chunked_data[feature] = np.concatenate(arrays)
|
||||
features = [chunked_data]
|
||||
return super().__call__(features, return_tensors=return_tensors)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MambaDataCollator:
|
||||
"""
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
"""Module for working with config dicts"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import torch
|
||||
from transformers.utils import is_torch_bf16_gpu_available
|
||||
|
||||
from axolotl.utils.bench import log_gpu_memory_usage
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.models import load_model_config
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
@@ -68,6 +70,8 @@ def normalize_config(cfg):
|
||||
else:
|
||||
LOG.debug("bf16 support not detected, disabling for this configuration.")
|
||||
cfg.bf16 = False
|
||||
if cfg.fp16 is None:
|
||||
cfg.fp16 = True
|
||||
|
||||
if cfg.device == "mps":
|
||||
cfg.load_in_8bit = False
|
||||
@@ -77,6 +81,8 @@ def normalize_config(cfg):
|
||||
cfg.bf16 = False
|
||||
else:
|
||||
torch.backends.cuda.matmul.allow_tf32 = cfg.tf32 or False
|
||||
if cfg.bf16:
|
||||
cfg.fp16 = False
|
||||
|
||||
if cfg.bf16 or cfg.bfloat16:
|
||||
cfg.torch_dtype = torch.bfloat16
|
||||
@@ -135,26 +141,31 @@ def normalize_config(cfg):
|
||||
]
|
||||
)
|
||||
or cfg.is_mistral_derived_model
|
||||
or "mistral" in cfg.base_model.lower()
|
||||
or "mistral" in cfg.base_model.lower().split("/")[-1]
|
||||
or (cfg.model_type and "mistral" in cfg.model_type.lower())
|
||||
)
|
||||
|
||||
cfg.is_qwen_derived_model = (
|
||||
(
|
||||
hasattr(model_config, "model_type")
|
||||
and model_config.model_type
|
||||
in [
|
||||
"qwen",
|
||||
]
|
||||
)
|
||||
or cfg.is_qwen_derived_model
|
||||
or "qwen" in cfg.base_model.lower()
|
||||
or (cfg.model_type and "qwen" in cfg.model_type.lower())
|
||||
)
|
||||
hasattr(model_config, "model_type")
|
||||
and model_config.model_type
|
||||
in [
|
||||
"qwen",
|
||||
]
|
||||
) or cfg.is_qwen_derived_model
|
||||
|
||||
if isinstance(cfg.learning_rate, str):
|
||||
cfg.learning_rate = float(cfg.learning_rate)
|
||||
|
||||
if isinstance(cfg.pretraining_dataset, dict):
|
||||
cfg.pretraining_dataset = [cfg.pretraining_dataset]
|
||||
|
||||
if (
|
||||
cfg.gradient_checkpointing
|
||||
and cfg.unfrozen_parameters is None
|
||||
and cfg.gradient_checkpointing_kwargs is None
|
||||
):
|
||||
cfg.gradient_checkpointing_kwargs = {"use_reentrant": True}
|
||||
|
||||
log_gpu_memory_usage(LOG, "baseline", cfg.device)
|
||||
|
||||
|
||||
@@ -182,22 +193,19 @@ def validate_config(cfg):
|
||||
if not cfg.bf16 and not cfg.bfloat16:
|
||||
LOG.info("bf16 support detected, but not enabled for this configuration.")
|
||||
else:
|
||||
if not cfg.merge_lora and (cfg.bf16 or cfg.bfloat16):
|
||||
if (
|
||||
not cfg.merge_lora
|
||||
and not cfg.is_preprocess
|
||||
and (cfg.bf16 is True or cfg.bfloat16 is True)
|
||||
):
|
||||
raise ValueError(
|
||||
"bf16 requested, but AMP is not supported on this GPU. Requires Ampere series or above."
|
||||
)
|
||||
if cfg.max_packed_sequence_len and cfg.sample_packing:
|
||||
raise ValueError(
|
||||
"please set only one of max_packed_sequence_len (deprecated soon) or sample_packing"
|
||||
)
|
||||
if cfg.max_packed_sequence_len:
|
||||
LOG.warning(
|
||||
str(
|
||||
PendingDeprecationWarning(
|
||||
"max_packed_sequence_len will be deprecated in favor of sample_packing"
|
||||
)
|
||||
)
|
||||
)
|
||||
raise DeprecationWarning("`max_packed_sequence_len` is no longer supported")
|
||||
|
||||
if cfg.sample_packing and cfg.rl:
|
||||
raise ValueError("`sample_packing: true` does not work with RLHF training")
|
||||
|
||||
if cfg.sample_packing and not cfg.pad_to_sequence_len:
|
||||
LOG.warning(
|
||||
@@ -359,20 +367,6 @@ def validate_config(cfg):
|
||||
"`early_stopping_patience` requires that eval_steps should evenly divide save_steps."
|
||||
)
|
||||
|
||||
if cfg.model_type == "MixFormerSequentialForCausalLM" and cfg.adapter is not None:
|
||||
LOG.warning("Use AutoModelForCausalLM for phi/MixFormer models with qLoRA")
|
||||
|
||||
if cfg.model_config_type == "mixformer-sequential":
|
||||
if cfg.sample_packing:
|
||||
if cfg.adapter is not None:
|
||||
LOG.warning(
|
||||
"phi/MixFormer models are not currently compatible with LoRA and sample_packing"
|
||||
)
|
||||
if cfg.model_type == "AutoModelForCausalLM":
|
||||
raise ValueError(
|
||||
"`model_type: MixFormerSequentialForCausalLM` required for sample_packing"
|
||||
)
|
||||
|
||||
if cfg.datasets:
|
||||
for idx, ds_cfg in enumerate(cfg.datasets):
|
||||
if not ds_cfg.type:
|
||||
@@ -480,6 +474,45 @@ def validate_config(cfg):
|
||||
"max_memory and gpu_memory_limit are mutually exclusive and cannot be used together."
|
||||
)
|
||||
|
||||
if (
|
||||
cfg.unfrozen_parameters
|
||||
and cfg.gradient_checkpointing_kwargs
|
||||
and cfg.gradient_checkpointing_kwargs.use_reentrant is True
|
||||
):
|
||||
# https://github.com/huggingface/transformers/issues/21381
|
||||
raise ValueError(
|
||||
"`use_reentrant` must be false when used with partially frozen model."
|
||||
)
|
||||
|
||||
if cfg.flash_attention and cfg.deepspeed and Path(cfg.deepspeed).is_file():
|
||||
with open(cfg.deepspeed, encoding="utf-8") as file:
|
||||
contents = file.read()
|
||||
deepspeed_cfg: DictDefault = DictDefault(json.loads(contents))
|
||||
if (
|
||||
deepspeed_cfg.zero_optimization
|
||||
and deepspeed_cfg.zero_optimization.stage == 3
|
||||
):
|
||||
if not (
|
||||
(
|
||||
deepspeed_cfg.bf16
|
||||
and deepspeed_cfg.bf16.enabled # pylint: disable=no-member
|
||||
is True
|
||||
)
|
||||
or (
|
||||
deepspeed_cfg.fp16
|
||||
and deepspeed_cfg.fp16.enabled # pylint: disable=no-member
|
||||
is True
|
||||
)
|
||||
):
|
||||
raise ValueError(
|
||||
"bf16.enabled or fp16.enabled must be set to true when using ZeRO-3 with flash-attention"
|
||||
)
|
||||
|
||||
if cfg.test_datasets and cfg.val_set_size:
|
||||
raise ValueError(
|
||||
"non-zero val_set_size should not be used with test_datasets configuration"
|
||||
)
|
||||
|
||||
# TODO
|
||||
# MPT 7b
|
||||
# https://github.com/facebookresearch/bitsandbytes/issues/25
|
||||
|
||||
@@ -4,9 +4,10 @@ import hashlib
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Tuple, Union
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
|
||||
import torch
|
||||
import yaml
|
||||
from datasets import (
|
||||
Dataset,
|
||||
DatasetDict,
|
||||
@@ -19,8 +20,9 @@ from torch.utils.data import RandomSampler
|
||||
from transformers import PreTrainedTokenizerBase
|
||||
|
||||
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
||||
from axolotl.datasets import ConstantLengthDataset, TokenizedPromptDataset
|
||||
from axolotl.datasets import TokenizedPromptDataset
|
||||
from axolotl.prompt_strategies import load
|
||||
from axolotl.prompt_strategies.dpo import load as load_dpo
|
||||
from axolotl.prompt_tokenizers import (
|
||||
AlpacaMultipleChoicePromptTokenizingStrategy,
|
||||
AlpacaPromptTokenizingStrategy,
|
||||
@@ -65,15 +67,25 @@ def prepare_dataset(cfg, tokenizer):
|
||||
prompters = []
|
||||
if not cfg.pretraining_dataset:
|
||||
with zero_first(is_main_process()):
|
||||
train_dataset, eval_dataset, prompters = load_prepare_datasets(
|
||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
|
||||
)
|
||||
if cfg.test_datasets:
|
||||
train_dataset, _, prompters = load_prepare_datasets(
|
||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH, split="train"
|
||||
)
|
||||
_, eval_dataset, _ = load_prepare_datasets(
|
||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH, split="test"
|
||||
)
|
||||
else:
|
||||
train_dataset, eval_dataset, prompters = load_prepare_datasets(
|
||||
tokenizer, cfg, DEFAULT_DATASET_PREPARED_PATH
|
||||
)
|
||||
else:
|
||||
path = cfg.pretraining_dataset
|
||||
name = None
|
||||
if isinstance(cfg.pretraining_dataset, dict):
|
||||
path = cfg.pretraining_dataset["path"]
|
||||
name = cfg.pretraining_dataset["name"]
|
||||
if isinstance(cfg.pretraining_dataset, list) and isinstance(
|
||||
cfg.pretraining_dataset[0], dict
|
||||
):
|
||||
path = cfg.pretraining_dataset[0]["path"]
|
||||
name = cfg.pretraining_dataset[0]["name"]
|
||||
|
||||
train_dataset = load_pretraining_dataset(
|
||||
path,
|
||||
@@ -88,11 +100,6 @@ def prepare_dataset(cfg, tokenizer):
|
||||
eval_dataset = None
|
||||
return train_dataset, eval_dataset, cfg.max_steps, prompters
|
||||
|
||||
with zero_first(is_main_process()):
|
||||
train_dataset, eval_dataset = process_datasets_for_packing(
|
||||
cfg, train_dataset, eval_dataset, tokenizer
|
||||
)
|
||||
|
||||
if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
|
||||
total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
|
||||
if total_eval_steps == 0:
|
||||
@@ -111,19 +118,29 @@ def prepare_dataset(cfg, tokenizer):
|
||||
|
||||
|
||||
def load_tokenized_prepared_datasets(
|
||||
tokenizer, cfg, default_dataset_prepared_path
|
||||
tokenizer,
|
||||
cfg,
|
||||
default_dataset_prepared_path,
|
||||
split="train",
|
||||
) -> Tuple[DatasetDict, List[Prompter]]:
|
||||
cfg_datasets = cfg.test_datasets if split == "test" else cfg.datasets
|
||||
tokenizer_name = tokenizer.__class__.__name__
|
||||
ds_hash = str(
|
||||
md5(
|
||||
(
|
||||
str(cfg.sequence_len)
|
||||
+ "@"
|
||||
+ str(cfg.sample_packing)
|
||||
+ "@"
|
||||
+ str(cfg.eval_sample_packing)
|
||||
+ "@"
|
||||
+ str(cfg.group_by_length)
|
||||
+ "@"
|
||||
+ "|".join(
|
||||
sorted(
|
||||
[
|
||||
f"{d.path}:{d.type}:{d.shards}:{d.conversation}"
|
||||
for d in cfg.datasets
|
||||
for d in cfg_datasets
|
||||
]
|
||||
)
|
||||
)
|
||||
@@ -146,19 +163,27 @@ def load_tokenized_prepared_datasets(
|
||||
f"{cfg.push_dataset_to_hub}/{ds_hash}",
|
||||
token=use_auth_token,
|
||||
)
|
||||
dataset = dataset["train"]
|
||||
dataset = dataset[split]
|
||||
except Exception: # pylint: disable=broad-except # nosec
|
||||
pass
|
||||
|
||||
if dataset:
|
||||
...
|
||||
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
|
||||
elif (
|
||||
cfg.dataset_prepared_path
|
||||
and any(prepared_ds_path.glob("*"))
|
||||
and not cfg.is_preprocess
|
||||
):
|
||||
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
||||
dataset = load_from_disk(str(prepared_ds_path))
|
||||
LOG.info("Prepared dataset loaded from disk...")
|
||||
else:
|
||||
LOG.info(f"Unable to find prepared dataset in {prepared_ds_path}")
|
||||
LOG.info("Loading raw datasets...")
|
||||
if not cfg.is_preprocess:
|
||||
LOG.warning(
|
||||
"Processing datasets during training can lead to VRAM instability. Please pre-process your dataset."
|
||||
)
|
||||
|
||||
if cfg.seed:
|
||||
seed = cfg.seed
|
||||
@@ -177,8 +202,8 @@ def load_tokenized_prepared_datasets(
|
||||
yield dataset
|
||||
|
||||
# pylint: disable=invalid-name
|
||||
for config_dataset in for_d_in_datasets(cfg.datasets):
|
||||
ds: Union[Dataset, DatasetDict] = None
|
||||
for config_dataset in for_d_in_datasets(cfg_datasets):
|
||||
ds: Optional[Union[Dataset, DatasetDict]] = None
|
||||
ds_from_hub = False
|
||||
try:
|
||||
load_dataset(
|
||||
@@ -331,16 +356,6 @@ def load_tokenized_prepared_datasets(
|
||||
)
|
||||
if not ds:
|
||||
raise ValueError("unhandled dataset load")
|
||||
# support for using a subset of the data
|
||||
if config_dataset.shards:
|
||||
if "train" in ds:
|
||||
ds = ds.shuffle(seed=seed)["train"].shard(
|
||||
num_shards=config_dataset.shards, index=0
|
||||
)
|
||||
else:
|
||||
ds = ds.shuffle(seed=seed).shard(
|
||||
num_shards=config_dataset.shards, index=0
|
||||
)
|
||||
|
||||
d_base_type = d_prompt_style = None
|
||||
d_type = config_dataset.type
|
||||
@@ -348,17 +363,21 @@ def load_tokenized_prepared_datasets(
|
||||
d_type_split = d_type.split(":")
|
||||
d_base_type = d_type_split[0]
|
||||
d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None
|
||||
if "train" in ds:
|
||||
ds = ds["train"]
|
||||
elif (
|
||||
isinstance(ds, DatasetDict)
|
||||
and config_dataset.train_on_split
|
||||
and config_dataset.train_on_split in ds
|
||||
):
|
||||
ds = ds[config_dataset.train_on_split]
|
||||
|
||||
if config_dataset.split and config_dataset.split in ds:
|
||||
ds = ds[config_dataset.split]
|
||||
elif split in ds:
|
||||
ds = ds[split]
|
||||
elif isinstance(ds, DatasetDict):
|
||||
raise ValueError(
|
||||
f"no train split found for dataset {config_dataset.path}, you may specify a split with 'train_on_split: `"
|
||||
f"no {split} split found for dataset {config_dataset.path}, you may specify a split with 'split: `"
|
||||
)
|
||||
|
||||
# support for using a subset of the data
|
||||
if config_dataset.shards:
|
||||
shards_idx = config_dataset.get("shards_idx", 0)
|
||||
ds = ds.shuffle(seed=seed).shard(
|
||||
num_shards=config_dataset.shards, index=shards_idx
|
||||
)
|
||||
|
||||
dataset_wrapper, dataset_prompter = get_dataset_wrapper(
|
||||
@@ -378,6 +397,9 @@ def load_tokenized_prepared_datasets(
|
||||
if len(datasets) > 1:
|
||||
LOG.info("shuffle merged datasets")
|
||||
dataset = dataset.shuffle(seed=seed)
|
||||
|
||||
dataset, _ = process_datasets_for_packing(cfg, dataset, None)
|
||||
|
||||
if cfg.local_rank == 0:
|
||||
LOG.info(f"Saving merged prepared dataset to disk... {prepared_ds_path}")
|
||||
dataset.save_to_disk(prepared_ds_path)
|
||||
@@ -414,116 +436,11 @@ def load_prepare_datasets(
|
||||
tokenizer: PreTrainedTokenizerBase,
|
||||
cfg,
|
||||
default_dataset_prepared_path,
|
||||
split="train",
|
||||
) -> Tuple[Dataset, Dataset, List[Prompter]]:
|
||||
max_packed_sequence_len = (
|
||||
cfg.max_packed_sequence_len if cfg.max_packed_sequence_len else cfg.sequence_len
|
||||
dataset, prompters = load_tokenized_prepared_datasets(
|
||||
tokenizer, cfg, default_dataset_prepared_path
|
||||
)
|
||||
max_packed_sequence_len = min(
|
||||
max_packed_sequence_len, cfg.sequence_len
|
||||
) # make sure we don't accidentally set it larger than sequence_len
|
||||
|
||||
tokenizer_name = tokenizer.__class__.__name__
|
||||
prompters: List[Prompter] = []
|
||||
if cfg.max_packed_sequence_len is not None:
|
||||
# see if we can go ahead and load the stacked dataset
|
||||
seed = f"@{str(cfg.seed)}" if cfg.seed else ""
|
||||
ds_hash = str(
|
||||
md5(
|
||||
(
|
||||
str(cfg.sequence_len)
|
||||
+ "@"
|
||||
+ str(max_packed_sequence_len)
|
||||
+ seed
|
||||
+ "|".join(
|
||||
sorted([f"{d.path}:{d.type}:{d.shards}" for d in cfg.datasets])
|
||||
)
|
||||
+ "|"
|
||||
+ tokenizer_name
|
||||
)
|
||||
)
|
||||
)
|
||||
prepared_ds_path = (
|
||||
Path(cfg.dataset_prepared_path) / ds_hash
|
||||
if cfg.dataset_prepared_path
|
||||
else Path(default_dataset_prepared_path) / ds_hash
|
||||
)
|
||||
|
||||
dataset = None
|
||||
use_auth_token = cfg.hf_use_auth_token
|
||||
try:
|
||||
if cfg.push_dataset_to_hub:
|
||||
LOG.info(
|
||||
f"Checking for packed prepared dataset from hub... {cfg.push_dataset_to_hub}/{ds_hash}"
|
||||
)
|
||||
dataset = load_dataset(
|
||||
f"{cfg.push_dataset_to_hub}/{ds_hash}",
|
||||
token=use_auth_token,
|
||||
)
|
||||
dataset = dataset["train"]
|
||||
except Exception: # pylint: disable=broad-except # nosec
|
||||
pass
|
||||
|
||||
if dataset:
|
||||
...
|
||||
elif cfg.dataset_prepared_path and any(prepared_ds_path.glob("*")):
|
||||
LOG.info(
|
||||
f"Loading prepared packed dataset from disk at {prepared_ds_path}..."
|
||||
)
|
||||
dataset = load_from_disk(str(prepared_ds_path))
|
||||
LOG.info("Prepared packed dataset loaded from disk...")
|
||||
if cfg.push_dataset_to_hub:
|
||||
LOG.info(
|
||||
f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
|
||||
)
|
||||
dataset.push_to_hub(
|
||||
f"{cfg.push_dataset_to_hub}/{ds_hash}", private=True
|
||||
)
|
||||
else:
|
||||
dataset, prompters = load_tokenized_prepared_datasets(
|
||||
tokenizer, cfg, default_dataset_prepared_path
|
||||
)
|
||||
|
||||
if cfg.seed:
|
||||
dataset = dataset.shuffle(seed=cfg.seed)
|
||||
|
||||
constant_len_dataset = ConstantLengthDataset(
|
||||
tokenizer,
|
||||
[dataset],
|
||||
seq_length=max_packed_sequence_len,
|
||||
)
|
||||
LOG.info(f"packing master dataset to len: {cfg.max_packed_sequence_len}")
|
||||
dataset = Dataset.from_list(list(constant_len_dataset))
|
||||
|
||||
# filter out bad data
|
||||
# TODO convert to dataset.filter(...)
|
||||
dataset = Dataset.from_list(
|
||||
[
|
||||
d
|
||||
for d in dataset
|
||||
if len(d["input_ids"]) <= cfg.sequence_len
|
||||
and len(d["input_ids"]) > 0
|
||||
and len(d["input_ids"]) == len(d["attention_mask"])
|
||||
and len(d["input_ids"]) == len(d["labels"])
|
||||
]
|
||||
)
|
||||
|
||||
if cfg.local_rank == 0:
|
||||
LOG.info(
|
||||
f"Saving packed prepared dataset to disk... {prepared_ds_path}"
|
||||
)
|
||||
dataset.save_to_disk(prepared_ds_path)
|
||||
if cfg.push_dataset_to_hub:
|
||||
LOG.info(
|
||||
f"Saving packed prepared dataset with push_to_hub... {cfg.push_dataset_to_hub}/{ds_hash}"
|
||||
)
|
||||
dataset.push_to_hub(
|
||||
f"{cfg.push_dataset_to_hub}/{ds_hash}",
|
||||
private=True,
|
||||
)
|
||||
else:
|
||||
dataset, prompters = load_tokenized_prepared_datasets(
|
||||
tokenizer, cfg, default_dataset_prepared_path
|
||||
)
|
||||
|
||||
if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
|
||||
LOG.info(
|
||||
@@ -534,7 +451,7 @@ def load_prepare_datasets(
|
||||
index=cfg.dataset_shard_idx,
|
||||
)
|
||||
|
||||
if cfg.val_set_size:
|
||||
if split == "train" and cfg.val_set_size:
|
||||
# ensure we end up with the same fingerprint by doing rank0 first and being able to cache
|
||||
to_hash_train = (
|
||||
dataset._fingerprint # pylint: disable=protected-access
|
||||
@@ -567,6 +484,9 @@ def load_prepare_datasets(
|
||||
|
||||
train_dataset = dataset["train"]
|
||||
eval_dataset = dataset["test"]
|
||||
elif split == "test":
|
||||
train_dataset = None
|
||||
eval_dataset = dataset
|
||||
else:
|
||||
train_dataset = dataset
|
||||
eval_dataset = None
|
||||
@@ -580,6 +500,11 @@ def get_dataset_wrapper(
|
||||
dataset_wrapper = None
|
||||
dataset_prompter = None
|
||||
|
||||
ds_kwargs = {
|
||||
"process_count": cfg.dataset_processes,
|
||||
"keep_in_memory": cfg.dataset_keep_in_memory is True,
|
||||
}
|
||||
|
||||
if (
|
||||
"input_ids" in dataset.features
|
||||
and "attention_mask" in dataset.features
|
||||
@@ -594,12 +519,16 @@ def get_dataset_wrapper(
|
||||
)
|
||||
dataset_prompter = UnsupportedPrompter()
|
||||
dataset_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy, dataset, process_count=cfg.dataset_processes
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
elif ds_strategy := load(config_dataset.type, tokenizer, cfg, config_dataset):
|
||||
dataset_prompter = UnsupportedPrompter()
|
||||
dataset_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy, dataset, process_count=cfg.dataset_processes
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
elif d_base_type == "alpaca":
|
||||
dataset_prompter = AlpacaPrompter(d_prompt_style)
|
||||
@@ -610,7 +539,9 @@ def get_dataset_wrapper(
|
||||
cfg.sequence_len,
|
||||
)
|
||||
ds_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy, dataset, process_count=cfg.dataset_processes
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
dataset_wrapper = ds_wrapper
|
||||
elif d_base_type == "explainchoice":
|
||||
@@ -622,7 +553,9 @@ def get_dataset_wrapper(
|
||||
cfg.sequence_len,
|
||||
)
|
||||
ds_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy, dataset, process_count=cfg.dataset_processes
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
dataset_wrapper = ds_wrapper
|
||||
elif d_base_type == "concisechoice":
|
||||
@@ -634,7 +567,9 @@ def get_dataset_wrapper(
|
||||
cfg.sequence_len,
|
||||
)
|
||||
ds_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy, dataset, process_count=cfg.dataset_processes
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
dataset_wrapper = ds_wrapper
|
||||
elif d_base_type == "summarizetldr":
|
||||
@@ -646,7 +581,9 @@ def get_dataset_wrapper(
|
||||
cfg.sequence_len,
|
||||
)
|
||||
ds_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy, dataset, process_count=cfg.dataset_processes
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
dataset_wrapper = ds_wrapper
|
||||
elif d_base_type == "jeopardy":
|
||||
@@ -658,7 +595,9 @@ def get_dataset_wrapper(
|
||||
cfg.sequence_len,
|
||||
)
|
||||
ds_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy, dataset, process_count=cfg.dataset_processes
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
dataset_wrapper = ds_wrapper
|
||||
elif d_base_type == "oasst":
|
||||
@@ -670,7 +609,9 @@ def get_dataset_wrapper(
|
||||
cfg.sequence_len,
|
||||
)
|
||||
ds_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy, dataset, process_count=cfg.dataset_processes
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
dataset_wrapper = ds_wrapper
|
||||
elif d_base_type == "gpteacher":
|
||||
@@ -682,7 +623,9 @@ def get_dataset_wrapper(
|
||||
cfg.sequence_len,
|
||||
)
|
||||
ds_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy, dataset, process_count=cfg.dataset_processes
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
dataset_wrapper = ds_wrapper
|
||||
elif d_base_type == "reflection":
|
||||
@@ -694,7 +637,9 @@ def get_dataset_wrapper(
|
||||
cfg.sequence_len,
|
||||
)
|
||||
ds_wrapper = TokenizedPromptDataset(
|
||||
ds_strategy, dataset, process_count=cfg.dataset_processes
|
||||
ds_strategy,
|
||||
dataset,
|
||||
**ds_kwargs,
|
||||
)
|
||||
dataset_wrapper = ds_wrapper
|
||||
else:
|
||||
@@ -844,10 +789,12 @@ def load_pretraining_dataset(path, tokenizer, cfg, name=None, max_tokens=2048, s
|
||||
dataset = dataset.map(
|
||||
encode,
|
||||
batched=True,
|
||||
batch_size=10_000,
|
||||
input_columns="text",
|
||||
# remove all the existing columns after mapping since they end up having
|
||||
# a different length than the encoded/tokenized column
|
||||
remove_columns=dataset.features.keys(),
|
||||
desc="Encoding Pretraining",
|
||||
)
|
||||
return dataset
|
||||
|
||||
@@ -905,3 +852,98 @@ def encode_packed_pretraining(
|
||||
chunked_data[feature].append(collated_features[feature].squeeze(0))
|
||||
|
||||
return chunked_data
|
||||
|
||||
|
||||
def _get_path(ds_hash, cfg):
|
||||
prepared_ds_path = (
|
||||
Path(cfg.dataset_prepared_path) / ds_hash
|
||||
if cfg.dataset_prepared_path
|
||||
else Path(DEFAULT_DATASET_PREPARED_PATH) / ds_hash
|
||||
)
|
||||
|
||||
return prepared_ds_path
|
||||
|
||||
|
||||
def _load_preprocessed_ds(cfg, sub_cfg):
|
||||
ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
|
||||
prepared_ds_path = _get_path(ds_hash, cfg)
|
||||
dataset = None
|
||||
|
||||
if (
|
||||
cfg.dataset_prepared_path
|
||||
and any(prepared_ds_path.glob("*"))
|
||||
and not cfg.is_preprocess
|
||||
):
|
||||
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
||||
dataset = load_from_disk(str(prepared_ds_path))
|
||||
|
||||
return dataset
|
||||
|
||||
|
||||
def _save_preprocessed_ds(cfg, sub_cfg, dataset):
|
||||
ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
|
||||
prepared_ds_path = _get_path(ds_hash, cfg)
|
||||
|
||||
if cfg.is_preprocess and is_main_process():
|
||||
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
||||
dataset.save_to_disk(str(prepared_ds_path))
|
||||
|
||||
|
||||
def load_prepare_dpo_datasets(cfg):
|
||||
def load_split(dataset_cfgs, _cfg):
|
||||
split_datasets: List[Any] = []
|
||||
for i, ds_cfg in enumerate(dataset_cfgs):
|
||||
if ds_cfg["ds_type"] == "json":
|
||||
for data_file in ds_cfg["data_files"]:
|
||||
data_files = {ds_cfg["split"]: data_file}
|
||||
ds = load_dataset( # pylint: disable=invalid-name
|
||||
"json",
|
||||
data_files=data_files,
|
||||
split=ds_cfg["split"],
|
||||
)
|
||||
split_datasets.insert(i, ds)
|
||||
else:
|
||||
ds = load_dataset( # pylint: disable=invalid-name
|
||||
ds_cfg["path"],
|
||||
split=ds_cfg["split"],
|
||||
)
|
||||
split_datasets.insert(i, ds)
|
||||
|
||||
for i, data_set in enumerate(split_datasets):
|
||||
_type = dataset_cfgs[i]["type"]
|
||||
if _type:
|
||||
ds_transform_fn = load_dpo(_type, _cfg)
|
||||
split_datasets[i] = data_set.map(
|
||||
ds_transform_fn,
|
||||
desc="Mapping RL Dataset",
|
||||
)
|
||||
else:
|
||||
# If no `type` is provided, assume the dataset is already in the expected format with
|
||||
# "prompt", "chosen" and "rejected" already preprocessed
|
||||
split_datasets[i] = data_set
|
||||
|
||||
return concatenate_datasets(split_datasets)
|
||||
|
||||
with zero_first(is_main_process()):
|
||||
train_is_preprocessed = False
|
||||
eval_is_preprocessed = False
|
||||
if train_dataset := _load_preprocessed_ds(cfg, cfg.datasets):
|
||||
train_is_preprocessed = True
|
||||
else:
|
||||
train_dataset = load_split(cfg.datasets, cfg)
|
||||
|
||||
eval_dataset = None
|
||||
if cfg.test_datasets:
|
||||
if eval_dataset := _load_preprocessed_ds(cfg, cfg.test_datasets):
|
||||
eval_is_preprocessed = True
|
||||
else:
|
||||
eval_dataset = load_split(cfg.test_datasets, cfg)
|
||||
if not eval_dataset:
|
||||
eval_dataset = None
|
||||
|
||||
if not train_is_preprocessed:
|
||||
_save_preprocessed_ds(cfg, cfg.datasets, train_dataset)
|
||||
if eval_dataset and not eval_is_preprocessed:
|
||||
_save_preprocessed_ds(cfg, cfg.test_datasets, eval_dataset)
|
||||
|
||||
return train_dataset, eval_dataset
|
||||
|
||||
@@ -7,8 +7,8 @@ def get_linear_embedding_layers(model_type):
|
||||
"""
|
||||
returns the linear embedding layers needed for loras, dependent on the model arch
|
||||
"""
|
||||
if model_type == "phi-msft":
|
||||
return ["embd.wte", "lm_head.linear"]
|
||||
if model_type == "gpt_neox":
|
||||
return ["embed_in", "embed_out"]
|
||||
if model_type == "falcon":
|
||||
return ["word_embeddings", "lm_head"]
|
||||
return ["embed_tokens", "lm_head"]
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
import logging
|
||||
import math
|
||||
import os
|
||||
from typing import Any, Optional, Tuple, Union # noqa: F401
|
||||
from typing import Any, Dict, Optional, Tuple, Union # noqa: F401
|
||||
|
||||
import addict
|
||||
import bitsandbytes as bnb
|
||||
@@ -21,7 +21,7 @@ from transformers import ( # noqa: F401
|
||||
PreTrainedModel,
|
||||
PreTrainedTokenizerBase,
|
||||
)
|
||||
from transformers.deepspeed import is_deepspeed_zero3_enabled
|
||||
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
|
||||
|
||||
from axolotl.models.mamba import fix_mamba_attn_for_loss
|
||||
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
|
||||
@@ -76,10 +76,15 @@ def load_model_config(cfg):
|
||||
if not model_config_name and cfg.tokenizer_config:
|
||||
model_config_name = cfg.tokenizer_config
|
||||
trust_remote_code = cfg.trust_remote_code is True
|
||||
config_kwargs = {}
|
||||
if cfg.model_revision:
|
||||
config_kwargs["revision"] = cfg.model_revision
|
||||
|
||||
try:
|
||||
model_config = AutoConfig.from_pretrained(
|
||||
model_config_name, trust_remote_code=trust_remote_code
|
||||
model_config_name,
|
||||
trust_remote_code=trust_remote_code,
|
||||
**config_kwargs,
|
||||
)
|
||||
except ValueError as err:
|
||||
if "mamba" in model_config_name:
|
||||
@@ -164,6 +169,7 @@ def load_tokenizer(cfg):
|
||||
# pylint: disable=too-many-boolean-expressions
|
||||
if (
|
||||
(getattr(tokenizer, k) is None or getattr(tokenizer, k) != val)
|
||||
and (len(tokenizer.encode(val)) > 1)
|
||||
and cfg.adapter
|
||||
and (
|
||||
not cfg.lora_modules_to_save
|
||||
@@ -256,37 +262,65 @@ def load_model(
|
||||
|
||||
replace_stablelm_attn_with_flash_attn(cfg.base_model)
|
||||
|
||||
if cfg.is_llama_derived_model and cfg.flash_attention and cfg.sample_packing:
|
||||
if cfg.device not in ["mps", "cpu"] and not inference:
|
||||
if cfg.sample_packing and cfg.s2_attention:
|
||||
raise ValueError(
|
||||
"Received `sample_packing=true` and `s2_attention=true`; however, \
|
||||
shifted-sparse attention does not currently support sample packing."
|
||||
)
|
||||
|
||||
# Modify all llama derived models in one block
|
||||
if cfg.is_llama_derived_model:
|
||||
if cfg.flash_attention:
|
||||
from axolotl.monkeypatch.llama_attn_hijack_flash import (
|
||||
replace_llama_attn_with_flash_attn,
|
||||
)
|
||||
|
||||
LOG.info("patching with flash attention for sample packing")
|
||||
replace_llama_attn_with_flash_attn(
|
||||
packed=cfg.sample_packing,
|
||||
cross_entropy=cfg.flash_attn_cross_entropy,
|
||||
rms_norm=cfg.flash_attn_rms_norm,
|
||||
if cfg.sample_packing:
|
||||
if cfg.device not in ["mps", "cpu"] and not inference:
|
||||
LOG.info("patching with flash attention for sample packing")
|
||||
replace_llama_attn_with_flash_attn(
|
||||
packed=True,
|
||||
cross_entropy=cfg.flash_attn_cross_entropy,
|
||||
rms_norm=cfg.flash_attn_rms_norm,
|
||||
)
|
||||
elif cfg.s2_attention:
|
||||
LOG.info("patching w/ flash-enabled, shifted-sparse attention")
|
||||
replace_llama_attn_with_flash_attn(
|
||||
packed=False,
|
||||
cross_entropy=cfg.flash_attn_cross_entropy,
|
||||
rms_norm=cfg.flash_attn_rms_norm,
|
||||
use_shifted_sparse_attn=True,
|
||||
)
|
||||
elif cfg.xformers_attention:
|
||||
from axolotl.monkeypatch.llama_attn_hijack_xformers import (
|
||||
hijack_llama_attention,
|
||||
)
|
||||
elif cfg.is_llama_derived_model and cfg.xformers_attention:
|
||||
from axolotl.monkeypatch.llama_attn_hijack_xformers import (
|
||||
hijack_llama_attention,
|
||||
)
|
||||
|
||||
LOG.info("patching with xformers attention")
|
||||
hijack_llama_attention()
|
||||
elif cfg.is_llama_derived_model and cfg.sdp_attention:
|
||||
from axolotl.monkeypatch.llama_attn_hijack_sdp import hijack_llama_sdp_attention
|
||||
LOG.info("patching with xformers attention")
|
||||
hijack_llama_attention()
|
||||
elif cfg.sdp_attention:
|
||||
from axolotl.monkeypatch.llama_attn_hijack_sdp import (
|
||||
hijack_llama_sdp_attention,
|
||||
)
|
||||
|
||||
LOG.info("patching with sdp attention")
|
||||
hijack_llama_sdp_attention()
|
||||
LOG.info("patching with sdp attention")
|
||||
hijack_llama_sdp_attention()
|
||||
elif cfg.s2_attention:
|
||||
raise NotImplementedError(
|
||||
"Shifted-sparse attention not currently implemented without flash attention."
|
||||
)
|
||||
|
||||
if cfg.is_mistral_derived_model and cfg.flash_attention and cfg.sample_packing:
|
||||
# Modify mistral derived models
|
||||
if (
|
||||
cfg.model_config_type == "mistral"
|
||||
and cfg.flash_attention
|
||||
and cfg.sample_packing
|
||||
):
|
||||
from axolotl.monkeypatch.mistral_attn_hijack_flash import (
|
||||
replace_mistral_attn_with_flash_attn,
|
||||
)
|
||||
|
||||
LOG.info("patching with flash attention")
|
||||
LOG.info("patching mistral with flash attention")
|
||||
replace_mistral_attn_with_flash_attn(packed=cfg.sample_packing)
|
||||
|
||||
if (
|
||||
@@ -298,20 +332,45 @@ def load_model(
|
||||
replace_mixtral_attn_with_multipack_flash_attn,
|
||||
)
|
||||
|
||||
LOG.info("patching with flash attention")
|
||||
replace_mixtral_attn_with_multipack_flash_attn()
|
||||
LOG.info("patching mixtral with flash attention")
|
||||
mixtral_patch_kwargs = {}
|
||||
if is_deepspeed_zero3_enabled():
|
||||
mixtral_patch_kwargs["for_zero3"] = True
|
||||
replace_mixtral_attn_with_multipack_flash_attn(**mixtral_patch_kwargs)
|
||||
|
||||
if (
|
||||
cfg.is_llama_derived_model
|
||||
and (cfg.max_packed_sequence_len or cfg.sample_packing)
|
||||
and not inference
|
||||
):
|
||||
if cfg.model_config_type == "falcon" and cfg.flash_attention and cfg.sample_packing:
|
||||
from axolotl.monkeypatch.falcon import (
|
||||
replace_falcon_attn_with_multipack_flash_attn,
|
||||
)
|
||||
|
||||
LOG.info("patching falcon with flash attention")
|
||||
replace_falcon_attn_with_multipack_flash_attn()
|
||||
|
||||
if cfg.model_config_type == "phi" and cfg.flash_attention and cfg.sample_packing:
|
||||
from axolotl.monkeypatch.phi import replace_phi_attn_with_multipack_flash_attn
|
||||
|
||||
LOG.info("patching phi with flash attention")
|
||||
replace_phi_attn_with_multipack_flash_attn()
|
||||
|
||||
if cfg.model_config_type == "qwen2" and cfg.flash_attention and cfg.sample_packing:
|
||||
from axolotl.monkeypatch.qwen2 import (
|
||||
replace_qwen2_attn_with_multipack_flash_attn,
|
||||
)
|
||||
|
||||
LOG.info("patching qwen2 with flash attention")
|
||||
replace_qwen2_attn_with_multipack_flash_attn()
|
||||
|
||||
if cfg.is_llama_derived_model and cfg.sample_packing and not inference:
|
||||
from axolotl.monkeypatch.llama_expand_mask import hijack_expand_mask
|
||||
|
||||
LOG.info("patching _expand_mask")
|
||||
hijack_expand_mask()
|
||||
|
||||
model_kwargs = {}
|
||||
model_kwargs: Dict[str, Any] = {}
|
||||
|
||||
if cfg.model_kwargs:
|
||||
for key, val in model_kwargs.items():
|
||||
model_kwargs[key] = val
|
||||
|
||||
max_memory = cfg.max_memory
|
||||
device_map = cfg.device_map
|
||||
@@ -387,21 +446,19 @@ def load_model(
|
||||
model_kwargs["quantization_config"] = BitsAndBytesConfig(
|
||||
**bnb_config,
|
||||
)
|
||||
|
||||
# sample packing uses custom FA2 patch
|
||||
if cfg.flash_attention:
|
||||
if not cfg.sample_packing:
|
||||
if (
|
||||
cfg.is_llama_derived_model
|
||||
or cfg.is_falcon_derived_model
|
||||
or cfg.is_mistral_derived_model
|
||||
or model_config.model_type == "mixtral"
|
||||
):
|
||||
model_kwargs["attn_implementation"] = "flash_attention_2"
|
||||
model_config._attn_implementation = ( # pylint: disable=protected-access
|
||||
"flash_attention_2"
|
||||
)
|
||||
if cfg.s2_attention:
|
||||
pass
|
||||
# most other models support flash attention, we can define exceptions as they come up
|
||||
model_kwargs["attn_implementation"] = "flash_attention_2"
|
||||
model_config._attn_implementation = ( # pylint: disable=protected-access
|
||||
"flash_attention_2"
|
||||
)
|
||||
else:
|
||||
if model_config.model_type == "mixtral":
|
||||
if model_config.model_type in ["mixtral", "qwen2", "falcon", "phi"]:
|
||||
model_kwargs["attn_implementation"] = "flash_attention_2"
|
||||
model_config._attn_implementation = ( # pylint: disable=protected-access
|
||||
"flash_attention_2"
|
||||
@@ -411,13 +468,13 @@ def load_model(
|
||||
model_config._attn_implementation = ( # pylint: disable=protected-access
|
||||
"eager"
|
||||
)
|
||||
if model_config.model_type == "phi-msft":
|
||||
model_config.flash_attn = True
|
||||
model_config.flash_rotary = True
|
||||
model_config.fused_dense = True
|
||||
|
||||
try:
|
||||
if cfg.is_llama_derived_model and not cfg.trust_remote_code and not cfg.gptq:
|
||||
if (
|
||||
model_config.model_type == "llama"
|
||||
and not cfg.trust_remote_code
|
||||
and not cfg.gptq
|
||||
):
|
||||
from transformers import LlamaForCausalLM
|
||||
|
||||
model = LlamaForCausalLM.from_pretrained(
|
||||
@@ -467,16 +524,6 @@ def load_model(
|
||||
# device=cfg.device,
|
||||
# )
|
||||
# model.train() # sets to train instead of eval mode
|
||||
elif model_type == "PhiForCausalLM" or model_config.model_type == "phi-msft":
|
||||
from axolotl.models.phi import PhiForCausalLM
|
||||
|
||||
model = PhiForCausalLM.from_pretrained(
|
||||
base_model,
|
||||
config=model_config,
|
||||
load_in_8bit=cfg.load_in_8bit and cfg.adapter is not None,
|
||||
load_in_4bit=cfg.load_in_4bit and cfg.adapter is not None,
|
||||
**model_kwargs,
|
||||
)
|
||||
elif model_type == "MambaLMHeadModel":
|
||||
# FIXME this is janky at best and hacked together to make it work
|
||||
MambaLMHeadModel = fix_mamba_attn_for_loss() # pylint: disable=invalid-name
|
||||
@@ -602,6 +649,12 @@ def load_model(
|
||||
needs_fa2_dtype = cfg.adapter or cfg.fsdp
|
||||
skip_prepare_model_for_kbit_training = False
|
||||
|
||||
if cfg.model_config_type == "mixtral" and is_deepspeed_zero3_enabled():
|
||||
from deepspeed.utils import set_z3_leaf_modules
|
||||
from transformers.models.mixtral.modeling_mixtral import MixtralSparseMoeBlock
|
||||
|
||||
set_z3_leaf_modules(model, [MixtralSparseMoeBlock])
|
||||
|
||||
if cfg.model_config_type == "qwen" and cfg.adapter == "lora":
|
||||
# Qwen doesn't play nicely with LoRA if this is enabled
|
||||
skip_prepare_model_for_kbit_training = True
|
||||
@@ -631,7 +684,12 @@ def load_model(
|
||||
|
||||
lora_config = None
|
||||
if not reference_model or cfg.lora_model_dir:
|
||||
model, lora_config = load_adapter(model, cfg, cfg.adapter)
|
||||
# if we're not loading the reference model, then we're loading the model for training
|
||||
# then the dpo trainer doesn't want the peft model loaded over it, it just wants the lora/peft config
|
||||
if cfg.adapter and cfg.rl in ["dpo", "ipo", "kto_pair"] and not cfg.merge_lora:
|
||||
_, lora_config = load_lora(model, cfg, inference=False, config_only=True)
|
||||
else:
|
||||
model, lora_config = load_adapter(model, cfg, cfg.adapter)
|
||||
|
||||
if cfg.ddp and not load_in_8bit and not (cfg.rl and cfg.load_in_4bit):
|
||||
model.to(f"cuda:{cfg.local_rank}")
|
||||
@@ -711,14 +769,16 @@ def find_all_linear_names(model):
|
||||
names = name.split(".")
|
||||
lora_module_names.add(names[0] if len(names) == 1 else names[-1])
|
||||
|
||||
if "lm_head" in lora_module_names: # needed for 16-bit
|
||||
lora_module_names.remove("lm_head")
|
||||
embedding_modules = get_linear_embedding_layers(model.config.model_type)
|
||||
output_embedding = embedding_modules[1]
|
||||
if output_embedding in lora_module_names: # needed for 16-bit
|
||||
lora_module_names.remove(output_embedding)
|
||||
|
||||
return list(lora_module_names)
|
||||
|
||||
|
||||
def load_lora(model, cfg, inference=False):
|
||||
# type: (PreTrainedModel, DictDefault, bool) -> Tuple[PreTrainedModel, Optional[PeftConfig]]
|
||||
def load_lora(model, cfg, inference=False, config_only=False):
|
||||
# type: (PreTrainedModel, DictDefault, bool, bool) -> Tuple[Optional[PreTrainedModel], Optional[PeftConfig]]
|
||||
|
||||
from peft import LoraConfig, PeftModel, get_peft_model
|
||||
|
||||
@@ -741,6 +801,9 @@ def load_lora(model, cfg, inference=False):
|
||||
task_type="CAUSAL_LM",
|
||||
)
|
||||
|
||||
if config_only:
|
||||
return None, lora_config
|
||||
|
||||
if cfg.lora_model_dir:
|
||||
LOG.debug("Loading pretained PEFT - LoRA")
|
||||
model_kwargs: Any = {}
|
||||
|
||||
@@ -7,11 +7,11 @@ import numpy as np
|
||||
def get_dataset_lengths(dataset):
|
||||
if "length" in dataset.data.column_names:
|
||||
lengths = np.array(dataset.data.column("length"))
|
||||
elif "position_ids" in dataset.data.column_names:
|
||||
position_ids = dataset.data.column("position_ids")
|
||||
lengths = np.array([x[-1] + 1 for x in position_ids])
|
||||
else:
|
||||
lengths = (
|
||||
dataset.data.column("position_ids")
|
||||
.to_pandas()
|
||||
.apply(lambda x: x[-1] + 1)
|
||||
.values
|
||||
)
|
||||
input_ids = dataset.data.column("input_ids")
|
||||
lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
|
||||
return lengths
|
||||
return lengths
|
||||
|
||||
@@ -81,6 +81,15 @@ def trainer_weighted_loss(model_output, labels, shift_labels=True):
|
||||
return weighted_cross_entropy(logits, labels, weights)
|
||||
|
||||
|
||||
@contextmanager
|
||||
def disable_datasets_caching():
|
||||
try:
|
||||
set_caching_enabled(False)
|
||||
yield
|
||||
finally:
|
||||
set_caching_enabled(True)
|
||||
|
||||
|
||||
def add_position_ids(sample):
|
||||
sample_len = len(sample["input_ids"])
|
||||
sample["position_ids"] = torch.arange(len(sample["input_ids"]))
|
||||
@@ -97,62 +106,80 @@ def drop_long_seq(sample, sequence_len=2048):
|
||||
return len(sample["input_ids"]) <= sequence_len and len(sample["input_ids"]) > 0
|
||||
|
||||
|
||||
@contextmanager
|
||||
def disable_datasets_caching():
|
||||
try:
|
||||
set_caching_enabled(False)
|
||||
yield
|
||||
finally:
|
||||
set_caching_enabled(True)
|
||||
|
||||
|
||||
def process_datasets_for_packing(cfg, train_dataset, eval_dataset, tokenizer):
|
||||
def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
|
||||
drop_long = partial(drop_long_seq, sequence_len=cfg.sequence_len)
|
||||
with zero_first(is_main_process()):
|
||||
if cfg.is_preprocess:
|
||||
min_input_len = np.min(get_dataset_lengths(train_dataset))
|
||||
LOG.debug(f"min_input_len: {min_input_len}", main_process_only=True)
|
||||
max_input_len = np.max(get_dataset_lengths(train_dataset))
|
||||
LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
|
||||
|
||||
if (
|
||||
cfg.is_mistral_derived_model and cfg.flash_attention
|
||||
) or cfg.model_config_type == "mamba":
|
||||
LOG.info("dropping attention_mask column")
|
||||
train_dataset = train_dataset.remove_columns("attention_mask")
|
||||
if eval_dataset:
|
||||
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
||||
|
||||
if cfg.model_config_type == "falcon":
|
||||
LOG.info("dropping token_type_ids column")
|
||||
train_dataset = train_dataset.remove_columns("token_type_ids")
|
||||
if eval_dataset:
|
||||
eval_dataset = eval_dataset.remove_columns("token_type_ids")
|
||||
|
||||
train_dataset = train_dataset.filter(
|
||||
drop_long,
|
||||
num_proc=cfg.dataset_processes,
|
||||
load_from_cache_file=not cfg.is_preprocess,
|
||||
desc="Dropping Long Sequences",
|
||||
)
|
||||
if eval_dataset:
|
||||
eval_dataset = eval_dataset.filter(
|
||||
drop_long,
|
||||
num_proc=cfg.dataset_processes,
|
||||
load_from_cache_file=not cfg.is_preprocess,
|
||||
desc="Dropping Long Sequences",
|
||||
)
|
||||
|
||||
if cfg.group_by_length:
|
||||
train_dataset = train_dataset.map(
|
||||
add_length, num_proc=cfg.dataset_processes
|
||||
add_length,
|
||||
num_proc=cfg.dataset_processes,
|
||||
load_from_cache_file=not cfg.is_preprocess,
|
||||
desc="Group By Length",
|
||||
)
|
||||
|
||||
if cfg.sample_packing:
|
||||
train_dataset = train_dataset.map(
|
||||
add_position_ids, num_proc=cfg.dataset_processes
|
||||
add_position_ids,
|
||||
num_proc=cfg.dataset_processes,
|
||||
load_from_cache_file=not cfg.is_preprocess,
|
||||
desc="Add position_id column (Sample Packing)",
|
||||
)
|
||||
if cfg.eval_sample_packing is not False:
|
||||
if eval_dataset:
|
||||
eval_dataset = eval_dataset.map(
|
||||
add_position_ids, num_proc=cfg.dataset_processes
|
||||
add_position_ids,
|
||||
num_proc=cfg.dataset_processes,
|
||||
load_from_cache_file=not cfg.is_preprocess,
|
||||
desc="Add position_id column (Sample Packing)",
|
||||
)
|
||||
|
||||
if cfg.group_by_length or cfg.sample_packing:
|
||||
max_input_len = np.max(get_dataset_lengths(train_dataset))
|
||||
LOG.debug(f"max_input_len: {max_input_len}", main_process_only=True)
|
||||
|
||||
train_dataset = train_dataset.filter(drop_long, num_proc=cfg.dataset_processes)
|
||||
if eval_dataset:
|
||||
eval_dataset = eval_dataset.filter(
|
||||
drop_long, num_proc=cfg.dataset_processes
|
||||
)
|
||||
|
||||
# Phi doesn't want the attention_mask feature when training
|
||||
if (
|
||||
"CodeGenTokenizer" in tokenizer.__class__.__name__
|
||||
or (cfg.is_mistral_derived_model and cfg.flash_attention)
|
||||
or cfg.model_config_type == "mamba"
|
||||
):
|
||||
train_dataset = train_dataset.remove_columns("attention_mask")
|
||||
if eval_dataset:
|
||||
eval_dataset = eval_dataset.remove_columns("attention_mask")
|
||||
|
||||
return train_dataset, eval_dataset
|
||||
|
||||
|
||||
def process_pretraining_datasets_for_packing(train_dataset, sequence_len):
|
||||
drop_long = partial(drop_long_seq, sequence_len=sequence_len)
|
||||
|
||||
train_dataset = train_dataset.filter(drop_long)
|
||||
train_dataset = train_dataset.filter(
|
||||
drop_long,
|
||||
desc="Dropping Long Sequences",
|
||||
)
|
||||
train_dataset = train_dataset.map(
|
||||
add_position_ids,
|
||||
desc="Add position_id column (Pretraining Sample Packing)",
|
||||
)
|
||||
return train_dataset
|
||||
|
||||
@@ -214,8 +241,7 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
|
||||
sampler=RandomSampler(train_dataset),
|
||||
batch_size=cfg.micro_batch_size,
|
||||
drop_last=True,
|
||||
batch_max_len=cfg.micro_batch_size
|
||||
* (cfg.max_packed_sequence_len or cfg.sequence_len),
|
||||
batch_max_len=cfg.micro_batch_size * cfg.sequence_len,
|
||||
lengths=get_dataset_lengths(train_dataset),
|
||||
)
|
||||
|
||||
@@ -289,9 +315,10 @@ def prepare_optim_env(cfg):
|
||||
|
||||
|
||||
def setup_trainer(cfg, train_dataset, eval_dataset, model, tokenizer, total_num_steps):
|
||||
if cfg.rl:
|
||||
if cfg.rl in ["dpo", "ipo", "kto_pair"]:
|
||||
trainer_builder = HFDPOTrainerBuilder(cfg, model[0], tokenizer)
|
||||
trainer_builder.model_ref = model[1]
|
||||
trainer_builder.peft_config = model[2]
|
||||
else:
|
||||
trainer_builder = HFCausalTrainerBuilder(cfg, model[0], tokenizer)
|
||||
|
||||
|
||||
112
tests/e2e/patched/test_falcon_samplepack.py
Normal file
112
tests/e2e/patched/test_falcon_samplepack.py
Normal file
@@ -0,0 +1,112 @@
|
||||
"""
|
||||
E2E tests for falcon
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from axolotl.cli import load_datasets
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.train import train
|
||||
from axolotl.utils.config import normalize_config
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
from ..utils import with_temp_dir
|
||||
|
||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
|
||||
|
||||
class TestFalconPatched(unittest.TestCase):
|
||||
"""
|
||||
Test case for Falcon models
|
||||
"""
|
||||
|
||||
@with_temp_dir
|
||||
def test_qlora(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "illuin/tiny-random-FalconForCausalLM",
|
||||
"flash_attention": True,
|
||||
"sample_packing": True,
|
||||
"sequence_len": 2048,
|
||||
"load_in_4bit": True,
|
||||
"adapter": "qlora",
|
||||
"lora_r": 16,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.1,
|
||||
"lora_target_linear": True,
|
||||
"lora_modules_to_save": ["word_embeddings", "lm_head"],
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"bos_token": "<|endoftext|>",
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 2,
|
||||
"micro_batch_size": 2,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_bnb_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 20,
|
||||
"save_steps": 10,
|
||||
"eval_steps": 10,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||
|
||||
@with_temp_dir
|
||||
def test_ft(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "illuin/tiny-random-FalconForCausalLM",
|
||||
"flash_attention": True,
|
||||
"sample_packing": True,
|
||||
"sequence_len": 2048,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"bos_token": "<|endoftext|>",
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 2,
|
||||
"micro_batch_size": 2,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_bnb_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 20,
|
||||
"save_steps": 10,
|
||||
"eval_steps": 10,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "pytorch_model.bin").exists()
|
||||
111
tests/e2e/patched/test_llama_s2_attention.py
Normal file
111
tests/e2e/patched/test_llama_s2_attention.py
Normal file
@@ -0,0 +1,111 @@
|
||||
"""
|
||||
E2E tests for llama w/ S2 attn
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from axolotl.cli import load_datasets
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.train import train
|
||||
from axolotl.utils.config import normalize_config
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
from ..utils import with_temp_dir
|
||||
|
||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
|
||||
|
||||
class TestLlamaShiftedSparseAttention(unittest.TestCase):
|
||||
"""
|
||||
Test case for Llama models using S2 Attn
|
||||
"""
|
||||
|
||||
@with_temp_dir
|
||||
def test_lora_s2_attn(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "JackFram/llama-68m",
|
||||
"tokenizer_type": "LlamaTokenizer",
|
||||
"sequence_len": 16384,
|
||||
"sample_packing": False,
|
||||
"flash_attention": True,
|
||||
"s2_attention": True,
|
||||
"load_in_8bit": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 32,
|
||||
"lora_alpha": 16,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "Yukang/LongAlpaca-12k",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 2,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 10,
|
||||
"save_steps": 5,
|
||||
"eval_steps": 5,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||
|
||||
@with_temp_dir
|
||||
def test_fft_s2_attn(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "JackFram/llama-68m",
|
||||
"tokenizer_type": "LlamaTokenizer",
|
||||
"sequence_len": 16384,
|
||||
"sample_packing": False,
|
||||
"flash_attention": True,
|
||||
"s2_attention": True,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "Yukang/LongAlpaca-12k",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 2,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 10,
|
||||
"save_steps": 5,
|
||||
"eval_steps": 5,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "pytorch_model.bin").exists()
|
||||
@@ -7,8 +7,6 @@ import os
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from transformers.utils import is_torch_bf16_gpu_available
|
||||
|
||||
from axolotl.cli import load_datasets
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.train import train
|
||||
@@ -34,6 +32,7 @@ class TestMixtral(unittest.TestCase):
|
||||
"base_model": "hf-internal-testing/Mixtral-tiny",
|
||||
"tokenizer_config": "mistralai/Mixtral-8x7B-v0.1",
|
||||
"flash_attention": True,
|
||||
"sample_packing": True,
|
||||
"sequence_len": 2048,
|
||||
"load_in_4bit": True,
|
||||
"adapter": "qlora",
|
||||
@@ -59,13 +58,9 @@ class TestMixtral(unittest.TestCase):
|
||||
"max_steps": 20,
|
||||
"save_steps": 10,
|
||||
"eval_steps": 10,
|
||||
"sample_packing": True,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
if is_torch_bf16_gpu_available():
|
||||
cfg.bf16 = True
|
||||
else:
|
||||
cfg.fp16 = True
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
@@ -81,6 +76,7 @@ class TestMixtral(unittest.TestCase):
|
||||
"base_model": "hf-internal-testing/Mixtral-tiny",
|
||||
"tokenizer_config": "mistralai/Mixtral-8x7B-v0.1",
|
||||
"flash_attention": True,
|
||||
"sample_packing": True,
|
||||
"sequence_len": 2048,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {},
|
||||
@@ -100,24 +96,16 @@ class TestMixtral(unittest.TestCase):
|
||||
"max_steps": 20,
|
||||
"save_steps": 10,
|
||||
"eval_steps": 10,
|
||||
"sample_packing": True,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
if is_torch_bf16_gpu_available():
|
||||
cfg.bf16 = True
|
||||
else:
|
||||
cfg.fp16 = True
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
model, _ = train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (
|
||||
"axolotl.monkeypatch.mixtral.modeling_mixtral"
|
||||
in model.model.layers[0].self_attn.__class__.__module__
|
||||
)
|
||||
assert (
|
||||
"MixtralMultipackFlashAttention2"
|
||||
"MixtralFlashAttention2"
|
||||
in model.model.layers[0].self_attn.__class__.__name__
|
||||
)
|
||||
assert (Path(temp_dir) / "pytorch_model.bin").exists()
|
||||
|
||||
@@ -52,11 +52,7 @@ class TestModelPatches(unittest.TestCase):
|
||||
model, _ = load_model(cfg, tokenizer, inference=cli_args.inference)
|
||||
|
||||
assert (
|
||||
"axolotl.monkeypatch.mixtral.modeling_mixtral"
|
||||
in model.model.layers[0].self_attn.__class__.__module__
|
||||
)
|
||||
assert (
|
||||
"MixtralMultipackFlashAttention2"
|
||||
"MixtralFlashAttention2"
|
||||
in model.model.layers[0].self_attn.__class__.__name__
|
||||
)
|
||||
|
||||
|
||||
123
tests/e2e/patched/test_phi_multipack.py
Normal file
123
tests/e2e/patched/test_phi_multipack.py
Normal file
@@ -0,0 +1,123 @@
|
||||
"""
|
||||
E2E tests for lora llama
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from axolotl.cli import load_datasets
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.train import train
|
||||
from axolotl.utils.config import normalize_config
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
from ..utils import with_temp_dir
|
||||
|
||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
|
||||
|
||||
class TestPhiMultipack(unittest.TestCase):
|
||||
"""
|
||||
Test case for Phi2 models
|
||||
"""
|
||||
|
||||
@with_temp_dir
|
||||
def test_ft_packed(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "microsoft/phi-1_5",
|
||||
"model_type": "PhiForCausalLM",
|
||||
"tokenizer_type": "AutoTokenizer",
|
||||
"sequence_len": 1024,
|
||||
"sample_packing": True,
|
||||
"flash_attention": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"load_in_8bit": False,
|
||||
"adapter": None,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"dataset_shard_num": 10,
|
||||
"dataset_shard_idx": 0,
|
||||
"num_epochs": 1,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_bnb_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 20,
|
||||
"eval_steps": 10,
|
||||
"save_steps": 10,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "pytorch_model.bin").exists()
|
||||
|
||||
@with_temp_dir
|
||||
def test_qlora_packed(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "microsoft/phi-1_5",
|
||||
"model_type": "PhiForCausalLM",
|
||||
"tokenizer_type": "AutoTokenizer",
|
||||
"sequence_len": 1024,
|
||||
"sample_packing": True,
|
||||
"flash_attention": True,
|
||||
"pad_to_sequence_len": True,
|
||||
"load_in_8bit": False,
|
||||
"adapter": "qlora",
|
||||
"lora_r": 64,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"dataset_shard_num": 10,
|
||||
"dataset_shard_idx": 0,
|
||||
"num_epochs": 1,
|
||||
"micro_batch_size": 1,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_bnb_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 20,
|
||||
"eval_steps": 10,
|
||||
"save_steps": 10,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||
157
tests/e2e/test_dpo.py
Normal file
157
tests/e2e/test_dpo.py
Normal file
@@ -0,0 +1,157 @@
|
||||
"""
|
||||
E2E tests for lora llama
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from axolotl.cli import load_rl_datasets
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.train import train
|
||||
from axolotl.utils.config import normalize_config
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
from .utils import with_temp_dir
|
||||
|
||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
|
||||
|
||||
class TestDPOLlamaLora(unittest.TestCase):
|
||||
"""
|
||||
Test case for DPO Llama models using LoRA
|
||||
"""
|
||||
|
||||
@with_temp_dir
|
||||
def test_dpo_lora(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "JackFram/llama-68m",
|
||||
"tokenizer_type": "LlamaTokenizer",
|
||||
"sequence_len": 1024,
|
||||
"load_in_8bit": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 64,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.1,
|
||||
"lora_target_linear": True,
|
||||
"special_tokens": {},
|
||||
"rl": "dpo",
|
||||
"datasets": [
|
||||
{
|
||||
"path": "Intel/orca_dpo_pairs",
|
||||
"type": "chatml.intel",
|
||||
"split": "train",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"micro_batch_size": 4,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "paged_adamw_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 20,
|
||||
"save_steps": 10,
|
||||
"warmup_steps": 5,
|
||||
"gradient_checkpointing": True,
|
||||
"gradient_checkpointing_kwargs": {"use_reentrant": True},
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "checkpoint-20/adapter_model.safetensors").exists()
|
||||
|
||||
@with_temp_dir
|
||||
def test_kto_pair_lora(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "JackFram/llama-68m",
|
||||
"tokenizer_type": "LlamaTokenizer",
|
||||
"sequence_len": 1024,
|
||||
"load_in_8bit": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 64,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.1,
|
||||
"lora_target_linear": True,
|
||||
"special_tokens": {},
|
||||
"rl": "kto_pair",
|
||||
"datasets": [
|
||||
{
|
||||
"path": "Intel/orca_dpo_pairs",
|
||||
"type": "chatml.intel",
|
||||
"split": "train",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"micro_batch_size": 4,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "paged_adamw_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 20,
|
||||
"save_steps": 10,
|
||||
"warmup_steps": 5,
|
||||
"gradient_checkpointing": True,
|
||||
"gradient_checkpointing_kwargs": {"use_reentrant": True},
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "checkpoint-20/adapter_model.safetensors").exists()
|
||||
|
||||
@with_temp_dir
|
||||
def test_ipo_lora(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "JackFram/llama-68m",
|
||||
"tokenizer_type": "LlamaTokenizer",
|
||||
"sequence_len": 1024,
|
||||
"load_in_8bit": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 64,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.1,
|
||||
"lora_target_linear": True,
|
||||
"special_tokens": {},
|
||||
"rl": "ipo",
|
||||
"datasets": [
|
||||
{
|
||||
"path": "Intel/orca_dpo_pairs",
|
||||
"type": "chatml.intel",
|
||||
"split": "train",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"micro_batch_size": 4,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "paged_adamw_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 20,
|
||||
"save_steps": 10,
|
||||
"warmup_steps": 5,
|
||||
"gradient_checkpointing": True,
|
||||
"gradient_checkpointing_kwargs": {"use_reentrant": True},
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_rl_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "checkpoint-20/adapter_model.safetensors").exists()
|
||||
166
tests/e2e/test_falcon.py
Normal file
166
tests/e2e/test_falcon.py
Normal file
@@ -0,0 +1,166 @@
|
||||
"""
|
||||
E2E tests for falcon
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from axolotl.cli import load_datasets
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.train import train
|
||||
from axolotl.utils.config import normalize_config
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
from .utils import with_temp_dir
|
||||
|
||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
|
||||
|
||||
class TestFalcon(unittest.TestCase):
|
||||
"""
|
||||
Test case for falcon
|
||||
"""
|
||||
|
||||
@with_temp_dir
|
||||
def test_lora(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "illuin/tiny-random-FalconForCausalLM",
|
||||
"flash_attention": True,
|
||||
"sequence_len": 1024,
|
||||
"load_in_8bit": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 32,
|
||||
"lora_alpha": 64,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
"lora_modules_to_save": [
|
||||
"word_embeddings",
|
||||
"lm_head",
|
||||
],
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"bos_token": "<|endoftext|>",
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 2,
|
||||
"micro_batch_size": 2,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 20,
|
||||
"save_steps": 10,
|
||||
"eval_steps": 10,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||
|
||||
@with_temp_dir
|
||||
def test_lora_added_vocab(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "illuin/tiny-random-FalconForCausalLM",
|
||||
"flash_attention": True,
|
||||
"sequence_len": 1024,
|
||||
"load_in_8bit": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 32,
|
||||
"lora_alpha": 64,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
"lora_modules_to_save": [
|
||||
"word_embeddings",
|
||||
"lm_head",
|
||||
],
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"bos_token": "<|endoftext|>",
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"tokens": [
|
||||
"<|im_start|>",
|
||||
"<|im_end|>",
|
||||
],
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 2,
|
||||
"micro_batch_size": 2,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 20,
|
||||
"save_steps": 10,
|
||||
"eval_steps": 10,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||
|
||||
@with_temp_dir
|
||||
def test_ft(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "illuin/tiny-random-FalconForCausalLM",
|
||||
"flash_attention": True,
|
||||
"sequence_len": 1024,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"bos_token": "<|endoftext|>",
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 2,
|
||||
"micro_batch_size": 2,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_torch",
|
||||
"lr_scheduler": "cosine",
|
||||
"max_steps": 20,
|
||||
"save_steps": 10,
|
||||
"eval_steps": 10,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "pytorch_model.bin").exists()
|
||||
@@ -7,9 +7,6 @@ import os
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from transformers.utils import is_torch_bf16_gpu_available
|
||||
|
||||
from axolotl.cli import load_datasets
|
||||
from axolotl.common.cli import TrainerCliArgs
|
||||
from axolotl.train import train
|
||||
@@ -27,17 +24,15 @@ class TestPhi(unittest.TestCase):
|
||||
Test case for Phi2 models
|
||||
"""
|
||||
|
||||
@pytest.mark.skip(reason="fixme later")
|
||||
@with_temp_dir
|
||||
def test_phi2_ft(self, temp_dir):
|
||||
def test_phi_ft(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "microsoft/phi-2",
|
||||
"trust_remote_code": True,
|
||||
"base_model": "microsoft/phi-1_5",
|
||||
"model_type": "AutoModelForCausalLM",
|
||||
"tokenizer_type": "AutoTokenizer",
|
||||
"sequence_len": 512,
|
||||
"sequence_len": 2048,
|
||||
"sample_packing": False,
|
||||
"load_in_8bit": False,
|
||||
"adapter": None,
|
||||
@@ -64,13 +59,9 @@ class TestPhi(unittest.TestCase):
|
||||
"max_steps": 10,
|
||||
"save_steps": 10,
|
||||
"eval_steps": 10,
|
||||
"save_safetensors": True,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
if is_torch_bf16_gpu_available():
|
||||
cfg.bf16 = True
|
||||
else:
|
||||
cfg.fp16 = True
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
@@ -78,25 +69,24 @@ class TestPhi(unittest.TestCase):
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "pytorch_model.bin").exists()
|
||||
|
||||
@pytest.mark.skip(reason="multipack no longer supported atm")
|
||||
@with_temp_dir
|
||||
def test_ft_packed(self, temp_dir):
|
||||
def test_phi_qlora(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "microsoft/phi-2",
|
||||
"trust_remote_code": True,
|
||||
"model_type": "PhiForCausalLM",
|
||||
"base_model": "microsoft/phi-1_5",
|
||||
"model_type": "AutoModelForCausalLM",
|
||||
"tokenizer_type": "AutoTokenizer",
|
||||
"sequence_len": 512,
|
||||
"sample_packing": True,
|
||||
"sequence_len": 2048,
|
||||
"sample_packing": False,
|
||||
"load_in_8bit": False,
|
||||
"adapter": None,
|
||||
"adapter": "qlora",
|
||||
"lora_r": 64,
|
||||
"lora_alpha": 32,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"unk_token": "<|endoftext|>",
|
||||
"bos_token": "<|endoftext|>",
|
||||
"eos_token": "<|endoftext|>",
|
||||
"pad_token": "<|endoftext|>",
|
||||
},
|
||||
"datasets": [
|
||||
@@ -112,18 +102,18 @@ class TestPhi(unittest.TestCase):
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_bnb_8bit",
|
||||
"optimizer": "paged_adamw_8bit",
|
||||
"lr_scheduler": "cosine",
|
||||
"flash_attention": True,
|
||||
"max_steps": 10,
|
||||
"save_steps": 10,
|
||||
"eval_steps": 10,
|
||||
"bf16": "auto",
|
||||
}
|
||||
)
|
||||
if is_torch_bf16_gpu_available():
|
||||
cfg.bf16 = True
|
||||
else:
|
||||
cfg.fp16 = True
|
||||
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, cli_args=cli_args, dataset_meta=dataset_meta)
|
||||
assert (Path(temp_dir) / "pytorch_model.bin").exists()
|
||||
assert (Path(temp_dir) / "adapter_model.bin").exists()
|
||||
|
||||
@@ -5,7 +5,12 @@ import unittest
|
||||
|
||||
import torch
|
||||
|
||||
from axolotl.monkeypatch.utils import get_cu_seqlens, get_cu_seqlens_from_pos_ids
|
||||
from axolotl.monkeypatch.utils import (
|
||||
get_cu_seqlens,
|
||||
get_cu_seqlens_from_pos_ids,
|
||||
get_max_seqlen_in_batch,
|
||||
get_unpad_data,
|
||||
)
|
||||
|
||||
|
||||
class TestMonkeyPatchUtils(unittest.TestCase):
|
||||
@@ -25,6 +30,70 @@ class TestMonkeyPatchUtils(unittest.TestCase):
|
||||
torch.allclose(get_cu_seqlens_from_pos_ids(position_ids)[0], target_res)
|
||||
)
|
||||
|
||||
def test_get_max_seqlen_in_batch(self):
|
||||
attn_mask = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0]])
|
||||
target_res = torch.tensor([4, 3, 5, 2], dtype=torch.int32)
|
||||
self.assertTrue(torch.allclose(get_max_seqlen_in_batch(attn_mask), target_res))
|
||||
|
||||
def test_get_unpad_data(self):
|
||||
attn_mask = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0]])
|
||||
target_indices = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
|
||||
target_cu_seqlen = torch.tensor([0, 4, 7, 12, 14], dtype=torch.int32)
|
||||
target_max_seqlen_in_batch = 5
|
||||
indices, cu_seqlen, max_seqlen_in_batch = get_unpad_data(attn_mask)
|
||||
self.assertTrue(torch.allclose(target_indices, indices))
|
||||
self.assertTrue(torch.allclose(target_cu_seqlen, cu_seqlen))
|
||||
self.assertEqual(target_max_seqlen_in_batch, max_seqlen_in_batch)
|
||||
|
||||
attn_mask = torch.tensor(
|
||||
[
|
||||
[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0],
|
||||
[1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5],
|
||||
]
|
||||
)
|
||||
target_indices = torch.tensor(
|
||||
[
|
||||
0,
|
||||
1,
|
||||
2,
|
||||
3,
|
||||
4,
|
||||
5,
|
||||
6,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
11,
|
||||
12,
|
||||
13,
|
||||
16,
|
||||
17,
|
||||
18,
|
||||
19,
|
||||
20,
|
||||
21,
|
||||
22,
|
||||
23,
|
||||
24,
|
||||
25,
|
||||
26,
|
||||
27,
|
||||
28,
|
||||
29,
|
||||
30,
|
||||
31,
|
||||
]
|
||||
)
|
||||
target_cu_seqlen = torch.tensor(
|
||||
[0, 4, 7, 12, 14, 17, 22, 24, 27, 30], dtype=torch.int32
|
||||
)
|
||||
target_max_seqlen_in_batch = 5
|
||||
indices, cu_seqlen, max_seqlen_in_batch = get_unpad_data(attn_mask)
|
||||
self.assertTrue(torch.allclose(target_indices, indices))
|
||||
self.assertTrue(torch.allclose(target_cu_seqlen, cu_seqlen))
|
||||
self.assertEqual(target_max_seqlen_in_batch, max_seqlen_in_batch)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
|
||||
116
tests/prompt_strategies/test_alpaca.py
Normal file
116
tests/prompt_strategies/test_alpaca.py
Normal file
@@ -0,0 +1,116 @@
|
||||
"""
|
||||
Test module for alpaca integration w chatml
|
||||
"""
|
||||
import pytest
|
||||
from datasets import Dataset
|
||||
from tokenizers import AddedToken
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from axolotl.datasets import TokenizedPromptDataset
|
||||
from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
|
||||
from axolotl.prompters import AlpacaPrompter, PromptStyle
|
||||
|
||||
|
||||
@pytest.fixture(name="alpaca_dataset")
|
||||
def fixture_alpaca_dataset():
|
||||
return Dataset.from_list(
|
||||
[
|
||||
{
|
||||
"instruction": "Evaluate this sentence for spelling and grammar mistakes",
|
||||
"input": "He finnished his meal and left the resturant",
|
||||
"output": "He finished his meal and left the restaurant.",
|
||||
}
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture(name="tokenizer")
|
||||
def fixture_tokenizer():
|
||||
# pylint: disable=all
|
||||
tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
|
||||
tokenizer.add_special_tokens(
|
||||
{
|
||||
"eos_token": AddedToken(
|
||||
"<|im_end|>", rstrip=False, lstrip=False, normalized=False
|
||||
)
|
||||
}
|
||||
)
|
||||
tokenizer.add_tokens(
|
||||
[
|
||||
AddedToken("<|im_start|>", rstrip=False, lstrip=False, normalized=False),
|
||||
]
|
||||
)
|
||||
|
||||
return tokenizer
|
||||
|
||||
|
||||
class TestAlpacaChatml:
|
||||
"""
|
||||
Test class for alpaca prompter
|
||||
"""
|
||||
|
||||
def test_no_double_im_end(self, alpaca_dataset, tokenizer):
|
||||
strategy = AlpacaPromptTokenizingStrategy(
|
||||
AlpacaPrompter(prompt_style=PromptStyle.CHATML.value),
|
||||
tokenizer,
|
||||
False, # train_on_inputs
|
||||
2048, # sequence_len
|
||||
)
|
||||
|
||||
dataset_wrapper = TokenizedPromptDataset(
|
||||
strategy, alpaca_dataset, process_count=1
|
||||
)
|
||||
|
||||
input_ids = dataset_wrapper[0]["input_ids"]
|
||||
# fmt: off
|
||||
assert input_ids == [
|
||||
1, # Bos
|
||||
32001, 1587, 13, 20548, 336, 349, 396, 13126, 369, 13966, 264, 3638, 28725, 5881, 1360, 395, 396, 2787, 369, 5312, 3629, 2758, 28723, 12018, 264, 2899, 369, 6582, 1999, 2691, 274, 272, 2159, 28723, 32000, 28705, 13, # instruction
|
||||
32001, 2188, 13, 16627, 11931, 456, 12271, 354, 668, 3572, 304, 18756, 3479, 17179, 13, 2428, 854, 28711, 1497, 516, 11314, 304, 1749, 272, 1846, 324, 440, 32000, 28705, 13, # input
|
||||
32001, 13892, 13, 650, 5967, 516, 11314, 304, 1749, 272, 9926, 28723, 32000, # output
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
def test_no_train_on_input(self, alpaca_dataset, tokenizer):
|
||||
strategy = AlpacaPromptTokenizingStrategy(
|
||||
AlpacaPrompter(prompt_style=PromptStyle.CHATML.value),
|
||||
tokenizer,
|
||||
False, # train_on_inputs
|
||||
2048, # sequence_len
|
||||
)
|
||||
|
||||
dataset_wrapper = TokenizedPromptDataset(
|
||||
strategy, alpaca_dataset, process_count=1
|
||||
)
|
||||
|
||||
labels = dataset_wrapper[0]["labels"]
|
||||
# fmt: off
|
||||
assert labels == [
|
||||
-100, # bos
|
||||
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # instruction
|
||||
-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, # input
|
||||
-100, -100, -100, 650, 5967, 516, 11314, 304, 1749, 272, 9926, 28723, 32000, # Output
|
||||
]
|
||||
# fmt: on
|
||||
|
||||
def test_w_train_on_input(self, alpaca_dataset, tokenizer):
|
||||
strategy = AlpacaPromptTokenizingStrategy(
|
||||
AlpacaPrompter(prompt_style=PromptStyle.CHATML.value),
|
||||
tokenizer,
|
||||
True, # train_on_inputs
|
||||
2048, # sequence_len
|
||||
)
|
||||
|
||||
dataset_wrapper = TokenizedPromptDataset(
|
||||
strategy, alpaca_dataset, process_count=1
|
||||
)
|
||||
|
||||
labels = dataset_wrapper[0]["labels"]
|
||||
# fmt: off
|
||||
assert labels == [
|
||||
1, # Bos
|
||||
32001, 1587, 13, 20548, 336, 349, 396, 13126, 369, 13966, 264, 3638, 28725, 5881, 1360, 395, 396, 2787, 369, 5312, 3629, 2758, 28723, 12018, 264, 2899, 369, 6582, 1999, 2691, 274, 272, 2159, 28723, 32000, 28705, 13, # instruction
|
||||
32001, 2188, 13, 16627, 11931, 456, 12271, 354, 668, 3572, 304, 18756, 3479, 17179, 13, 2428, 854, 28711, 1497, 516, 11314, 304, 1749, 272, 1846, 324, 440, 32000, 28705, 13, # input
|
||||
32001, 13892, 13, 650, 5967, 516, 11314, 304, 1749, 272, 9926, 28723, 32000, # output
|
||||
]
|
||||
# fmt: on
|
||||
@@ -78,13 +78,28 @@ class NormalizeConfigTestCase(unittest.TestCase):
|
||||
normalize_config(cfg)
|
||||
|
||||
self.assertTrue(cfg.bf16)
|
||||
self.assertFalse(cfg.fp16)
|
||||
|
||||
@patch("axolotl.utils.config.is_torch_bf16_gpu_available")
|
||||
def test_bf16_auto_setter_not_available(self, mock_bf16_avail):
|
||||
cfg = self._get_base_cfg()
|
||||
cfg.bf16 = "auto"
|
||||
cfg.fp16 = None
|
||||
mock_bf16_avail.return_value = False
|
||||
|
||||
normalize_config(cfg)
|
||||
|
||||
self.assertFalse(cfg.bf16)
|
||||
self.assertTrue(cfg.fp16)
|
||||
|
||||
@patch("axolotl.utils.config.is_torch_bf16_gpu_available")
|
||||
def test_bf16_disables_fp16(self, mock_bf16_avail):
|
||||
cfg = self._get_base_cfg()
|
||||
cfg.bf16 = True
|
||||
cfg.fp16 = False
|
||||
mock_bf16_avail.return_value = True
|
||||
|
||||
normalize_config(cfg)
|
||||
|
||||
self.assertTrue(cfg.bf16)
|
||||
self.assertFalse(cfg.fp16)
|
||||
|
||||
@@ -324,20 +324,19 @@ class ValidationTest(BaseValidation):
|
||||
|
||||
validate_config(cfg)
|
||||
|
||||
def test_packing(self):
|
||||
def test_deprecated_packing(self):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"max_packed_sequence_len": 2048,
|
||||
"max_packed_sequence_len": 1024,
|
||||
}
|
||||
)
|
||||
with self._caplog.at_level(logging.WARNING):
|
||||
with pytest.raises(
|
||||
DeprecationWarning,
|
||||
match=r"`max_packed_sequence_len` is no longer supported",
|
||||
):
|
||||
validate_config(cfg)
|
||||
assert any(
|
||||
"max_packed_sequence_len will be deprecated in favor of sample_packing"
|
||||
in record.message
|
||||
for record in self._caplog.records
|
||||
)
|
||||
|
||||
def test_packing(self):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"sample_packing": True,
|
||||
@@ -352,16 +351,6 @@ class ValidationTest(BaseValidation):
|
||||
for record in self._caplog.records
|
||||
)
|
||||
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"max_packed_sequence_len": 2048,
|
||||
"sample_packing": True,
|
||||
}
|
||||
)
|
||||
regex_exp = r".*set only one of max_packed_sequence_len \(deprecated soon\) or sample_packing.*"
|
||||
with pytest.raises(ValueError, match=regex_exp):
|
||||
validate_config(cfg)
|
||||
|
||||
@pytest.mark.skipif(
|
||||
is_torch_bf16_gpu_available(),
|
||||
reason="test should only run on gpus w/o bf16 support",
|
||||
@@ -753,11 +742,11 @@ class ValidationCheckModelConfig(BaseValidation):
|
||||
|
||||
check_model_config(cfg, model_config)
|
||||
|
||||
def test_phi2_add_tokens_adapter(self):
|
||||
def test_phi_add_tokens_adapter(self):
|
||||
cfg = DictDefault(
|
||||
{"adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"]}
|
||||
)
|
||||
model_config = DictDefault({"model_type": "phi-msft"})
|
||||
model_config = DictDefault({"model_type": "phi"})
|
||||
|
||||
with pytest.raises(
|
||||
ValueError,
|
||||
@@ -770,7 +759,7 @@ class ValidationCheckModelConfig(BaseValidation):
|
||||
"adapter": "qlora",
|
||||
"load_in_4bit": True,
|
||||
"tokens": ["<|imstart|>"],
|
||||
"lora_modules_to_save": ["embed_tokens", "lm_head"],
|
||||
"lora_modules_to_save": ["embd.wte", "lm_head.linear"],
|
||||
}
|
||||
)
|
||||
|
||||
@@ -785,7 +774,7 @@ class ValidationCheckModelConfig(BaseValidation):
|
||||
"adapter": "qlora",
|
||||
"load_in_4bit": True,
|
||||
"tokens": ["<|imstart|>"],
|
||||
"lora_modules_to_save": ["embd.wte", "lm_head.linear"],
|
||||
"lora_modules_to_save": ["embed_tokens", "lm_head"],
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
37
tests/utils/test_models.py
Normal file
37
tests/utils/test_models.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""Module for testing models utils file."""
|
||||
|
||||
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
|
||||
from axolotl.utils.dict import DictDefault
|
||||
from axolotl.utils.models import load_model
|
||||
|
||||
|
||||
class ModelsUtilsTest(unittest.TestCase):
|
||||
"""Testing module for models utils."""
|
||||
|
||||
def test_cfg_throws_error_with_s2_attention_and_sample_packing(self):
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"s2_attention": True,
|
||||
"sample_packing": True,
|
||||
"base_model": "",
|
||||
"model_type": "LlamaForCausalLM",
|
||||
}
|
||||
)
|
||||
|
||||
# Mock out call to HF hub
|
||||
with patch(
|
||||
"axolotl.utils.models.load_model_config"
|
||||
) as mocked_load_model_config:
|
||||
mocked_load_model_config.return_value = {}
|
||||
with pytest.raises(ValueError) as exc:
|
||||
# Should error before hitting tokenizer, so we pass in an empty str
|
||||
load_model(cfg, tokenizer="")
|
||||
assert (
|
||||
"shifted-sparse attention does not currently support sample packing"
|
||||
in str(exc.value)
|
||||
)
|
||||
Reference in New Issue
Block a user