diff --git a/README.md b/README.md index b16cb4d2f..7229d0a70 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,7 @@ Axolotl is a tool designed to streamline the fine-tuning of various AI models, o - [LambdaLabs Installation](#lambdalabs) - [Dataset](#dataset) - [How to Add Custom Prompts](#how-to-add-custom-prompts) + - [How to Use Custom Pretokenized Dataset](#how-to-use-your-custom-pretokenized-dataset) - [Config](#config) - [Train](#train) - [Inference](#inference) @@ -99,7 +100,7 @@ accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml \ ``` - Conda/Pip venv - 1. Install python **3.9** + 1. Install python >=**3.9** 2. Install pytorch stable https://pytorch.org/get-started/locally/ @@ -273,11 +274,29 @@ Have dataset(s) in one of the following format (JSONL recommended): #### How to add custom prompts - 1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example. - 2. Use your custom file name as the dataset type `.load_`. +Using yaml. Example: +```yaml +datasets: + - path: repo + type: + system_prompt: "" + no_input_format: |- + User: {instruction}<|end_of_turn|> + Assistant: + format: |- + User: {instruction} + {input}<|end_of_turn|> + Assistant: +``` -Optionally, download some datasets, see [data/README.md](data/README.md) +Using file: +1. Add your method to a file in [prompt_strategies](src/axolotl/prompt_strategies). Please see other files as example. +2. Use your custom file name as the dataset type `.load_`. +#### How to use your custom pretokenized dataset + +- Do not pass a `type:` +- Dataset must contain `input_ids`, `attention_mask`, `labels` in columns ### Config @@ -307,9 +326,9 @@ See [examples](examples) for quick start. It is recommended to duplicate and mod # local datasets: - - path: json - data_files: data.jsonl # or json - type: alpaca # format from earlier + - path: data.jsonl # or json + ds_type: json # see other options below + type: alpaca ``` - loading @@ -395,6 +414,24 @@ datasets: shards: # number of shards to split data into name: # name of dataset configuration to load + # custom user prompt + - path: repo + type: + # the below are defaults. only set what's needed. + system_prompt: "" + field_system: system + field_instruction: instruction + field_output: input + + # customizable to be single line or multi-line + system_format: "{system}" + # 'format' can include {input} + format: |- + User: {instruction} {input} + Assistant: + # 'no_input_format' cannot include {input} + no_input_format: "{instruction} " + # axolotl attempts to save the dataset as an arrow after packing the data together so # subsequent training attempts load faster, relative path dataset_prepared_path: data/last_run_prepared @@ -667,7 +704,9 @@ Please reduce any below - `gradient_accumulation_steps` - `sequence_len` -> `failed (exitcode: -9)` usually means your system has run out of system memory. +> `failed (exitcode: -9)` + +Usually means your system has run out of system memory. Similarly, you should consider reducing the same settings as when you run out of VRAM. Additionally, look into upgrading your system RAM which should be simpler than GPU upgrades. diff --git a/data/README.md b/data/README.md deleted file mode 100644 index c452ece7c..000000000 --- a/data/README.md +++ /dev/null @@ -1,24 +0,0 @@ - -## Download some datasets -```shell -curl https://raw.githubusercontent.com/tloen/alpaca-lora/main/alpaca_data_gpt4.json -o data/raw/alpaca_data_gpt4.json -curl https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -L -o data/raw/vicuna_cleaned.json -curl https://github.com/teknium1/GPTeacher/blob/main/Instruct/gpt4-instruct-similarity-0.6-dataset.json?raw=true -L -o data/raw/gpt4-instruct-similarity-0.6-dataset.json -curl https://github.com/teknium1/GPTeacher/blob/main/Roleplay/roleplay-similarity_0.6-instruct-dataset.json?raw=true -L -o data/raw/roleplay-similarity_0.6-instruct-dataset.json -``` - -## Convert the JSON data files to JSONL. - -```shell -python3 ./scripts/alpaca_json_to_jsonl.py --file data/alpaca_data_gpt4.json --output data/alpaca_data_gpt4.jsonl -python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/vicuna_cleaned.json --output data/vicuna_cleaned.jsonl -python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/roleplay-similarity_0.6-instruct-dataset.json --output data/roleplay-similarity_0.6-instruct-dataset.jsonl -python3 ./scripts/alpaca_json_to_jsonl.py --file data/raw/gpt4-instruct-similarity-0.6-dataset.json --output data/gpt4-instruct-similarity-0.6-dataset.jsonl -``` ---- - -Using JSONL makes it easier to subset the data if you want a smaller training set, i.e get 2000 random examples. - -```shell -shuf -n2000 data/vicuna_cleaned.jsonl > data/vicuna_cleaned.subset0.jsonl -``` diff --git a/data/raw/.gitignore b/data/raw/.gitignore deleted file mode 100644 index 1d085cacc..000000000 --- a/data/raw/.gitignore +++ /dev/null @@ -1 +0,0 @@ -** diff --git a/scripts/alpaca_json_to_jsonl.py b/scripts/alpaca_json_to_jsonl.py deleted file mode 100644 index 8ea1983fe..000000000 --- a/scripts/alpaca_json_to_jsonl.py +++ /dev/null @@ -1,52 +0,0 @@ -"""Module to convert json file to jsonl""" - -import os -import sys -from pathlib import Path -from typing import Optional, Union - -import fire - -from axolotl.convert import ( - FileReader, - FileWriter, - JsonlSerializer, - JsonParser, - JsonToJsonlConverter, - StdoutWriter, -) -from axolotl.logging_config import configure_logging - -configure_logging() - -# add src to the pythonpath so we don't need to pip install this -project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) -src_dir = os.path.join(project_root, "src") -sys.path.insert(0, src_dir) - - -def main( - file: Path, - output: Optional[Path] = None, - to_stdout: Optional[bool] = False, -): - """ - Convert a json file to jsonl - """ - - file_reader = FileReader() - writer: Union[StdoutWriter, FileWriter] - if to_stdout or output is None: - writer = StdoutWriter() - else: - writer = FileWriter(output) - json_parser = JsonParser() - jsonl_serializer = JsonlSerializer() - - converter = JsonToJsonlConverter(file_reader, writer, json_parser, jsonl_serializer) - - converter.convert(file, output) - - -if __name__ == "__main__": - fire.Fire(main)