diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 7fb97b9d9..4a7112041 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -88,6 +88,11 @@ jobs: pytorch: 2.5.1 axolotl_extras: is_latest: true + - cuda: 124 + cuda_version: 12.4.1 + python_version: "3.11" + pytorch: 2.6.0 + axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout diff --git a/.github/workflows/nightlies.yml b/.github/workflows/nightlies.yml index c501064d2..bc3c1a191 100644 --- a/.github/workflows/nightlies.yml +++ b/.github/workflows/nightlies.yml @@ -80,6 +80,11 @@ jobs: python_version: "3.11" pytorch: 2.5.1 axolotl_extras: + - cuda: 124 + cuda_version: 12.4.1 + python_version: "3.11" + pytorch: 2.6.0 + axolotl_extras: runs-on: axolotl-gpu-runner steps: - name: Checkout diff --git a/README.md b/README.md index 5d9a29a83..953bc0be5 100644 --- a/README.md +++ b/README.md @@ -55,6 +55,7 @@ Features: ### Installation ```bash +pip3 install -U packaging setuptools wheel ninja pip3 install --no-build-isolation axolotl[flash-attn,deepspeed] # Download example axolotl configs, deepspeed configs diff --git a/_quarto.yml b/_quarto.yml index c0536e730..943ed5293 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -32,8 +32,9 @@ website: contents: - docs/getting-started.qmd - docs/installation.qmd - - docs/cli.qmd - docs/inference.qmd + - docs/cli.qmd + - docs/config.qmd - section: "Dataset Formats" contents: docs/dataset-formats/* @@ -74,10 +75,6 @@ website: - docs/debugging.qmd - docs/nccl.qmd - - section: "Reference" - contents: - - docs/config.qmd - format: html: theme: darkly diff --git a/docs/config.qmd b/docs/config.qmd index fb0c4b59b..38ec368a1 100644 --- a/docs/config.qmd +++ b/docs/config.qmd @@ -1,5 +1,5 @@ --- -title: Config options +title: Config Reference description: A complete list of all configuration options. --- @@ -30,6 +30,8 @@ tokenizer_legacy: # Resize the model embeddings when new tokens are added to multiples of 32 # This is reported to improve training speed on some models resize_token_embeddings_to_32x: +# Optional[bool] Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink. +shrink_embeddings: # (Internal use only) # Used to identify which the model is based on @@ -205,10 +207,46 @@ test_datasets: data_files: - /workspace/data/eval.jsonl -# use RL training: 'dpo', 'ipo', 'kto' +# use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo' rl: -# whether to perform weighting if doing DPO training. Boolean. -dpo_use_weighting: +rl_beta: # Optional[float]. The beta parameter for the RL training. + +# dpo +dpo_use_weighting: # Optional[bool]. Whether to perform weighting. +rpo_alpha: # Optional[float]. Weighting of NLL term in loss from RPO paper. + +# orpo +orpo_alpha: 0.1 # Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping. + +# kto +kto_desirable_weight: # Optional[float]. Factor for desirable loss term in KTO loss. +kto_undesirable_weight: # Optional[float]. Factor for undesirable loss term in KTO loss. + +# simpo +cpo_alpha: 1.0 # Weight of the BC regularizer +simpo_gamma: 0.5 # Target reward margin for the SimPO loss + +# grpo +trl: + use_vllm: # Optional[bool]. Whether to use VLLM for RL training. + vllm_device: # Optional[str]. Device to use for VLLM. + vllm_gpu_memory_utilization: # Optional[float]. GPU memory utilization for VLLM. + vllm_max_model_len: # Optional[int]. Maximum length of the model for VLLM. + vllm_dtype: # Optional[str]. Data type for VLLM. + + beta: # Optional[float]. Beta parameter for the RL training. Same as `rl_beta`. Use + max_completion_length: # Optional[int]. Maximum length of the completion for RL training. + + reward_funcs: # Optional[list[str]]. List of reward functions to load. Paths must be importable from current dir. + reward_weights: # Optional[list[float]]. List of reward weights for the reward functions. + + num_generations: # Optional[int]. Number of generations to sample. + log_completions: # Optional[bool]. Whether to log completions. + + sync_ref_model: # Optional[bool]. Whether to sync the reference model. + ref_model_mixup_alpha: # Optional[float]. Mixup alpha for the reference model. + ref_model_sync_steps: # Optional[int]. Sync steps for the reference model. + # reward modelling: `True` or `False` reward_model: @@ -232,7 +270,7 @@ default_system_message: You are a helpful assistant. Please give a long and deta # subsequent training attempts load faster, relative path dataset_prepared_path: data/last_run_prepared # Push prepared dataset to hub -push_dataset_to_hub: # repo path +push_dataset_to_hub: # Optional[str] repo_org/repo_name # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` # if not set. dataset_processes: # defaults to os.cpu_count() if not set diff --git a/docs/faq.qmd b/docs/faq.qmd index ba7ac1265..acec1886e 100644 --- a/docs/faq.qmd +++ b/docs/faq.qmd @@ -27,6 +27,16 @@ description: Frequently asked questions > A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it. +**Q: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.** + +> A: This is likely due to vocab size mismatch. By default, Axolotl expands the model's embeddings if the tokenizer has more tokens than the model. Please use the `axolotl merge-lora` command to merge the adapters instead of using your own scripts. + +> On the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model's embeddings unless `shrink_embeddings: true` is set in the config. + +**Q: How to call Axolotl via custom python scripts?** + +> A: Yes, since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called. + ### Chat templates **Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`** diff --git a/docs/getting-started.qmd b/docs/getting-started.qmd index 8e826b959..a0501ad21 100644 --- a/docs/getting-started.qmd +++ b/docs/getting-started.qmd @@ -36,7 +36,9 @@ The YAML configuration file controls everything about your training. Here's what ```yaml base_model: NousResearch/Llama-3.2-1B -# hub_model_id: username/custom_model_name + +load_in_8bit: true +adapter: lora datasets: - path: teknium/GPT4-LLM-Cleaned @@ -44,11 +46,15 @@ datasets: dataset_prepared_path: last_run_prepared val_set_size: 0.1 output_dir: ./outputs/lora-out - -adapter: lora -lora_model_dir: ``` +::: {.callout-tip} +`load_in_8bit: true` and `adapter: lora` enables LoRA adapter finetuning. + +- To perform Full finetuning, remove these two lines. +- To perform QLoRA finetuning, replace with `load_in_4bit: true` and `adapter: qlora`. +::: + See our [Config options](config.qmd) for more details. ### Training {#sec-training} @@ -56,7 +62,7 @@ See our [Config options](config.qmd) for more details. When you run `axolotl train`, Axolotl: 1. Downloads the base model -2. (If specified) applies LoRA adapter layers +2. (If specified) applies QLoRA/LoRA adapter layers 3. Loads and processes the dataset 4. Runs the training loop 5. Saves the trained model and / or LoRA weights @@ -69,6 +75,8 @@ Let's modify the example for your own data: ```yaml base_model: NousResearch/Nous-Hermes-llama-1b-v1 + +load_in_8bit: true adapter: lora # Training settings @@ -104,8 +112,6 @@ format): {"instruction": "Classify this text", "input": "Not good at all", "output": "negative"} ``` -Please consult the supported [Dataset Formats](dataset-formats/) for more details. - 3. Run the training: ```bash diff --git a/docs/inference.qmd b/docs/inference.qmd index aded400d0..6917d3c33 100644 --- a/docs/inference.qmd +++ b/docs/inference.qmd @@ -1,5 +1,5 @@ --- -title: "Inference" +title: "Inference and Merging" format: html: toc: true @@ -9,10 +9,14 @@ execute: enabled: false --- -This guide covers how to use your trained models for inference, including model loading, interactive testing, and common troubleshooting steps. +This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps. ## Quick Start {#sec-quickstart} +::: {.callout-tip} +Use the same config used for training on inference/merging. +::: + ### Basic Inference {#sec-basic} ::: {.panel-tabset} diff --git a/docs/installation.qmd b/docs/installation.qmd index 95f15e78e..3cff0bd32 100644 --- a/docs/installation.qmd +++ b/docs/installation.qmd @@ -22,6 +22,7 @@ This guide covers all the ways you can install and set up Axolotl for your envir ### PyPI Installation (Recommended) {#sec-pypi} ```{.bash} +pip3 install -U packaging setuptools wheel ninja pip3 install --no-build-isolation axolotl[flash-attn,deepspeed] ``` @@ -37,7 +38,7 @@ For the latest features between releases: ```{.bash} git clone https://github.com/axolotl-ai-cloud/axolotl.git cd axolotl -pip3 install packaging ninja +pip3 install -U packaging setuptools wheel ninja pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]' ``` @@ -107,7 +108,7 @@ We recommend using WSL2 (Windows Subsystem for Linux) or Docker. 2. Install PyTorch: https://pytorch.org/get-started/locally/ 3. Install Axolotl: ```{.bash} - pip3 install packaging + pip3 install -U packaging setuptools wheel ninja pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]' ``` 4. (Optional) Login to Hugging Face: diff --git a/docs/lora_optims.qmd b/docs/lora_optims.qmd index 8bee20402..a7555a0a3 100644 --- a/docs/lora_optims.qmd +++ b/docs/lora_optims.qmd @@ -66,6 +66,10 @@ logic to be compatible with more of them. +::: {.callout-tip} +Check out our [LoRA optimizations blog](https://axolotlai.substack.com/p/accelerating-lora-fine-tuning-with). +::: + ## Usage These optimizations can be enabled in your Axolotl config YAML file. The diff --git a/docs/reward_modelling.qmd b/docs/reward_modelling.qmd index c9ac5f801..386dc1f57 100644 --- a/docs/reward_modelling.qmd +++ b/docs/reward_modelling.qmd @@ -41,6 +41,10 @@ Bradley-Terry chat templates expect single-turn conversations in the following f ### Process Reward Models (PRM) +::: {.callout-tip} +Check out our [PRM blog](https://axolotlai.substack.com/p/process-reward-models). +::: + Process reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning. ```yaml base_model: Qwen/Qwen2.5-3B diff --git a/docs/rlhf.qmd b/docs/rlhf.qmd index 773b159e8..6bef7c831 100644 --- a/docs/rlhf.qmd +++ b/docs/rlhf.qmd @@ -298,7 +298,7 @@ The input format is a simple JSON input with customizable fields based on the ab ### IPO -As IPO is just DPO with a different loss function, all supported options for DPO works here. +As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO. ```yaml rl: ipo @@ -344,8 +344,9 @@ ORPO supports the following types with the following dataset format: ```yaml rl: kto -rl_beta: 0.5 -kto_desirable_weight: 0.2 +rl_beta: 0.1 # default +kto_desirable_weight: 1.0 # default +kto_undesirable_weight: 1.0 # default remove_unused_columns: false @@ -497,6 +498,10 @@ The input format is a simple JSON input with customizable fields based on the ab ### GRPO +::: {.callout-tip} +Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo). +::: + GRPO uses custom reward functions and transformations. Please have them ready locally. For ex, to load OpenAI's GSM8K and use a random reward for completions: @@ -540,6 +545,19 @@ To see other examples of custom reward functions, please see [TRL GRPO Docs](htt To see description of the configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/utils/config/models/input/v0_4_1/trl.py). +### SimPO + +SimPO uses [CPOTrainer](https://huggingface.co/docs/trl/main/en/cpo_trainer) but with alternative loss function. + +```yaml +rl: simpo +rl_beta: 0.1 # default in CPOTrainer +cpo_alpha: 1.0 # default in CPOTrainer +simpo_gamma: 0.5 # default in CPOTrainer +``` + +This method uses the same dataset format as [DPO](#dpo). + ### Using local dataset files ```yaml diff --git a/requirements.txt b/requirements.txt index cd5690f0b..c3a9eb2f3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -62,5 +62,5 @@ antlr4-python3-runtime==4.13.2 torchao==0.7.0 schedulefree==1.3.0 -axolotl-contribs-lgpl==0.0.3 +axolotl-contribs-lgpl==0.0.6 axolotl-contribs-mit==0.0.3 diff --git a/src/axolotl/cli/train.py b/src/axolotl/cli/train.py index 032f12b66..e991105e6 100644 --- a/src/axolotl/cli/train.py +++ b/src/axolotl/cli/train.py @@ -1,6 +1,7 @@ """CLI to run training on a model.""" import logging +import os from pathlib import Path from typing import Union @@ -34,7 +35,8 @@ def do_train(cfg: DictDefault, cli_args: TrainerCliArgs) -> None: """ print_axolotl_text_art() check_accelerate_default_config() - check_user_token() + if int(os.getenv("LOCAL_RANK", "0")) == 0: + check_user_token() if cfg.rl: dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args) diff --git a/src/axolotl/core/datasets/chat.py b/src/axolotl/core/datasets/chat.py index e74c247d2..ba257071d 100644 --- a/src/axolotl/core/datasets/chat.py +++ b/src/axolotl/core/datasets/chat.py @@ -43,7 +43,7 @@ class TokenizedChatDataset(Dataset): process_or_cpu_count: int = ( process_count or os.cpu_count() # type: ignore[assignment] ) - num_proc = min(64, process_or_cpu_count) + num_proc = min(32, process_or_cpu_count) features = data.features.keys() tokenized_data = data.map( map_fn, diff --git a/src/axolotl/train.py b/src/axolotl/train.py index 178b90f7b..1ceb5babd 100644 --- a/src/axolotl/train.py +++ b/src/axolotl/train.py @@ -7,7 +7,7 @@ import signal import sys import weakref from pathlib import Path -from typing import Any +from typing import Any, Dict import torch import transformers.modelcard @@ -20,7 +20,7 @@ from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled from transformers.trainer import Trainer from axolotl.common.datasets import TrainDatasetMeta -from axolotl.contribs.lgpl.unsloth import ( # pylint: disable = no-name-in-module +from axolotl.contribs.lgpl import ( # pylint: disable = no-name-in-module fix_untrained_tokens, ) from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder @@ -382,21 +382,23 @@ def handle_untrained_tokens_fix( if not cfg.fix_untrained_tokens: return + is_ds_zero3: bool = False + if os.environ.get("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3": + is_ds_zero3 = True + # Check if the `token_ids_to_fix` kwarg exists in the fix_untrained_tokens args sig = inspect.signature(fix_untrained_tokens) + fix_kwargs: Dict[str, Any] = {} # If the function has the `token_ids_to_fix` arg, and fix_untrained_tokens is a list if "token_ids_to_fix" in sig.parameters and isinstance( cfg.fix_untrained_tokens, list ): - fix_untrained_tokens( - model, - tokenizer, - train_dataset, - token_ids_to_fix=cfg.fix_untrained_tokens, - ) - else: - fix_untrained_tokens(model, tokenizer, train_dataset) + fix_kwargs["token_ids_to_fix"] = cfg.fix_untrained_tokens + if "is_ds_zero3" in sig.parameters: + fix_kwargs["is_ds_zero3"] = is_ds_zero3 + + fix_untrained_tokens(model, tokenizer, train_dataset, **fix_kwargs) if cfg.local_rank == 0: model.save_pretrained( diff --git a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py index cca8b92a1..8136c5046 100644 --- a/src/axolotl/utils/config/models/input/v0_4_1/__init__.py +++ b/src/axolotl/utils/config/models/input/v0_4_1/__init__.py @@ -693,7 +693,7 @@ class AxolotlInputConfig( default=None, json_schema_extra={"description": "streaming dataset to use for pretraining"}, ) - dataset_processes: Optional[int] = Field(default=os.cpu_count()) + dataset_processes: Optional[int] = Field(default=min(32, os.cpu_count())) # type: ignore[type-var] dataset_exact_deduplication: Optional[bool] = None dataset_keep_in_memory: Optional[bool] = None dataloader_pin_memory: Optional[bool] = None @@ -1632,6 +1632,14 @@ class AxolotlConfigWCapabilities(AxolotlInputConfig): data["torch_compile"] = False return data + @model_validator(mode="before") + @classmethod + def check_beta_and_trl_beta_match(cls, data): + if data.get("beta") and data.get("trl", {}).get("beta"): + if data["beta"] != data["trl"]["beta"]: + raise ValueError("beta and trl.beta must match or one must be removed") + return data + def handle_legacy_message_fields_logic(data: dict) -> dict: """ diff --git a/src/axolotl/utils/models.py b/src/axolotl/utils/models.py index 1805a749a..93d0f13c0 100644 --- a/src/axolotl/utils/models.py +++ b/src/axolotl/utils/models.py @@ -24,7 +24,6 @@ from peft import ( PeftModelForCausalLM, prepare_model_for_kbit_training, ) -from peft.tuners.lora import QuantLinear from torch import nn from transformers import ( # noqa: F401 AddedToken, @@ -1360,7 +1359,7 @@ def load_llama_adapter(model, cfg): def find_all_linear_names(model): - cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear, QuantLinear) + cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear) lora_module_names = set() for name, module in model.named_modules(): if ( diff --git a/tests/e2e/multigpu/test_llama.py b/tests/e2e/multigpu/test_llama.py index 0f91fe056..60b194090 100644 --- a/tests/e2e/multigpu/test_llama.py +++ b/tests/e2e/multigpu/test_llama.py @@ -750,3 +750,66 @@ class TestMultiGPULlama: check_tensorboard( temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss is too high" ) + + def test_fix_untrained_tokens(self, temp_dir): + # pylint: disable=duplicate-code + cfg = DictDefault( + { + "base_model": "HuggingFaceTB/SmolLM2-135M", + "fix_untrained_tokens": True, + "sequence_len": 512, + "val_set_size": 0.0, + "special_tokens": { + "pad_token": "<|endoftext|>", + "bos_token": "<|custom_im_start|>", + "eos_token": "<|custom_im_end|>", + }, + "datasets": [ + { + "chat_template": "jinja", + "chat_template_jinja": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|custom_im_start|>' + message['role'] + '\n' + message['content'] + '<|custom_im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|custom_im_start|>assistant\n' }}{% endif %}", + "path": "mlabonne/FineTome-100k", + "type": "chat_template", + "split": "train[:10%]", + "field_messages": "conversations", + "message_field_role": "from", + "message_field_content": "value", + }, + ], + "num_epochs": 1, + "max_steps": 5, + "micro_batch_size": 1, + "gradient_accumulation_steps": 1, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_torch_fused", + "lr_scheduler": "cosine", + "flash_attention": True, + "sample_packing": True, + "bf16": True, + "save_safetensors": True, + "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero3_bf16.json"), + "use_tensorboard": True, + } + ) + + # write cfg to yaml file + Path(temp_dir).mkdir(parents=True, exist_ok=True) + with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout: + fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper)) + + execute_subprocess_async( + [ + "axolotl", + "train", + str(Path(temp_dir) / "config.yaml"), + "--num-processes", + "2", + "--main-process-port", + f"{get_torch_dist_unique_port()}", + ] + ) + + check_tensorboard( + temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss is too high" + ) diff --git a/tests/e2e/test_llama.py b/tests/e2e/test_llama.py index 77e70d8c2..644744240 100644 --- a/tests/e2e/test_llama.py +++ b/tests/e2e/test_llama.py @@ -66,6 +66,54 @@ class TestLlama: check_model_output_exists(temp_dir, cfg) def test_fix_untrained_tokens(self, temp_dir): + # pylint: disable=duplicate-code + cfg = DictDefault( + { + "base_model": "HuggingFaceTB/SmolLM2-135M", + "fix_untrained_tokens": True, + "sequence_len": 512, + "val_set_size": 0.0, + "special_tokens": { + "pad_token": "<|endoftext|>", + "bos_token": "<|custom_im_start|>", + "eos_token": "<|custom_im_end|>", + }, + "datasets": [ + { + "chat_template": "jinja", + "chat_template_jinja": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|custom_im_start|>' + message['role'] + '\n' + message['content'] + '<|custom_im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|custom_im_start|>assistant\n' }}{% endif %}", + "path": "mlabonne/FineTome-100k", + "type": "chat_template", + "split": "train[:10%]", + "field_messages": "conversations", + "message_field_role": "from", + "message_field_content": "value", + }, + ], + "num_epochs": 1, + "max_steps": 5, + "micro_batch_size": 1, + "gradient_accumulation_steps": 1, + "output_dir": temp_dir, + "learning_rate": 0.00001, + "optimizer": "adamw_8bit", + "lr_scheduler": "cosine", + "flash_attention": True, + "sample_packing": True, + "bf16": True, + "save_safetensors": True, + } + ) + + cfg = validate_config(cfg) + normalize_config(cfg) + cli_args = TrainerCliArgs() + dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args) + + train(cfg=cfg, dataset_meta=dataset_meta) + check_model_output_exists(temp_dir, cfg) + + def test_fix_untrained_tokens_already_trained(self, temp_dir): # pylint: disable=duplicate-code cfg = DictDefault( {