Compare commits
5 Commits
0134093acc
...
train-refa
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
5c0510a876 | ||
|
|
e1bc18763a | ||
|
|
ed5178cd3d | ||
|
|
a3224c7c3c | ||
|
|
c4104fc10c |
@@ -19,6 +19,9 @@
|
||||
<br/>
|
||||
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
|
||||
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
|
||||
<a href="https://www.phorm.ai/query?projectId=e315ba4a-4e14-421f-ab05-38a1f9076f25">
|
||||
<img alt="phorm.ai" src="https://img.shields.io/badge/Phorm-Ask_AI-%23F2777A.svg?&logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iNSIgaGVpZ2h0PSI0IiBmaWxsPSJub25lIiB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciPgogIDxwYXRoIGQ9Ik00LjQzIDEuODgyYTEuNDQgMS40NCAwIDAgMS0uMDk4LjQyNmMtLjA1LjEyMy0uMTE1LjIzLS4xOTIuMzIyLS4wNzUuMDktLjE2LjE2NS0uMjU1LjIyNmExLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxMmMtLjA5OS4wMTItLjE5Mi4wMTQtLjI3OS4wMDZsLTEuNTkzLS4xNHYtLjQwNmgxLjY1OGMuMDkuMDAxLjE3LS4xNjkuMjQ2LS4xOTFhLjYwMy42MDMgMCAwIDAgLjItLjEwNi41MjkuNTI5IDAgMCAwIC4xMzgtLjE3LjY1NC42NTQgMCAwIDAgLjA2NS0uMjRsLjAyOC0uMzJhLjkzLjkzIDAgMCAwLS4wMzYtLjI0OS41NjcuNTY3IDAgMCAwLS4xMDMtLjIuNTAyLjUwMiAwIDAgMC0uMTY4LS4xMzguNjA4LjYwOCAwIDAgMC0uMjQtLjA2N0wyLjQzNy43MjkgMS42MjUuNjcxYS4zMjIuMzIyIDAgMCAwLS4yMzIuMDU4LjM3NS4zNzUgMCAwIDAtLjExNi4yMzJsLS4xMTYgMS40NS0uMDU4LjY5Ny0uMDU4Ljc1NEwuNzA1IDRsLS4zNTctLjA3OUwuNjAyLjkwNkMuNjE3LjcyNi42NjMuNTc0LjczOS40NTRhLjk1OC45NTggMCAwIDEgLjI3NC0uMjg1Ljk3MS45NzEgMCAwIDEgLjMzNy0uMTRjLjExOS0uMDI2LjIyNy0uMDM0LjMyNS0uMDI2TDMuMjMyLjE2Yy4xNTkuMDE0LjMzNi4wMy40NTkuMDgyYTEuMTczIDEuMTczIDAgMCAxIC41NDUuNDQ3Yy4wNi4wOTQuMTA5LjE5Mi4xNDQuMjkzYTEuMzkyIDEuMzkyIDAgMCAxIC4wNzguNThsLS4wMjkuMzJaIiBmaWxsPSIjRjI3NzdBIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+CiAgPHBhdGggZD0iTTQuMDgyIDIuMDA3YTEuNDU1IDEuNDU1IDAgMCAxLS4wOTguNDI3Yy0uMDUuMTI0LS4xMTQuMjMyLS4xOTIuMzI0YTEuMTMgMS4xMyAwIDAgMS0uMjU0LjIyNyAxLjM1MyAxLjM1MyAwIDAgMS0uNTk1LjIxNGMtLjEuMDEyLS4xOTMuMDE0LS4yOC4wMDZsLTEuNTYtLjEwOC4wMzQtLjQwNi4wMy0uMzQ4IDEuNTU5LjE1NGMuMDkgMCAuMTczLS4wMS4yNDgtLjAzM2EuNjAzLjYwMyAwIDAgMCAuMi0uMTA2LjUzMi41MzIgMCAwIDAgLjEzOS0uMTcyLjY2LjY2IDAgMCAwIC4wNjQtLjI0MWwuMDI5LS4zMjFhLjk0Ljk0IDAgMCAwLS4wMzYtLjI1LjU3LjU3IDAgMCAwLS4xMDMtLjIwMi41MDIuNTAyIDAgMCAwLS4xNjgtLjEzOC42MDUuNjA1IDAgMCAwLS4yNC0uMDY3TDEuMjczLjgyN2MtLjA5NC0uMDA4LS4xNjguMDEtLjIyMS4wNTUtLjA1My4wNDUtLjA4NC4xMTQtLjA5Mi4yMDZMLjcwNSA0IDAgMy45MzhsLjI1NS0yLjkxMUExLjAxIDEuMDEgMCAwIDEgLjM5My41NzIuOTYyLjk2MiAwIDAgMSAuNjY2LjI4NmEuOTcuOTcgMCAwIDEgLjMzOC0uMTRDMS4xMjIuMTIgMS4yMy4xMSAxLjMyOC4xMTlsMS41OTMuMTRjLjE2LjAxNC4zLjA0Ny40MjMuMWExLjE3IDEuMTcgMCAwIDEgLjU0NS40NDhjLjA2MS4wOTUuMTA5LjE5My4xNDQuMjk1YTEuNDA2IDEuNDA2IDAgMCAxIC4wNzcuNTgzbC0uMDI4LjMyMloiIGZpbGw9IndoaXRlIi8+Cjwvc3ZnPgo=">
|
||||
</a>
|
||||
</p>
|
||||
|
||||
Axolotl is a tool designed to streamline post-training for various AI models.
|
||||
|
||||
@@ -40,7 +40,6 @@ website:
|
||||
|
||||
- section: "Deployments"
|
||||
contents:
|
||||
- docs/docker.qmd
|
||||
- docs/multi-gpu.qmd
|
||||
- docs/multi-node.qmd
|
||||
- docs/ray-integration.qmd
|
||||
|
||||
@@ -163,12 +163,6 @@ datasets:
|
||||
system: ["system"]
|
||||
tool: ["tool"]
|
||||
|
||||
# Optional[bool]. Whether to drop the system turn from the dataset. Only works with chat_template.
|
||||
# This does not drop the default system message from chat_template if it exists. If you wish to,
|
||||
# we recommend using a custom jinja template with the default system message removed or
|
||||
# adding a system turn with empty content.
|
||||
drop_system_message:
|
||||
|
||||
# IMPORTANT: The following fields determine which parts of the conversation to train on.
|
||||
# Priority order: message_field_training > message_field_training_detail > train_on_inputs or role in roles_to_train
|
||||
# See examples at `docs/dataset-formats/conversation.qmd`
|
||||
@@ -228,8 +222,8 @@ process_reward_model:
|
||||
chat_template: tokenizer_default
|
||||
# custom jinja template for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null.
|
||||
chat_template_jinja: null
|
||||
# Changes the default system message. Currently only supports chatml.
|
||||
default_system_message: You are a helpful assistant. Please give a long and detailed answer.
|
||||
# Changes the default system message
|
||||
default_system_message: You are a helpful assistant. Please give a long and detailed answer. # Currently only supports chatml.
|
||||
# Axolotl attempts to save the dataset as an arrow after packing the data together so
|
||||
# subsequent training attempts load faster, relative path
|
||||
dataset_prepared_path: data/last_run_prepared
|
||||
@@ -451,7 +445,7 @@ gradient_checkpointing: false
|
||||
early_stopping_patience: 3
|
||||
|
||||
# Specify a scheduler and kwargs to use with the optimizer
|
||||
lr_scheduler: # 'one_cycle' | 'rex' | 'log_sweep' | empty for cosine
|
||||
lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
|
||||
lr_scheduler_kwargs:
|
||||
cosine_min_lr_ratio: # decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr
|
||||
cosine_constant_lr_ratio: # freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step (https://arxiv.org/pdf/2308.04014.pdf)
|
||||
@@ -534,8 +528,6 @@ flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
|
||||
sdp_attention:
|
||||
# Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf
|
||||
s2_attention:
|
||||
# Optional[bool]. Whether to use low_cpu_mem_usage
|
||||
low_cpu_mem_usage:
|
||||
# Resume from a specific checkpoint dir
|
||||
resume_from_checkpoint:
|
||||
# If resume_from_checkpoint isn't set and you simply want it to start where it left off.
|
||||
|
||||
@@ -129,7 +129,6 @@ You can mix and match within each approach or across approaches to train a model
|
||||
We suggest this approach when you want to bring your own tokenized dataset.
|
||||
|
||||
Axolotl expects the dataset to have three keys:
|
||||
|
||||
- `input_ids`: from tokenizing formatted prompt
|
||||
- `attention_mask`: for masking padding. If you don't add padding, it would be equal to `len(input_ids) * [1]`
|
||||
- `labels`: this is the same as `input_ids`, however, if you want to mask certain tokens, you would set those indices to `-100`.
|
||||
|
||||
140
docs/docker.qmd
140
docs/docker.qmd
@@ -1,140 +0,0 @@
|
||||
---
|
||||
title: "Docker"
|
||||
format:
|
||||
html:
|
||||
toc: true
|
||||
toc-depth: 4
|
||||
---
|
||||
|
||||
This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
|
||||
|
||||
## Base
|
||||
|
||||
The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.
|
||||
|
||||
#### Image
|
||||
|
||||
```
|
||||
axolotlai/axolotl-base
|
||||
```
|
||||
|
||||
Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base)
|
||||
|
||||
#### Tags format
|
||||
|
||||
```bash
|
||||
main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
|
||||
```
|
||||
|
||||
Tags examples:
|
||||
|
||||
- `main-base-py3.11-cu124-2.6.0`
|
||||
- `main-base-py3.11-cu124-2.5.1`
|
||||
- `main-base-py3.11-cu124-2.4.1`
|
||||
|
||||
## Main
|
||||
|
||||
The main image is the image that is used to run Axolotl. It is based on the `axolotlai/axolotl-base` image and includes the Axolotl codebase, dependencies, and more.
|
||||
|
||||
#### Image
|
||||
|
||||
```
|
||||
axolotlai/axolotl
|
||||
```
|
||||
|
||||
Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
|
||||
|
||||
#### Tags format {#sec-main-tags}
|
||||
|
||||
```bash
|
||||
# on push to main
|
||||
main-py{python_version}-cu{cuda_version}-{pytorch_version}
|
||||
|
||||
# latest main (currently torch 2.5.1, python 3.11, cuda 12.4)
|
||||
main-latest
|
||||
|
||||
# nightly build
|
||||
{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}
|
||||
|
||||
# tagged release
|
||||
{version}
|
||||
```
|
||||
|
||||
:::{.callout-tip}
|
||||
|
||||
There may be some extra tags appended to the image, like `-vllm` which installs those packages.
|
||||
|
||||
:::
|
||||
|
||||
Tags examples:
|
||||
|
||||
- `main-py3.11-cu124-2.6.0`
|
||||
- `main-py3.11-cu124-2.5.1`
|
||||
- `main-py3.11-cu124-2.4.1`
|
||||
- `main-latest`
|
||||
- `main-20250303-py3.11-cu124-2.6.0`
|
||||
- `main-20250303-py3.11-cu124-2.5.1`
|
||||
- `main-20250303-py3.11-cu124-2.4.1`
|
||||
- `0.7.1`
|
||||
|
||||
## Cloud
|
||||
|
||||
The cloud image is the image that is used to run Axolotl in the cloud. It is based on the `axolotlai/axolotl` image and sets ENV variables like HuggingFace cache directories for volume mounts, tmux, and more for different cloud providers.
|
||||
|
||||
:::{.callout-tip}
|
||||
|
||||
Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variables to disable it.
|
||||
|
||||
:::
|
||||
|
||||
#### Image
|
||||
|
||||
```
|
||||
axolotlai/axolotl-cloud
|
||||
```
|
||||
|
||||
Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud)
|
||||
|
||||
#### Tags format
|
||||
|
||||
This uses the same tags as the [`main` image](#sec-main-tags).
|
||||
|
||||
#### Environment variables
|
||||
|
||||
- `JUPYTER_DISABLE`: Disable Jupyter lab.
|
||||
- `JUPYTER_PASSWORD`: Set a password for the Jupyter lab.
|
||||
- `PUBLIC_KEY`: Add a public key for the SSH service.
|
||||
- `SSH_KEY`: Add a private key for the SSH service.
|
||||
|
||||
#### Volume mounts
|
||||
|
||||
:::{.callout-tip}
|
||||
|
||||
We recommend mounting volumes to `/workspace/data` for data persistence. `/workspace/axolotl` contains the source code and is ephemeral.
|
||||
|
||||
:::
|
||||
|
||||
- `/workspace/data/axolotl-artifacts`: Directory to store Axolotl artifacts.
|
||||
- `/workspace/data/huggingface-cache`: Directory to store HuggingFace cache.
|
||||
|
||||
## Cloud-no-tmux
|
||||
|
||||
This is the same as the [`cloud` image](#sec-cloud) but without tmux.
|
||||
|
||||
#### Image
|
||||
|
||||
```
|
||||
axolotlai/axolotl-cloud-term
|
||||
```
|
||||
|
||||
Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud-term)
|
||||
|
||||
:::{.callout-note}
|
||||
|
||||
The naming may be a bit confusing as it has `-term` appended to the end.
|
||||
|
||||
:::
|
||||
|
||||
#### Tags format
|
||||
|
||||
This uses the same tags as the [`cloud` image](#sec-cloud-tags).
|
||||
@@ -19,9 +19,7 @@ description: Frequently asked questions
|
||||
|
||||
**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**
|
||||
|
||||
**Q: ModuleNotFoundError: No module named 'mpi4py' using single GPU with deepspeed**
|
||||
|
||||
> A: You may be using deepspeed with single gpu. Please remove the `deepspeed:` section in the yaml file or `--deepspeed` CLI flag.
|
||||
> A: You may be using deepspeed with single gpu. Please don't set `deepspeed:` in yaml or cli.
|
||||
|
||||
**Q: The codes is stuck on saving preprocessed datasets.**
|
||||
|
||||
|
||||
@@ -65,8 +65,6 @@ docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
|
||||
```
|
||||
:::
|
||||
|
||||
Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.
|
||||
|
||||
## Cloud Environments {#sec-cloud}
|
||||
|
||||
### Cloud GPU Providers {#sec-cloud-gpu}
|
||||
|
||||
@@ -572,7 +572,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
||||
training_arguments_kwargs["embedding_lr_scale"] = self.cfg.embedding_lr_scale
|
||||
training_arguments_kwargs["lr_groups"] = self.cfg.lr_groups
|
||||
|
||||
if self.cfg.lr_scheduler in ["one_cycle", "rex", "log_sweep"]:
|
||||
if self.cfg.lr_scheduler in ["one_cycle", "log_sweep"]:
|
||||
training_arguments_kwargs["lr_scheduler_type"] = "cosine"
|
||||
training_arguments_kwargs[
|
||||
"alternate_lr_scheduler_type"
|
||||
|
||||
@@ -25,7 +25,6 @@ from trl.trainer.utils import pad_to_length
|
||||
from axolotl.monkeypatch.relora import ReLoRAScheduler
|
||||
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
|
||||
from axolotl.utils.schedulers import (
|
||||
RexLR,
|
||||
get_cosine_schedule_with_min_lr,
|
||||
get_cosine_schedule_with_quadratic_warmup,
|
||||
get_cosine_schedule_with_warmup_decay_constant,
|
||||
@@ -116,17 +115,6 @@ class SchedulerMixin(Trainer):
|
||||
**extra_lr_kwargs,
|
||||
**self.args.lr_scheduler_kwargs,
|
||||
)
|
||||
elif self.args.alternate_lr_scheduler_type == "rex":
|
||||
if use_cosine_min_lr:
|
||||
assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
|
||||
|
||||
self.lr_scheduler = RexLR(
|
||||
optimizer=optimizer,
|
||||
max_lr=self.args.learning_rate,
|
||||
min_lr=0 if not use_cosine_min_lr else (self.args.learning_rate * self.args.cosine_min_lr_ratio),
|
||||
total_steps=num_training_steps,
|
||||
num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
|
||||
)
|
||||
elif use_cosine_quadratic:
|
||||
if use_cosine_min_lr:
|
||||
LOG.warning("Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")
|
||||
|
||||
@@ -9,7 +9,6 @@ import logging
|
||||
from trl.trainer.grpo_trainer import RewardFunc
|
||||
|
||||
from axolotl.core.trainers.grpo.trainer import AxolotlGRPOTrainer
|
||||
from axolotl.utils.config.models.input.v0_4_1.trl import TRLConfig
|
||||
|
||||
LOG = logging.getLogger("axolotl")
|
||||
|
||||
@@ -32,44 +31,30 @@ class GRPOStrategy:
|
||||
@classmethod
|
||||
def set_training_args_kwargs(cls, cfg):
|
||||
grpo_args_kwargs = {}
|
||||
|
||||
if not hasattr(cfg, "trl") or not cfg.trl:
|
||||
return grpo_args_kwargs
|
||||
|
||||
trl: TRLConfig = cfg.trl # type: ignore
|
||||
|
||||
if trl.use_vllm:
|
||||
grpo_args_kwargs["use_vllm"] = trl.use_vllm
|
||||
grpo_args_kwargs["vllm_device"] = (
|
||||
trl.vllm_device if trl.vllm_device else "auto"
|
||||
)
|
||||
|
||||
if trl.vllm_gpu_memory_utilization:
|
||||
if cfg.trl and cfg.trl.use_vllm:
|
||||
grpo_args_kwargs["use_vllm"] = cfg.trl.use_vllm
|
||||
if cfg.trl and cfg.trl.vllm_device:
|
||||
grpo_args_kwargs["vllm_device"] = cfg.trl.vllm_device
|
||||
else:
|
||||
grpo_args_kwargs["vllm_device"] = "auto"
|
||||
if cfg.trl and cfg.trl.vllm_gpu_memory_utilization:
|
||||
grpo_args_kwargs[
|
||||
"vllm_gpu_memory_utilization"
|
||||
] = trl.vllm_gpu_memory_utilization
|
||||
|
||||
if trl.vllm_max_model_len:
|
||||
grpo_args_kwargs["vllm_max_model_len"] = trl.vllm_max_model_len
|
||||
|
||||
if trl.num_generations:
|
||||
grpo_args_kwargs["num_generations"] = trl.num_generations
|
||||
|
||||
if trl.sync_ref_model:
|
||||
grpo_args_kwargs["sync_ref_model"] = trl.sync_ref_model
|
||||
|
||||
if trl.ref_model_mixup_alpha:
|
||||
grpo_args_kwargs["ref_model_mixup_alpha"] = trl.ref_model_mixup_alpha
|
||||
|
||||
if trl.ref_model_sync_steps:
|
||||
grpo_args_kwargs["ref_model_sync_steps"] = trl.ref_model_sync_steps
|
||||
|
||||
grpo_args_kwargs["max_completion_length"] = trl.max_completion_length
|
||||
grpo_args_kwargs["log_completions"] = trl.log_completions
|
||||
|
||||
if trl.reward_weights:
|
||||
grpo_args_kwargs["reward_weights"] = trl.reward_weights
|
||||
|
||||
] = cfg.trl.vllm_gpu_memory_utilization
|
||||
if cfg.trl and cfg.trl.vllm_max_model_len:
|
||||
grpo_args_kwargs["vllm_max_model_len"] = cfg.trl.vllm_max_model_len
|
||||
if cfg.trl and cfg.trl.num_generations:
|
||||
grpo_args_kwargs["num_generations"] = cfg.trl.num_generations
|
||||
if cfg.trl and cfg.trl.sync_ref_model:
|
||||
grpo_args_kwargs["sync_ref_model"] = cfg.trl.sync_ref_model
|
||||
if cfg.trl and cfg.trl.ref_model_mixup_alpha:
|
||||
grpo_args_kwargs[
|
||||
"ref_model_mixup_alpha"
|
||||
] = cfg.trl.ref_model_mixup_alpha
|
||||
if cfg.trl and cfg.trl.ref_model_sync_steps:
|
||||
grpo_args_kwargs["ref_model_sync_steps"] = cfg.trl.ref_model_sync_steps
|
||||
grpo_args_kwargs["max_completion_length"] = cfg.trl.max_completion_length
|
||||
grpo_args_kwargs["log_completions"] = cfg.trl.log_completions
|
||||
return grpo_args_kwargs
|
||||
|
||||
@classmethod
|
||||
|
||||
@@ -4,22 +4,6 @@ Cut Cross Entropy reduces VRAM usage through optimization on the cross-entropy o
|
||||
|
||||
See https://github.com/apple/ml-cross-entropy
|
||||
|
||||
## Requirements
|
||||
|
||||
- PyTorch 2.4.0 or higher
|
||||
|
||||
## Installation
|
||||
|
||||
Run the following command to install `cut_cross_entropy[transformers]` if you don't have it already.
|
||||
|
||||
```bash
|
||||
# if you are in dev environment
|
||||
python scripts/cutcrossentropy_install.py | sh
|
||||
|
||||
# if you are not in dev environment
|
||||
pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy @ git+https://github.com/apple/ml-cross-entropy.git@9c297c905f55b73594b5d650722d1e78183b77bd"'
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
||||
```yaml
|
||||
|
||||
@@ -518,7 +518,7 @@ class HyperparametersConfig(BaseModel):
|
||||
)
|
||||
torchdistx_path: Optional[str] = None
|
||||
lr_scheduler: Optional[
|
||||
Union[SchedulerType, Literal["one_cycle"], Literal["rex"]]
|
||||
Union[SchedulerType, Literal["one_cycle"]]
|
||||
] = SchedulerType.COSINE
|
||||
lr_scheduler_kwargs: Optional[Dict[str, Any]] = None
|
||||
lr_quadratic_warmup: Optional[bool] = None
|
||||
|
||||
@@ -27,7 +27,6 @@ class TRLConfig(BaseModel):
|
||||
vllm_dtype: Optional[str] = "auto"
|
||||
|
||||
reward_funcs: Optional[List[str]] = None
|
||||
reward_weights: Optional[List[float]] = None
|
||||
num_generations: Optional[int] = None
|
||||
log_completions: Optional[bool] = False
|
||||
|
||||
|
||||
@@ -6,80 +6,6 @@ from torch.optim import Optimizer
|
||||
from torch.optim.lr_scheduler import LambdaLR, LRScheduler
|
||||
|
||||
|
||||
class RexLR(LRScheduler):
|
||||
"""
|
||||
Reflected Exponential (REX) learning rate scheduler.
|
||||
|
||||
- Original implementation: https://github.com/IvanVassi/REX_LR
|
||||
- Original license: Apache 2.0
|
||||
- Based on: https://arxiv.org/abs/2107.04197
|
||||
|
||||
Args:
|
||||
optimizer (torch.optim.Optimizer): The optimizer to schedule the learning rate for.
|
||||
max_lr (float): The maximum learning rate.
|
||||
min_lr (float): The minimum learning rate.
|
||||
total_steps (int): The total number of training steps.
|
||||
num_warmup_steps (int): The number of warmup steps.
|
||||
last_step (int): The index of last step.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self, optimizer, max_lr, min_lr, total_steps=0, num_warmup_steps=0, last_step=0
|
||||
):
|
||||
if min_lr > max_lr:
|
||||
raise ValueError(
|
||||
f'Value of "min_lr" should be less than value of "max_lr". Got min_lr={min_lr} and max_lr={max_lr}'
|
||||
)
|
||||
if num_warmup_steps > total_steps:
|
||||
raise ValueError(
|
||||
f"num_warmup_steps ({num_warmup_steps}) must be less than or equal to total_steps ({total_steps})."
|
||||
)
|
||||
|
||||
self.min_lr = min_lr
|
||||
self.max_lr = max_lr
|
||||
self.total_steps = total_steps
|
||||
self.num_warmup_steps = num_warmup_steps
|
||||
self.last_step = last_step - 1
|
||||
|
||||
# Ensure each parameter group has an "initial_lr" key to avoid issues when resuming.
|
||||
for group in optimizer.param_groups:
|
||||
group.setdefault("initial_lr", group["lr"])
|
||||
|
||||
# Pass self.last_step as last_epoch to the parent.
|
||||
super().__init__(optimizer, last_epoch=self.last_step)
|
||||
|
||||
@property
|
||||
def last_step(self):
|
||||
return self.last_epoch
|
||||
|
||||
@last_step.setter
|
||||
def last_step(self, value):
|
||||
self.last_epoch = value
|
||||
|
||||
def get_lr(self):
|
||||
# Warmup phase: if defined, increase lr linearly from 0 to max_lr.
|
||||
if 1 <= self.last_step <= self.num_warmup_steps:
|
||||
return [
|
||||
base_lr * self.last_step / self.num_warmup_steps
|
||||
for base_lr in self.base_lrs
|
||||
]
|
||||
|
||||
# Post-warmup phase: adjust step relative to the end of warmup.
|
||||
step_after = self.last_step - self.num_warmup_steps
|
||||
remaining_steps = self.total_steps - self.num_warmup_steps
|
||||
|
||||
# Avoid LR spiking
|
||||
if step_after >= remaining_steps or step_after == -1 or remaining_steps <= 0:
|
||||
return [self.min_lr for _ in self.base_lrs]
|
||||
|
||||
mod_iter = step_after % remaining_steps
|
||||
z = (remaining_steps - mod_iter) / remaining_steps
|
||||
rex_factor = self.min_lr / self.max_lr + (1.0 - self.min_lr / self.max_lr) * (
|
||||
z / (0.1 + 0.9 * z)
|
||||
)
|
||||
return [base_lr * rex_factor for base_lr in self.base_lrs]
|
||||
|
||||
|
||||
class InterpolatingLogScheduler(LRScheduler):
|
||||
"""
|
||||
A scheduler that interpolates learning rates in a logarithmic fashion
|
||||
|
||||
@@ -1,71 +0,0 @@
|
||||
"""
|
||||
E2E tests for custom schedulers using Llama
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import unittest
|
||||
|
||||
from axolotl.cli.args import TrainerCliArgs
|
||||
from axolotl.common.datasets import load_datasets
|
||||
from axolotl.train import train
|
||||
from axolotl.utils.config import normalize_config, validate_config
|
||||
from axolotl.utils.dict import DictDefault
|
||||
|
||||
from .utils import check_model_output_exists, with_temp_dir
|
||||
|
||||
LOG = logging.getLogger("axolotl.tests.e2e")
|
||||
os.environ["WANDB_DISABLED"] = "true"
|
||||
|
||||
|
||||
class TestCustomSchedulers(unittest.TestCase):
|
||||
"""
|
||||
Test case for Llama models using LoRA
|
||||
"""
|
||||
|
||||
@with_temp_dir
|
||||
def test_rex_scheduler(self, temp_dir):
|
||||
# pylint: disable=duplicate-code
|
||||
cfg = DictDefault(
|
||||
{
|
||||
"base_model": "JackFram/llama-68m",
|
||||
"tokenizer_type": "LlamaTokenizer",
|
||||
"sequence_len": 1024,
|
||||
"load_in_8bit": True,
|
||||
"adapter": "lora",
|
||||
"lora_r": 8,
|
||||
"lora_alpha": 16,
|
||||
"lora_dropout": 0.05,
|
||||
"lora_target_linear": True,
|
||||
"val_set_size": 0.1,
|
||||
"special_tokens": {
|
||||
"unk_token": "<unk>",
|
||||
"bos_token": "<s>",
|
||||
"eos_token": "</s>",
|
||||
},
|
||||
"datasets": [
|
||||
{
|
||||
"path": "mhenrichsen/alpaca_2k_test",
|
||||
"type": "alpaca",
|
||||
},
|
||||
],
|
||||
"num_epochs": 1,
|
||||
"micro_batch_size": 8,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"output_dir": temp_dir,
|
||||
"learning_rate": 0.00001,
|
||||
"optimizer": "adamw_hf",
|
||||
"max_steps": 20,
|
||||
"lr_scheduler": "rex",
|
||||
"warmup_steps": 5,
|
||||
"cosine_min_lr_ratio": 0.05,
|
||||
}
|
||||
)
|
||||
|
||||
cfg = validate_config(cfg)
|
||||
normalize_config(cfg)
|
||||
cli_args = TrainerCliArgs()
|
||||
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||
|
||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||
check_model_output_exists(temp_dir, cfg)
|
||||
Reference in New Issue
Block a user