Compare commits

..

26 Commits

Author SHA1 Message Date
Dan Saunders
926dc4af90 updates 2025-04-25 02:28:38 +00:00
Dan Saunders
6810f0ee19 minimize diffs to GRPO trainer 2025-04-24 22:40:33 +00:00
Dan Saunders
6c65eeaaf7 finalizing SP + GRPO trainer 2025-04-24 22:40:32 +00:00
Dan Saunders
7f4e4076e1 progress 2025-04-24 22:39:31 +00:00
Dan Saunders
4f2d092216 subclassing constructor 2025-04-24 22:39:31 +00:00
Dan Saunders
b13b6e185f stronger subclassing of TRL GRPO trainer; custom distributed sampler 2025-04-24 22:39:29 +00:00
Dan Saunders
76e2d2e60b progress 2025-04-24 22:38:42 +00:00
Dan Saunders
11b6803ff4 grpo sp support 2025-04-24 22:38:22 +00:00
Dan Saunders
e55dce9995 fix 2025-04-24 22:38:21 +00:00
Dan Saunders
9640aacfc9 fixes for batch API funcs, simplify 2025-04-24 22:37:14 +00:00
Dan Saunders
5306c6acbb fix 2025-04-24 22:35:18 +00:00
Dan Saunders
4ae8df16a9 adding all batch ring-flash-attn methods via single adapter 2025-04-24 22:35:18 +00:00
Dan Saunders
74e7cfd28f update 2025-04-24 22:35:18 +00:00
Dan Saunders
2bb5c1fe7e batch api HF adapter for ring-flash-attn; cleanup and improvements 2025-04-24 22:35:18 +00:00
Dan Saunders
3f1873cc62 pytest 2025-04-24 22:31:24 +00:00
Dan Saunders
072df89e0e add gather post hook, simplify, fixes 2025-04-24 22:31:24 +00:00
Dan Saunders
cb7c3ee847 tweak codecov yaml 2025-04-24 22:31:24 +00:00
Dan Saunders
d92ac7a41d reorg 2025-04-24 22:31:24 +00:00
Dan Saunders
5816433121 nit 2025-04-24 22:31:24 +00:00
Dan Saunders
e5a4e21497 simplifying 2025-04-24 22:31:24 +00:00
Dan Saunders
65ae78009c simplifying 2025-04-24 22:31:24 +00:00
Dan Saunders
7e5168ad74 accommodate both training context managers 2025-04-24 22:31:24 +00:00
Dan Saunders
cd393fecc3 further simplifying 2025-04-24 22:31:24 +00:00
Dan Saunders
bac5568bda update 2025-04-24 22:31:24 +00:00
Dan Saunders
69aeae80ed updates 2025-04-24 22:31:24 +00:00
Dan Saunders
cafda804ec ctx manager for SP 2025-04-24 22:31:24 +00:00
39 changed files with 1136 additions and 1761 deletions

View File

@@ -8,7 +8,6 @@ on:
- 'setup.py'
- 'pyproject.toml'
- '.github/workflows/multi-gpu-e2e.yml'
- 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
workflow_dispatch:
schedule:
- cron: '0 0 * * 1,4' # Runs at 00:00 UTC every monday & thursday

161
.runpod/.gitignore vendored
View File

@@ -1,161 +0,0 @@
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
pod/scripts/config.yaml

View File

@@ -1,18 +0,0 @@
FROM runpod/pytorch:3.10-2.0.0-117
COPY .runpod/requirements.txt /requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade pip && \
python3 -m pip install --upgrade -r /requirements.txt
# Environment settings
ARG BASE_VOLUME="/runpod-volume"
ENV BASE_VOLUME=$BASE_VOLUME
ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
COPY .runpod/src /src
CMD ["python3", "/src/handler.py"]

View File

@@ -1,335 +0,0 @@
<h1>LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more</h1>
# Configuration Options
This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.
## Usage
You can use these configuration Options:
1. As a JSON request body:
```json
{
"input": {
"user_id": "user",
"model_id": "model-name",
"run_id": "run-id",
"credentials": {
"wandb_api_key": "", # add your Weights & biases key. TODO: you will be able to set this in Enviornment variables.
"hf_token": "", # add your HF_token. TODO: you will be able to set this in Enviornment variables.
},
"args": {
"base_model": "NousResearch/Llama-3.2-1B",
// ... other options
}
}
}
```
## Configuration Options
### Model Configuration
| Option | Description | Default |
| ------------------- | --------------------------------------------------------------------------------------------- | -------------------- |
| `base_model` | Path to the base model (local or HuggingFace) | Required |
| `base_model_config` | Configuration path for the base model | Same as base_model |
| `revision_of_model` | Specific model revision from HuggingFace hub | Latest |
| `tokenizer_config` | Custom tokenizer configuration path | Optional |
| `model_type` | Type of model to load | AutoModelForCausalLM |
| `tokenizer_type` | Type of tokenizer to use | AutoTokenizer |
| `hub_model_id` | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional |
## Model Family Identification
| Option | Default | Description |
| -------------------------- | ------- | ------------------------------ |
| `is_falcon_derived_model` | `false` | Whether model is Falcon-based |
| `is_llama_derived_model` | `false` | Whether model is LLaMA-based |
| `is_qwen_derived_model` | `false` | Whether model is Qwen-based |
| `is_mistral_derived_model` | `false` | Whether model is Mistral-based |
## Model Configuration Overrides
| Option | Default | Description |
| ----------------------------------------------- | ---------- | ---------------------------------- |
| `overrides_of_model_config.rope_scaling.type` | `"linear"` | RoPE scaling type (linear/dynamic) |
| `overrides_of_model_config.rope_scaling.factor` | `1.0` | RoPE scaling factor |
### Model Loading Options
| Option | Description | Default |
| -------------- | ----------------------------- | ------- |
| `load_in_8bit` | Load model in 8-bit precision | false |
| `load_in_4bit` | Load model in 4-bit precision | false |
| `bf16` | Use bfloat16 precision | false |
| `fp16` | Use float16 precision | false |
| `tf32` | Use tensor float 32 precision | false |
## Memory and Device Settings
| Option | Default | Description |
| ------------------ | --------- | ----------------------- |
| `gpu_memory_limit` | `"20GiB"` | GPU memory limit |
| `lora_on_cpu` | `false` | Load LoRA on CPU |
| `device_map` | `"auto"` | Device mapping strategy |
| `max_memory` | `null` | Max memory per device |
## Training Hyperparameters
| Option | Default | Description |
| ----------------------------- | --------- | --------------------------- |
| `gradient_accumulation_steps` | `1` | Gradient accumulation steps |
| `micro_batch_size` | `2` | Batch size per GPU |
| `eval_batch_size` | `null` | Evaluation batch size |
| `num_epochs` | `4` | Number of training epochs |
| `warmup_steps` | `100` | Warmup steps |
| `warmup_ratio` | `0.05` | Warmup ratio |
| `learning_rate` | `0.00003` | Learning rate |
| `lr_quadratic_warmup` | `false` | Quadratic warmup |
| `logging_steps` | `null` | Logging frequency |
| `eval_steps` | `null` | Evaluation frequency |
| `evals_per_epoch` | `null` | Evaluations per epoch |
| `save_strategy` | `"epoch"` | Checkpoint saving strategy |
| `save_steps` | `null` | Saving frequency |
| `saves_per_epoch` | `null` | Saves per epoch |
| `save_total_limit` | `null` | Maximum checkpoints to keep |
| `max_steps` | `null` | Maximum training steps |
### Dataset Configuration
```yaml
datasets:
- path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
ds_type: json # Dataset type
data_files: path/to/data # Source data files
train_on_split: train # Dataset split to use
```
## Chat Template Settings
| Option | Default | Description |
| ------------------------ | -------------------------------- | ---------------------- |
| `chat_template` | `"tokenizer_default"` | Chat template type |
| `chat_template_jinja` | `null` | Custom Jinja template |
| `default_system_message` | `"You are a helpful assistant."` | Default system message |
## Dataset Processing
| Option | Default | Description |
| ----------------------------- | -------------------------- | --------------------------------- |
| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
| `dataset_processes` | `4` | Number of preprocessing processes |
| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
| `dataset_exact_deduplication` | `true` | Deduplicate datasets |
## LoRA Configuration
| Option | Default | Description |
| -------------------------- | ---------------------- | ------------------------------ |
| `adapter` | `"lora"` | Adapter type (lora/qlora) |
| `lora_model_dir` | `""` | Directory with pretrained LoRA |
| `lora_r` | `8` | LoRA attention dimension |
| `lora_alpha` | `16` | LoRA alpha parameter |
| `lora_dropout` | `0.05` | LoRA dropout |
| `lora_target_modules` | `["q_proj", "v_proj"]` | Modules to apply LoRA |
| `lora_target_linear` | `false` | Target all linear modules |
| `peft_layers_to_transform` | `[]` | Layers to transform |
| `lora_modules_to_save` | `[]` | Modules to save |
| `lora_fan_in_fan_out` | `false` | Fan in/out structure |
## Optimization Settings
| Option | Default | Description |
| ------------------------- | ------- | -------------------------- |
| `train_on_inputs` | `false` | Train on input prompts |
| `group_by_length` | `false` | Group by sequence length |
| `gradient_checkpointing` | `false` | Use gradient checkpointing |
| `early_stopping_patience` | `3` | Early stopping patience |
## Learning Rate Scheduling
| Option | Default | Description |
| -------------------------- | ---------- | -------------------- |
| `lr_scheduler` | `"cosine"` | Scheduler type |
| `lr_scheduler_kwargs` | `{}` | Scheduler parameters |
| `cosine_min_lr_ratio` | `null` | Minimum LR ratio |
| `cosine_constant_lr_ratio` | `null` | Constant LR ratio |
| `lr_div_factor` | `null` | LR division factor |
## Optimizer Settings
| Option | Default | Description |
| ---------------------- | ------------ | ------------------- |
| `optimizer` | `"adamw_hf"` | Optimizer choice |
| `optim_args` | `{}` | Optimizer arguments |
| `optim_target_modules` | `[]` | Target modules |
| `weight_decay` | `null` | Weight decay |
| `adam_beta1` | `null` | Adam beta1 |
| `adam_beta2` | `null` | Adam beta2 |
| `adam_epsilon` | `null` | Adam epsilon |
| `max_grad_norm` | `null` | Gradient clipping |
## Attention Implementations
| Option | Default | Description |
| -------------------------- | ------- | ----------------------------- |
| `flash_optimum` | `false` | Use better transformers |
| `xformers_attention` | `false` | Use xformers |
| `flash_attention` | `false` | Use flash attention |
| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
| `flash_attn_rms_norm` | `false` | Flash attention RMS norm |
| `flash_attn_fuse_qkv` | `false` | Fuse QKV operations |
| `flash_attn_fuse_mlp` | `false` | Fuse MLP operations |
| `sdp_attention` | `false` | Use scaled dot product |
| `s2_attention` | `false` | Use shifted sparse attention |
## Tokenizer Modifications
| Option | Default | Description |
| ---------------- | ------- | ---------------------------- |
| `special_tokens` | - | Special tokens to add/modify |
| `tokens` | `[]` | Additional tokens |
## Distributed Training
| Option | Default | Description |
| ----------------------- | ------- | --------------------- |
| `fsdp` | `null` | FSDP configuration |
| `fsdp_config` | `null` | FSDP config options |
| `deepspeed` | `null` | Deepspeed config path |
| `ddp_timeout` | `null` | DDP timeout |
| `ddp_bucket_cap_mb` | `null` | DDP bucket capacity |
| `ddp_broadcast_buffers` | `null` | DDP broadcast buffers |
<details>
<summary><h3>Example Configuration Request:</h3></summary>
Here's a complete example for fine-tuning a LLaMA model using LoRA:
```json
{
"input": {
"user_id": "user",
"model_id": "llama-test",
"run_id": "test-run",
"credentials": {
"wandb_api_key": "",
"hf_token": ""
},
"args": {
"base_model": "NousResearch/Llama-3.2-1B",
"load_in_8bit": false,
"load_in_4bit": false,
"strict": false,
"datasets": [
{
"path": "teknium/GPT4-LLM-Cleaned",
"type": "alpaca"
}
],
"dataset_prepared_path": "last_run_prepared",
"val_set_size": 0.1,
"output_dir": "./outputs/lora-out",
"adapter": "lora",
"sequence_len": 2048,
"sample_packing": true,
"eval_sample_packing": true,
"pad_to_sequence_len": true,
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.05,
"lora_target_modules": [
"gate_proj",
"down_proj",
"up_proj",
"q_proj",
"v_proj",
"k_proj",
"o_proj"
],
"gradient_accumulation_steps": 2,
"micro_batch_size": 2,
"num_epochs": 1,
"optimizer": "adamw_8bit",
"lr_scheduler": "cosine",
"learning_rate": 0.0002,
"train_on_inputs": false,
"group_by_length": false,
"bf16": "auto",
"tf32": false,
"gradient_checkpointing": true,
"logging_steps": 1,
"flash_attention": true,
"loss_watchdog_threshold": 5,
"loss_watchdog_patience": 3,
"warmup_steps": 10,
"evals_per_epoch": 4,
"saves_per_epoch": 1,
"weight_decay": 0,
"hub_model_id": "runpod/llama-fr-lora",
"wandb_name": "test-run-1",
"wandb_project": "test-run-1",
"wandb_entity": "axo-test",
"special_tokens": {
"pad_token": "<|end_of_text|>"
}
}
}
}
```
</details>
### Advanced Features
#### Wandb Integration
- `wandb_project`: Project name for Weights & Biases
- `wandb_entity`: Team name in W&B
- `wandb_watch`: Monitor model with W&B
- `wandb_name`: Name of the W&B run
- `wandb_run_id`: ID for the W&B run
#### Performance Optimization
- `sample_packing`: Enable efficient sequence packing
- `eval_sample_packing`: Use sequence packing during evaluation
- `torch_compile`: Enable PyTorch 2.0 compilation
- `flash_attention`: Use Flash Attention implementation
- `xformers_attention`: Use xFormers attention implementation
### Available Optimizers
The following optimizers are supported:
- `adamw_hf`: HuggingFace's AdamW implementation
- `adamw_torch`: PyTorch's AdamW
- `adamw_torch_fused`: Fused AdamW implementation
- `adamw_torch_xla`: XLA-optimized AdamW
- `adamw_apex_fused`: NVIDIA Apex fused AdamW
- `adafactor`: Adafactor optimizer
- `adamw_anyprecision`: Anyprecision AdamW
- `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
- `lion_8bit`: 8-bit Lion optimizer
- `lion_32bit`: 32-bit Lion optimizer
- `sgd`: Stochastic Gradient Descent
- `adagrad`: Adagrad optimizer
## Notes
- Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
- Enable `flash_attention: true` for faster training on modern GPUs
- Use `gradient_checkpointing: true` to reduce memory usage
- Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html).
### Errors:
- if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.

View File

@@ -1,93 +0,0 @@
{
"title": "Axolotl Fine-Tuning",
"description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
"type": "serverless",
"category": "language",
"iconUrl": "https://avatars.githubusercontent.com/u/167502477",
"config": {
"runsOn": "GPU",
"containerDiskInGb": 200,
"gpuCount": 1,
"allowedCudaVersions": [
"12.8",
"12.7",
"12.6",
"12.5",
"12.4"
],
"presets": [],
"env": [
{
"key": "TOKENIZER",
"input": {
"name": "Tokenizer",
"type": "string",
"description": "Name or path of the Hugging Face tokenizer to use.",
"default": "",
"advanced": true
}
},
{
"key": "MAX_NUM_SEQS",
"input": {
"name": "Max Num Seqs",
"type": "number",
"description": "Maximum number of sequences per iteration.",
"default": 256,
"advanced": true
}
},
{
"key": "DISABLE_LOG_STATS",
"input": {
"name": "Disable Log Stats",
"type": "boolean",
"description": "Disable logging statistics.",
"default": false,
"trueValue": "true",
"falseValue": "false"
}
},
{
"key": "LOAD_FORMAT",
"input": {
"name": "Load Format",
"type": "string",
"description": "The format of the model weights to load.",
"default": "auto",
"options": [
{
"label": "auto",
"value": "auto"
},
{
"label": "pt",
"value": "pt"
},
{
"label": "safetensors",
"value": "safetensors"
},
{
"label": "npcache",
"value": "npcache"
},
{
"label": "dummy",
"value": "dummy"
},
{
"label": "tensorizer",
"value": "tensorizer"
},
{
"label": "bitsandbytes",
"value": "bitsandbytes"
}
],
"advanced": true
}
}
]
}
}

View File

@@ -1,15 +0,0 @@
# Required Python packages get listed here, one per line.
# Reccomended to lock the version number to avoid unexpected changes.
# You can also install packages from a git repository, e.g.:
# git+https://github.com/runpod/runpod-python.git
# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
runpod~=1.7.0
huggingface_hub
typing-extensions
pydantic
pydantic-settings
hf-transfer
setuptools
numpy==2.0.0
axolotl[flash-attn,deepspeed]

View File

@@ -1,577 +0,0 @@
# # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
# # This can also be a relative path to a model on disk
# base_model: ./llama-7b-hf
# # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
# base_model_ignore_patterns:
# # If the base_model repo on hf hub doesn't include configuration .json files,
# # You can set that here, or leave this empty to default to base_model
# base_model_config: ./llama-7b-hf
# # You can specify to choose a specific model revision from huggingface hub
# model_revision:
# # Optional tokenizer configuration override in case you want to use a different tokenizer
# # than the one defined in the base model
# tokenizer_config:
# # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
# model_type: AutoModelForCausalLM
# # Corresponding tokenizer for the model AutoTokenizer is a good choice
# tokenizer_type: AutoTokenizer
# # Trust remote code for untrusted source
# trust_remote_code:
# # use_fast option for tokenizer loading from_pretrained, default to True
# tokenizer_use_fast:
# # Whether to use the legacy tokenizer setting, defaults to True
# tokenizer_legacy:
# # Resize the model embeddings when new tokens are added to multiples of 32
# # This is reported to improve training speed on some models
# resize_token_embeddings_to_32x:
# # Used to identify which the model is based on
# is_falcon_derived_model:
# is_llama_derived_model:
# # Please note that if you set this to true, `padding_side` will be set to "left" by default
# is_mistral_derived_model:
# is_qwen_derived_model:
# # optional overrides to the base model configuration
# model_config:
# # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
# rope_scaling:
# type: # linear | dynamic
# factor: # float
# # Whether you are training a 4-bit GPTQ quantized model
# gptq: true
# gptq_groupsize: 128 # group size
# gptq_model_v1: false # v1 or v2
# # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
# load_in_8bit: true
# # Use bitsandbytes 4 bit
# load_in_4bit:
# # Use CUDA bf16
# bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
# # Use CUDA fp16
# fp16: true
# # Use CUDA tf32
# tf32: true # require >=ampere
# # No AMP (automatic mixed precision)
# bfloat16: true # require >=ampere
# float16: true
# # A list of one or more datasets to finetune the model with
# datasets:
# # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
# - path: vicgalle/alpaca-gpt4
# # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
# type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
# ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
# data_files: # Optional[str] path to source data files
# shards: # Optional[int] number of shards to split data into
# name: # Optional[str] name of dataset configuration to load
# train_on_split: train # Optional[str] name of dataset split to load from
# # Optional[str] fastchat conversation type, only used with type: sharegpt
# conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
# field_human: # Optional[str]. Human key to use for conversation.
# field_model: # Optional[str]. Assistant key to use for conversation.
# # Custom user prompt
# - path: repo
# type:
# # The below are defaults. only set what's needed.
# system_prompt: ""
# system_format: "{system}"
# field_system: system
# field_instruction: instruction
# field_input: input
# field_output: output
# # Customizable to be single line or multi-line
# # 'format' can include {input}
# format: |-
# User: {instruction} {input}
# Assistant:
# # 'no_input_format' cannot include {input}
# no_input_format: "{instruction} "
# # For `completion` datsets only, uses the provided field instead of `text` column
# field:
# # Axolotl attempts to save the dataset as an arrow after packing the data together so
# # subsequent training attempts load faster, relative path
# dataset_prepared_path: data/last_run_prepared
# # Push prepared dataset to hub
# push_dataset_to_hub: # repo path
# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
# # if not set.
# dataset_processes: # defaults to os.cpu_count() if not set
# # push checkpoints to hub
# hub_model_id: # repo path to push finetuned model
# # how to push checkpoints to hub
# # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
# hub_strategy:
# # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
# # Required to be true when used in combination with `push_dataset_to_hub`
# hf_use_auth_token: # boolean
# # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
# val_set_size: 0.04
# # Num shards for whole dataset
# dataset_shard_num:
# # Index of shard to use for whole dataset
# dataset_shard_idx:
# # The maximum length of an input to train with, this should typically be less than 2048
# # as most models have a token/context limit of 2048
# sequence_len: 2048
# # Pad inputs so each step uses constant sized buffers
# # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
# pad_to_sequence_len:
# # Max sequence length to concatenate training samples together up to
# # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
# # FutureWarning: This will soon be DEPRECATED
# max_packed_sequence_len: 1024
# # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
# sample_packing:
# # Set to 'false' if getting errors during eval with sample_packing on.
# eval_sample_packing:
# # You can set these packing optimizations AFTER starting a training at least once.
# # The trainer will provide recommended values for these values.
# sample_packing_eff_est:
# total_num_tokens:
# # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
# adapter: lora
# # If you already have a lora model trained that you want to load, put that here.
# # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
# lora_model_dir:
# # LoRA hyperparameters
# # For more details about the following options, see:
# # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
# lora_r: 8
# lora_alpha: 16
# lora_dropout: 0.05
# lora_target_modules:
# - q_proj
# - v_proj
# # - k_proj
# # - o_proj
# # - gate_proj
# # - down_proj
# # - up_proj
# lora_target_linear: # If true, will target all linear layers
# # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
# # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
# # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
# # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
# lora_modules_to_save:
# # - embed_tokens
# # - lm_head
# # Once you complete training, the model will be saved to the following directory.
# # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
# # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
# lora_out_dir:
# lora_fan_in_fan_out: false
# # ReLoRA configuration
# # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
# relora_steps: # Number of steps per ReLoRA restart
# relora_warmup_steps: # Number of per-restart warmup steps
# relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
# # wandb configuration if you're using it
# wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
# wandb_project: # Your wandb project name
# wandb_entity: # A wandb Team name if using a Team
# wandb_watch:
# wandb_run_id: # Set the name of your wandb run
# wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
# # Where to save the full-finetuned model to
# output_dir: ./completed-model
# # Whether to use torch.compile and which backend to use
# torch_compile: # bool
# torch_compile_backend: # Optional[str]
# # Training hyperparameters
# # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
# gradient_accumulation_steps: 1
# # The number of samples to include in each batch. This is the number of samples sent to each GPU.
# micro_batch_size: 2
# eval_batch_size:
# num_epochs: 4
# warmup_steps: 100 # cannot use with warmup_ratio
# warmup_ratio: 0.05 # cannot use with warmup_steps
# learning_rate: 0.00003
# lr_quadratic_warmup:
# logging_steps:
# save_strategy: # Set to `no` to skip checkpoint saves
# save_steps: # Leave empty to save at each epoch
# eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
# save_total_limit: # Checkpoints saved at a time
# # Maximum number of iterations to train for. It precedes num_epochs which means that
# # if both are set, num_epochs will not be guaranteed.
# # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
# max_steps:
# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
# # Save model as safetensors (require safetensors package)
# save_safetensors:
# # Whether to mask out or include the human's prompt from the training labels
# train_on_inputs: false
# # Group similarly sized data to minimize padding.
# # May be slower to start, as it must download and sort the entire dataset.
# # Note that training loss may have an oscillating pattern with this enabled.
# group_by_length: false
# # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
# gradient_checkpointing: false
# # Stop training after this many evaluation losses have increased in a row
# # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
# early_stopping_patience: 3
# # Specify a scheduler and kwargs to use with the optimizer
# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
# lr_scheduler_kwargs:
# # For one_cycle optim
# lr_div_factor: # Learning rate div factor
# # For log_sweep optim
# log_sweep_min_lr:
# log_sweep_max_lr:
# # Specify optimizer
# # Valid values are driven by the Transformers OptimizerNames class, see:
# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
# #
# # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
# # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
# # in the examples/ for your model and fine-tuning use case.
# #
# # Valid values for 'optimizer' include:
# # - adamw_hf
# # - adamw_torch
# # - adamw_torch_fused
# # - adamw_torch_xla
# # - adamw_apex_fused
# # - adafactor
# # - adamw_anyprecision
# # - sgd
# # - adagrad
# # - adamw_bnb_8bit
# # - lion_8bit
# # - lion_32bit
# # - paged_adamw_32bit
# # - paged_adamw_8bit
# # - paged_lion_32bit
# # - paged_lion_8bit
# optimizer:
# # Specify weight decay
# weight_decay:
# # adamw hyperparams
# adam_beta1:
# adam_beta2:
# adam_epsilon:
# # Gradient clipping max norm
# max_grad_norm:
# # Augmentation techniques
# # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
# # currently only supported on Llama and Mistral
# noisy_embedding_alpha:
# # Whether to bettertransformers
# flash_optimum:
# # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
# xformers_attention:
# # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
# flash_attention:
# flash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only
# flash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only
# flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
# # Whether to use scaled-dot-product attention
# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
# sdp_attention:
# # Landmark attention (only llama)
# landmark_attention:
# # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
# # LLaMA only
# xpos_rope:
# # Resume from a specific checkpoint dir
# resume_from_checkpoint:
# # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
# # Be careful with this being turned on between different models.
# auto_resume_from_checkpoints: false
# # Don't mess with this, it's here for accelerate and torchrun
# local_rank:
# # Add or change special tokens.
# # If you add tokens here, you don't need to add them to the `tokens` list.
# special_tokens:
# # bos_token: "<s>"
# # eos_token: "</s>"
# # unk_token: "<unk>"
# # Add extra tokens.
# tokens:
# # FSDP
# fsdp:
# fsdp_config:
# # Deepspeed config path. e.g., deepspeed/zero3.json
# deepspeed:
# # Advanced DDP Arguments
# ddp_timeout:
# ddp_bucket_cap_mb:
# ddp_broadcast_buffers:
# # Path to torch distx for optim 'adamw_anyprecision'
# torchdistx_path:
# # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
# pretraining_dataset:
# # Debug mode
# debug:
# # Seed
# seed:
# # Allow overwrite yml config using from cli
# strict:
base_model: ${BASE_MODEL}
base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
base_model_config: ${BASE_MODEL_CONFIG}
revision_of_model: ${REVISION_OF_MODEL}
tokenizer_config: ${TOKENIZER_CONFIG}
model_type: ${MODEL_TYPE}
tokenizer_type: ${TOKENIZER_TYPE}
trust_remote_code: ${TRUST_REMOTE_CODE}
tokenizer_use_fast: ${TOKENIZER_USE_FAST}
tokenizer_legacy: ${TOKENIZER_LEGACY}
resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}
is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}
overrides_of_model_config:
rope_scaling:
type: ${ROPE_SCALING_TYPE}
factor: ${ROPE_SCALING_FACTOR}
bnb_config_kwargs:
llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}
gptq: ${GPTQ}
load_in_8bit: ${LOAD_IN_8BIT}
load_in_4bit: ${LOAD_IN_4BIT}
bf16: ${BF16}
fp16: ${FP16}
tf32: ${TF32}
bfloat16: ${BFLOAT16}
float16: ${FLOAT16}
gpu_memory_limit: ${GPU_MEMORY_LIMIT}
lora_on_cpu: ${LORA_ON_CPU}
datasets:
- path: ${DATASET_PATH}
type: ${DATASET_TYPE}
ds_type: ${DATASET_DS_TYPE}
data_files: ${DATASET_DATA_FILES}
shards: ${DATASET_SHARDS}
name: ${DATASET_NAME}
train_on_split: ${DATASET_TRAIN_ON_SPLIT}
revision: ${DATASET_REVISION}
trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}
rl: ${RL}
dpo_use_weighting: ${DPO_USE_WEIGHTING}
chat_template: ${CHAT_TEMPLATE}
chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
dataset_prepared_path: ${DATASET_PREPARED_PATH}
push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
dataset_processes: ${DATASET_PROCESSES}
dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
hub_model_id: ${HUB_MODEL_ID}
hub_strategy: ${HUB_STRATEGY}
hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
val_set_size: ${VAL_SET_SIZE}
dataset_shard_num: ${DATASET_SHARD_NUM}
dataset_shard_idx: ${DATASET_SHARD_IDX}
sequence_len: ${SEQUENCE_LEN}
pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
sample_packing: ${SAMPLE_PACKING}
eval_sample_packing: ${EVAL_SAMPLE_PACKING}
sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
total_num_tokens: ${TOTAL_NUM_TOKENS}
sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}
batch_flattening: ${BATCH_FLATTENING}
device_map: ${DEVICE_MAP}
max_memory: ${MAX_MEMORY}
adapter: ${ADAPTER}
lora_model_dir: ${LORA_MODEL_DIR}
lora_r: ${LORA_R}
lora_alpha: ${LORA_ALPHA}
lora_dropout: ${LORA_DROPOUT}
lora_target_modules:
- ${LORA_TARGET_MODULES}
lora_target_linear: ${LORA_TARGET_LINEAR}
peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}
loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}
peft:
loftq_config:
loftq_bits: ${LOFTQ_BITS}
relora_steps: ${RELORA_STEPS}
relora_warmup_steps: ${RELORA_WARMUP_STEPS}
relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
relora_prune_ratio: ${RELORA_PRUNE_RATIO}
relora_cpu_offload: ${RELORA_CPU_OFFLOAD}
wandb_mode: ${WANDB_MODE}
wandb_project: ${WANDB_PROJECT}
wandb_entity: ${WANDB_ENTITY}
wandb_watch: ${WANDB_WATCH}
wandb_name: ${WANDB_NAME}
wandb_run_id: ${WANDB_RUN_ID}
wandb_log_model: ${WANDB_LOG_MODEL}
mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
mlflow_run_name: ${MLFLOW_RUN_NAME}
hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}
use_comet: ${USE_COMET}
comet_api_key: ${COMET_API_KEY}
comet_workspace: ${COMET_WORKSPACE}
comet_project_name: ${COMET_PROJECT_NAME}
comet_experiment_key: ${COMET_EXPERIMENT_KEY}
comet_mode: ${COMET_MODE}
comet_online: ${COMET_ONLINE}
comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}
output_dir: ${OUTPUT_DIR}
torch_compile: ${TORCH_COMPILE}
torch_compile_backend: ${TORCH_COMPILE_BACKEND}
gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
micro_batch_size: ${MICRO_BATCH_SIZE}
eval_batch_size: ${EVAL_BATCH_SIZE}
num_epochs: ${NUM_EPOCHS}
warmup_steps: ${WARMUP_STEPS}
warmup_ratio: ${WARMUP_RATIO}
learning_rate: ${LEARNING_RATE}
lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
logging_steps: ${LOGGING_STEPS}
eval_steps: ${EVAL_STEPS}
evals_per_epoch: ${EVALS_PER_EPOCH}
save_strategy: ${SAVE_STRATEGY}
save_steps: ${SAVE_STEPS}
saves_per_epoch: ${SAVES_PER_EPOCH}
save_total_limit: ${SAVE_TOTAL_LIMIT}
max_steps: ${MAX_STEPS}
eval_table_size: ${EVAL_TABLE_SIZE}
eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}
profiler_steps: ${PROFILER_STEPS}
loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
save_safetensors: ${SAVE_SAFETENSORS}
train_on_inputs: ${TRAIN_ON_INPUTS}
group_by_length: ${GROUP_BY_LENGTH}
gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
early_stopping_patience: ${EARLY_STOPPING_PATIENCE}
lr_scheduler: ${LR_SCHEDULER}
lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
lr_div_factor: ${LR_DIV_FACTOR}
optimizer: ${OPTIMIZER}
optim_args: ${OPTIM_ARGS}
optim_target_modules: ${OPTIM_TARGET_MODULES}
weight_decay: ${WEIGHT_DECAY}
adam_beta1: ${ADAM_BETA1}
adam_beta2: ${ADAM_BETA2}
adam_epsilon: ${ADAM_EPSILON}
max_grad_norm: ${MAX_GRAD_NORM}
neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}
flash_optimum: ${FLASH_OPTIMUM}
xformers_attention: ${XFORMERS_ATTENTION}
flash_attention: ${FLASH_ATTENTION}
flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
sdp_attention: ${SDP_ATTENTION}
s2_attention: ${S2_ATTENTION}
resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}
local_rank: ${LOCAL_RANK}
special_tokens:
bos_token: ${SPECIAL_TOKEN_BOS}
eos_token: ${SPECIAL_TOKEN_EOS}
unk_token: ${SPECIAL_TOKEN_UNK}
pad_token: ${SPECIAL_TOKEN_PAD}
tokens: ${TOKENS}
fsdp: ${FSDP}
fsdp_config: ${FSDP_CONFIG}
deepspeed: ${DEEPSPEED}
ddp_timeout: ${DDP_TIMEOUT}
ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}
torchdistx_path: ${TORCHDISTX_PATH}
pretraining_dataset: ${PRETRAINING_DATASET}
debug: ${DEBUG}
seed: ${SEED}
strict: ${STRICT}

View File

@@ -1,64 +0,0 @@
"""
Runpod serverless entrypoint handler
"""
import os
import runpod
import yaml
from huggingface_hub._login import login
from train import train
from utils import get_output_dir
BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
if not os.path.exists(BASE_VOLUME):
os.makedirs(BASE_VOLUME)
logger = runpod.RunPodLogger()
async def handler(job):
runpod_job_id = job["id"]
inputs = job["input"]
run_id = inputs.get("run_id", "default_run_id")
args = inputs.get("args", {})
# Set output directory
output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
args["output_dir"] = output_dir
# First save args to a temporary config file
config_path = "/workspace/test_config.yaml"
# Add run_name and job_id to args before saving
args["run_name"] = run_id
args["runpod_job_id"] = runpod_job_id
yaml_data = yaml.dump(args, default_flow_style=False)
with open(config_path, "w", encoding="utf-8") as file:
file.write(yaml_data)
# Handle credentials
credentials = inputs.get("credentials", {})
if "wandb_api_key" in credentials:
os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
if "hf_token" in credentials:
os.environ["HF_TOKEN"] = credentials["hf_token"]
if os.environ.get("HF_TOKEN"):
login(token=os.environ["HF_TOKEN"])
else:
logger.info("No HF_TOKEN provided. Skipping login.")
logger.info("Starting Training.")
async for result in train(config_path): # Pass the config path instead of args
logger.info(result)
logger.info("Training Complete.")
# Cleanup
del os.environ["WANDB_API_KEY"]
del os.environ["HF_TOKEN"]
runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})

View File

@@ -1,61 +0,0 @@
{
"input": {
"user_id": "user",
"model_id": "llama-test",
"run_id": "llama-test",
"credentials": {
"wandb_api_key": "",
"hf_token": ""
},
"args": {
"base_model": "NousResearch/Meta-Llama-3-8B",
"model_type": "LlamaForCausalLM",
"tokenizer_type": "AutoTokenizer",
"load_in_8bit": true,
"load_in_4bit": false,
"strict": false,
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": "alpaca"
}
],
"val_set_size": 0.05,
"output_dir": "./outputs/lora-out",
"sequence_len": 4096,
"sample_packing": true,
"eval_sample_packing": false,
"pad_to_sequence_len": true,
"adapter": "lora",
"lora_r": 32,
"lora_alpha": 16,
"lora_dropout": 0.05,
"lora_target_linear": true,
"lora_modules_to_save": [
"embed_tokens",
"lm_head"
],
"gradient_accumulation_steps": 4,
"micro_batch_size": 2,
"num_epochs": 1,
"optimizer": "adamw_bnb_8bit",
"lr_scheduler": "cosine",
"learning_rate": 0.0002,
"train_on_inputs": false,
"group_by_length": false,
"bf16": "auto",
"tf32": false,
"gradient_checkpointing": true,
"logging_steps": 1,
"flash_attention": true,
"warmup_steps": 1,
"evals_per_epoch": 1,
"eval_max_new_tokens": 128,
"saves_per_epoch": 1,
"weight_decay": 0.0,
"special_tokens": {
"pad_token": "<|end_of_text|>"
}
}
}
}

View File

@@ -1,45 +0,0 @@
"""
Runpod train entrypoint
"""
import asyncio
async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
"""
Run preprocessing (if enabled) and training with the given config file
:param config_path: Path to the YAML config file
:param gpu_id: GPU ID to use (default: "0")
:param preprocess: Whether to run preprocessing (default: True)
"""
# First check if preprocessing is needed
if preprocess:
# Preprocess command
preprocess_cmd = (
f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
)
process = await asyncio.create_subprocess_shell(
preprocess_cmd,
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.STDOUT,
)
if process.stdout is not None:
async for line in process.stdout:
yield f"Preprocessing: {line.decode().strip()}"
await process.wait()
yield "Preprocessing completed."
else:
yield "Skipping preprocessing step."
# Training command
train_cmd = f"axolotl train {config_path}"
process = await asyncio.create_subprocess_shell(
train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
)
if process.stdout is not None:
async for line in process.stdout:
yield f"Training: {line.decode().strip()}"
await process.wait()

View File

@@ -1,89 +0,0 @@
"""
Runpod launcher utils
"""
import os
import yaml
def get_output_dir(run_id):
path = f"fine-tuning/{run_id}"
return path
def make_valid_config(input_args):
"""
Creates and saves updated config file, returns the path to the new config
:param input_args: dict of input args
:return: str, path to the updated config file
"""
# Load default config
with open("config/config.yaml", "r", encoding="utf-8") as fin:
all_args = yaml.safe_load(fin)
if not input_args:
print("No args provided, using defaults")
else:
all_args.update(input_args)
# Create updated config path
updated_config_path = "config/updated_config.yaml"
# Save updated config to new file
with open(updated_config_path, "w", encoding="utf-8") as f:
yaml.dump(all_args, f)
return updated_config_path
def set_config_env_vars(args: dict):
"""
Convert API arguments into environment variables.
Handles nested dictionaries, lists, and special values.
Args:
args (dict): The arguments dictionary from the API request
"""
def process_value(value):
"""Convert Python values to string format for environment variables"""
if value is None:
return ""
if isinstance(value, bool):
return str(value).lower()
if isinstance(value, (list, dict)):
return str(value)
return str(value)
def set_env_vars(data, prefix=""):
"""Recursively set environment variables from nested dictionary"""
for key, value in data.items():
env_key = prefix + key.upper()
# Handle special cases
if isinstance(value, dict):
# For nested dictionaries (like special_tokens)
set_env_vars(value, f"{env_key}_")
elif isinstance(value, list):
# Handle list of dictionaries (like datasets)
if value and isinstance(value[0], dict):
for i, item in enumerate(value):
set_env_vars(item, f"{env_key}_{i}_")
else:
# For simple lists (like lora_target_modules)
os.environ[env_key] = process_value(value)
else:
# Handle all other cases
os.environ[env_key] = process_value(value)
# Clear any existing related environment variables
# This prevents old values from persisting
for key in list(os.environ.keys()):
if key.startswith(
("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
):
del os.environ[key]
# Set new environment variables
set_env_vars(args)

View File

@@ -1,89 +0,0 @@
{
"tests": [
{
"name": "quick_smoke_test_sft",
"input": {
"user_id": "user",
"model_id": "llama-test",
"run_id": "llama-test",
"credentials": {
"wandb_api_key": "",
"hf_token": ""
},
"args": {
"base_model": "NousResearch/Meta-Llama-3-8B",
"model_type": "LlamaForCausalLM",
"tokenizer_type": "AutoTokenizer",
"load_in_8bit": true,
"load_in_4bit": false,
"strict": false,
"datasets": [
{
"path": "mhenrichsen/alpaca_2k_test",
"type": "alpaca"
}
],
"val_set_size": 0.05,
"output_dir": "./outputs/lora-out",
"sequence_len": 4096,
"sample_packing": true,
"eval_sample_packing": false,
"pad_to_sequence_len": true,
"adapter": "lora",
"lora_r": 32,
"lora_alpha": 16,
"lora_dropout": 0.05,
"lora_target_linear": true,
"lora_modules_to_save": [
"embed_tokens",
"lm_head"
],
"gradient_accumulation_steps": 4,
"micro_batch_size": 2,
"num_epochs": 1,
"optimizer": "adamw_bnb_8bit",
"lr_scheduler": "cosine",
"learning_rate": 0.0002,
"train_on_inputs": false,
"group_by_length": false,
"bf16": "auto",
"tf32": false,
"gradient_checkpointing": true,
"logging_steps": 1,
"flash_attention": true,
"warmup_steps": 1,
"evals_per_epoch": 1,
"eval_max_new_tokens": 128,
"saves_per_epoch": 1,
"weight_decay": 0.0,
"special_tokens": {
"pad_token": "<|end_of_text|>"
}
}
},
"timeout": 100000
}
],
"config": {
"gpuTypeId": "NVIDIA GeForce RTX 4090",
"gpuCount": 1,
"containerDiskInGb": 200,
"env": [
{
"key": "TOKENIZER",
"value": ""
},
{
"key": "DISABLE_LOG_STATS",
"value": "true"
}
],
"allowedCudaVersions": [
"12.8",
"12.7",
"12.6",
"12.5",
"12.4"
]
}
}

View File

@@ -52,4 +52,4 @@ pytest -v --durations=10 \
--cov-append \
--cov-report=xml:e2e-coverage.xml
codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true
codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}

View File

@@ -28,8 +28,6 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
Tags examples:
- `main-base-py3.11-cu128-2.7.0`
- `main-base-py3.11-cu126-2.7.0`
- `main-base-py3.11-cu124-2.6.0`
- `main-base-py3.11-cu124-2.5.1`
- `main-base-py3.11-cu124-2.4.1`
@@ -52,7 +50,7 @@ Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)
# on push to main
main-py{python_version}-cu{cuda_version}-{pytorch_version}
# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
# latest main (currently torch 2.5.1, python 3.11, cuda 12.4)
main-latest
# nightly build
@@ -70,7 +68,6 @@ There may be some extra tags appended to the image, like `-vllm` which installs
Tags examples:
- `main-py3.11-cu126-2.7.0`
- `main-py3.11-cu124-2.6.0`
- `main-py3.11-cu124-2.5.1`
- `main-py3.11-cu124-2.4.1`

View File

@@ -10,6 +10,7 @@ plugins:
liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true
cut_cross_entropy: true
llama4_linearized_experts: true # needed with custom linearized experts model
load_in_4bit: true

View File

@@ -14,6 +14,7 @@ from axolotl.utils.data import prepare_dataset
from axolotl.utils.data.rl import load_prepare_preference_datasets
from axolotl.utils.dict import DictDefault
from axolotl.utils.models import load_processor, load_tokenizer
from axolotl.utils.schemas.enums import RLType
from axolotl.utils.tokenization import check_dataset_labels
LOG = logging.getLogger(__name__)
@@ -125,7 +126,7 @@ def load_preference_datasets(
total_num_steps: Optional[int] = int(
math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
)
if cfg.rl == "grpo":
if cfg.rl is RLType.GRPO:
total_num_steps = None
if cli_args.debug or cfg.debug:

View File

@@ -84,7 +84,7 @@ from axolotl.utils.collators import (
)
from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
from axolotl.utils.models import ensure_dtype
from axolotl.utils.schemas.enums import CustomSupportedOptimizers
from axolotl.utils.schemas.enums import CustomSupportedOptimizers, RLType
try:
import torch._dynamo # pylint: disable=ungrouped-imports
@@ -538,8 +538,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
report_to = []
if self.cfg.use_wandb:
report_to.append("wandb")
if self.cfg.wandb_name:
training_arguments_kwargs["run_name"] = self.cfg.wandb_name
if self.cfg.use_mlflow:
report_to.append("mlflow")
if self.cfg.use_tensorboard:
@@ -1011,6 +1009,8 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
training_args_kwargs["dataloader_prefetch_factor"] = (
self.cfg.dataloader_prefetch_factor
)
if self.cfg.seed:
training_args_kwargs["seed"] = self.cfg.seed
if self.cfg.gradient_checkpointing:
training_args_kwargs["gradient_checkpointing"] = (
self.cfg.gradient_checkpointing
@@ -1048,12 +1048,13 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
if self.cfg.rpo_alpha is not None:
training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha
if self.cfg.use_wandb:
training_args_kwargs["run_name"] = self.cfg.wandb_name
training_args_kwargs["sequence_parallel_degree"] = (
self.cfg.sequence_parallel_degree
)
training_args_cls = None
blocklist_args_kwargs = []
if self.cfg.rl == "simpo":
if self.cfg.rl is RLType.SIMPO:
training_args_cls = AxolotlCPOConfig
training_args_kwargs["loss_type"] = "simpo"
training_args_kwargs["max_length"] = self.cfg.sequence_len
@@ -1061,13 +1062,13 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
if self.cfg.cpo_alpha is not None:
training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha
elif self.cfg.rl == "orpo":
elif self.cfg.rl is RLType.ORPO:
training_args_cls = AxolotlORPOConfig
training_args_kwargs["max_length"] = self.cfg.sequence_len
if self.cfg.max_prompt_len:
training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
elif self.cfg.rl == "kto":
elif self.cfg.rl is RLType.KTO:
training_args_cls = AxolotlKTOConfig
training_args_kwargs["desirable_weight"] = (
@@ -1081,14 +1082,14 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
if self.cfg.max_prompt_len:
training_args_kwargs["max_prompt_length"] = self.cfg.max_prompt_len
elif self.cfg.rl == "grpo":
elif self.cfg.rl is RLType.GRPO:
training_args_cls = GRPOStrategy.get_training_args_class()
training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()
else:
training_args_cls = AxolotlDPOConfig
if self.cfg.rl == "ipo":
if self.cfg.rl is RLType.IPO:
training_args_kwargs["loss_type"] = "ipo"
training_args_kwargs["max_length"] = self.cfg.sequence_len
training_args_kwargs["max_completion_length"] = None
@@ -1121,43 +1122,37 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
**training_args_kwargs,
)
# unset run_name so wandb sets up experiment names
if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
training_args.run_name = ( # pylint: disable=attribute-defined-outside-init
None
)
return training_args
def build(self, total_num_steps):
training_args = self.build_training_arguments(total_num_steps)
dpo_trainer_kwargs = {}
if self.cfg.rl == "ipo":
trainer_kwargs = {}
if self.cfg.rl is RLType.IPO:
if self.cfg.dpo_label_smoothing:
dpo_trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
trainer_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
if self.eval_dataset:
dpo_trainer_kwargs["eval_dataset"] = self.eval_dataset
trainer_kwargs["eval_dataset"] = self.eval_dataset
if self.cfg.adapter and self.peft_config:
dpo_trainer_kwargs["peft_config"] = self.peft_config
trainer_kwargs["peft_config"] = self.peft_config
if self.cfg.precompute_ref_log_probs is not None:
dpo_trainer_kwargs["precompute_ref_log_probs"] = (
trainer_kwargs["precompute_ref_log_probs"] = (
self.cfg.precompute_ref_log_probs
)
if self.cfg.rl == "grpo":
if self.cfg.rl is RLType.GRPO:
trainer_cls = GRPOStrategy.get_trainer_class()
trainer_cls_args = [self.model]
trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))
dpo_trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
elif self.cfg.rl in ["dpo", "ipo"]:
trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))
elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
trainer_cls = DPOStrategy.get_trainer_class()
trainer_cls_args = [self.model, self.model_ref]
elif self.cfg.rl == "orpo":
elif self.cfg.rl is RLType.ORPO:
trainer_cls = AxolotlORPOTrainer
trainer_cls_args = [self.model]
elif self.cfg.rl in ["kto"]:
elif self.cfg.rl is RLType.KTO:
trainer_cls = AxolotlKTOTrainer
trainer_cls_args = [self.model]
elif self.cfg.rl in ["simpo"]:
elif self.cfg.rl is RLType.SIMPO:
trainer_cls = AxolotlCPOTrainer
trainer_cls_args = [self.model]
else:
@@ -1165,33 +1160,33 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
sig = inspect.signature(trainer_cls)
if "tokenizer" in sig.parameters.keys():
dpo_trainer_kwargs["tokenizer"] = self.tokenizer
trainer_kwargs["tokenizer"] = self.tokenizer
else:
dpo_trainer_kwargs["processing_class"] = self.tokenizer
trainer_kwargs["processing_class"] = self.tokenizer
if self.cfg.datasets is not None and (
trainer_cls is DPOStrategy.get_trainer_class()
):
dpo_trainer_kwargs["dataset_tags"] = [
trainer_kwargs["dataset_tags"] = [
d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
]
dpo_trainer = trainer_cls(
trainer = trainer_cls(
*trainer_cls_args,
args=training_args,
train_dataset=self.train_dataset,
callbacks=self.get_callbacks(),
**dpo_trainer_kwargs,
**trainer_kwargs,
)
if self.cfg.fsdp:
ensure_dtype(dpo_trainer.model, dtype=self.cfg.torch_dtype)
if self.cfg.rl in ["dpo", "ipo"] and dpo_trainer.ref_model:
ensure_dtype(dpo_trainer.ref_model, dtype=self.cfg.torch_dtype)
ensure_dtype(trainer.model, dtype=self.cfg.torch_dtype)
if self.cfg.rl in [RLType.DPO, RLType.IPO] and trainer.ref_model:
ensure_dtype(trainer.ref_model, dtype=self.cfg.torch_dtype)
dpo_trainer = self.hook_post_create_trainer(dpo_trainer)
for callback in self.get_post_trainer_create_callbacks(dpo_trainer):
dpo_trainer.add_callback(callback)
trainer = self.hook_post_create_trainer(trainer)
for callback in self.get_post_trainer_create_callbacks(trainer):
trainer.add_callback(callback)
return dpo_trainer
return trainer
class HFPPOTrainerBuilder(TrainerBuilderBase):

View File

@@ -3,6 +3,7 @@ DPO Specific Strategy for training
"""
from axolotl.core.trainers.dpo.trainer import AxolotlDPOTrainer
from axolotl.utils.schemas.enums import RLType
class DPOStrategy:
@@ -23,7 +24,7 @@ class DPOStrategy:
@classmethod
def set_training_args_kwargs(cls, cfg):
training_args_kwargs = {}
if cfg.rl == "ipo":
if cfg.rl is RLType.IPO:
training_args_kwargs["loss_type"] = "ipo"
training_args_kwargs["max_length"] = cfg.sequence_len
training_args_kwargs["max_completion_length"] = None

View File

@@ -11,6 +11,4 @@ from axolotl.core.training_args import AxolotlTrainingMixins
@dataclass
class AxolotlGRPOConfig(AxolotlTrainingMixins, GRPOConfig):
"""
Axolotl GRPO Config for GRPO training
"""
"""Axolotl GRPO Config for GRPO training"""

View File

@@ -0,0 +1,124 @@
"""
Repeat random sampler (akin to the one implemented in
https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds
sequence parallelism functionality; i.e., duplicating data across ranks in the same
sequencee parallel group.
"""
from typing import Sized
import torch
from torch.utils.data import Sampler
class SequenceParallelRepeatRandomSampler(Sampler):
"""
Sampler for GRPO training with sequence parallelism that ensures:
1. Ranks in the same sequence parallel group receive identical data
2. Each index is repeated multiple times for sampling different completions
3. Entire batches are repeated for reuse in multiple updates
"""
def __init__(
self,
dataset: Sized,
mini_repeat_count: int,
world_size: int,
rank: int,
batch_size: int = 1,
repeat_count: int = 1,
sequence_parallel_degree: int = 1,
shuffle: bool = True,
seed: int = 0,
drop_last: bool = False,
):
self.dataset = dataset
self.mini_repeat_count = mini_repeat_count
self.batch_size = batch_size
self.repeat_count = repeat_count
self.shuffle = shuffle
self.seed = seed
self.drop_last = drop_last
self.epoch = 0
self.world_size = world_size
self.rank = rank
# Sequence parallelism parameters
self.sequence_parallel_degree = sequence_parallel_degree
self.num_sp_groups = world_size // sequence_parallel_degree
self.sp_group_id = rank // sequence_parallel_degree
# Adjust dataset size for distributed sampling
self.num_samples = len(self.dataset)
self.total_size = self.num_samples
# Calculate effective number of samples per SP group
if (
self.drop_last
and self.total_size % (self.num_sp_groups * self.batch_size) != 0
):
# Drop last incomplete batch if drop_last is True
self.num_samples_per_sp_group = (
self.total_size // self.batch_size // self.num_sp_groups
) * self.batch_size
else:
# Round up to include last batch if drop_last is False
self.num_samples_per_sp_group = (
(self.total_size + self.batch_size * self.num_sp_groups - 1)
// (self.batch_size * self.num_sp_groups)
* self.batch_size
)
def __iter__(self):
# Deterministically shuffle based on epoch and seed
if self.shuffle:
# Use same seed for all ranks in the same SP group
g = torch.Generator()
seed_value = self.seed + self.epoch + self.sp_group_id * 10000
g.manual_seed(seed_value)
indices = torch.randperm(len(self.dataset), generator=g).tolist()
else:
indices = list(range(len(self.dataset)))
# Add extra samples to make it evenly divisible by batch_size
if len(indices) % self.batch_size != 0:
padding = indices[: self.batch_size - len(indices) % self.batch_size]
indices += padding
# Subsample based on SP group ID
# Each SP group gets distinct batches of data
batch_indices = []
for i in range(0, len(indices), self.batch_size * self.num_sp_groups):
start_idx = i + self.sp_group_id * self.batch_size
end_idx = min(start_idx + self.batch_size, len(indices))
if start_idx < len(indices):
for j in range(self.batch_size):
if start_idx + j < end_idx:
batch_indices.append(indices[start_idx + j])
# Make sure batch_indices is exactly batch_size * num_batches_per_sp_group
if self.drop_last:
num_batches_per_sp_group = self.num_samples_per_sp_group // self.batch_size
target_len = self.batch_size * num_batches_per_sp_group
if len(batch_indices) > target_len:
batch_indices = batch_indices[:target_len]
# Apply the GRPO repeat pattern
final_indices = []
for _ in range(self.repeat_count):
for idx in batch_indices:
for _ in range(self.mini_repeat_count):
final_indices.append(idx)
return iter(final_indices)
def __len__(self):
# Total length including all repetitions
return (
self.num_samples_per_sp_group * self.mini_repeat_count * self.repeat_count
)
def set_epoch(self, epoch):
"""Sets the epoch for this sampler"""
self.epoch = epoch

View File

@@ -1,26 +1,279 @@
"""
Axolotl GRPO trainer
"""
"""Axolotl GRPO trainer"""
# pylint: disable=too-many-lines,duplicate-code
import warnings
from contextlib import nullcontext
from typing import Any
from accelerate.utils import is_deepspeed_available, is_peft_model
import datasets
import torch
import torch.distributed as dist
from accelerate.utils import (
broadcast_object_list,
gather,
gather_object,
is_peft_model,
)
from datasets import Dataset, IterableDataset
from torch import nn
from torch.utils.data import (
BatchSampler,
DataLoader,
Sampler,
)
from transformers import (
PreTrainedModel,
PreTrainedTokenizerBase,
Trainer,
TrainerCallback,
is_wandb_available,
)
from transformers.trainer_utils import seed_worker
from transformers.utils import is_peft_available
from trl import GRPOTrainer
from trl.extras.profiling import profiling_decorator
from trl.data_utils import (
apply_chat_template,
is_conversational,
maybe_apply_chat_template,
)
from trl.extras.profiling import profiling_context, profiling_decorator
from trl.import_utils import (
is_deepspeed_available,
is_rich_available,
)
from trl.models import (
unwrap_model_for_generation,
)
from trl.trainer.grpo_config import GRPOConfig
from trl.trainer.grpo_trainer import RewardFunc
from trl.trainer.utils import (
pad,
print_prompt_completions_sample,
selective_log_softmax,
)
from axolotl.core.trainers.grpo.sampler import SequenceParallelRepeatRandomSampler
from axolotl.core.trainers.mixins import RngLoaderMixin, SchedulerMixin
from axolotl.monkeypatch.attention.ring_attn.patch import get_ring_attn_group
if is_peft_available():
# pylint: disable=unused-import
from peft import PeftConfig
if is_deepspeed_available():
import deepspeed
if is_wandb_available():
import wandb
class AxolotlGRPOTrainer(RngLoaderMixin, SchedulerMixin, GRPOTrainer):
"""
Extend the base GRPOTrainer for axolotl helpers
"""
"""Extend the base GRPOTrainer for axolotl helpers"""
_tag_names = ["trl", "grpo", "axolotl"]
def __init__(
self,
model: str | PreTrainedModel,
reward_funcs: RewardFunc | list[RewardFunc],
args: GRPOConfig | None = None,
train_dataset: Dataset | IterableDataset | None = None,
eval_dataset: (
Dataset | IterableDataset | dict[str, Dataset | IterableDataset] | None
) = None,
processing_class: PreTrainedTokenizerBase | None = None,
reward_processing_classes: (
PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None
) = None,
callbacks: list[TrainerCallback] | None = None,
optimizers: tuple[
torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
] = (None, None),
peft_config: "PeftConfig | None" = None,
):
# First call the superclass constructor with all arguments
super().__init__(
model=model,
reward_funcs=reward_funcs,
args=args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
processing_class=processing_class,
reward_processing_classes=reward_processing_classes,
callbacks=callbacks,
optimizers=optimizers,
peft_config=peft_config,
)
# Now execute your custom logic
# Get number of SP groups (number of processes divided by SP degree)
num_processes = self.accelerator.num_processes
num_sp_groups = num_processes // self.args.sequence_parallel_degree
# Calculate batch size per SP group (not per process)
sp_group_batch_size = self.args.per_device_train_batch_size * num_sp_groups
possible_values = [
n_gen
for n_gen in range(2, sp_group_batch_size + 1)
if (sp_group_batch_size) % n_gen == 0
]
if self.num_generations not in possible_values:
raise ValueError(
f"The batch size per SP group ({num_sp_groups} x "
f"{self.args.per_device_train_batch_size}) must be evenly divisible by "
f"the number of generations per prompt ({self.num_generations}). Given "
"the current configuration, the valid values for the number of "
f"generations are: {possible_values}."
)
if self.args.eval_strategy != "no":
# If sequence parallelism is enabled, calculate batch size per SP group
sp_group_eval_batch_size = args.per_device_eval_batch_size * num_sp_groups # type: ignore[union-attr]
possible_values = [
n_gen
for n_gen in range(2, sp_group_eval_batch_size + 1)
if (sp_group_eval_batch_size) % n_gen == 0
]
if self.num_generations not in possible_values:
raise ValueError(
f"With sequence parallelism (degree {self.args.sequence_parallel_degree}), "
f"the eval batch size per SP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) "
f"must be evenly divisible by the number of generations per prompt "
f"({self.num_generations}). Given the current eval batch size, "
f"the valid values for the number of generations are: {possible_values}."
)
# Initialize the SP group
self.sp_group = get_ring_attn_group()
self.local_rank = dist.get_rank(group=self.sp_group)
self.local_world_size = dist.get_world_size(group=self.sp_group)
print("end of trainer init")
def _get_train_sampler(self) -> Sampler:
# Get distributed training info
world_size = dist.get_world_size()
rank = dist.get_rank()
effective_batch_size = (
self.args.per_device_train_batch_size
* world_size
* self.args.gradient_accumulation_steps
)
return SequenceParallelRepeatRandomSampler(
dataset=self.train_dataset,
mini_repeat_count=self.num_generations,
world_size=world_size,
rank=rank,
batch_size=effective_batch_size
// self.num_generations
// self.args.sequence_parallel_degree,
repeat_count=self.num_iterations,
sequence_parallel_degree=self.args.sequence_parallel_degree,
shuffle=True,
seed=self.args.seed,
drop_last=True,
)
def _create_dataloader_params(self, is_eval=False, custom_batch_size=None):
"""Create common dataloader parameters for train or eval."""
batch_size = custom_batch_size or (
self.args.eval_batch_size if is_eval else self._train_batch_size
)
params = {
"batch_size": batch_size,
"collate_fn": self.data_collator,
"num_workers": self.args.dataloader_num_workers,
"pin_memory": self.args.dataloader_pin_memory,
}
# Add persistent workers only for training
if not is_eval and hasattr(self.args, "dataloader_persistent_workers"):
params["persistent_workers"] = self.args.dataloader_persistent_workers
# Add prefetch factor if specified
if self.args.dataloader_prefetch_factor:
params["prefetch_factor"] = self.args.dataloader_prefetch_factor
return params
def _prepare_dataloader(
self, dataset, sampler, is_eval=False, custom_batch_size=None
):
"""Prepare a dataloader with the given dataset and sampler."""
# Get base parameters
dataloader_params = self._create_dataloader_params(is_eval, custom_batch_size)
# Add sampler configuration
if not isinstance(dataset, torch.utils.data.IterableDataset):
if isinstance(sampler, BatchSampler):
# batch_size and batch_sampler are mutually exclusive
dataloader_params["batch_sampler"] = sampler
del dataloader_params["batch_size"]
else:
dataloader_params["sampler"] = sampler
dataloader_params["drop_last"] = self.args.dataloader_drop_last
if not is_eval:
dataloader_params["worker_init_fn"] = seed_worker
# Create the dataloader
dataloader = DataLoader(dataset, **dataloader_params)
if self.args.sample_packing and (
(not is_eval and not self.args.pretraining)
or (is_eval and self.args.eval_sample_packing is not False)
):
self.accelerator.even_batches = False
# Return unprepared dataloader if using sequence parallelism
# TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
# if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
# slice each batch along the sequence dimension).
if self.args.sequence_parallel_degree > 1:
return dataloader
# Otherwise prepare with accelerator
return self.accelerator.prepare_data_loader(dataloader)
def get_train_dataloader(self) -> DataLoader:
"""Get dataloader for training"""
train_dataset = self.train_dataset
# pylint: disable=access-member-before-definition
data_collator = self.data_collator # type: ignore
# Initialize SP group attributes if sequence parallelism is enabled
if self.args.sequence_parallel_degree > 1:
self.sp_group = get_ring_attn_group()
self.local_rank = dist.get_rank(group=self.sp_group)
self.local_world_size = dist.get_world_size(group=self.sp_group)
# Handle dataset preprocessing
if isinstance(train_dataset, datasets.Dataset):
# Add debug print before any modifications
if self.args.sample_packing and not self.args.pretraining:
train_dataset = train_dataset.remove_columns(["length"])
if not self.args.sample_packing or self.args.pretraining:
train_dataset = self._remove_unused_columns(
train_dataset, description="training"
)
else:
self.data_collator = self._get_collator_with_removed_columns( # pylint: disable=attribute-defined-outside-init
data_collator,
description="training",
)
# Get sampler and create dataloader
sampler = self._get_train_sampler()
dataloader = self._prepare_dataloader(train_dataset, sampler, is_eval=False)
return dataloader
@profiling_decorator
def _move_model_to_vllm(self):
# For DeepSpeed ZeRO-3, we need to gather all parameters before operations
@@ -67,3 +320,577 @@ class AxolotlGRPOTrainer(RngLoaderMixin, SchedulerMixin, GRPOTrainer):
# Reset cache on main process
if self.accelerator.is_main_process:
self.vllm_client.reset_prefix_cache()
# def _generate_and_score_completions(
# self, inputs: list[dict[str, torch.Tensor | Any]]
# ) -> dict[str, torch.Tensor | Any]:
# device = self.accelerator.device
# prompts = [x["prompt"] for x in inputs]
# prompts_text = [
# maybe_apply_chat_template(example, self.processing_class)["prompt"]
# for example in inputs
# ]
# prompt_inputs = self.processing_class(
# text=prompts_text,
# return_tensors="pt",
# padding=True,
# padding_side="left",
# add_special_tokens=False,
# )
# # pylint: disable=protected-access
# prompt_inputs = Trainer._prepare_inputs(self, prompt_inputs)
# prompt_ids, prompt_mask = (
# prompt_inputs["input_ids"],
# prompt_inputs["attention_mask"],
# )
# if self.max_prompt_length is not None:
# prompt_ids = prompt_ids[:, -self.max_prompt_length :]
# prompt_mask = prompt_mask[:, -self.max_prompt_length :]
# # Generate completions using either vLLM or regular generation
# if self.args.use_vllm:
# # First, have main process load weights if needed
# # pylint: disable=access-member-before-definition
# if self.state.global_step != self._last_loaded_step: # type: ignore[has-type]
# self._move_model_to_vllm()
# # pylint: disable=attribute-defined-outside-init
# self._last_loaded_step = self.state.global_step
# all_prompts_text = gather_object(prompts_text)
# if self.accelerator.is_main_process:
# # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
# # num_generations outputs for each one. This is faster than generating outputs for each duplicate
# # prompt individually.
# # ordered_set_of_prompts = all_prompts_text[:: self.num_generations]
# ordered_set_of_prompts = all_prompts_text[
# :: self.num_generations * self.args.sequence_parallel_degree
# ]
# with profiling_context(self, "vLLM.generate"):
# completion_ids = self.vllm_client.generate(
# prompts=ordered_set_of_prompts,
# n=self.num_generations,
# repetition_penalty=self.repetition_penalty,
# temperature=self.temperature,
# top_p=self.top_p,
# top_k=-1 if self.top_k is None else self.top_k,
# min_p=0.0 if self.min_p is None else self.min_p,
# max_tokens=self.max_completion_length,
# guided_decoding_regex=self.guided_decoding_regex,
# )
# else:
# completion_ids = [None] * (
# len(all_prompts_text) // self.args.sequence_parallel_degree
# )
# # Broadcast the completions from the main process to all processes
# completion_ids = broadcast_object_list(completion_ids, from_process=0)
# # Determine the appropriate slice based on sequence parallelism
# if self.args.sequence_parallel_degree > 1:
# # Calculate SP group ID (which group of ranks this rank belongs to)
# sp_group_id = self.accelerator.process_index // self.local_world_size
# # Calculate the start index for this SP group
# sp_group_start = sp_group_id * len(prompts) * self.local_world_size
# # All ranks in the same SP group get the same data slice
# process_slice = slice(
# sp_group_start,
# sp_group_start + len(prompts),
# )
# completion_ids = completion_ids[process_slice]
# else:
# # Original behavior for non-sequence parallel case
# process_slice = slice(
# self.accelerator.process_index * len(prompts),
# (self.accelerator.process_index + 1) * len(prompts),
# )
# completion_ids = completion_ids[process_slice]
# # Pad the completions, and concatenate them with the prompts
# completion_ids = [
# torch.tensor(ids, device=device) for ids in completion_ids
# ]
# completion_ids = pad(
# completion_ids, padding_value=self.processing_class.pad_token_id
# )
# else:
# # Regular generation path
# with unwrap_model_for_generation(
# self.model_wrapped,
# self.accelerator,
# gather_deepspeed3_params=self.args.ds3_gather_for_generation,
# ) as unwrapped_model:
# prompt_completion_ids = unwrapped_model.generate(
# prompt_ids,
# attention_mask=prompt_mask,
# generation_config=self.generation_config,
# )
# # Compute prompt length and extract completion ids
# prompt_length = prompt_ids.size(1)
# prompt_ids = prompt_completion_ids[:, :prompt_length]
# completion_ids = prompt_completion_ids[:, prompt_length:]
# prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
# # Mask everything after the first EOS token
# is_eos = completion_ids == self.processing_class.eos_token_id
# eos_idx = torch.full(
# (is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device
# )
# eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
# sequence_indices = torch.arange(is_eos.size(1), device=device).expand(
# is_eos.size(0), -1
# )
# completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()
# # Concatenate prompt_mask with completion_mask for logit computation
# attention_mask = torch.cat([prompt_mask, completion_mask], dim=1) # (B, P+C)
# logits_to_keep = completion_ids.size(
# 1
# ) # we only need to compute the logits for the completion tokens
# with torch.no_grad():
# # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip it's
# # computation here, and use per_token_logps.detach() instead.
# if self.num_iterations > 1:
# if self.args.sequence_parallel_degree > 1:
# old_per_token_logps, _ = self._get_per_token_logps_v2(
# self.model,
# prompt_completion_ids,
# attention_mask,
# logits_to_keep,
# )
# else:
# old_per_token_logps = super()._get_per_token_logps(
# self.model,
# prompt_completion_ids,
# attention_mask,
# logits_to_keep,
# )
# else:
# old_per_token_logps = None
# if self.beta == 0.0:
# ref_per_token_logps = None
# elif self.ref_model is not None:
# if self.args.sequence_parallel_degree > 1:
# ref_per_token_logps, _ = self._get_per_token_logps_v2(
# self.ref_model,
# prompt_completion_ids,
# attention_mask,
# logits_to_keep,
# )
# else:
# ref_per_token_logps = super()._get_per_token_logps(
# self.ref_model,
# prompt_completion_ids,
# attention_mask,
# logits_to_keep,
# )
# else:
# with self.accelerator.unwrap_model(self.model).disable_adapter():
# if self.args.sequence_parallel_degree > 1:
# ref_per_token_logps, _ = self._get_per_token_logps_v2(
# self.model,
# prompt_completion_ids,
# attention_mask,
# logits_to_keep,
# )
# else:
# ref_per_token_logps = super()._get_per_token_logps(
# self.model,
# prompt_completion_ids,
# attention_mask,
# logits_to_keep,
# )
# # Decode the generated completions
# completions_text = self.processing_class.batch_decode(
# completion_ids, skip_special_tokens=True
# )
# if is_conversational(inputs[0]):
# completions = []
# for prompt, completion in zip(prompts, completions_text):
# bootstrap = (
# prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
# )
# completions.append(
# [{"role": "assistant", "content": bootstrap + completion}]
# )
# else:
# completions = completions_text
# rewards_per_func = torch.zeros(
# len(prompts), len(self.reward_funcs), device=device
# )
# for i, (reward_func, reward_processing_class) in enumerate(
# zip(self.reward_funcs, self.reward_processing_classes)
# ):
# if isinstance(
# reward_func, nn.Module
# ): # Module instead of PretrainedModel for compat with compiled models
# reward_func_name = (
# f"reward {reward_func.config._name_or_path.split('/')[-1]}"
# )
# else:
# # pylint: disable=protected-access
# reward_func_name = reward_func.__name__
# with profiling_context(self, reward_func_name):
# if isinstance(
# reward_func, nn.Module
# ): # Module instead of PretrainedModel for compat with compiled models
# if is_conversational(inputs[0]):
# messages = [
# {"messages": p + c} for p, c in zip(prompts, completions)
# ]
# texts = [
# apply_chat_template(x, reward_processing_class)["text"]
# for x in messages
# ]
# else:
# texts = [p + c for p, c in zip(prompts, completions)]
# reward_inputs = reward_processing_class(
# text=texts,
# return_tensors="pt",
# padding=True,
# padding_side="right",
# add_special_tokens=False,
# )
# # pylint: disable=protected-access
# reward_inputs = Trainer._prepare_inputs(self, reward_inputs)
# with torch.inference_mode():
# rewards_per_func[:, i] = reward_func(**reward_inputs).logits[
# :, 0
# ] # Shape (B*G,)
# else:
# # Repeat all input columns (but "prompt" and "completion") to match the number of generations
# keys = [
# key for key in inputs[0] if key not in ["prompt", "completion"]
# ]
# reward_kwargs = {
# key: [example[key] for example in inputs] for key in keys
# }
# output_reward_func = reward_func(
# prompts=prompts, completions=completions, **reward_kwargs
# )
# # Convert None values to NaN
# output_reward_func = [
# reward if reward is not None else torch.nan
# for reward in output_reward_func
# ]
# rewards_per_func[:, i] = torch.tensor(
# output_reward_func, dtype=torch.float32, device=device
# )
# # If all reward functions return None for a given row, issue a detailed warning
# if torch.isnan(rewards_per_func).all(dim=1).any():
# nan_row_idx = (
# torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0]
# )
# row_reward_kwargs = {
# key: value[nan_row_idx] for key, value in reward_kwargs.items()
# }
# row_reward_kwargs["prompt"] = prompts[nan_row_idx]
# row_reward_kwargs["completion"] = completions[nan_row_idx]
# warnings.warn(
# f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. "
# "Please ensure that at least one reward function returns a valid reward."
# )
# # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
# # completions may be distributed across processes
# rewards_per_func = gather(rewards_per_func)
# # Apply weights to each reward function's output and sum
# rewards = (
# rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
# ).nansum(dim=1)
# # Compute grouped-wise rewards
# mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
# std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)
# # Normalize the rewards to compute the advantages
# mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(
# self.num_generations, dim=0
# )
# std_grouped_rewards = std_grouped_rewards.repeat_interleave(
# self.num_generations, dim=0
# )
# advantages = rewards - mean_grouped_rewards
# if self.args.scale_rewards:
# advantages = advantages / (std_grouped_rewards + 1e-4)
# # Slice to keep only the local part of the data
# process_slice = slice(
# self.accelerator.process_index * len(prompts),
# (self.accelerator.process_index + 1) * len(prompts),
# )
# advantages = advantages[process_slice]
# # Log the metrics
# mode = "eval" if self.control.should_evaluate else "train"
# if mode == "train":
# # pylint: disable=no-member
# self._total_train_tokens += (
# self.accelerator.gather_for_metrics(attention_mask.sum()).sum().item()
# )
# # pylint: disable=no-member
# self._metrics[mode]["num_tokens"] = [self._total_train_tokens]
# completion_length = (
# self.accelerator.gather_for_metrics(completion_mask.sum(1))
# .float()
# .mean()
# .item()
# )
# self._metrics[mode]["completion_length"].append(completion_length)
# # Calculate mean reward per function, but only for samples where the function was applied
# for i, reward_func in enumerate(self.reward_funcs):
# if isinstance(
# reward_func, nn.Module
# ): # Module instead of PretrainedModel for compat with compiled models
# reward_func_name = reward_func.config._name_or_path.split("/")[-1]
# else:
# # pylint: disable=protected-access
# reward_func_name = reward_func.__name__
# # Only calculate mean for samples where this reward function was applied (non-NaN values)
# mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
# self._metrics[mode][f"rewards/{reward_func_name}"].append(mean_rewards)
# self._metrics[mode]["reward"].append(rewards.mean().item())
# self._metrics[mode]["reward_std"].append(std_grouped_rewards.mean().item())
# if (
# self.log_completions
# and self.state.global_step % self.args.logging_steps == 0
# ):
# prompts_to_log = gather_object(prompts_text)
# completions_to_log = gather_object(completions_text)
# rewards_to_log = rewards.tolist()
# if self.accelerator.is_main_process:
# if is_rich_available():
# print_prompt_completions_sample(
# prompts_to_log,
# completions_to_log,
# rewards_to_log,
# self.state.global_step,
# )
# if (
# self.args.report_to
# and "wandb" in self.args.report_to
# and wandb.run is not None
# ):
# import pandas as pd
# # For logging
# table = {
# "step": [str(self.state.global_step)] * len(rewards),
# "prompt": prompts_to_log,
# "completion": completions_to_log,
# "reward": rewards.tolist(),
# }
# df = pd.DataFrame(table)
# wandb.log({"completions": wandb.Table(dataframe=df)})
# return {
# "prompt_ids": prompt_ids,
# "prompt_mask": prompt_mask,
# "completion_ids": completion_ids,
# "completion_mask": completion_mask,
# "old_per_token_logps": old_per_token_logps,
# "ref_per_token_logps": ref_per_token_logps,
# "advantages": advantages,
# }
# def _get_per_token_logps_v2(
# self, model, input_ids, attention_mask, logits_to_keep, completion_mask=None
# ):
# # Pad sequence to be divisible by SP degree if needed
# total_seq_len = input_ids.shape[1]
# if total_seq_len % self.local_world_size != 0:
# pad_len = self.local_world_size - (total_seq_len % self.local_world_size)
# pad_token_id = self.processing_class.pad_token_id or 0
# # Pad input_ids and attention_mask
# padding = torch.full(
# (input_ids.shape[0], pad_len),
# pad_token_id,
# dtype=input_ids.dtype,
# device=input_ids.device,
# )
# input_ids = torch.cat([input_ids, padding], dim=1)
# attn_padding = torch.zeros(
# (attention_mask.shape[0], pad_len),
# dtype=attention_mask.dtype,
# device=attention_mask.device,
# )
# attention_mask = torch.cat([attention_mask, attn_padding], dim=1)
# if completion_mask is not None:
# completion_mask = torch.cat([completion_mask, attn_padding], dim=1)
# total_seq_len += pad_len
# logits_to_keep += pad_len
# # Split the sequence
# slice_size = total_seq_len // self.local_world_size
# start = self.local_rank * slice_size
# end = start + slice_size
# # Get our slice
# input_ids_slice = input_ids[:, start:end]
# attention_mask_slice = attention_mask[:, start:end]
# # Calculate where our slice starts and ends relative to the completion tokens
# local_completion_mask = None
# prompt_len = input_ids.size(1) - logits_to_keep
# if start >= prompt_len:
# # Slice starts within the completion section
# start_in_completion = start - prompt_len
# end_in_completion = min(end - prompt_len, logits_to_keep)
# local_logits_to_keep = end_in_completion - start_in_completion
# if completion_mask is not None:
# local_completion_mask = completion_mask[
# :, start_in_completion:end_in_completion
# ]
# elif end <= prompt_len:
# # Slice is entirely within the prompt section (no completion tokens)
# local_logits_to_keep = 0
# if completion_mask is not None:
# local_completion_mask = torch.zeros(
# (completion_mask.size(0), 0), device=completion_mask.device
# )
# else:
# # Slice contains the boundary between prompt and completion
# start_in_completion = 0
# end_in_completion = min(end - prompt_len, logits_to_keep)
# local_logits_to_keep = end_in_completion - start_in_completion
# if completion_mask is not None:
# local_completion_mask = completion_mask[
# :, start_in_completion:end_in_completion
# ]
# # Get logits with enough context to compute log probs
# logits = model(
# input_ids=input_ids_slice,
# attention_mask=attention_mask_slice,
# logits_to_keep=local_logits_to_keep + 1,
# ).logits
# # Only the last rank that contains completion tokens needs to remove the last logit
# is_last_rank_with_completions = (
# self.local_rank == self.local_world_size - 1 # Last rank overall
# or end
# >= prompt_len
# + logits_to_keep # Our slice includes the last completion token
# )
# if is_last_rank_with_completions:
# logits = logits[:, :-1]
# if local_completion_mask is not None:
# local_completion_mask = local_completion_mask[:, :-1]
# local_logits_to_keep -= 1
# if start >= prompt_len:
# # For ranks where slice is all completion tokens,
# # we need to offset to match the logits (which predict the next token)
# offset = 1 # Skip the first token as it's predicted by the last token of the previous rank
# local_input_ids = input_ids_slice[:, offset : offset + local_logits_to_keep]
# else:
# # For the rank that contains the prompt-completion boundary,
# # we need to take completion tokens only
# offset = prompt_len - start # Where completions start in our slice
# local_input_ids = input_ids_slice[:, offset : offset + local_logits_to_keep]
# logits = logits[
# :, -local_logits_to_keep:
# ] # Take only logits for completion tokens
# logits = logits / self.temperature
# per_token_logps = selective_log_softmax(logits, local_input_ids)
# return per_token_logps, local_completion_mask
# # pylint: disable=unused-argument
# @profiling_decorator
# def compute_loss(
# self, model, inputs, return_outputs=False, num_items_in_batch=None
# ):
# if return_outputs:
# raise ValueError("The GRPOTrainer does not support returning outputs")
# # Unpack inputs
# prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
# completion_ids, completion_mask = (
# inputs["completion_ids"],
# inputs["completion_mask"],
# )
# prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
# attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
# logits_to_keep = completion_ids.size(1)
# if self.args.sequence_parallel_degree > 1:
# per_token_logps, completion_mask = self._get_per_token_logps_v2(
# model,
# prompt_completion_ids,
# attention_mask,
# logits_to_keep,
# completion_mask,
# )
# else:
# per_token_logps = super()._get_per_token_logps(
# model, prompt_completion_ids, attention_mask, logits_to_keep
# )
# # Compute the KL divergence between the model and the reference model
# if self.beta != 0.0:
# ref_per_token_logps = inputs["ref_per_token_logps"]
# per_token_kl = (
# torch.exp(ref_per_token_logps - per_token_logps)
# - (ref_per_token_logps - per_token_logps)
# - 1
# )
# # Compute the loss
# advantages = inputs["advantages"]
# # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip its computation
# # and use per_token_logps.detach() instead.
# old_per_token_logps = (
# inputs["old_per_token_logps"]
# if self.num_iterations > 1
# else per_token_logps.detach()
# )
# coef_1 = torch.exp(per_token_logps - old_per_token_logps)
# coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
# per_token_loss1 = coef_1 * advantages.unsqueeze(1)
# per_token_loss2 = coef_2 * advantages.unsqueeze(1)
# per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
# if self.beta != 0.0:
# per_token_loss = per_token_loss + self.beta * per_token_kl
# loss = (per_token_loss * completion_mask).sum() / completion_mask.sum()
# # Log metrics
# mode = "eval" if self.control.should_evaluate else "train"
# if self.beta != 0.0:
# mean_kl = (per_token_kl * completion_mask).sum() / completion_mask.sum()
# self._metrics[mode]["kl"].append(
# self.accelerator.gather_for_metrics(mean_kl).mean().item()
# )
# is_clipped = (per_token_loss1 < per_token_loss2).float()
# clip_ratio = (is_clipped * completion_mask).sum() / completion_mask.sum()
# self._metrics[mode]["clip_ratio"].append(
# self.accelerator.gather_for_metrics(clip_ratio).mean().item()
# )
# return loss

View File

@@ -13,14 +13,66 @@ from torch.utils.data import DistributedSampler, Sampler
from torch.utils.hooks import RemovableHandle
from axolotl.monkeypatch.attention.ring_attn import (
RingAttnFunc,
get_ring_attn_group,
update_ring_attn_params,
)
from axolotl.utils.schemas.enums import RingAttnFunc
LOG = logging.getLogger(__name__)
def _handle_logits_to_keep(
logits_to_keep,
local_rank: int,
local_world_size: int,
ring_attn_func: RingAttnFunc,
total_seq_len: int,
):
"""
Handle logits_to_keep parameter for sequence parallelism.
Args:
logits_to_keep: Integer or tensor indicating which positions to compute logits
for.
local_rank: Rank in the sequence parallel group.
local_world_size: World size of the sequence parallel group.
ring_attn_func: Ring attention function being used.
total_seq_len: Full sequence length.
Returns:
Adjusted logits_to_keep appropriate for this rank's sharded sequence
"""
print("start of _handle_logits_to_keep")
print(dist.get_rank(), logits_to_keep)
# No transformation needed if logits_to_keep is None
if logits_to_keep is None:
return None
assert isinstance(
logits_to_keep, int
), "sequence parallelism currently only supports integer logits_to_keep"
assert ring_attn_func in [
RingAttnFunc.VARLEN_LLAMA3,
RingAttnFunc.BATCH_RING,
], "if specifying logits_to_keep, sequence parallelism currently only supports 'batch_ring' and 'varlen_llama3' `ring_attn_func`s"
# For standard sharding, each rank gets a contiguous chunk
chunk_size = total_seq_len // local_world_size
start_idx = local_rank * chunk_size
end_idx = start_idx + chunk_size
# Check if logits_to_keep is in this rank's range
if start_idx <= logits_to_keep < end_idx:
print("end of _handle_logits_to_keep")
print(dist.get_rank(), logits_to_keep - start_idx)
return logits_to_keep - start_idx
else:
print("end of _handle_logits_to_keep")
print(dist.get_rank(), -1)
return -1
def apply_sequence_parallelism(
batch: dict[str, torch.Tensor],
local_rank: int,
@@ -31,10 +83,10 @@ def apply_sequence_parallelism(
Apply sequence parallelism slicing to a batch.
Args:
batch: Batch dictionary (e.g., input_ids, attention_mask, etc.)
local_rank: Local rank in the sequence parallel group
local_world_size: World size of the sequence parallel group
ring_attn_func: The ring attention function to use
batch: Batch dictionary (e.g., input_ids, attention_mask, etc.).
local_rank: Local rank in the sequence parallel group.
local_world_size: World size of the sequence parallel group.
ring_attn_func: The ring attention function to use.
Returns:
Sliced batch dictionary.
@@ -47,12 +99,10 @@ def apply_sequence_parallelism(
total_seq_len = batch["input_ids"].size(1)
for key in batch:
if (
key in batch
and isinstance(batch[key], torch.Tensor)
isinstance(batch[key], torch.Tensor)
and batch[key].dim() > 1
and batch[key].size(1) == total_seq_len
):
if ring_attn_func in [
RingAttnFunc.VARLEN_LLAMA3,
RingAttnFunc.BATCH_RING,
@@ -77,6 +127,14 @@ def apply_sequence_parallelism(
dim=1,
).transpose(1, 2)
batch[key] = tensor[:, local_rank].contiguous()
if key == "logits_to_keep":
batch[key] = _handle_logits_to_keep(
logits_to_keep=batch[key],
local_rank=local_rank,
local_world_size=local_world_size,
ring_attn_func=ring_attn_func,
total_seq_len=total_seq_len,
)
return batch
@@ -204,8 +262,11 @@ class SequenceParallelContextManager:
# Forward post-hook to gather outputs
def sequence_parallel_post_hook(_, __, output):
print("start of sequence_parallel_post_hook")
# Gather the sharded outputs
return self.gather_outputs(output)
output = self.gather_outputs(output)
print("end of sequence_parallel_post_hook")
return output
# Register both hooks
self.hook_handles.append(

View File

@@ -9,7 +9,7 @@ from PIL.Image import Resampling
from transformers import TrainingArguments
from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig
from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
from axolotl.utils.schemas.enums import RingAttnFunc
@dataclass

View File

@@ -27,6 +27,8 @@ pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transform
```yaml
plugins:
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true
```
## Supported Models

View File

@@ -28,7 +28,7 @@ class CutCrossEntropyArgs(BaseModel):
Input args for Cut Cross Entropy.
"""
cut_cross_entropy: Optional[bool] = True
cut_cross_entropy: Optional[bool] = None
@model_validator(mode="before")
@classmethod

View File

@@ -4,7 +4,6 @@
# flake8: noqa
from .patch import (
RingAttnFunc,
get_ring_attn_group,
register_ring_attn,
set_ring_attn_group,

View File

@@ -28,7 +28,7 @@ from transformers.modeling_flash_attention_utils import (
)
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS
from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
from axolotl.utils.schemas.enums import RingAttnFunc
RING_ATTN_FUNC_MAPPING = {
RingAttnFunc.BATCH_RING: ring_flash_attn_func,

View File

@@ -6,14 +6,13 @@ package, specifically the `hf_adapter.substitute_hf_flash_attn` function to patc
their sequence parallel version of Flash Attention 2.
"""
from enum import Enum
import torch
import torch.distributed as dist
from accelerate.logging import get_logger
from axolotl.logging_config import configure_logging
from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
from axolotl.utils.schemas.enums import RingAttnFunc
configure_logging()
LOG = get_logger(__name__)
@@ -43,17 +42,6 @@ def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
RING_ATTN_GROUP = ring_attn_group
class RingAttnFunc(str, Enum):
"""Enum class for supported `ring-flash-attn` implementations"""
# VARLEN_RING = "varlen_ring"
# VARLEN_ZIGZAG = "varlen_zigzag"
VARLEN_LLAMA3 = "varlen_llama3"
BATCH_RING = "batch_ring"
BATCH_ZIGZAG = "batch_zigzag"
BATCH_STRIPE = "batch_stripe"
def register_ring_attn(
sequence_parallel_degree: int,
heads_k_stride: int | None,

View File

@@ -34,6 +34,7 @@ from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import cleanup_distributed
from axolotl.utils.freeze import freeze_layers_except
from axolotl.utils.models import load_model, load_processor, load_tokenizer
from axolotl.utils.schemas.enums import RLType
from axolotl.utils.trainer import setup_trainer
try:
@@ -108,7 +109,7 @@ def setup_reference_model(
Reference model if needed for RL training, `None` otherwise.
"""
model_ref = None
if cfg.rl and cfg.rl != "orpo":
if cfg.rl and cfg.rl != RLType.ORPO:
if cfg.adapter and not cfg.rl_adapter_ref_model:
# use built-in trl autounwrap
LOG.debug("Passing model_ref: None to RL trainer")

View File

@@ -18,8 +18,9 @@ from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import is_main_process, zero_first
from axolotl.utils.models import load_tokenizer
from axolotl.utils.schemas.enums import RLType
LOG = logging.getLogger("axolotl")
LOG = logging.getLogger(__name__)
def _get_path(ds_hash, cfg):
@@ -80,7 +81,7 @@ def map_dataset(cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs):
def drop_long_rl_seq(
sample, rl, tokenizer, sequence_len # pylint: disable=invalid-name
):
if rl in ("dpo", "ipo", "orpo", "simpo"):
if rl in (RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO):
if not (
sample.get("prompt") and sample.get("chosen") and sample.get("rejected")
):
@@ -100,7 +101,7 @@ def drop_long_rl_seq(
len_prompt + len_rejected
) <= sequence_len
if rl == "kto":
if rl is RLType.KTO:
if not (sample.get("prompt") and sample.get("completion")):
raise ValueError("Prompt and completion keys are required for KTO datasets")
@@ -114,7 +115,7 @@ def drop_long_rl_seq(
return (len_prompt + len_completion) <= sequence_len
if rl == "grpo":
if rl is RLType.GRPO:
return True
raise ValueError("Unknown RL type")
@@ -137,9 +138,9 @@ def load_prepare_preference_datasets(cfg):
if _type:
if isinstance(_type, DictDefault):
_type = "user_defined.default"
if _cfg.rl == "orpo":
if _cfg.rl is RLType.ORPO:
ds_transform_fn = load_orpo(_type, _cfg, dataset_idx=i)
elif _cfg.rl == "kto":
elif _cfg.rl is RLType.KTO:
ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
else:
ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
@@ -150,7 +151,7 @@ def load_prepare_preference_datasets(cfg):
split_datasets[i] = map_dataset(
cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs
)
elif _cfg.rl == "kto":
elif _cfg.rl is RLType.KTO:
ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
map_kwargs = {}
if isinstance(ds_transform_fn, tuple):

View File

@@ -134,9 +134,10 @@ def prepare_dataset(cfg, tokenizer, processor=None, preprocess_iterable=None):
"csv", data_files=f.name, split="train", streaming=True
)
else:
iter_ds = load_dataset(
path, streaming=True, split=split, name=name, data_files=data_files
)
if is_local_main_process():
iter_ds = load_dataset(
path, streaming=True, split=split, name=name, data_files=data_files
)
if skip:
LOG.info(f"Skipping {skip} samples from the dataset")

View File

@@ -1,7 +1,5 @@
"""custom checkpointing utils"""
from functools import partial
from axolotl.utils.gradient_checkpointing.unsloth import (
Unsloth_Offloaded_Gradient_Checkpointer,
)
@@ -11,10 +9,6 @@ def hf_grad_checkpoint_offload_wrapper(
decoder_layer, *args, use_reentrant=None
): # pylint: disable=unused-argument
return Unsloth_Offloaded_Gradient_Checkpointer.apply(
(
decoder_layer.func.__self__
if isinstance(decoder_layer, partial)
else decoder_layer.__self__
),
decoder_layer.__self__,
*args,
)

View File

@@ -72,6 +72,7 @@ from axolotl.utils.distributed import (
from axolotl.utils.gradient_checkpointing import hf_grad_checkpoint_offload_wrapper
from axolotl.utils.lora_embeddings import get_linear_embedding_layers
from axolotl.utils.model_shard_quant import load_sharded_model, load_sharded_model_quant
from axolotl.utils.schemas.enums import RLType
LOG = logging.getLogger(__name__)
@@ -1340,7 +1341,7 @@ class ModelLoader:
# then the dpo trainer doesn't want the peft model loaded over it, it just wants the lora/peft config
if (
self.cfg.adapter
and self.cfg.rl in ["dpo", "ipo", "kto"]
and self.cfg.rl in [RLType.DPO, RLType.IPO, RLType.KTO]
and not self.cfg.merge_lora
):
_, lora_config = load_lora(

View File

@@ -18,6 +18,7 @@ from pydantic import (
)
from transformers.utils.import_utils import is_torch_npu_available
from axolotl.utils.distributed import is_main_process
from axolotl.utils.schemas.datasets import (
DatasetConfig,
DPODataset,
@@ -27,7 +28,7 @@ from axolotl.utils.schemas.datasets import (
StepwiseSupervisedDataset,
)
from axolotl.utils.schemas.deprecated import DeprecatedParameters, RemappedParameters
from axolotl.utils.schemas.enums import ChatTemplate, RLType
from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType
from axolotl.utils.schemas.integrations import (
CometConfig,
GradioConfig,
@@ -259,7 +260,7 @@ class AxolotlInputConfig(
sequence_parallel_degree: int | None = None
heads_k_stride: int | None = None
ring_attn_func: str | None = None
ring_attn_func: RingAttnFunc | None = None
special_tokens: SpecialTokensConfig | None = None
tokens: list[str] | None = None
@@ -718,9 +719,10 @@ class AxolotlInputConfig(
and data.get("eval_sample_packing") is None
and not data.get("eval_table_size")
):
LOG.info(
"explicitly setting `eval_sample_packing` to match `sample_packing`"
)
if is_main_process():
LOG.info(
"explicitly setting `eval_sample_packing` to match `sample_packing`"
)
data["eval_sample_packing"] = True
if (
@@ -782,7 +784,7 @@ class AxolotlInputConfig(
@model_validator(mode="after")
def check_simpo_warmup(self):
if self.rl == "simpo" and self.warmup_ratio:
if self.rl is RLType.SIMPO and self.warmup_ratio:
raise ValueError(
"warmup_ratio is not supported with the simpo trainer. Please use `warmup_steps` instead"
)
@@ -1177,14 +1179,15 @@ class AxolotlInputConfig(
# TODO: monkeypatch / callback to average losses correctly across SP ranks
# / fix gradient scaling across SP ranks. Losses, grads should be scaled
# according to the proportion of non-padding tokens per rank.
LOG.warning(
"Sequence parallelism (SP) is enabled with "
f"sequence_parallel_degree={self.sequence_parallel_degree}. "
"Please note that logged losses may differ slightly to the non-SP "
"losses due to transformers Trainer implementation details. "
"Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
"for more details."
)
if is_main_process():
LOG.warning(
"Sequence parallelism (SP) is enabled with "
f"sequence_parallel_degree={self.sequence_parallel_degree}. "
"Please note that logged losses may differ slightly to the non-SP "
"losses due to transformers Trainer implementation details. "
"Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
"for more details."
)
return self
@@ -1193,8 +1196,6 @@ class AxolotlInputConfig(
if getattr(self, "sequence_parallel_degree", 1) == 1:
return self
from axolotl.monkeypatch.attention.ring_attn.patch import RingAttnFunc
if self.ring_attn_func is not None:
valid_funcs = list(RingAttnFunc)
if self.ring_attn_func in valid_funcs:

View File

@@ -6,12 +6,12 @@ from enum import Enum
class RLType(str, Enum):
"""RL trainer type configuration subset"""
dpo = "dpo" # pylint: disable=invalid-name
grpo = "grpo" # pylint: disable=invalid-name
ipo = "ipo" # pylint: disable=invalid-name
orpo = "orpo" # pylint: disable=invalid-name
kto = "kto" # pylint: disable=invalid-name
simpo = "simpo" # pylint: disable=invalid-name
DPO = "dpo" # pylint: disable=invalid-name
GRPO = "grpo" # pylint: disable=invalid-name
IPO = "ipo" # pylint: disable=invalid-name
ORPO = "orpo" # pylint: disable=invalid-name
KTO = "kto" # pylint: disable=invalid-name
SIMPO = "simpo" # pylint: disable=invalid-name
class ChatTemplate(str, Enum):
@@ -53,3 +53,14 @@ class CustomSupportedOptimizers(str, Enum):
ao_adamw_fp8 = "ao_adamw_fp8" # pylint: disable=invalid-name
adopt_adamw = "adopt_adamw" # pylint: disable=invalid-name
muon = "muon" # pylint: disable=invalid-name
class RingAttnFunc(str, Enum):
"""Enum class for supported `ring-flash-attn` implementations"""
# VARLEN_RING = "varlen_ring"
# VARLEN_ZIGZAG = "varlen_zigzag"
VARLEN_LLAMA3 = "varlen_llama3"
BATCH_RING = "batch_ring"
BATCH_ZIGZAG = "batch_zigzag"
BATCH_STRIPE = "batch_stripe"

View File

@@ -528,13 +528,6 @@ def setup_torch_compile_env(cfg):
def setup_deepspeed_env(cfg, stage=None):
from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig
from axolotl.utils.distributed import distributed_state
if distributed_state and distributed_state.initialized:
raise RuntimeError(
"Distributed State already initialized before Deepspeed setup"
)
os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
if stage:

View File

@@ -1,77 +0,0 @@
"""
E2E tests for activation checkpointing
"""
import pytest
import transformers
from torch.utils.checkpoint import checkpoint
from axolotl.cli.args import TrainerCliArgs
from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault
from ..utils import check_model_output_exists
@pytest.fixture()
def fix_checkpoint_after_test():
yield
transformers.modeling_utils.checkpoint = checkpoint
class TestActivationCheckpointing:
"""
E2E tests for activation checkpointing
"""
def test_activation_checkpointing_offload(
self,
temp_dir,
fix_checkpoint_after_test, # pylint: disable=unused-argument,redefined-outer-name
):
# pylint: disable=duplicate-code
cfg = DictDefault(
{
"base_model": "HuggingFaceTB/SmolLM2-135M",
"sequence_len": 1024,
"val_set_size": 0.0,
"special_tokens": {
"pad_token": "<|endoftext|>",
"eos_token": "<|im_end|>",
},
"datasets": [
{
"chat_template": "chatml",
"path": "mlabonne/FineTome-100k",
"type": "chat_template",
"split": "train[:10%]",
"field_messages": "conversations",
"message_field_role": "from",
"message_field_content": "value",
},
],
"num_epochs": 1,
"max_steps": 5,
"micro_batch_size": 1,
"gradient_accumulation_steps": 1,
"output_dir": temp_dir,
"learning_rate": 0.00001,
"optimizer": "adamw_8bit",
"lr_scheduler": "cosine",
"flash_attention": True,
"sample_packing": True,
"bf16": True,
"save_safetensors": True,
"gradient_checkpointing": "offload",
}
)
cfg = validate_config(cfg)
normalize_config(cfg)
cli_args = TrainerCliArgs()
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
train(cfg=cfg, dataset_meta=dataset_meta)
check_model_output_exists(temp_dir, cfg)

View File

@@ -1,6 +1,4 @@
"""
E2E tests for mixtral
"""
"""E2E tests for mixtral"""
import logging
import os

View File

@@ -12,12 +12,12 @@ from accelerate.state import PartialState
from axolotl.core.trainers.mixins.sequence_parallel import apply_sequence_parallelism
from axolotl.monkeypatch.attention.ring_attn import (
RingAttnFunc,
get_ring_attn_group,
register_ring_attn,
set_ring_attn_group,
)
from axolotl.utils.dict import DictDefault
from axolotl.utils.schemas.enums import RingAttnFunc
@pytest.fixture
@@ -131,6 +131,11 @@ class TestConfigValidation:
# Mock the ring_flash_attn module
monkeypatch.setitem(sys.modules, "ring_flash_attn", MagicMock())
# Mock the is_main_process function to return True
monkeypatch.setattr(
"axolotl.utils.schemas.config.is_main_process", lambda: True
)
@pytest.fixture
def base_cfg(self):
"""Create a base configuration for testing."""