Compare commits
23 Commits
llama4-pat
...
transforme
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0aa7c72c59 | ||
|
|
e4307fb7d7 | ||
|
|
dd8bad06d0 | ||
|
|
de8a625dd7 | ||
|
|
51267ded04 | ||
|
|
756a0559c1 | ||
|
|
9a8e3e9c7b | ||
|
|
7e7180fa10 | ||
|
|
22c562533d | ||
|
|
16823e1de6 | ||
|
|
e0420b3528 | ||
|
|
9f986f5e71 | ||
|
|
f85861a0b2 | ||
|
|
630e40dd13 | ||
|
|
bf9efe2a09 | ||
|
|
0dac2ddeac | ||
|
|
a6c03217f5 | ||
|
|
59cd472504 | ||
|
|
9b89591ead | ||
|
|
31498d0230 | ||
|
|
d25daebea9 | ||
|
|
e0e5d9b1d6 | ||
|
|
8bbad21bfd |
22
README.md
22
README.md
@@ -63,7 +63,7 @@ axolotl fetch examples
|
||||
axolotl fetch deepspeed_configs # OPTIONAL
|
||||
```
|
||||
|
||||
Other installation approaches are described [here](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html).
|
||||
Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
|
||||
|
||||
### Your First Fine-tune
|
||||
|
||||
@@ -78,7 +78,7 @@ axolotl fetch examples --dest path/to/folder
|
||||
axolotl train examples/llama-3/lora-1b.yml
|
||||
```
|
||||
|
||||
That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/getting-started.html) for a more detailed walkthrough.
|
||||
That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
|
||||
|
||||
## ✨ Key Features
|
||||
|
||||
@@ -91,20 +91,20 @@ That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github
|
||||
|
||||
## 📚 Documentation
|
||||
|
||||
- [Installation Options](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html) - Detailed setup instructions for different environments
|
||||
- [Configuration Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html) - Full configuration options and examples
|
||||
- [Dataset Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) - Supported formats and how to use them
|
||||
- [Multi-GPU Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-gpu.html)
|
||||
- [Multi-Node Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html)
|
||||
- [Multipacking](https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html)
|
||||
- [API Reference](https://axolotl-ai-cloud.github.io/axolotl/docs/api/) - Auto-generated code documentation
|
||||
- [FAQ](https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html) - Frequently asked questions
|
||||
- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
|
||||
- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
|
||||
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
|
||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||
- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
|
||||
- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
|
||||
- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions
|
||||
|
||||
## 🤝 Getting Help
|
||||
|
||||
- Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
|
||||
- Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
|
||||
- Read our [Debugging Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html)
|
||||
- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
|
||||
- Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
|
||||
|
||||
## 🌟 Contributing
|
||||
|
||||
@@ -231,6 +231,7 @@ website:
|
||||
- docs/reward_modelling.qmd
|
||||
- docs/lr_groups.qmd
|
||||
- docs/lora_optims.qmd
|
||||
- docs/dataset_loading.qmd
|
||||
|
||||
- section: "Core Concepts"
|
||||
contents:
|
||||
|
||||
@@ -68,7 +68,7 @@ def run_cmd(cmd: str, run_folder: str):
|
||||
@app.function(
|
||||
image=cicd_image,
|
||||
gpu=GPU_CONFIG,
|
||||
timeout=60 * 60,
|
||||
timeout=90 * 60,
|
||||
cpu=8.0,
|
||||
memory=131072 * N_GPUS,
|
||||
volumes=VOLUME_CONFIG,
|
||||
|
||||
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||
WORKDIR /workspace
|
||||
|
||||
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
|
||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
|
||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
|
||||
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
|
||||
|
||||
|
||||
@@ -90,7 +90,7 @@ lora_on_cpu: true
|
||||
|
||||
# List[str]. Add plugins to extend the pipeline.
|
||||
# See `src/axolotl/integrations` for the available plugins or doc below for more details.
|
||||
# https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html
|
||||
# https://docs.axolotl.ai/docs/custom_integrations.html
|
||||
plugins:
|
||||
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||
|
||||
@@ -109,7 +109,7 @@ datasets:
|
||||
preprocess_shards: # Optional[int] process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)
|
||||
|
||||
name: # Optional[str] name of dataset configuration to load
|
||||
train_on_split: train # Optional[str] name of dataset split to load from
|
||||
split: train # Optional[str] name of dataset split to load from
|
||||
revision: # Optional[str] The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets.
|
||||
trust_remote_code: # Optional[bool] Trust remote code for untrusted source
|
||||
|
||||
@@ -165,7 +165,9 @@ datasets:
|
||||
content: value
|
||||
# ...
|
||||
|
||||
# Optional[Dict[str, List]]. Roles mapping in the messages. The default is:
|
||||
# Optional[Dict[str, List]]. Roles mapping in the messages.
|
||||
# The format is {target_role: [source_roles]}. All source roles will be mapped to the target role.
|
||||
# The default is:
|
||||
roles:
|
||||
user: ["human", "user"]
|
||||
assistant: ["gpt", "assistant"]
|
||||
@@ -392,7 +394,7 @@ lora_fan_in_fan_out: false
|
||||
|
||||
# Apply custom LoRA autograd functions and activation function Triton kernels for
|
||||
# speed and memory savings
|
||||
# See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html
|
||||
# See: https://docs.axolotl.ai/docs/lora_optims.html
|
||||
lora_mlp_kernel: true
|
||||
lora_qkv_kernel: true
|
||||
lora_o_kernel: true
|
||||
@@ -686,7 +688,7 @@ ddp_broadcast_buffers:
|
||||
# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
|
||||
# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
|
||||
# subsequences, or set to 4 to split into four equal-sized subsequences.
|
||||
# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
|
||||
# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.
|
||||
sequence_parallel_degree:
|
||||
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||
# Must evenly divide the number of KV heads in your model.
|
||||
|
||||
@@ -13,6 +13,13 @@ As there are a lot of available options in Axolotl, this guide aims to provide a
|
||||
|
||||
Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.
|
||||
|
||||
::: {.callout-tip}
|
||||
|
||||
This guide will mainly use JSONL as an introduction. Please refer to the [dataset loading docs](../dataset_loading.qmd) to understand how to load datasets from other sources.
|
||||
|
||||
For `pretraining_dataset:` specifically, please refer to the [Pre-training section](#pre-training).
|
||||
:::
|
||||
|
||||
## Pre-training
|
||||
|
||||
When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports [streaming](https://huggingface.co/docs/datasets/en/stream) to only load batches into memory at a time.
|
||||
@@ -450,10 +457,7 @@ datasets:
|
||||
type: alpaca
|
||||
```
|
||||
|
||||
Axolotl supports many kinds of instruction dataset. All of them can be found here (https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html) with their respective type and sample row format.
|
||||
|
||||
|
||||
Reference: [Instruction Dataset Documentation](inst_tune.qmd).
|
||||
Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format.
|
||||
|
||||
#### Custom Instruct Prompt Format
|
||||
|
||||
|
||||
276
docs/dataset_loading.qmd
Normal file
276
docs/dataset_loading.qmd
Normal file
@@ -0,0 +1,276 @@
|
||||
---
|
||||
title: Dataset Loading
|
||||
description: Understanding how to load datasets from different sources
|
||||
back-to-top-navigation: true
|
||||
toc: true
|
||||
toc-depth: 5
|
||||
---
|
||||
|
||||
## Overview
|
||||
|
||||
Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.
|
||||
|
||||
## Loading Datasets
|
||||
|
||||
We use the `datasets` library to load datasets and a mix of `load_dataset` and `load_from_disk` to load them.
|
||||
|
||||
You may recognize the similar named configs between `load_dataset` and the `datasets` section of the config file.
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path:
|
||||
name:
|
||||
data_files:
|
||||
split:
|
||||
revision:
|
||||
trust_remote_code:
|
||||
```
|
||||
|
||||
::: {.callout-tip}
|
||||
|
||||
Do not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be `path` and sometimes `data_files`.
|
||||
|
||||
:::
|
||||
|
||||
This matches the API of [`datasets.load_dataset`](https://github.com/huggingface/datasets/blob/0b5998ac62f08e358f8dcc17ec6e2f2a5e9450b6/src/datasets/load.py#L1838-L1858), so if you're familiar with that, you will feel right at home.
|
||||
|
||||
For HuggingFace's guide to load different dataset types, see [here](https://huggingface.co/docs/datasets/loading).
|
||||
|
||||
For full details on the config, see [config.qmd](config.qmd).
|
||||
|
||||
::: {.callout-note}
|
||||
|
||||
You can set multiple datasets in the config file by more than one entry under `datasets`.
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: /path/to/your/dataset
|
||||
- path: /path/to/your/other/dataset
|
||||
```
|
||||
|
||||
:::
|
||||
|
||||
### Local dataset
|
||||
|
||||
#### Files
|
||||
|
||||
Usually, to load a JSON file, you would do something like this:
|
||||
|
||||
```python
|
||||
from datasets import load_dataset
|
||||
|
||||
dataset = load_dataset("json", data_files="data.json")
|
||||
```
|
||||
|
||||
Which translates to the following config:
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: json
|
||||
data_files: /path/to/your/file.jsonl
|
||||
```
|
||||
|
||||
However, to make things easier, we have added a few shortcuts for loading local dataset files.
|
||||
|
||||
You can just point the `path` to the file or directory along with the `ds_type` to load the dataset. The below example shows for a JSON file:
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: /path/to/your/file.jsonl
|
||||
ds_type: json
|
||||
```
|
||||
|
||||
This works for CSV, JSON, Parquet, and Arrow files.
|
||||
|
||||
::: {.callout-tip}
|
||||
|
||||
If `path` points to a file and `ds_type` is not specified, we will automatically infer the dataset type from the file extension, so you could omit `ds_type` if you'd like.
|
||||
|
||||
:::
|
||||
|
||||
#### Directory
|
||||
|
||||
If you're loading a directory, you can point the `path` to the directory.
|
||||
|
||||
Then, you have two options:
|
||||
|
||||
##### Loading entire directory
|
||||
|
||||
You do not need any additional configs.
|
||||
|
||||
We will attempt to load in the following order:
|
||||
- datasets saved with `datasets.save_to_disk`
|
||||
- loading entire directory of files (such as with parquet/arrow files)
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: /path/to/your/directory
|
||||
```
|
||||
|
||||
##### Loading specific files in directory
|
||||
|
||||
Provide `data_files` with a list of files to load.
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
# single file
|
||||
- path: /path/to/your/directory
|
||||
ds_type: csv
|
||||
data_files: file1.csv
|
||||
|
||||
# multiple files
|
||||
- path: /path/to/your/directory
|
||||
ds_type: json
|
||||
data_files:
|
||||
- file1.jsonl
|
||||
- file2.jsonl
|
||||
|
||||
# multiple files for parquet
|
||||
- path: /path/to/your/directory
|
||||
ds_type: parquet
|
||||
data_files:
|
||||
- file1.parquet
|
||||
- file2.parquet
|
||||
|
||||
```
|
||||
|
||||
### HuggingFace Hub
|
||||
|
||||
The method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.
|
||||
|
||||
::: {.callout-note}
|
||||
|
||||
If you're using a private dataset, you will need to enable the `hf_use_auth_token` flag in the root-level of the config file.
|
||||
|
||||
:::
|
||||
|
||||
#### Folder uploaded
|
||||
|
||||
This would mean that the dataset is a single file or file(s) uploaded to the Hub.
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: org/dataset-name
|
||||
data_files:
|
||||
- file1.jsonl
|
||||
- file2.jsonl
|
||||
```
|
||||
|
||||
#### HuggingFace Dataset
|
||||
|
||||
This means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via `datasets.push_to_hub`.
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: org/dataset-name
|
||||
```
|
||||
|
||||
::: {.callout-note}
|
||||
|
||||
There are some other configs which may be required like `name`, `split`, `revision`, `trust_remote_code`, etc depending on the dataset.
|
||||
|
||||
:::
|
||||
|
||||
### Remote Filesystems
|
||||
|
||||
Via the `storage_options` config under `load_dataset`, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.
|
||||
|
||||
::: {.callout-warning}
|
||||
|
||||
This is currently experimental. Please let us know if you run into any issues!
|
||||
|
||||
:::
|
||||
|
||||
The only difference between the providers is that you need to prepend the path with the respective protocols.
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
# Single file
|
||||
- path: s3://bucket-name/path/to/your/file.jsonl
|
||||
|
||||
# Directory
|
||||
- path: s3://bucket-name/path/to/your/directory
|
||||
```
|
||||
|
||||
For directory, we load via `load_from_disk`.
|
||||
|
||||
#### S3
|
||||
|
||||
Prepend the path with `s3://`.
|
||||
|
||||
The credentials are pulled in the following order:
|
||||
|
||||
- `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` environment variables
|
||||
- from the `~/.aws/credentials` file
|
||||
- for nodes on EC2, the IAM metadata provider
|
||||
|
||||
::: {.callout-note}
|
||||
|
||||
We assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.
|
||||
|
||||
:::
|
||||
|
||||
Other environment variables that can be set can be found in [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables)
|
||||
|
||||
#### GCS
|
||||
|
||||
Prepend the path with `gs://` or `gcs://`.
|
||||
|
||||
The credentials are loaded in the following order:
|
||||
|
||||
- gcloud credentials
|
||||
- for nodes on GCP, the google metadata service
|
||||
- anonymous access
|
||||
|
||||
#### Azure
|
||||
|
||||
##### Gen 1
|
||||
|
||||
Prepend the path with `adl://`.
|
||||
|
||||
Ensure you have the following environment variables set:
|
||||
|
||||
- `AZURE_STORAGE_TENANT_ID`
|
||||
- `AZURE_STORAGE_CLIENT_ID`
|
||||
- `AZURE_STORAGE_CLIENT_SECRET`
|
||||
|
||||
##### Gen 2
|
||||
|
||||
Prepend the path with `abfs://` or `az://`.
|
||||
|
||||
Ensure you have the following environment variables set:
|
||||
|
||||
- `AZURE_STORAGE_ACCOUNT_NAME`
|
||||
- `AZURE_STORAGE_ACCOUNT_KEY`
|
||||
|
||||
Other environment variables that can be set can be found in [adlfs docs](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials)
|
||||
|
||||
#### OCI
|
||||
|
||||
Prepend the path with `oci://`.
|
||||
|
||||
It would attempt to read in the following order:
|
||||
|
||||
- `OCIFS_IAM_TYPE`, `OCIFS_CONFIG_LOCATION`, and `OCIFS_CONFIG_PROFILE` environment variables
|
||||
- when on OCI resource, resource principal
|
||||
|
||||
Other environment variables:
|
||||
|
||||
- `OCI_REGION_METADATA`
|
||||
|
||||
Please see the [ocifs docs](https://ocifs.readthedocs.io/en/latest/getting-connected.html#Using-Environment-Variables).
|
||||
|
||||
### HTTPS
|
||||
|
||||
The path should start with `https://`.
|
||||
|
||||
```yaml
|
||||
datasets:
|
||||
- path: https://path/to/your/dataset/file.jsonl
|
||||
```
|
||||
|
||||
This must be publically accessible.
|
||||
|
||||
## Next steps
|
||||
|
||||
Now that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format [dataset formats docs](dataset-formats).
|
||||
@@ -36,6 +36,9 @@ deepspeed: deepspeed_configs/zero1.json
|
||||
### Usage {#sec-deepspeed-usage}
|
||||
|
||||
```{.bash}
|
||||
# Fetch deepspeed configs (if not already present)
|
||||
axolotl fetch deepspeed_configs
|
||||
|
||||
# Passing arg via config
|
||||
axolotl train config.yml
|
||||
|
||||
@@ -48,10 +51,20 @@ axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
|
||||
We provide default configurations for:
|
||||
|
||||
- ZeRO Stage 1 (`zero1.json`)
|
||||
- ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`)
|
||||
- ZeRO Stage 2 (`zero2.json`)
|
||||
- ZeRO Stage 3 (`zero3.json`)
|
||||
- ZeRO Stage 3 with bf16 (`zero3_bf16.json`)
|
||||
- ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`)
|
||||
- ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`)
|
||||
|
||||
Choose based on your memory requirements and performance needs.
|
||||
::: {.callout-tip}
|
||||
|
||||
Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.
|
||||
|
||||
Start from Stage 1 -> Stage 2 -> Stage 3.
|
||||
|
||||
:::
|
||||
|
||||
## FSDP {#sec-fsdp}
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ format:
|
||||
## Supported Models
|
||||
|
||||
- [Mllama](#sec-mllama)
|
||||
- [Llama4](#sec-llama4)
|
||||
- [Pixtral](#sec-pixtral)
|
||||
- [Llava-1.5](#sec-llava-15)
|
||||
- [Mistral-Small-3.1](#sec-mistral-small-31)
|
||||
@@ -63,6 +64,14 @@ base_model: meta-llama/Llama-3.2-11B-Vision-Instruct
|
||||
chat_template: llama3_2_vision
|
||||
```
|
||||
|
||||
### Llama4 {#sec-llama4}
|
||||
|
||||
```yaml
|
||||
base_model: meta-llama/Llama-4-Scout-17B-16E-Instruct
|
||||
|
||||
chat_template: llama4
|
||||
```
|
||||
|
||||
### Pixtral {#sec-pixtral}
|
||||
|
||||
```yaml
|
||||
|
||||
@@ -530,7 +530,7 @@ trl:
|
||||
```
|
||||
|
||||
```bash
|
||||
CUDA_VISIBLE_DEVICES=2,3 axolotl vllm_serve grpo.yaml
|
||||
CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml
|
||||
```
|
||||
|
||||
Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
|
||||
|
||||
@@ -8,7 +8,6 @@ tokenizer_type: GPT2Tokenizer
|
||||
trust_remote_code: true
|
||||
tokenizer_use_fast: true
|
||||
tokenizer_legacy: true
|
||||
strict: false
|
||||
push_dataset_to_hub:
|
||||
hf_use_auth_token: true
|
||||
datasets:
|
||||
|
||||
@@ -4,7 +4,6 @@ base_model: cerebras/Cerebras-GPT-1.3B
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
push_dataset_to_hub:
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -4,7 +4,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
# huggingface repo
|
||||
chat_template: cohere
|
||||
|
||||
@@ -3,7 +3,6 @@ base_model: LnL-AI/dbrx-base-converted-v2
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
@@ -6,7 +6,6 @@ trust_remote_code: true
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
@@ -3,7 +3,6 @@ base_model: LnL-AI/dbrx-base-converted-v2
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
58
examples/deepcoder/deepcoder-14B-preview-lora.yml
Normal file
58
examples/deepcoder/deepcoder-14B-preview-lora.yml
Normal file
@@ -0,0 +1,58 @@
|
||||
base_model: agentica-org/DeepCoder-14B-Preview
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
field_messages: messages
|
||||
message_property_mappings:
|
||||
role: role
|
||||
content: content
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 2
|
||||
num_epochs: 4
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
58
examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
Normal file
58
examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
Normal file
@@ -0,0 +1,58 @@
|
||||
base_model: deepcogito/cogito-v1-preview-llama-3B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
field_messages: messages
|
||||
message_property_mappings:
|
||||
role: role
|
||||
content: content
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
58
examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
Normal file
58
examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
Normal file
@@ -0,0 +1,58 @@
|
||||
base_model: deepcogito/cogito-v1-preview-qwen-14B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: fozziethebeat/alpaca_messages_2k_test
|
||||
type: chat_template
|
||||
field_messages: messages
|
||||
message_property_mappings:
|
||||
role: role
|
||||
content: content
|
||||
|
||||
dataset_prepared_path:
|
||||
val_set_size: 0.05
|
||||
output_dir: ./outputs/lora-out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
eval_sample_packing: false
|
||||
pad_to_sequence_len: true
|
||||
|
||||
adapter: lora
|
||||
lora_model_dir:
|
||||
lora_r: 32
|
||||
lora_alpha: 16
|
||||
lora_dropout: 0.05
|
||||
lora_target_linear: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 2
|
||||
micro_batch_size: 2
|
||||
num_epochs: 1
|
||||
optimizer: adamw_bnb_8bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 0.0002
|
||||
|
||||
bf16: auto
|
||||
tf32: true
|
||||
|
||||
gradient_checkpointing: true
|
||||
resume_from_checkpoint:
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 10
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
@@ -2,7 +2,6 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
trust_remote_code: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
@@ -6,7 +6,6 @@ trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
|
||||
plugins:
|
||||
|
||||
@@ -11,7 +11,6 @@ trust_remote_code: true
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
gptq: false
|
||||
strict: false
|
||||
push_dataset_to_hub:
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -15,7 +15,6 @@ load_in_8bit: false
|
||||
# enable 4bit for QLoRA
|
||||
load_in_4bit: true
|
||||
gptq: false
|
||||
strict: false
|
||||
push_dataset_to_hub:
|
||||
datasets:
|
||||
- path: QingyiSi/Alpaca-CoT
|
||||
|
||||
@@ -8,7 +8,6 @@ tokenizer_type: AutoTokenizer
|
||||
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
||||
trust_remote_code: true
|
||||
gptq: false
|
||||
strict: false
|
||||
push_dataset_to_hub:
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -8,7 +8,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
# huggingface repo
|
||||
datasets:
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
# huggingface repo
|
||||
chat_template: gemma
|
||||
|
||||
@@ -5,7 +5,6 @@ num_labels: 1
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
reward_model: true
|
||||
chat_template: gemma
|
||||
|
||||
@@ -10,7 +10,6 @@ ddp_find_unused_parameters: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
# huggingface repo
|
||||
chat_template: gemma3
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
base_model: google/gemma-3-4b-it
|
||||
strict: false
|
||||
|
||||
load_in_4bit: true
|
||||
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
base_model: google/gemma-3-4b-it
|
||||
processor_type: AutoProcessor
|
||||
strict: false
|
||||
|
||||
load_in_4bit: true
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ base_model: EleutherAI/gpt-j-6b
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
push_dataset_to_hub:
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -6,7 +6,6 @@ trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -5,7 +5,6 @@ trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -5,7 +5,6 @@ tokenizer_type: AutoTokenizer
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
use_tensorboard: true
|
||||
chat_template: jamba
|
||||
datasets:
|
||||
|
||||
@@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -10,7 +10,6 @@ gptq_disable_exllama: true
|
||||
|
||||
tokenizer_use_fast: true
|
||||
tokenizer_legacy: true
|
||||
strict: false
|
||||
push_dataset_to_hub:
|
||||
hf_use_auth_token: true
|
||||
datasets:
|
||||
|
||||
@@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: yahma/alpaca-cleaned
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -5,7 +5,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -4,7 +4,6 @@ processor_type: AutoProcessor
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
strict: false
|
||||
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
skip_prepare_dataset: true
|
||||
|
||||
@@ -9,7 +9,6 @@ liger_rms_norm: true
|
||||
liger_glu_activation: true
|
||||
liger_fused_linear_cross_entropy: true
|
||||
|
||||
strict: false
|
||||
|
||||
chat_template: llama3
|
||||
datasets:
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
base_model: NousResearch/Meta-Llama-3.1-8B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
chat_template: llama3
|
||||
rl: dpo
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
chat_template: llama3
|
||||
datasets:
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
chat_template: llama3
|
||||
rl: dpo
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
base_model: NousResearch/Llama-3.2-1B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
base_model: NousResearch/Llama-3.2-1B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
base_model: NousResearch/Llama-3.2-1B
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -4,7 +4,6 @@ base_model: meta-llama/Llama-3.2-1B
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
rl: kto
|
||||
rl_beta: 0.5
|
||||
|
||||
@@ -4,7 +4,6 @@ base_model: NousResearch/Llama-3.2-1B
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -5,7 +5,6 @@ tokenizer_type: AutoTokenizer
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: aaditya/alpaca_subset_1
|
||||
|
||||
16
examples/llama-4/README.md
Normal file
16
examples/llama-4/README.md
Normal file
@@ -0,0 +1,16 @@
|
||||
# Llama 4 by Meta AI
|
||||
|
||||
## Available Examples
|
||||
|
||||
### Llama 4 Scout 17Bx16Experts (109B)
|
||||
- [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml)
|
||||
- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml)
|
||||
- [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml)
|
||||
|
||||
Our Single H100 implementation for Llama 4 Scout uses only 68.5GB VRAM for post-training with 4k context length @ 546 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-sft/runs/zic56rhd)
|
||||
|
||||
### Llama 4 Maverick 17Bx128Experts (400B)
|
||||
|
||||
- [Text Multi GPU QLoRA w/FSDP1](./maverick-qlora-fsdp1.yaml)
|
||||
|
||||
Our 4xH100 implementation for Llama 4 Maverick uses 79.5GB VRAM/GPU for post-training with 4k context length @ 206 tokens/second. [WandB logs here.](https://wandb.ai/axolotl-ai/llama-sft/runs/siyvwuxc?nw=nwuserwinglian)
|
||||
88
examples/llama-4/maverick-qlora-fsdp1.yaml
Normal file
88
examples/llama-4/maverick-qlora-fsdp1.yaml
Normal file
@@ -0,0 +1,88 @@
|
||||
base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16
|
||||
model_type: Llama4ForConditionalGeneration
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_glu_activation: true
|
||||
liger_rms_norm: true
|
||||
liger_layer_norm: true
|
||||
|
||||
llama4_linearized_experts: true
|
||||
load_in_4bit: true
|
||||
adapter: qlora
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_target_modules:
|
||||
- self_attn.q_proj
|
||||
- self_attn.k_proj
|
||||
- self_attn.v_proj
|
||||
- self_attn.o_proj
|
||||
- shared_expert.gate_proj
|
||||
- shared_expert.up_proj
|
||||
- shared_expert.down_proj
|
||||
# - experts.gate_projs.[0-9]+$
|
||||
# - experts.up_projs.[0-9]+$
|
||||
# - experts.down_projs.[0-9]+$
|
||||
lora_modules_to_save:
|
||||
# - lm_head
|
||||
# - embed_tokens
|
||||
|
||||
chat_template: llama4
|
||||
datasets:
|
||||
- path: mlabonne/FineTome-100k
|
||||
type: chat_template
|
||||
split: train[:20%]
|
||||
field_messages: conversations
|
||||
message_property_mappings:
|
||||
role: from
|
||||
content: value
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 1e-4
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
gradient_checkpointing: offload
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
|
||||
warmup_steps: 20
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- auto_wrap
|
||||
- full_shard
|
||||
fsdp_config:
|
||||
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
special_tokens:
|
||||
pad_token: <|finetune_right_pad_id|>
|
||||
eos_token: <|eot|>
|
||||
@@ -1,13 +1,20 @@
|
||||
base_model: meta-llama/Llama-4-Scout-17B-16E
|
||||
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||
model_type: Llama4ForConditionalGeneration
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
strict: false
|
||||
|
||||
# torch_compile: true
|
||||
# torch_compile: true
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
adapter: lora
|
||||
liger_glu_activation: true
|
||||
liger_rms_norm: true
|
||||
liger_layer_norm: true
|
||||
|
||||
llama4_linearized_experts: true
|
||||
load_in_4bit: true
|
||||
adapter: qlora
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_target_modules:
|
||||
@@ -15,6 +22,12 @@ lora_target_modules:
|
||||
- self_attn.k_proj
|
||||
- self_attn.v_proj
|
||||
- self_attn.o_proj
|
||||
- shared_expert.gate_proj
|
||||
- shared_expert.up_proj
|
||||
- shared_expert.down_proj
|
||||
# - experts.gate_projs.[0-9]+$
|
||||
# - experts.up_projs.[0-9]+$
|
||||
# - experts.down_projs.[0-9]+$
|
||||
lora_modules_to_save:
|
||||
- lm_head
|
||||
- embed_tokens
|
||||
@@ -37,38 +50,42 @@ sequence_len: 4096
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_8bit
|
||||
optimizer: adamw_torch_fused
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
# gradient_checkpointing: true
|
||||
# gradient_checkpointing_kwargs:
|
||||
# use_reentrant: false
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
evals_per_epoch: 2
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- auto_wrap
|
||||
- full_shard
|
||||
fsdp_config:
|
||||
fsdp_version: 2
|
||||
fsdp_offload_params: false
|
||||
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
fsdp_reshard_after_forward: true
|
||||
fsdp_activation_checkpointing: true
|
||||
special_tokens:
|
||||
pad_token: <|finetune_right_pad_id|>
|
||||
85
examples/llama-4/scout-qlora-single-h100.yaml
Normal file
85
examples/llama-4/scout-qlora-single-h100.yaml
Normal file
@@ -0,0 +1,85 @@
|
||||
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||
model_type: Llama4ForConditionalGeneration
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_glu_activation: true
|
||||
liger_rms_norm: true
|
||||
liger_layer_norm: true
|
||||
|
||||
llama4_linearized_experts: true
|
||||
load_in_4bit: true
|
||||
adapter: qlora
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_target_modules:
|
||||
- self_attn.q_proj
|
||||
- self_attn.k_proj
|
||||
- self_attn.v_proj
|
||||
- self_attn.o_proj
|
||||
- shared_expert.gate_proj
|
||||
- shared_expert.up_proj
|
||||
- shared_expert.down_proj
|
||||
# - experts.gate_projs.[0-9]+$
|
||||
# - experts.up_projs.[0-9]+$
|
||||
# - experts.down_projs.[0-9]+$
|
||||
lora_modules_to_save:
|
||||
# - lm_head
|
||||
# - embed_tokens
|
||||
|
||||
lora_mlp_kernel: true
|
||||
lora_qkv_kernel: true
|
||||
lora_o_kernel: true
|
||||
|
||||
chat_template: llama4
|
||||
datasets:
|
||||
- path: mlabonne/FineTome-100k
|
||||
type: chat_template
|
||||
split: train[:20%]
|
||||
field_messages: conversations
|
||||
message_property_mappings:
|
||||
role: from
|
||||
content: value
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
sequence_len: 4096 # up to 8k will work on a single H100
|
||||
sample_packing: true
|
||||
pad_to_sequence_len: true
|
||||
|
||||
wandb_project:
|
||||
wandb_entity:
|
||||
wandb_watch:
|
||||
wandb_name:
|
||||
wandb_log_model:
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_4bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 1e-4
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
gradient_checkpointing: offload
|
||||
gradient_checkpointing_kwargs:
|
||||
use_reentrant: false
|
||||
|
||||
warmup_steps: 20
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
special_tokens:
|
||||
pad_token: <|finetune_right_pad_id|>
|
||||
eos_token: <|eot|>
|
||||
88
examples/llama-4/scout-vision-qlora-fsdp.yaml
Normal file
88
examples/llama-4/scout-vision-qlora-fsdp.yaml
Normal file
@@ -0,0 +1,88 @@
|
||||
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||
model_type: Llama4ForConditionalGeneration
|
||||
processor_type: Llama4Processor
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
skip_prepare_dataset: true
|
||||
remove_unused_columns: false
|
||||
sample_packing: false
|
||||
|
||||
sequence_len: 4096
|
||||
|
||||
plugins:
|
||||
- axolotl.integrations.liger.LigerPlugin
|
||||
|
||||
liger_glu_activation: true
|
||||
liger_rms_norm: true
|
||||
liger_layer_norm: true
|
||||
|
||||
llama4_linearized_experts: true # use Axolotl's customized model
|
||||
load_in_4bit: true
|
||||
adapter: qlora
|
||||
lora_r: 32
|
||||
lora_alpha: 64
|
||||
lora_target_modules:
|
||||
- self_attn.q_proj
|
||||
- self_attn.k_proj
|
||||
- self_attn.v_proj
|
||||
- self_attn.o_proj
|
||||
- shared_expert.gate_proj
|
||||
- shared_expert.up_proj
|
||||
- shared_expert.down_proj
|
||||
- vision_adapter.mlp.fc1
|
||||
- vision_adapter.mlp.fc2
|
||||
# - experts.gate_projs.[0-9]+$
|
||||
# - experts.up_projs.[0-9]+$
|
||||
# - experts.down_projs.[0-9]+$
|
||||
lora_modules_to_save:
|
||||
- lm_head
|
||||
- embed_tokens
|
||||
|
||||
chat_template: llama4
|
||||
datasets:
|
||||
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||
type: chat_template
|
||||
split: train[:1%]
|
||||
field_messages: messages
|
||||
|
||||
dataset_prepared_path: last_run_prepared
|
||||
val_set_size: 0.0
|
||||
output_dir: ./outputs/out
|
||||
|
||||
gradient_accumulation_steps: 1
|
||||
micro_batch_size: 1
|
||||
num_epochs: 1
|
||||
optimizer: adamw_torch_4bit
|
||||
lr_scheduler: cosine
|
||||
learning_rate: 2e-5
|
||||
|
||||
bf16: true
|
||||
tf32: true
|
||||
|
||||
logging_steps: 1
|
||||
flash_attention: true
|
||||
|
||||
warmup_steps: 100
|
||||
evals_per_epoch: 1
|
||||
saves_per_epoch: 1
|
||||
weight_decay: 0.0
|
||||
fsdp:
|
||||
- auto_wrap
|
||||
- full_shard
|
||||
fsdp_config:
|
||||
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||
fsdp_limit_all_gathers: true
|
||||
fsdp_sync_module_states: true
|
||||
fsdp_offload_params: true
|
||||
fsdp_use_orig_params: false
|
||||
fsdp_cpu_ram_efficient_loading: true
|
||||
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||
fsdp_state_dict_type: FULL_STATE_DICT
|
||||
fsdp_sharding_strategy: FULL_SHARD
|
||||
fsdp_activation_checkpointing: true
|
||||
special_tokens:
|
||||
pad_token: <|finetune_right_pad_id|>
|
||||
eos_token: <|eot|>
|
||||
@@ -1,6 +1,5 @@
|
||||
base_model: llava-hf/llava-1.5-7b-hf
|
||||
processor_type: AutoProcessor
|
||||
strict: false
|
||||
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
skip_prepare_dataset: true
|
||||
|
||||
@@ -5,7 +5,6 @@ tokenizer_type: AutoTokenizer
|
||||
tokenizer_config: EleutherAI/gpt-neox-20b
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -6,7 +6,6 @@ tokenizer_type: LlamaTokenizer
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
strict: false
|
||||
|
||||
unfrozen_parameters:
|
||||
- ^lm_head.weight$
|
||||
|
||||
@@ -4,7 +4,6 @@ model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -4,7 +4,6 @@ model_type: MistralForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -12,7 +12,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
chat_template: chatml
|
||||
rl: dpo
|
||||
|
||||
@@ -9,7 +9,6 @@ trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
rl: orpo
|
||||
orpo_alpha: 0.1
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
|
||||
processor_type: AutoProcessor
|
||||
strict: false
|
||||
|
||||
load_in_8bit: true
|
||||
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
@@ -9,7 +9,6 @@ trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
@@ -9,7 +9,6 @@ trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: tatsu-lab/alpaca
|
||||
|
||||
@@ -6,7 +6,6 @@ tokenizer_type: LlamaTokenizer
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
strict: false
|
||||
|
||||
unfrozen_parameters:
|
||||
- ^lm_head.weight$
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
|
||||
tokenizer_type: LlamaTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
push_dataset_to_hub:
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
push_dataset_to_hub:
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
push_dataset_to_hub:
|
||||
datasets:
|
||||
- path: teknium/GPT4-LLM-Cleaned
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
chat_template: phi_3
|
||||
datasets:
|
||||
|
||||
@@ -4,7 +4,6 @@ model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: garage-bAInd/Open-Platypus
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: garage-bAInd/Open-Platypus
|
||||
|
||||
@@ -4,7 +4,6 @@ model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: garage-bAInd/Open-Platypus
|
||||
|
||||
@@ -4,7 +4,6 @@ model_type: AutoModelForCausalLM
|
||||
tokenizer_type: AutoTokenizer
|
||||
# Automatically upload checkpoint and final model to HF
|
||||
# hub_model_id: username/custom_model_name
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
chat_template: phi_3
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: garage-bAInd/Open-Platypus
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
base_model: mistral-community/pixtral-12b
|
||||
processor_type: AutoProcessor
|
||||
strict: false
|
||||
|
||||
# these 3 lines are needed for now to handle vision chat templates w images
|
||||
skip_prepare_dataset: true
|
||||
|
||||
@@ -9,7 +9,6 @@ trust_remote_code: true
|
||||
|
||||
load_in_8bit: true
|
||||
load_in_4bit: false
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -9,7 +9,6 @@ trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -3,7 +3,6 @@ base_model: Qwen/Qwen1.5-MoE-A2.7B
|
||||
# hub_model_id: username/custom_model_name
|
||||
|
||||
trust_remote_code: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
@@ -6,7 +6,6 @@ trust_remote_code: true
|
||||
|
||||
load_in_8bit: false
|
||||
load_in_4bit: true
|
||||
strict: false
|
||||
|
||||
datasets:
|
||||
- path: mhenrichsen/alpaca_2k_test
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user