Compare commits
35 Commits
fix/doc-ke
...
preprocess
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
f8e92407ff | ||
|
|
c12906134d | ||
|
|
8154d26614 | ||
|
|
fefcbc300d | ||
|
|
7d479348ee | ||
|
|
ce0259db13 | ||
|
|
2798817cf9 | ||
|
|
0e1b081e49 | ||
|
|
8df37ad91f | ||
|
|
9b74298328 | ||
|
|
ae8738aa87 | ||
|
|
ec52561a0c | ||
|
|
eadb16c709 | ||
|
|
9da730d6a4 | ||
|
|
32637fad00 | ||
|
|
f776f889a1 | ||
|
|
69eda209a6 | ||
|
|
b8c633aa97 | ||
|
|
682a9cf79b | ||
|
|
271b24cccc | ||
|
|
198d775d6d | ||
|
|
e4307fb7d7 | ||
|
|
dd8bad06d0 | ||
|
|
de8a625dd7 | ||
|
|
51267ded04 | ||
|
|
756a0559c1 | ||
|
|
9a8e3e9c7b | ||
|
|
7e7180fa10 | ||
|
|
22c562533d | ||
|
|
16823e1de6 | ||
|
|
e0420b3528 | ||
|
|
9f986f5e71 | ||
|
|
f85861a0b2 | ||
|
|
630e40dd13 | ||
|
|
bf9efe2a09 |
14
.coveragerc
Normal file
14
.coveragerc
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
[run]
|
||||||
|
source = axolotl
|
||||||
|
omit =
|
||||||
|
*/tests/*
|
||||||
|
setup.py
|
||||||
|
|
||||||
|
[report]
|
||||||
|
exclude_lines =
|
||||||
|
pragma: no cover
|
||||||
|
def __repr__
|
||||||
|
raise NotImplementedError
|
||||||
|
if __name__ == .__main__.:
|
||||||
|
pass
|
||||||
|
raise ImportError
|
||||||
2
.github/workflows/main.yml
vendored
2
.github/workflows/main.yml
vendored
@@ -29,7 +29,7 @@ jobs:
|
|||||||
cuda_version: 12.4.1
|
cuda_version: 12.4.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
axolotl_extras:
|
axolotl_extras: vllm
|
||||||
is_latest: true
|
is_latest: true
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
13
.github/workflows/tests.yml
vendored
13
.github/workflows/tests.yml
vendored
@@ -102,9 +102,16 @@ jobs:
|
|||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/ --cov=axolotl --cov-report=xml
|
||||||
pytest -v tests/patched/
|
pytest -v tests/patched/ --cov=axolotl --cov-append --cov-report=xml
|
||||||
pytest -v tests/cli/
|
pytest -v tests/cli/ --cov=axolotl --cov-append --cov-report=xml
|
||||||
|
|
||||||
|
- name: Upload coverage to Codecov
|
||||||
|
uses: codecov/codecov-action@v5
|
||||||
|
with:
|
||||||
|
files: ./coverage.xml
|
||||||
|
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
||||||
|
fail_ci_if_error: false
|
||||||
|
|
||||||
- name: cleanup pip cache
|
- name: cleanup pip cache
|
||||||
run: |
|
run: |
|
||||||
|
|||||||
23
README.md
23
README.md
@@ -9,6 +9,7 @@
|
|||||||
<p align="center">
|
<p align="center">
|
||||||
<img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
|
<img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
|
||||||
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
|
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
|
||||||
|
<a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
|
||||||
<a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
|
<a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
|
||||||
<br/>
|
<br/>
|
||||||
<a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
|
<a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
|
||||||
@@ -63,7 +64,7 @@ axolotl fetch examples
|
|||||||
axolotl fetch deepspeed_configs # OPTIONAL
|
axolotl fetch deepspeed_configs # OPTIONAL
|
||||||
```
|
```
|
||||||
|
|
||||||
Other installation approaches are described [here](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html).
|
Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).
|
||||||
|
|
||||||
### Your First Fine-tune
|
### Your First Fine-tune
|
||||||
|
|
||||||
@@ -78,7 +79,7 @@ axolotl fetch examples --dest path/to/folder
|
|||||||
axolotl train examples/llama-3/lora-1b.yml
|
axolotl train examples/llama-3/lora-1b.yml
|
||||||
```
|
```
|
||||||
|
|
||||||
That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/getting-started.html) for a more detailed walkthrough.
|
That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
|
||||||
|
|
||||||
## ✨ Key Features
|
## ✨ Key Features
|
||||||
|
|
||||||
@@ -91,20 +92,20 @@ That's it! Check out our [Getting Started Guide](https://axolotl-ai-cloud.github
|
|||||||
|
|
||||||
## 📚 Documentation
|
## 📚 Documentation
|
||||||
|
|
||||||
- [Installation Options](https://axolotl-ai-cloud.github.io/axolotl/docs/installation.html) - Detailed setup instructions for different environments
|
- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
|
||||||
- [Configuration Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html) - Full configuration options and examples
|
- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
|
||||||
- [Dataset Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/) - Supported formats and how to use them
|
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
|
||||||
- [Multi-GPU Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-gpu.html)
|
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||||
- [Multi-Node Training](https://axolotl-ai-cloud.github.io/axolotl/docs/multi-node.html)
|
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||||
- [Multipacking](https://axolotl-ai-cloud.github.io/axolotl/docs/multipack.html)
|
- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
|
||||||
- [API Reference](https://axolotl-ai-cloud.github.io/axolotl/docs/api/) - Auto-generated code documentation
|
- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
|
||||||
- [FAQ](https://axolotl-ai-cloud.github.io/axolotl/docs/faq.html) - Frequently asked questions
|
- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions
|
||||||
|
|
||||||
## 🤝 Getting Help
|
## 🤝 Getting Help
|
||||||
|
|
||||||
- Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
|
- Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
|
||||||
- Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
|
- Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
|
||||||
- Read our [Debugging Guide](https://axolotl-ai-cloud.github.io/axolotl/docs/debugging.html)
|
- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
|
||||||
- Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
|
- Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options
|
||||||
|
|
||||||
## 🌟 Contributing
|
## 🌟 Contributing
|
||||||
|
|||||||
63
cicd/cicd.sh
63
cicd/cicd.sh
@@ -3,10 +3,59 @@ set -e
|
|||||||
|
|
||||||
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
|
python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"
|
||||||
|
|
||||||
pytest -v --durations=10 -n8 --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli /workspace/axolotl/tests/
|
# Run unit tests with initial coverage report
|
||||||
pytest -v --durations=10 /workspace/axolotl/tests/e2e/patched/lora_kernels # running these with the other patches causes a failure
|
pytest -v --durations=10 -n8 \
|
||||||
pytest -v --durations=10 --ignore=tests/e2e/patched/lora_kernels /workspace/axolotl/tests/e2e/patched
|
--ignore=tests/e2e/ \
|
||||||
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/solo/
|
--ignore=tests/patched/ \
|
||||||
pytest -v --durations=10 /workspace/axolotl/tests/e2e/integrations/
|
--ignore=tests/cli \
|
||||||
pytest -v --durations=10 /workspace/axolotl/tests/cli
|
/workspace/axolotl/tests/ \
|
||||||
pytest -v --durations=10 --ignore=tests/e2e/solo/ --ignore=tests/e2e/patched/ --ignore=tests/e2e/multigpu/ --ignore=tests/e2e/integrations/ --ignore=tests/cli /workspace/axolotl/tests/e2e/
|
--cov=axolotl \
|
||||||
|
--cov-report=xml:coverage.xml
|
||||||
|
|
||||||
|
# Run lora kernels tests with coverage append
|
||||||
|
pytest -v --durations=10 \
|
||||||
|
/workspace/axolotl/tests/e2e/patched/lora_kernels \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append
|
||||||
|
|
||||||
|
# Run patched tests excluding lora kernels with coverage append
|
||||||
|
pytest -v --durations=10 \
|
||||||
|
--ignore=tests/e2e/patched/lora_kernels \
|
||||||
|
/workspace/axolotl/tests/e2e/patched \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append
|
||||||
|
|
||||||
|
# Run solo tests with coverage append
|
||||||
|
pytest -v --durations=10 -n1 \
|
||||||
|
/workspace/axolotl/tests/e2e/solo/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append
|
||||||
|
|
||||||
|
# Run integration tests with coverage append
|
||||||
|
pytest -v --durations=10 \
|
||||||
|
/workspace/axolotl/tests/e2e/integrations/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append
|
||||||
|
|
||||||
|
pytest -v --durations=10 /workspace/axolotl/tests/cli \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append
|
||||||
|
|
||||||
|
# Run remaining e2e tests with coverage append and final report
|
||||||
|
pytest -v --durations=10 \
|
||||||
|
--ignore=tests/e2e/solo/ \
|
||||||
|
--ignore=tests/e2e/patched/ \
|
||||||
|
--ignore=tests/e2e/multigpu/ \
|
||||||
|
--ignore=tests/e2e/integrations/ \
|
||||||
|
--ignore=tests/cli \
|
||||||
|
/workspace/axolotl/tests/e2e/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append \
|
||||||
|
--cov-report=xml:coverage.xml
|
||||||
|
|
||||||
|
# Upload coverage to Codecov
|
||||||
|
if [ -f e2e-coverage.xml ]; then
|
||||||
|
codecov -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION}
|
||||||
|
else
|
||||||
|
echo "Coverage file not found. Coverage report may have failed."
|
||||||
|
fi
|
||||||
|
|||||||
@@ -68,7 +68,7 @@ def run_cmd(cmd: str, run_folder: str):
|
|||||||
@app.function(
|
@app.function(
|
||||||
image=cicd_image,
|
image=cicd_image,
|
||||||
gpu=GPU_CONFIG,
|
gpu=GPU_CONFIG,
|
||||||
timeout=60 * 60,
|
timeout=90 * 60,
|
||||||
cpu=8.0,
|
cpu=8.0,
|
||||||
memory=131072 * N_GPUS,
|
memory=131072 * N_GPUS,
|
||||||
volumes=VOLUME_CONFIG,
|
volumes=VOLUME_CONFIG,
|
||||||
|
|||||||
@@ -4,3 +4,22 @@ set -e
|
|||||||
# only run one test at a time so as not to OOM the GPU
|
# only run one test at a time so as not to OOM the GPU
|
||||||
pytest -v --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
|
pytest -v --durations=10 -n2 /workspace/axolotl/tests/e2e/multigpu/ --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
|
||||||
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
|
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/
|
||||||
|
|
||||||
|
# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
|
||||||
|
pytest -v -n2 \
|
||||||
|
--ignore=/workspace/axolotl/tests/e2e/multigpu/solo/
|
||||||
|
/workspace/axolotl/tests/e2e/multigpu/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-report=xml:multigpu-coverage.xml
|
||||||
|
|
||||||
|
pytest -v --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/solo/ \
|
||||||
|
--cov=axolotl \
|
||||||
|
--cov-append \
|
||||||
|
--cov-report=xml:multigpu-coverage.xml
|
||||||
|
|
||||||
|
# Upload coverage to Codecov
|
||||||
|
if [ -f multigpu-coverage.xml ]; then
|
||||||
|
codecov -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
|
||||||
|
else
|
||||||
|
echo "Coverage file not found. Coverage report may have failed."
|
||||||
|
fi
|
||||||
|
|||||||
51
codecov.yml
Normal file
51
codecov.yml
Normal file
@@ -0,0 +1,51 @@
|
|||||||
|
codecov:
|
||||||
|
require_ci_to_pass: yes
|
||||||
|
|
||||||
|
coverage:
|
||||||
|
precision: 2
|
||||||
|
round: down
|
||||||
|
range: "70...100"
|
||||||
|
status:
|
||||||
|
project:
|
||||||
|
default:
|
||||||
|
# basic
|
||||||
|
target: auto
|
||||||
|
threshold: 0%
|
||||||
|
base: auto
|
||||||
|
# advanced
|
||||||
|
branches: null
|
||||||
|
if_no_uploads: error
|
||||||
|
if_not_found: success
|
||||||
|
if_ci_failed: error
|
||||||
|
only_pulls: false
|
||||||
|
flags: null
|
||||||
|
paths: null
|
||||||
|
patch:
|
||||||
|
default:
|
||||||
|
# basic
|
||||||
|
target: auto
|
||||||
|
threshold: 0%
|
||||||
|
base: auto
|
||||||
|
# advanced
|
||||||
|
branches: null
|
||||||
|
if_no_uploads: error
|
||||||
|
if_not_found: success
|
||||||
|
if_ci_failed: error
|
||||||
|
only_pulls: false
|
||||||
|
flags: null
|
||||||
|
paths: null
|
||||||
|
|
||||||
|
parsers:
|
||||||
|
gcov:
|
||||||
|
branch_detection:
|
||||||
|
conditional: yes
|
||||||
|
loop: yes
|
||||||
|
method: no
|
||||||
|
macro: no
|
||||||
|
|
||||||
|
comment:
|
||||||
|
layout: "reach,diff,flags,files,footer"
|
||||||
|
behavior: default
|
||||||
|
require_changes: no
|
||||||
|
require_base: no
|
||||||
|
require_head: yes
|
||||||
@@ -90,7 +90,7 @@ lora_on_cpu: true
|
|||||||
|
|
||||||
# List[str]. Add plugins to extend the pipeline.
|
# List[str]. Add plugins to extend the pipeline.
|
||||||
# See `src/axolotl/integrations` for the available plugins or doc below for more details.
|
# See `src/axolotl/integrations` for the available plugins or doc below for more details.
|
||||||
# https://axolotl-ai-cloud.github.io/axolotl/docs/custom_integrations.html
|
# https://docs.axolotl.ai/docs/custom_integrations.html
|
||||||
plugins:
|
plugins:
|
||||||
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
# - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
|
||||||
@@ -394,7 +394,7 @@ lora_fan_in_fan_out: false
|
|||||||
|
|
||||||
# Apply custom LoRA autograd functions and activation function Triton kernels for
|
# Apply custom LoRA autograd functions and activation function Triton kernels for
|
||||||
# speed and memory savings
|
# speed and memory savings
|
||||||
# See: https://axolotl-ai-cloud.github.io/axolotl/docs/lora_optims.html
|
# See: https://docs.axolotl.ai/docs/lora_optims.html
|
||||||
lora_mlp_kernel: true
|
lora_mlp_kernel: true
|
||||||
lora_qkv_kernel: true
|
lora_qkv_kernel: true
|
||||||
lora_o_kernel: true
|
lora_o_kernel: true
|
||||||
@@ -688,11 +688,14 @@ ddp_broadcast_buffers:
|
|||||||
# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
|
# Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM.
|
||||||
# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
|
# E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized
|
||||||
# subsequences, or set to 4 to split into four equal-sized subsequences.
|
# subsequences, or set to 4 to split into four equal-sized subsequences.
|
||||||
# See https://axolotl-ai-cloud.github.io/axolotl/docs/sequence_parallelism.html for more details.
|
# See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details.
|
||||||
sequence_parallel_degree:
|
sequence_parallel_degree:
|
||||||
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
# Must evenly divide the number of KV heads in your model.
|
# Must evenly divide the number of KV heads in your model.
|
||||||
heads_k_stride: 1
|
heads_k_stride: 1
|
||||||
|
# One of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to "varlen_llama3"
|
||||||
|
# in the sample packing case, and "batch_ring" in the non-sample packing case.
|
||||||
|
ring_attn_func:
|
||||||
|
|
||||||
# Path to torch distx for optim 'adamw_anyprecision'
|
# Path to torch distx for optim 'adamw_anyprecision'
|
||||||
torchdistx_path:
|
torchdistx_path:
|
||||||
|
|||||||
@@ -457,10 +457,7 @@ datasets:
|
|||||||
type: alpaca
|
type: alpaca
|
||||||
```
|
```
|
||||||
|
|
||||||
Axolotl supports many kinds of instruction dataset. All of them can be found here (https://axolotl-ai-cloud.github.io/axolotl/docs/dataset-formats/inst_tune.html) with their respective type and sample row format.
|
Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format.
|
||||||
|
|
||||||
|
|
||||||
Reference: [Instruction Dataset Documentation](inst_tune.qmd).
|
|
||||||
|
|
||||||
#### Custom Instruct Prompt Format
|
#### Custom Instruct Prompt Format
|
||||||
|
|
||||||
|
|||||||
@@ -36,6 +36,9 @@ deepspeed: deepspeed_configs/zero1.json
|
|||||||
### Usage {#sec-deepspeed-usage}
|
### Usage {#sec-deepspeed-usage}
|
||||||
|
|
||||||
```{.bash}
|
```{.bash}
|
||||||
|
# Fetch deepspeed configs (if not already present)
|
||||||
|
axolotl fetch deepspeed_configs
|
||||||
|
|
||||||
# Passing arg via config
|
# Passing arg via config
|
||||||
axolotl train config.yml
|
axolotl train config.yml
|
||||||
|
|
||||||
@@ -48,10 +51,20 @@ axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
|
|||||||
We provide default configurations for:
|
We provide default configurations for:
|
||||||
|
|
||||||
- ZeRO Stage 1 (`zero1.json`)
|
- ZeRO Stage 1 (`zero1.json`)
|
||||||
|
- ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`)
|
||||||
- ZeRO Stage 2 (`zero2.json`)
|
- ZeRO Stage 2 (`zero2.json`)
|
||||||
- ZeRO Stage 3 (`zero3.json`)
|
- ZeRO Stage 3 (`zero3.json`)
|
||||||
|
- ZeRO Stage 3 with bf16 (`zero3_bf16.json`)
|
||||||
|
- ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`)
|
||||||
|
- ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`)
|
||||||
|
|
||||||
Choose based on your memory requirements and performance needs.
|
::: {.callout-tip}
|
||||||
|
|
||||||
|
Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.
|
||||||
|
|
||||||
|
Start from Stage 1 -> Stage 2 -> Stage 3.
|
||||||
|
|
||||||
|
:::
|
||||||
|
|
||||||
## FSDP {#sec-fsdp}
|
## FSDP {#sec-fsdp}
|
||||||
|
|
||||||
|
|||||||
@@ -164,7 +164,7 @@ Here is an example of a multi-modal dataset:
|
|||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
|
{"type": "image", "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
|
||||||
{"type": "text", "text": "Describe this image in detail."}
|
{"type": "text", "text": "Describe this image in detail."}
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
|
|||||||
@@ -530,7 +530,7 @@ trl:
|
|||||||
```
|
```
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
CUDA_VISIBLE_DEVICES=2,3 axolotl vllm_serve grpo.yaml
|
CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml
|
||||||
```
|
```
|
||||||
|
|
||||||
Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
|
Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:
|
||||||
|
|||||||
@@ -27,6 +27,9 @@ To enable sequence parallelism, add the following to your configuration file:
|
|||||||
sequence_parallel_degree: 4 # Split sequences across 4 GPUs
|
sequence_parallel_degree: 4 # Split sequences across 4 GPUs
|
||||||
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
|
||||||
heads_k_stride: 1
|
heads_k_stride: 1
|
||||||
|
# Optional; one of "varlen_llama3", "batch_ring", "batch_zigzag", "batch_stripe". Defaults to
|
||||||
|
# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
|
||||||
|
ring_attn_func:
|
||||||
```
|
```
|
||||||
|
|
||||||
The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
|
The `sequence_parallel_degree` should be a divisor of the total number of GPUs. For example:
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ tokenizer_type: GPT2Tokenizer
|
|||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
tokenizer_use_fast: true
|
tokenizer_use_fast: true
|
||||||
tokenizer_legacy: true
|
tokenizer_legacy: true
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
hf_use_auth_token: true
|
hf_use_auth_token: true
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ base_model: cerebras/Cerebras-GPT-1.3B
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: CodeLlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
# huggingface repo
|
# huggingface repo
|
||||||
chat_template: cohere
|
chat_template: cohere
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ base_model: LnL-AI/dbrx-base-converted-v2
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ base_model: LnL-AI/dbrx-base-converted-v2
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
58
examples/deepcoder/deepcoder-14B-preview-lora.yml
Normal file
58
examples/deepcoder/deepcoder-14B-preview-lora.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
base_model: agentica-org/DeepCoder-14B-Preview
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: fozziethebeat/alpaca_messages_2k_test
|
||||||
|
type: chat_template
|
||||||
|
field_messages: messages
|
||||||
|
message_property_mappings:
|
||||||
|
role: role
|
||||||
|
content: content
|
||||||
|
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.05
|
||||||
|
output_dir: ./outputs/lora-out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
eval_sample_packing: false
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 4
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
58
examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
Normal file
58
examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
base_model: deepcogito/cogito-v1-preview-llama-3B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: fozziethebeat/alpaca_messages_2k_test
|
||||||
|
type: chat_template
|
||||||
|
field_messages: messages
|
||||||
|
message_property_mappings:
|
||||||
|
role: role
|
||||||
|
content: content
|
||||||
|
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.05
|
||||||
|
output_dir: ./outputs/lora-out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
eval_sample_packing: false
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
58
examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
Normal file
58
examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
Normal file
@@ -0,0 +1,58 @@
|
|||||||
|
base_model: deepcogito/cogito-v1-preview-qwen-14B
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
load_in_8bit: true
|
||||||
|
load_in_4bit: false
|
||||||
|
strict: false
|
||||||
|
|
||||||
|
datasets:
|
||||||
|
- path: fozziethebeat/alpaca_messages_2k_test
|
||||||
|
type: chat_template
|
||||||
|
field_messages: messages
|
||||||
|
message_property_mappings:
|
||||||
|
role: role
|
||||||
|
content: content
|
||||||
|
|
||||||
|
dataset_prepared_path:
|
||||||
|
val_set_size: 0.05
|
||||||
|
output_dir: ./outputs/lora-out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
eval_sample_packing: false
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
adapter: lora
|
||||||
|
lora_model_dir:
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 16
|
||||||
|
lora_dropout: 0.05
|
||||||
|
lora_target_linear: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 2
|
||||||
|
micro_batch_size: 2
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_bnb_8bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 0.0002
|
||||||
|
|
||||||
|
bf16: auto
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
gradient_checkpointing: true
|
||||||
|
resume_from_checkpoint:
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
@@ -2,7 +2,6 @@ base_model: deepseek-ai/DeepSeek-V2-Lite
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
|
|
||||||
plugins:
|
plugins:
|
||||||
|
|||||||
@@ -11,7 +11,6 @@ trust_remote_code: true
|
|||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
gptq: false
|
gptq: false
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -15,7 +15,6 @@ load_in_8bit: false
|
|||||||
# enable 4bit for QLoRA
|
# enable 4bit for QLoRA
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
gptq: false
|
gptq: false
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: QingyiSi/Alpaca-CoT
|
- path: QingyiSi/Alpaca-CoT
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
gptq: false
|
gptq: false
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -8,7 +8,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
# huggingface repo
|
# huggingface repo
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
# huggingface repo
|
# huggingface repo
|
||||||
chat_template: gemma
|
chat_template: gemma
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ num_labels: 1
|
|||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
reward_model: true
|
reward_model: true
|
||||||
chat_template: gemma
|
chat_template: gemma
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ ddp_find_unused_parameters: true
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
# huggingface repo
|
# huggingface repo
|
||||||
chat_template: gemma3
|
chat_template: gemma3
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
base_model: google/gemma-3-4b-it
|
base_model: google/gemma-3-4b-it
|
||||||
strict: false
|
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
base_model: google/gemma-3-4b-it
|
base_model: google/gemma-3-4b-it
|
||||||
processor_type: AutoProcessor
|
processor_type: AutoProcessor
|
||||||
strict: false
|
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ base_model: EleutherAI/gpt-j-6b
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
use_tensorboard: true
|
use_tensorboard: true
|
||||||
chat_template: jamba
|
chat_template: jamba
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -10,7 +10,6 @@ gptq_disable_exllama: true
|
|||||||
|
|
||||||
tokenizer_use_fast: true
|
tokenizer_use_fast: true
|
||||||
tokenizer_legacy: true
|
tokenizer_legacy: true
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
hf_use_auth_token: true
|
hf_use_auth_token: true
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: yahma/alpaca-cleaned
|
- path: yahma/alpaca-cleaned
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ processor_type: AutoProcessor
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
# these 3 lines are needed for now to handle vision chat templates w images
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
skip_prepare_dataset: true
|
skip_prepare_dataset: true
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ liger_rms_norm: true
|
|||||||
liger_glu_activation: true
|
liger_glu_activation: true
|
||||||
liger_fused_linear_cross_entropy: true
|
liger_fused_linear_cross_entropy: true
|
||||||
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
chat_template: llama3
|
chat_template: llama3
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
base_model: NousResearch/Meta-Llama-3.1-8B
|
base_model: NousResearch/Meta-Llama-3.1-8B
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
chat_template: llama3
|
chat_template: llama3
|
||||||
rl: dpo
|
rl: dpo
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
chat_template: llama3
|
chat_template: llama3
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
chat_template: llama3
|
chat_template: llama3
|
||||||
rl: dpo
|
rl: dpo
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
base_model: NousResearch/Llama-3.2-1B
|
base_model: NousResearch/Llama-3.2-1B
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
base_model: NousResearch/Llama-3.2-1B
|
base_model: NousResearch/Llama-3.2-1B
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
base_model: NousResearch/Llama-3.2-1B
|
base_model: NousResearch/Llama-3.2-1B
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ base_model: meta-llama/Llama-3.2-1B
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
rl: kto
|
rl: kto
|
||||||
rl_beta: 0.5
|
rl_beta: 0.5
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ base_model: NousResearch/Llama-3.2-1B
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer # PreTrainedTokenizerFast
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: aaditya/alpaca_subset_1
|
- path: aaditya/alpaca_subset_1
|
||||||
|
|||||||
28
examples/llama-4/README.md
Normal file
28
examples/llama-4/README.md
Normal file
@@ -0,0 +1,28 @@
|
|||||||
|
# Llama 4 by Meta AI
|
||||||
|
|
||||||
|
## Flash Attention vs Flex Attention
|
||||||
|
|
||||||
|
While Flash Attention to support is "enabled" for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.
|
||||||
|
|
||||||
|
## Available Examples
|
||||||
|
|
||||||
|
### Llama 4 Scout 17Bx16Experts (109B)
|
||||||
|
|
||||||
|
Flex Attention
|
||||||
|
- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100-flex.yaml)
|
||||||
|
- [Text Multi GPU QLoRA w/ FSDP2](./scout-qlora-flexattn-fsdp2.yaml)
|
||||||
|
|
||||||
|
[//]: # (Flash Attention (Do not use))
|
||||||
|
|
||||||
|
[//]: # (- [Multi-Modal/Vision QLoRA w/ FSDP1](./scout-vision-qlora-fsdp.yaml))
|
||||||
|
|
||||||
|
[//]: # (- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100.yaml))
|
||||||
|
|
||||||
|
[//]: # (- [Text Multi GPU QLoRA w/ FSDP1](./scout-qlora-fsdp1.yaml))
|
||||||
|
|
||||||
|
Our Single H100 implementation for Llama 4 Scout uses only 64.5GB VRAM for post-training with 4k context length @ 519 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/wpie7dkj)
|
||||||
|
Multi-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @ 280tps/gpu, [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/2lkezdj8)
|
||||||
|
|
||||||
|
### Llama 4 Maverick 17Bx128Experts (400B)
|
||||||
|
|
||||||
|
Coming Soon
|
||||||
88
examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
Normal file
88
examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
# - lm_head
|
||||||
|
# - embed_tokens
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_fused
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 1e-4
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
gradient_checkpointing: offload
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
|
||||||
|
warmup_steps: 20
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
- auto_wrap
|
||||||
|
- full_shard
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||||
|
fsdp_limit_all_gathers: true
|
||||||
|
fsdp_sync_module_states: true
|
||||||
|
fsdp_offload_params: true
|
||||||
|
fsdp_use_orig_params: false
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
@@ -3,7 +3,6 @@ model_type: Llama4ForConditionalGeneration
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
strict: false
|
|
||||||
|
|
||||||
# torch_compile: true
|
# torch_compile: true
|
||||||
plugins:
|
plugins:
|
||||||
85
examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
Normal file
85
examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
# - lm_head
|
||||||
|
# - embed_tokens
|
||||||
|
|
||||||
|
lora_mlp_kernel: true
|
||||||
|
lora_qkv_kernel: true
|
||||||
|
lora_o_kernel: true
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
sequence_len: 4096 # up to 8k will work on a single H100
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
wandb_project:
|
||||||
|
wandb_entity:
|
||||||
|
wandb_watch:
|
||||||
|
wandb_name:
|
||||||
|
wandb_log_model:
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 1e-4
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
gradient_checkpointing: offload
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
|
||||||
|
warmup_steps: 20
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
88
examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
Normal file
88
examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
processor_type: Llama4Processor
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
|
||||||
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
|
skip_prepare_dataset: true
|
||||||
|
remove_unused_columns: false
|
||||||
|
sample_packing: false
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true # use Axolotl's customized model
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
- vision_adapter.mlp.fc1
|
||||||
|
- vision_adapter.mlp.fc2
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
- lm_head
|
||||||
|
- embed_tokens
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||||
|
type: chat_template
|
||||||
|
split: train[:1%]
|
||||||
|
field_messages: messages
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 2e-5
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
flash_attention: true
|
||||||
|
|
||||||
|
warmup_steps: 100
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
- auto_wrap
|
||||||
|
- full_shard
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||||
|
fsdp_limit_all_gathers: true
|
||||||
|
fsdp_sync_module_states: true
|
||||||
|
fsdp_offload_params: true
|
||||||
|
fsdp_use_orig_params: false
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_state_dict_type: FULL_STATE_DICT
|
||||||
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
fsdp_activation_checkpointing: true
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
@@ -1,13 +1,18 @@
|
|||||||
base_model: meta-llama/Llama-4-Scout-17B-16E
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
model_type: Llama4ForConditionalGeneration
|
model_type: Llama4ForConditionalGeneration
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
strict: false
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
# torch_compile: true
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
adapter: lora
|
llama4_linearized_experts: true
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 64
|
lora_alpha: 64
|
||||||
lora_target_modules:
|
lora_target_modules:
|
||||||
@@ -15,9 +20,15 @@ lora_target_modules:
|
|||||||
- self_attn.k_proj
|
- self_attn.k_proj
|
||||||
- self_attn.v_proj
|
- self_attn.v_proj
|
||||||
- self_attn.o_proj
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
lora_modules_to_save:
|
lora_modules_to_save:
|
||||||
- lm_head
|
# - lm_head
|
||||||
- embed_tokens
|
# - embed_tokens
|
||||||
|
|
||||||
chat_template: llama4
|
chat_template: llama4
|
||||||
datasets:
|
datasets:
|
||||||
@@ -38,23 +49,23 @@ sample_packing: true
|
|||||||
pad_to_sequence_len: true
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
gradient_accumulation_steps: 1
|
gradient_accumulation_steps: 1
|
||||||
micro_batch_size: 1
|
micro_batch_size: 2
|
||||||
num_epochs: 1
|
num_epochs: 3
|
||||||
optimizer: adamw_torch_8bit
|
optimizer: adamw_torch_4bit
|
||||||
lr_scheduler: cosine
|
lr_scheduler: cosine
|
||||||
learning_rate: 2e-5
|
learning_rate: 1e-4
|
||||||
|
|
||||||
bf16: true
|
bf16: true
|
||||||
tf32: true
|
tf32: true
|
||||||
|
|
||||||
# gradient_checkpointing: true
|
|
||||||
# gradient_checkpointing_kwargs:
|
|
||||||
# use_reentrant: false
|
|
||||||
logging_steps: 1
|
logging_steps: 1
|
||||||
flash_attention: true
|
flex_attention: true
|
||||||
|
flex_attn_compile_kwargs:
|
||||||
|
dynamic: false
|
||||||
|
mode: max-autotune-no-cudagraphs
|
||||||
|
|
||||||
warmup_steps: 100
|
warmup_steps: 10
|
||||||
evals_per_epoch: 2
|
evals_per_epoch: 1
|
||||||
saves_per_epoch: 1
|
saves_per_epoch: 1
|
||||||
weight_decay: 0.0
|
weight_decay: 0.0
|
||||||
fsdp:
|
fsdp:
|
||||||
85
examples/llama-4/scout-qlora-single-h100-flex.yaml
Normal file
85
examples/llama-4/scout-qlora-single-h100-flex.yaml
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
cut_cross_entropy: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true # needed with custom linearized experts model
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
# - experts.gate_projs.[0-9]+$ # optionally train the moe experts
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
# - lm_head # needed if modifying vocabulary
|
||||||
|
# - embed_tokens
|
||||||
|
|
||||||
|
lora_mlp_kernel: true
|
||||||
|
lora_qkv_kernel: true
|
||||||
|
lora_o_kernel: true
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: mlabonne/FineTome-100k
|
||||||
|
type: chat_template
|
||||||
|
split: train[:20%]
|
||||||
|
field_messages: conversations
|
||||||
|
message_property_mappings:
|
||||||
|
role: from
|
||||||
|
content: value
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
sequence_len: 4096 # up to 8k will work on a single H100
|
||||||
|
sample_packing: true
|
||||||
|
pad_to_sequence_len: true
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 1e-4
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
torch_compile: true
|
||||||
|
flex_attention: true
|
||||||
|
flex_attn_compile_kwargs:
|
||||||
|
dynamic: false
|
||||||
|
mode: max-autotune-no-cudagraphs
|
||||||
|
|
||||||
|
gradient_checkpointing: offload
|
||||||
|
gradient_checkpointing_kwargs:
|
||||||
|
use_reentrant: false
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
warmup_steps: 20
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
|
||||||
|
weight_decay: 0.0
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
89
examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
Normal file
89
examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
Normal file
@@ -0,0 +1,89 @@
|
|||||||
|
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
|
||||||
|
model_type: Llama4ForConditionalGeneration
|
||||||
|
processor_type: Llama4Processor
|
||||||
|
# Automatically upload checkpoint and final model to HF
|
||||||
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
|
skip_prepare_dataset: true
|
||||||
|
remove_unused_columns: false
|
||||||
|
sample_packing: false
|
||||||
|
|
||||||
|
sequence_len: 4096
|
||||||
|
|
||||||
|
plugins:
|
||||||
|
- axolotl.integrations.liger.LigerPlugin
|
||||||
|
|
||||||
|
liger_glu_activation: true
|
||||||
|
liger_rms_norm: true
|
||||||
|
liger_layer_norm: true
|
||||||
|
|
||||||
|
llama4_linearized_experts: true # use Axolotl's customized model
|
||||||
|
load_in_4bit: true
|
||||||
|
adapter: qlora
|
||||||
|
lora_r: 32
|
||||||
|
lora_alpha: 64
|
||||||
|
lora_target_modules:
|
||||||
|
- self_attn.q_proj
|
||||||
|
- self_attn.k_proj
|
||||||
|
- self_attn.v_proj
|
||||||
|
- self_attn.o_proj
|
||||||
|
- shared_expert.gate_proj
|
||||||
|
- shared_expert.up_proj
|
||||||
|
- shared_expert.down_proj
|
||||||
|
- vision_adapter.mlp.fc1
|
||||||
|
- vision_adapter.mlp.fc2
|
||||||
|
# - experts.gate_projs.[0-9]+$
|
||||||
|
# - experts.up_projs.[0-9]+$
|
||||||
|
# - experts.down_projs.[0-9]+$
|
||||||
|
lora_modules_to_save:
|
||||||
|
- lm_head
|
||||||
|
- embed_tokens
|
||||||
|
|
||||||
|
chat_template: llama4
|
||||||
|
datasets:
|
||||||
|
- path: HuggingFaceH4/llava-instruct-mix-vsft
|
||||||
|
type: chat_template
|
||||||
|
split: train[:1%]
|
||||||
|
field_messages: messages
|
||||||
|
|
||||||
|
dataset_prepared_path: last_run_prepared
|
||||||
|
val_set_size: 0.0
|
||||||
|
output_dir: ./outputs/out
|
||||||
|
|
||||||
|
gradient_accumulation_steps: 1
|
||||||
|
micro_batch_size: 1
|
||||||
|
num_epochs: 1
|
||||||
|
optimizer: adamw_torch_4bit
|
||||||
|
lr_scheduler: cosine
|
||||||
|
learning_rate: 1e-4
|
||||||
|
|
||||||
|
bf16: true
|
||||||
|
tf32: true
|
||||||
|
|
||||||
|
logging_steps: 1
|
||||||
|
flex_attention: true
|
||||||
|
flex_attn_compile_kwargs:
|
||||||
|
dynamic: false
|
||||||
|
mode: max-autotune-no-cudagraphs
|
||||||
|
|
||||||
|
warmup_steps: 10
|
||||||
|
evals_per_epoch: 1
|
||||||
|
saves_per_epoch: 1
|
||||||
|
weight_decay: 0.0
|
||||||
|
fsdp:
|
||||||
|
- auto_wrap
|
||||||
|
- full_shard
|
||||||
|
fsdp_config:
|
||||||
|
fsdp_version: 2
|
||||||
|
fsdp_offload_params: false
|
||||||
|
fsdp_cpu_ram_efficient_loading: true
|
||||||
|
fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
|
||||||
|
fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
|
||||||
|
fsdp_state_dict_type: SHARDED_STATE_DICT
|
||||||
|
fsdp_sharding_strategy: FULL_SHARD
|
||||||
|
fsdp_reshard_after_forward: true
|
||||||
|
fsdp_activation_checkpointing: true
|
||||||
|
special_tokens:
|
||||||
|
pad_token: <|finetune_right_pad_id|>
|
||||||
|
eos_token: <|eot|>
|
||||||
@@ -1,6 +1,5 @@
|
|||||||
base_model: llava-hf/llava-1.5-7b-hf
|
base_model: llava-hf/llava-1.5-7b-hf
|
||||||
processor_type: AutoProcessor
|
processor_type: AutoProcessor
|
||||||
strict: false
|
|
||||||
|
|
||||||
# these 3 lines are needed for now to handle vision chat templates w images
|
# these 3 lines are needed for now to handle vision chat templates w images
|
||||||
skip_prepare_dataset: true
|
skip_prepare_dataset: true
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
tokenizer_config: EleutherAI/gpt-neox-20b
|
tokenizer_config: EleutherAI/gpt-neox-20b
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
unfrozen_parameters:
|
unfrozen_parameters:
|
||||||
- ^lm_head.weight$
|
- ^lm_head.weight$
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ model_type: MistralForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ model_type: MistralForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -12,7 +12,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
chat_template: chatml
|
chat_template: chatml
|
||||||
rl: dpo
|
rl: dpo
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
rl: orpo
|
rl: orpo
|
||||||
orpo_alpha: 0.1
|
orpo_alpha: 0.1
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
|
base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
|
||||||
processor_type: AutoProcessor
|
processor_type: AutoProcessor
|
||||||
strict: false
|
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ trust_remote_code: true
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: tatsu-lab/alpaca
|
- path: tatsu-lab/alpaca
|
||||||
|
|||||||
@@ -6,7 +6,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
trust_remote_code: true
|
trust_remote_code: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
unfrozen_parameters:
|
unfrozen_parameters:
|
||||||
- ^lm_head.weight$
|
- ^lm_head.weight$
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: mhenrichsen/alpaca_2k_test
|
- path: mhenrichsen/alpaca_2k_test
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ model_type: LlamaForCausalLM
|
|||||||
tokenizer_type: LlamaTokenizer
|
tokenizer_type: LlamaTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: LlamaTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
push_dataset_to_hub:
|
push_dataset_to_hub:
|
||||||
datasets:
|
datasets:
|
||||||
- path: teknium/GPT4-LLM-Cleaned
|
- path: teknium/GPT4-LLM-Cleaned
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
strict: false
|
|
||||||
|
|
||||||
chat_template: phi_3
|
chat_template: phi_3
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ model_type: AutoModelForCausalLM
|
|||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: garage-bAInd/Open-Platypus
|
- path: garage-bAInd/Open-Platypus
|
||||||
|
|||||||
@@ -7,7 +7,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
|
|
||||||
load_in_8bit: false
|
load_in_8bit: false
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: garage-bAInd/Open-Platypus
|
- path: garage-bAInd/Open-Platypus
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ model_type: AutoModelForCausalLM
|
|||||||
tokenizer_type: AutoTokenizer
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
strict: false
|
|
||||||
|
|
||||||
datasets:
|
datasets:
|
||||||
- path: garage-bAInd/Open-Platypus
|
- path: garage-bAInd/Open-Platypus
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user