Compare commits
33 Commits
v0.10.0
...
telemetry-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
345a159796 | ||
|
|
657bffd85f | ||
|
|
f0dde8e2d5 | ||
|
|
25fa4df70f | ||
|
|
e735f4270b | ||
|
|
035e7a2f4c | ||
|
|
2d36c11264 | ||
|
|
b8ec5bdccf | ||
|
|
249405b46e | ||
|
|
d3be84fec2 | ||
|
|
1c74ab175f | ||
|
|
b2f1fc109a | ||
|
|
5a2a80cc48 | ||
|
|
4033fe74f8 | ||
|
|
e9df4444be | ||
|
|
ffd2985750 | ||
|
|
17310f9acc | ||
|
|
71ae6f9f87 | ||
|
|
9dd1092f8f | ||
|
|
2c2f2647a9 | ||
|
|
98313a6b3f | ||
|
|
8b75205d3b | ||
|
|
ef4990f304 | ||
|
|
db3297b090 | ||
|
|
86ed554bda | ||
|
|
f254d7d5a2 | ||
|
|
d8b0522ea0 | ||
|
|
1edd6b9524 | ||
|
|
66c6fb56cb | ||
|
|
90b39ce112 | ||
|
|
5afab46cc6 | ||
|
|
bd152c6115 | ||
|
|
76336743ff |
8
.github/workflows/base.yml
vendored
8
.github/workflows/base.yml
vendored
@@ -16,7 +16,6 @@ on:
|
|||||||
jobs:
|
jobs:
|
||||||
build-base:
|
build-base:
|
||||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||||
timeout-minutes: 480
|
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
runs-on: ubuntu-latest-m
|
runs-on: ubuntu-latest-m
|
||||||
strategy:
|
strategy:
|
||||||
@@ -48,14 +47,14 @@ jobs:
|
|||||||
cuda_version: 12.6.3
|
cuda_version: 12.6.3
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-base"
|
dockerfile: "Dockerfile-base"
|
||||||
- cuda: "128"
|
- cuda: "128"
|
||||||
cuda_version: 12.6.3
|
cuda_version: 12.6.3
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-base"
|
dockerfile: "Dockerfile-base"
|
||||||
- cuda: "128"
|
- cuda: "128"
|
||||||
@@ -107,7 +106,6 @@ jobs:
|
|||||||
TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
|
TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
|
||||||
build-base-uv:
|
build-base-uv:
|
||||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||||
timeout-minutes: 480
|
|
||||||
runs-on: ubuntu-latest-m
|
runs-on: ubuntu-latest-m
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
@@ -124,7 +122,7 @@ jobs:
|
|||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
cudnn_version: ""
|
cudnn_version: ""
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-uv-base"
|
dockerfile: "Dockerfile-uv-base"
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
8
.github/workflows/main.yml
vendored
8
.github/workflows/main.yml
vendored
@@ -29,12 +29,12 @@ jobs:
|
|||||||
- cuda: 126
|
- cuda: 126
|
||||||
cuda_version: 12.6.3
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 128
|
- cuda: 128
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
@@ -97,12 +97,12 @@ jobs:
|
|||||||
- cuda: 126
|
- cuda: 126
|
||||||
cuda_version: 12.6.3
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 128
|
- cuda: 128
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
runs-on: axolotl-gpu-runner
|
runs-on: axolotl-gpu-runner
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
2
.github/workflows/multi-gpu-e2e.yml
vendored
2
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -43,7 +43,7 @@ jobs:
|
|||||||
- cuda: 126
|
- cuda: 126
|
||||||
cuda_version: 12.6.3
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.0
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
nightly_build: "true"
|
nightly_build: "true"
|
||||||
|
|||||||
12
.github/workflows/tests.yml
vendored
12
.github/workflows/tests.yml
vendored
@@ -52,7 +52,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python_version: ["3.11"]
|
python_version: ["3.11"]
|
||||||
pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
|
pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
|
||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -125,7 +125,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python_version: ["3.11"]
|
python_version: ["3.11"]
|
||||||
pytorch_version: ["2.5.1", "2.6.0", "2.7.1"]
|
pytorch_version: ["2.5.1", "2.6.0", "2.7.0"]
|
||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -188,7 +188,7 @@ jobs:
|
|||||||
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' }}
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
runs-on: [self-hosted, modal]
|
runs-on: [self-hosted, modal]
|
||||||
timeout-minutes: 120
|
timeout-minutes: 90
|
||||||
needs: [pre-commit, pytest, pytest-sdist]
|
needs: [pre-commit, pytest, pytest-sdist]
|
||||||
|
|
||||||
strategy:
|
strategy:
|
||||||
@@ -238,7 +238,7 @@ jobs:
|
|||||||
if: github.repository_owner == 'axolotl-ai-cloud'
|
if: github.repository_owner == 'axolotl-ai-cloud'
|
||||||
# this job needs to be run on self-hosted GPU runners...
|
# this job needs to be run on self-hosted GPU runners...
|
||||||
runs-on: [self-hosted, modal]
|
runs-on: [self-hosted, modal]
|
||||||
timeout-minutes: 120
|
timeout-minutes: 90
|
||||||
# Only run the remainder of the matrix if the first e2e check passed;
|
# Only run the remainder of the matrix if the first e2e check passed;
|
||||||
# this is to save on wasted compute costs for known failures that get caught in the first run
|
# this is to save on wasted compute costs for known failures that get caught in the first run
|
||||||
needs: [pre-commit, pytest, docker-e2e-tests-1st]
|
needs: [pre-commit, pytest, docker-e2e-tests-1st]
|
||||||
@@ -262,13 +262,13 @@ jobs:
|
|||||||
- cuda: 126
|
- cuda: 126
|
||||||
cuda_version: 12.6.3
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.0
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
- cuda: 128
|
- cuda: 128
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.0
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
81
README.md
81
README.md
@@ -22,32 +22,28 @@
|
|||||||
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
|
<img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
|
||||||
</p>
|
</p>
|
||||||
|
|
||||||
|
|
||||||
## 🎉 Latest Updates
|
|
||||||
|
|
||||||
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/magistral) to start training your own Magistral models with Axolotl!
|
|
||||||
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
|
|
||||||
- 2025/04: Llama 4 support has been added in Axolotl. See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-4) to start training your own Llama 4 models with Axolotl's linearized version!
|
|
||||||
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
|
|
||||||
- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
|
|
||||||
- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
|
|
||||||
- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
|
|
||||||
- 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).
|
|
||||||
|
|
||||||
## ✨ Overview
|
|
||||||
|
|
||||||
Axolotl is a tool designed to streamline post-training for various AI models.
|
Axolotl is a tool designed to streamline post-training for various AI models.
|
||||||
|
Post-training refers to any modifications or additional training performed on
|
||||||
|
pre-trained models - including full model fine-tuning, parameter-efficient tuning (like
|
||||||
|
LoRA and QLoRA), supervised fine-tuning (SFT), instruction tuning, and alignment
|
||||||
|
techniques. With support for multiple model architectures and training configurations,
|
||||||
|
Axolotl makes it easy to get started with these techniques.
|
||||||
|
|
||||||
|
Axolotl is designed to work with YAML config files that contain everything you need to
|
||||||
|
preprocess a dataset, train or fine-tune a model, run model inference or evaluation,
|
||||||
|
and much more.
|
||||||
|
|
||||||
Features:
|
Features:
|
||||||
|
|
||||||
- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more. We are compatible with HuggingFace transformers causal language models.
|
- Train various Huggingface models such as llama, pythia, falcon, mpt
|
||||||
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO), Multimodal, and Reward Modelling (RM) / Process Reward Modelling (PRM).
|
- Supports fullfinetune, lora, qlora, relora, and gptq
|
||||||
- **Easy Configuration**: Re-use a single YAML file between dataset preprocess, training, evaluation, quantization, and inference.
|
- Customize configurations using a simple yaml file or CLI overwrite
|
||||||
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention](https://github.com/Dao-AILab/flash-attention), [Xformers](https://github.com/facebookresearch/xformers), [Flex Attention](https://pytorch.org/blog/flexattention/), [Liger Kernel](https://github.com/linkedin/Liger-Kernel), [Cut Cross Entropy](https://github.com/apple/ml-cross-entropy/tree/main), Sequence Parallelism (SP), LoRA optimizations, Multi-GPU training (FSDP1, FSDP2, DeepSpeed), Multi-node training (Torchrun, Ray), and many more!
|
- Load different dataset formats, use custom formats, or bring your own tokenized datasets
|
||||||
- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
|
- Integrated with [xformers](https://github.com/facebookresearch/xformers), flash attention, [liger kernel](https://github.com/linkedin/Liger-Kernel), rope scaling, and multipacking
|
||||||
- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.
|
- Works with single GPU or multiple GPUs via FSDP or Deepspeed
|
||||||
|
- Easily run with Docker locally or on the cloud
|
||||||
|
- Log results and optionally checkpoints to wandb, mlflow or Comet
|
||||||
|
- And more!
|
||||||
|
|
||||||
## 🚀 Quick Start
|
## 🚀 Quick Start
|
||||||
|
|
||||||
@@ -85,12 +81,19 @@ axolotl train examples/llama-3/lora-1b.yml
|
|||||||
|
|
||||||
That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
|
That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.
|
||||||
|
|
||||||
|
## ✨ Key Features
|
||||||
|
|
||||||
|
- **Multiple Model Support**: Train various models like LLaMA, Mistral, Mixtral, Pythia, and more
|
||||||
|
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, and more
|
||||||
|
- **Easy Configuration**: Simple YAML files to control your training setup
|
||||||
|
- **Performance Optimizations**: Flash Attention, xformers, multi-GPU training
|
||||||
|
- **Flexible Dataset Handling**: Use various formats and custom datasets
|
||||||
|
- **Cloud Ready**: Run on cloud platforms or local hardware
|
||||||
|
|
||||||
## 📚 Documentation
|
## 📚 Documentation
|
||||||
|
|
||||||
- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
|
- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
|
||||||
- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
|
- [Configuration Guide](https://docs.axolotl.ai/docs/config.html) - Full configuration options and examples
|
||||||
- [Dataset Loading](https://docs.axolotl.ai/docs/dataset_loading.html) - Loading datasets from various sources
|
|
||||||
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
|
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
|
||||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
||||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
||||||
@@ -109,6 +112,38 @@ That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/ge
|
|||||||
|
|
||||||
Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
|
Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.
|
||||||
|
|
||||||
|
## 📈 Telemetry
|
||||||
|
|
||||||
|
Axolotl has opt-in telemetry that helps us understand how the project is being used
|
||||||
|
and prioritize improvements. We collect basic system information, model types, and
|
||||||
|
error rates—never personal data or file paths. Telemetry is disabled by default. To
|
||||||
|
enable it, set AXOLOTL_DO_NOT_TRACK=0. For more details, see our [telemetry documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/telemetry.html).
|
||||||
|
|
||||||
|
## Supported Models
|
||||||
|
|
||||||
|
| | fp16/fp32 | lora | qlora | gptq | gptq w/flash attn | flash attn | xformers attn |
|
||||||
|
|-------------|:----------|:-----|-------|------|-------------------|------------|--------------|
|
||||||
|
| llama | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
|
| Mistral | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
|
| Mixtral-MoE | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
|
||||||
|
| Mixtral8X22 | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
|
||||||
|
| Pythia | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
|
||||||
|
| cerebras | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
|
||||||
|
| btlm | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
|
||||||
|
| mpt | ✅ | ❌ | ❓ | ❌ | ❌ | ❌ | ❓ |
|
||||||
|
| falcon | ✅ | ✅ | ✅ | ❌ | ❌ | ❌ | ❓ |
|
||||||
|
| gpt-j | ✅ | ✅ | ✅ | ❌ | ❌ | ❓ | ❓ |
|
||||||
|
| XGen | ✅ | ❓ | ✅ | ❓ | ❓ | ❓ | ✅ |
|
||||||
|
| phi | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
|
||||||
|
| RWKV | ✅ | ❓ | ❓ | ❓ | ❓ | ❓ | ❓ |
|
||||||
|
| Qwen | ✅ | ✅ | ✅ | ❓ | ❓ | ❓ | ❓ |
|
||||||
|
| Gemma | ✅ | ✅ | ✅ | ❓ | ❓ | ✅ | ❓ |
|
||||||
|
| Jamba | ✅ | ✅ | ✅ | ❓ | ❓ | ✅ | ❓ |
|
||||||
|
|
||||||
|
✅: supported
|
||||||
|
❌: not supported
|
||||||
|
❓: untested
|
||||||
|
|
||||||
## ❤️ Sponsors
|
## ❤️ Sponsors
|
||||||
|
|
||||||
Thank you to our sponsors who help make Axolotl possible:
|
Thank you to our sponsors who help make Axolotl possible:
|
||||||
|
|||||||
@@ -236,6 +236,7 @@ website:
|
|||||||
- docs/inference.qmd
|
- docs/inference.qmd
|
||||||
- docs/cli.qmd
|
- docs/cli.qmd
|
||||||
- docs/config.qmd
|
- docs/config.qmd
|
||||||
|
- docs/telemetry.qmd
|
||||||
- text: "API Reference"
|
- text: "API Reference"
|
||||||
href: docs/api
|
href: docs/api
|
||||||
|
|
||||||
|
|||||||
@@ -1,31 +0,0 @@
|
|||||||
{
|
|
||||||
"compile": {
|
|
||||||
"disable": false,
|
|
||||||
"backend": "inductor"
|
|
||||||
},
|
|
||||||
"zero_optimization": {
|
|
||||||
"stage": 2,
|
|
||||||
"offload_optimizer": {
|
|
||||||
"device": "cpu"
|
|
||||||
},
|
|
||||||
"contiguous_gradients": true,
|
|
||||||
"overlap_comm": true
|
|
||||||
},
|
|
||||||
"bf16": {
|
|
||||||
"enabled": "auto"
|
|
||||||
},
|
|
||||||
"fp16": {
|
|
||||||
"enabled": "auto",
|
|
||||||
"auto_cast": false,
|
|
||||||
"loss_scale": 0,
|
|
||||||
"initial_scale_power": 32,
|
|
||||||
"loss_scale_window": 1000,
|
|
||||||
"hysteresis": 2,
|
|
||||||
"min_loss_scale": 1
|
|
||||||
},
|
|
||||||
"gradient_accumulation_steps": "auto",
|
|
||||||
"gradient_clipping": "auto",
|
|
||||||
"train_batch_size": "auto",
|
|
||||||
"train_micro_batch_size_per_gpu": "auto",
|
|
||||||
"wall_clock_breakdown": false
|
|
||||||
}
|
|
||||||
@@ -38,6 +38,6 @@ RUN git lfs install --skip-repo && \
|
|||||||
# The base image ships with `pydantic==1.8.2` which is not working
|
# The base image ships with `pydantic==1.8.2` which is not working
|
||||||
pip3 install -U --no-cache-dir pydantic==1.10.10
|
pip3 install -U --no-cache-dir pydantic==1.10.10
|
||||||
|
|
||||||
RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
|
RUN if [ "$PYTORCH_VERSION" = "2.7.0" ] ; then \
|
||||||
pip3 install flash-attn==2.7.4.post1; \
|
pip3 install flash-attn==2.7.4.post1; \
|
||||||
fi
|
fi
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
|||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
|
RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
|
||||||
python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
|
python3 -m pip install --no-cache-dir -U torch==2.7.0 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
|
||||||
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
|
||||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
|
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"
|
||||||
|
|
||||||
|
|||||||
@@ -29,12 +29,8 @@ RUN uv venv --no-project --relocatable axolotl-venv
|
|||||||
|
|
||||||
ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
|
ENV PATH="/workspace/axolotl-venv/bin:${PATH}"
|
||||||
|
|
||||||
RUN uv pip install packaging setuptools wheel psutil \
|
RUN uv pip install packaging setuptools wheel \
|
||||||
&& uv pip install torch==${PYTORCH_VERSION} \
|
&& uv pip install torch==${PYTORCH_VERSION} \
|
||||||
&& uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
|
&& uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" \
|
||||||
&& uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
|
&& uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" \
|
||||||
&& uv pip install awscli pydantic
|
&& uv pip install awscli pydantic
|
||||||
|
|
||||||
RUN if [ "$PYTORCH_VERSION" = "2.7.1" ] ; then \
|
|
||||||
uv pip install --no-build-isolation flash-attn==2.7.4.post1; \
|
|
||||||
fi
|
|
||||||
|
|||||||
@@ -27,8 +27,6 @@ trust_remote_code:
|
|||||||
tokenizer_use_fast:
|
tokenizer_use_fast:
|
||||||
# Whether to use the legacy tokenizer setting, defaults to True
|
# Whether to use the legacy tokenizer setting, defaults to True
|
||||||
tokenizer_legacy:
|
tokenizer_legacy:
|
||||||
# Whether to use mistral-common tokenizer. If set to True, it will use the mistral-common tokenizer.
|
|
||||||
tokenizer_use_mistral_common:
|
|
||||||
# Resize the model embeddings when new tokens are added to multiples of 32
|
# Resize the model embeddings when new tokens are added to multiples of 32
|
||||||
# This is reported to improve training speed on some models
|
# This is reported to improve training speed on some models
|
||||||
resize_token_embeddings_to_32x:
|
resize_token_embeddings_to_32x:
|
||||||
@@ -175,10 +173,6 @@ datasets:
|
|||||||
# Key containing the messages (default: "messages")
|
# Key containing the messages (default: "messages")
|
||||||
field_messages: messages
|
field_messages: messages
|
||||||
|
|
||||||
# Key containing the tools (default: "tools")
|
|
||||||
# Must be a list[dict] and follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
|
|
||||||
field_tools: tools
|
|
||||||
|
|
||||||
# Key containing the system message (default: "system")
|
# Key containing the system message (default: "system")
|
||||||
# If the system message is not present in the dataset sample, it will be loaded from the field_system property.
|
# If the system message is not present in the dataset sample, it will be loaded from the field_system property.
|
||||||
field_system: system
|
field_system: system
|
||||||
|
|||||||
@@ -52,9 +52,7 @@ We recommend checking the below examples for other usecases.
|
|||||||
|
|
||||||
### Examples
|
### Examples
|
||||||
|
|
||||||
#### Training on last message
|
1. (Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
|
||||||
|
|
||||||
(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
datasets:
|
datasets:
|
||||||
@@ -68,9 +66,7 @@ datasets:
|
|||||||
If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
|
If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
#### Overriding default chat template
|
2. Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
|
||||||
|
|
||||||
Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
chat_template: gemma # this overwrites the tokenizer's chat_template
|
chat_template: gemma # this overwrites the tokenizer's chat_template
|
||||||
@@ -80,13 +76,7 @@ datasets:
|
|||||||
roles_to_train: ["assistant"] # default value
|
roles_to_train: ["assistant"] # default value
|
||||||
```
|
```
|
||||||
|
|
||||||
::: {.callout-note}
|
3. Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
|
||||||
If you want to use built-in chat_template, use `chat_template: tokenizer_default` (this is set by default).
|
|
||||||
:::
|
|
||||||
|
|
||||||
#### Using default chat template with fallback
|
|
||||||
|
|
||||||
Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
|
chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
|
||||||
@@ -95,9 +85,7 @@ datasets:
|
|||||||
type: chat_template
|
type: chat_template
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Custom Jinja template
|
4. Using a custom jinja template on OpenAI messages format, training on all assistant messages.
|
||||||
|
|
||||||
Using a custom jinja template on OpenAI messages format, training on all assistant messages.
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
|
# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
|
||||||
@@ -112,9 +100,7 @@ datasets:
|
|||||||
Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
|
Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
#### Using template with different token for EOT and EOS
|
5. If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
|
||||||
|
|
||||||
- If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
eot_tokens:
|
eot_tokens:
|
||||||
@@ -139,7 +125,7 @@ Using `eot_tokens` requires each token that exists in `chat_template` to be a si
|
|||||||
You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
|
You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config.qmd) for more details.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
- Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
|
6. Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
eot_tokens:
|
eot_tokens:
|
||||||
@@ -159,73 +145,7 @@ If EOS token only appears at the end of a prompt, `train_on_eos: last` is equiva
|
|||||||
:::
|
:::
|
||||||
|
|
||||||
|
|
||||||
#### Using tool use
|
7. (Advanced) Using fine-grained control over tokens and turns to train in a conversation
|
||||||
|
|
||||||
Instead of passing `tools` via the system prompt, an alternative method would be to have the `tools` in a separate column and loaded via `chat_template` to let the template dynamically build it.
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"tools": [
|
|
||||||
{
|
|
||||||
"type": "...",
|
|
||||||
"function": {
|
|
||||||
"name": "...",
|
|
||||||
"description": "...",
|
|
||||||
"parameters": {
|
|
||||||
"type": "...",
|
|
||||||
"properties": {
|
|
||||||
// ...
|
|
||||||
},
|
|
||||||
"required": ["..."],
|
|
||||||
},
|
|
||||||
},
|
|
||||||
},
|
|
||||||
],
|
|
||||||
"messages": [
|
|
||||||
// ...
|
|
||||||
{
|
|
||||||
"role": "assistant", // call the function via assistant
|
|
||||||
"tool_calls": [
|
|
||||||
{
|
|
||||||
"type": "function",
|
|
||||||
"function": {
|
|
||||||
"name": "...",
|
|
||||||
"arguments": {
|
|
||||||
"...": "...",
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"role": "tool",
|
|
||||||
"name": "...",
|
|
||||||
"content": "..."
|
|
||||||
},
|
|
||||||
],
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-note}
|
|
||||||
Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
|
|
||||||
:::
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
chat_template: llama4
|
|
||||||
datasets:
|
|
||||||
- path: ...
|
|
||||||
type: chat_template
|
|
||||||
# field_tools: tools # default is `tools`
|
|
||||||
```
|
|
||||||
|
|
||||||
::: {.callout-tip}
|
|
||||||
Look into the `chat_template` you are using to see if it supports `tools` and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the `tool` or `ipython` role for `llama4` template.
|
|
||||||
:::
|
|
||||||
|
|
||||||
|
|
||||||
#### Using fine-grained control over token masking
|
|
||||||
|
|
||||||
(Advanced) Using fine-grained control over tokens and turns to train in a conversation
|
|
||||||
|
|
||||||
For a data sample that looks like:
|
For a data sample that looks like:
|
||||||
|
|
||||||
@@ -276,9 +196,7 @@ datasets:
|
|||||||
It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
|
It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
#### Reasoning split
|
8. (For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
|
||||||
|
|
||||||
(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
datasets:
|
datasets:
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ format:
|
|||||||
This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
|
This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).
|
||||||
|
|
||||||
::: {.callout-important}
|
::: {.callout-important}
|
||||||
For Blackwell GPUs, please use the tags with Pytorch 2.7.1 and CUDA 12.8.
|
For Blackwell GPUs, please use the tags with Pytorch 2.7.0 and CUDA 12.8.
|
||||||
:::
|
:::
|
||||||
|
|
||||||
## Base
|
## Base
|
||||||
@@ -32,8 +32,8 @@ main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
|
|||||||
|
|
||||||
Tags examples:
|
Tags examples:
|
||||||
|
|
||||||
- `main-base-py3.11-cu128-2.7.1`
|
- `main-base-py3.11-cu128-2.7.0`
|
||||||
- `main-base-py3.11-cu126-2.7.1`
|
- `main-base-py3.11-cu126-2.7.0`
|
||||||
- `main-base-py3.11-cu124-2.6.0`
|
- `main-base-py3.11-cu124-2.6.0`
|
||||||
- `main-base-py3.11-cu124-2.5.1`
|
- `main-base-py3.11-cu124-2.5.1`
|
||||||
|
|
||||||
|
|||||||
@@ -29,4 +29,4 @@ qat:
|
|||||||
fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
|
fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
|
||||||
```
|
```
|
||||||
|
|
||||||
Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.
|
Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize` command](./quantize.md) to do this.
|
||||||
|
|||||||
@@ -500,7 +500,7 @@ The input format is a simple JSON input with customizable fields based on the ab
|
|||||||
### GRPO
|
### GRPO
|
||||||
|
|
||||||
::: {.callout-tip}
|
::: {.callout-tip}
|
||||||
Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/grpo_code).
|
Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/axolotl-cookbook/tree/main/grpo#training-an-r1-style-large-language-model-using-grpo).
|
||||||
:::
|
:::
|
||||||
|
|
||||||
In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
|
In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:
|
||||||
|
|||||||
59
docs/telemetry.qmd
Normal file
59
docs/telemetry.qmd
Normal file
@@ -0,0 +1,59 @@
|
|||||||
|
---
|
||||||
|
title: Telemetry
|
||||||
|
description: A description of the opt-in telemetry implementation in Axolotl.
|
||||||
|
---
|
||||||
|
|
||||||
|
# Telemetry in Axolotl
|
||||||
|
|
||||||
|
Axolotl implements anonymous telemetry to help maintainers understand how the library
|
||||||
|
is used and where users encounter issues. This data helps prioritize features, optimize
|
||||||
|
performance, and fix bugs.
|
||||||
|
|
||||||
|
## Data Collection
|
||||||
|
|
||||||
|
We collect:
|
||||||
|
|
||||||
|
- System info: OS, Python version, Axolotl version, PyTorch version, Transformers
|
||||||
|
version, etc.
|
||||||
|
- Hardware info: CPU count, memory, GPU count and models
|
||||||
|
- Runtime metrics: Training progress, memory usage, timing information
|
||||||
|
- Usage patterns: Models (from a whitelist) and configurations used
|
||||||
|
- Error tracking: Stack traces and error messages (sanitized to remove personal
|
||||||
|
information)
|
||||||
|
|
||||||
|
Personally identifiable information (PII) is not collected.
|
||||||
|
|
||||||
|
## Implementation
|
||||||
|
|
||||||
|
Telemetry is implemented using PostHog and consists of:
|
||||||
|
|
||||||
|
- `axolotl.telemetry.TelemetryManager`: A singleton class that initializes the
|
||||||
|
telemetry system and provides methods for tracking events.
|
||||||
|
- `axolotl.telemetry.errors.send_errors`: A decorator that captures exceptions and
|
||||||
|
sends sanitized stack traces.
|
||||||
|
- `axolotl.telemetry.runtime_metrics.RuntimeMetricsTracker`: A class that tracks
|
||||||
|
runtime metrics during training.
|
||||||
|
- `axolotl.telemetry.callbacks.TelemetryCallback`: A Trainer callback that sends
|
||||||
|
runtime metrics telemetry.
|
||||||
|
|
||||||
|
The telemetry system will block training startup for 15 seconds to ensure users are
|
||||||
|
aware of data collection, unless telemetry is explicitly enabled or disabled.
|
||||||
|
|
||||||
|
## Opt-In Mechanism
|
||||||
|
|
||||||
|
Telemetry is **disabled by default** on an opt-in basis. To enable it, set `AXOLOTL_DO_NOT_TRACK=0`.
|
||||||
|
|
||||||
|
To remove the warning message about telemetry that is displayed on train, etc. startup,
|
||||||
|
explicitly set: `AXOLOTL_DO_NOT_TRACK=0` (enable telemetry) or `AXOLOTL_DO_NOT_TRACK=1`
|
||||||
|
(explicitly disable telemetry).
|
||||||
|
|
||||||
|
**Note**: Telemetry will move to an opt-out model in a later release.
|
||||||
|
|
||||||
|
## Privacy
|
||||||
|
|
||||||
|
- All path-like config information is automatically redacted from telemetry data
|
||||||
|
- Model information is only collected for whitelisted organizations
|
||||||
|
- See `axolotl/telemetry/whitelist.yaml` for the set of whitelisted organizations
|
||||||
|
- Each run generates a unique anonymous ID
|
||||||
|
- This allows us to link different telemetry events in a single same training run
|
||||||
|
- Telemetry is only sent from the main process to avoid duplicate events
|
||||||
@@ -5,10 +5,6 @@ tokenizer_type: AutoTokenizer
|
|||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
special_tokens:
|
|
||||||
pad_token: <|finetune_right_pad_id|>
|
|
||||||
eos_token: <|eot_id|>
|
|
||||||
|
|
||||||
load_in_8bit: true
|
load_in_8bit: true
|
||||||
load_in_4bit: false
|
load_in_4bit: false
|
||||||
|
|
||||||
|
|||||||
@@ -1,71 +0,0 @@
|
|||||||
# Finetune Magistral Small with Axolotl
|
|
||||||
|
|
||||||
Magistral Small is a 24B parameter opensource model from MistralAI found on [HuggingFace](https://huggingface.co/mistralai/Magistral-Small-2506). This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.
|
|
||||||
|
|
||||||
MistralAI has also released a proprietary medium-sized version called Magistral Medium.
|
|
||||||
|
|
||||||
Thanks to the team at MistralAI for giving us early access to prepare for this release.
|
|
||||||
|
|
||||||
## Getting started
|
|
||||||
|
|
||||||
1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Magistral is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).
|
|
||||||
|
|
||||||
Here is an example of how to install from main for pip:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Ensure you have Pytorch installed (Pytorch 2.6.0 recommended)
|
|
||||||
git clone https://github.com/axolotl-ai-cloud/axolotl.git
|
|
||||||
cd axolotl
|
|
||||||
|
|
||||||
pip3 install packaging==23.2 setuptools==75.8.0 wheel ninja
|
|
||||||
pip3 install --no-build-isolation -e '.[flash-attn,mistral]'
|
|
||||||
```
|
|
||||||
|
|
||||||
2. Download the example config:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
axolotl fetch examples
|
|
||||||
```
|
|
||||||
|
|
||||||
3. Run the finetuning example:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
axolotl train examples/magistral/magistral-small-qlora.yaml
|
|
||||||
```
|
|
||||||
|
|
||||||
This config uses about 24GB VRAM.
|
|
||||||
|
|
||||||
Let us know how it goes. Happy finetuning! 🚀
|
|
||||||
|
|
||||||
### TIPS
|
|
||||||
|
|
||||||
- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
|
|
||||||
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
|
|
||||||
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
|
|
||||||
- The dataset format is the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
|
|
||||||
|
|
||||||
## Optimization Guides
|
|
||||||
|
|
||||||
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
|
|
||||||
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
|
|
||||||
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
|
|
||||||
|
|
||||||
## Limitations
|
|
||||||
|
|
||||||
We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.
|
|
||||||
|
|
||||||
The tokenizer does not work with `dataset.map` with multiprocessing, so we had to disable it. In addition, we do not support overriding tokens yet.
|
|
||||||
|
|
||||||
## Related Resources
|
|
||||||
|
|
||||||
- [MistralAI Magistral Blog](https://mistral.ai/news/magistral/)
|
|
||||||
- [Axolotl Docs](https://docs.axolotl.ai)
|
|
||||||
- [Axolotl Website](https://axolotl.ai)
|
|
||||||
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
|
|
||||||
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)
|
|
||||||
|
|
||||||
|
|
||||||
## Future Work
|
|
||||||
|
|
||||||
- Add parity to Preference Tuning, RL, Multi-modal, etc.
|
|
||||||
- Add parity to other tokenizer configs like overriding tokens.
|
|
||||||
@@ -1,72 +0,0 @@
|
|||||||
base_model: mistralai/Magistral-Small-2506
|
|
||||||
|
|
||||||
# Enable to use mistral-common tokenizer
|
|
||||||
tokenizer_use_mistral_common: true
|
|
||||||
|
|
||||||
# Automatically upload checkpoint and final model to HF
|
|
||||||
# hub_model_id: username/custom_model_name
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: true
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: fozziethebeat/alpaca_messages_2k_test
|
|
||||||
type: chat_template
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.1
|
|
||||||
output_dir: ./outputs/lora-out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
eval_sample_packing: false
|
|
||||||
pad_to_sequence_len: true
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_target_modules:
|
|
||||||
- gate_proj
|
|
||||||
- down_proj
|
|
||||||
- up_proj
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_torch_fused
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing:
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
|
|
||||||
fsdp:
|
|
||||||
- full_shard
|
|
||||||
- auto_wrap
|
|
||||||
fsdp_config:
|
|
||||||
fsdp_state_dict_type: FULL_STATE_DICT
|
|
||||||
fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
|
|
||||||
fsdp_activation_checkpointing: true
|
|
||||||
@@ -1,63 +0,0 @@
|
|||||||
base_model: mistralai/Magistral-Small-2506
|
|
||||||
|
|
||||||
# Enable to use mistral-common tokenizer
|
|
||||||
tokenizer_use_mistral_common: true
|
|
||||||
|
|
||||||
# Automatically upload checkpoint and final model to HF
|
|
||||||
# hub_model_id: username/custom_model_name
|
|
||||||
|
|
||||||
load_in_8bit: false
|
|
||||||
load_in_4bit: true
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: fozziethebeat/alpaca_messages_2k_test
|
|
||||||
type: chat_template
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0.1
|
|
||||||
output_dir: ./outputs/lora-out
|
|
||||||
|
|
||||||
adapter: qlora
|
|
||||||
lora_model_dir:
|
|
||||||
|
|
||||||
sequence_len: 2048
|
|
||||||
sample_packing: true
|
|
||||||
pad_to_sequence_len: true
|
|
||||||
|
|
||||||
lora_r: 32
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.05
|
|
||||||
lora_target_linear: true
|
|
||||||
lora_target_modules:
|
|
||||||
- gate_proj
|
|
||||||
- down_proj
|
|
||||||
- up_proj
|
|
||||||
- q_proj
|
|
||||||
- v_proj
|
|
||||||
- k_proj
|
|
||||||
- o_proj
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 4
|
|
||||||
micro_batch_size: 2
|
|
||||||
num_epochs: 1
|
|
||||||
optimizer: adamw_bnb_8bit
|
|
||||||
lr_scheduler: cosine
|
|
||||||
learning_rate: 0.0002
|
|
||||||
|
|
||||||
bf16: auto
|
|
||||||
tf32: false
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
resume_from_checkpoint:
|
|
||||||
logging_steps: 1
|
|
||||||
flash_attention: true
|
|
||||||
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
evals_per_epoch: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
@@ -25,7 +25,7 @@ pad_to_sequence_len: false
|
|||||||
lora_r: 32
|
lora_r: 32
|
||||||
lora_alpha: 16
|
lora_alpha: 16
|
||||||
lora_dropout: 0.05
|
lora_dropout: 0.05
|
||||||
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'
|
||||||
|
|
||||||
wandb_project:
|
wandb_project:
|
||||||
wandb_entity:
|
wandb_entity:
|
||||||
|
|||||||
@@ -68,4 +68,5 @@ schedulefree==1.4.1
|
|||||||
axolotl-contribs-lgpl==0.0.6
|
axolotl-contribs-lgpl==0.0.6
|
||||||
axolotl-contribs-mit==0.0.3
|
axolotl-contribs-mit==0.0.3
|
||||||
|
|
||||||
mistral-common==1.6.0
|
# telemetry
|
||||||
|
posthog>=4.2.0
|
||||||
|
|||||||
@@ -4,4 +4,4 @@ import pkgutil
|
|||||||
|
|
||||||
__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package
|
__path__ = pkgutil.extend_path(__path__, __name__) # Make this a namespace package
|
||||||
|
|
||||||
__version__ = "0.10.0"
|
__version__ = "0.10.0.dev0"
|
||||||
|
|||||||
@@ -14,6 +14,8 @@ import yaml
|
|||||||
from transformers.utils import is_torch_bf16_gpu_available
|
from transformers.utils import is_torch_bf16_gpu_available
|
||||||
|
|
||||||
from axolotl.integrations.base import PluginManager
|
from axolotl.integrations.base import PluginManager
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
|
from axolotl.telemetry.manager import TelemetryManager
|
||||||
from axolotl.utils.comet_ import setup_comet_env_vars
|
from axolotl.utils.comet_ import setup_comet_env_vars
|
||||||
from axolotl.utils.config import (
|
from axolotl.utils.config import (
|
||||||
normalize_cfg_datasets,
|
normalize_cfg_datasets,
|
||||||
@@ -28,6 +30,8 @@ from axolotl.utils.wandb_ import setup_wandb_env_vars
|
|||||||
|
|
||||||
LOG = get_logger(__name__, use_environ=True)
|
LOG = get_logger(__name__, use_environ=True)
|
||||||
|
|
||||||
|
TELEMETRY_MANAGER = TelemetryManager.get_instance()
|
||||||
|
|
||||||
|
|
||||||
def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
|
def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
|
||||||
"""
|
"""
|
||||||
@@ -159,6 +163,7 @@ def plugin_set_cfg(cfg: DictDefault):
|
|||||||
plugin_manager.cfg = cfg
|
plugin_manager.cfg = cfg
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def load_cfg(
|
def load_cfg(
|
||||||
config: str | Path | DictDefault = Path("examples/"), **kwargs
|
config: str | Path | DictDefault = Path("examples/"), **kwargs
|
||||||
) -> DictDefault:
|
) -> DictDefault:
|
||||||
@@ -192,6 +197,8 @@ def load_cfg(
|
|||||||
temp_file.close()
|
temp_file.close()
|
||||||
cfg.axolotl_config_path = temp_file.name
|
cfg.axolotl_config_path = temp_file.name
|
||||||
|
|
||||||
|
TELEMETRY_MANAGER.send_event(event_type="config-loaded", properties=cfg)
|
||||||
|
|
||||||
# If there are any options passed in the cli, if it is something that seems valid
|
# If there are any options passed in the cli, if it is something that seems valid
|
||||||
# from the yaml, then overwrite the value
|
# from the yaml, then overwrite the value
|
||||||
cfg_keys = cfg.keys()
|
cfg_keys = cfg.keys()
|
||||||
@@ -233,4 +240,6 @@ def load_cfg(
|
|||||||
setup_comet_env_vars(cfg)
|
setup_comet_env_vars(cfg)
|
||||||
plugin_set_cfg(cfg)
|
plugin_set_cfg(cfg)
|
||||||
|
|
||||||
|
TELEMETRY_MANAGER.send_event(event_type="config-processed", properties=cfg)
|
||||||
|
|
||||||
return cfg
|
return cfg
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from axolotl.cli.args import InferenceCliArgs
|
|||||||
from axolotl.cli.art import print_axolotl_text_art
|
from axolotl.cli.art import print_axolotl_text_art
|
||||||
from axolotl.cli.config import load_cfg
|
from axolotl.cli.config import load_cfg
|
||||||
from axolotl.cli.utils import load_model_and_tokenizer
|
from axolotl.cli.utils import load_model_and_tokenizer
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
from axolotl.utils.chat_templates import (
|
from axolotl.utils.chat_templates import (
|
||||||
get_chat_template,
|
get_chat_template,
|
||||||
get_chat_template_from_config,
|
get_chat_template_from_config,
|
||||||
@@ -42,6 +43,7 @@ def get_multi_line_input() -> str:
|
|||||||
return instruction
|
return instruction
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def do_inference(
|
def do_inference(
|
||||||
*,
|
*,
|
||||||
cfg: DictDefault,
|
cfg: DictDefault,
|
||||||
@@ -135,6 +137,7 @@ def do_inference(
|
|||||||
print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
|
print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def do_inference_gradio(
|
def do_inference_gradio(
|
||||||
*,
|
*,
|
||||||
cfg: DictDefault,
|
cfg: DictDefault,
|
||||||
|
|||||||
@@ -9,12 +9,14 @@ from dotenv import load_dotenv
|
|||||||
from axolotl.cli.art import print_axolotl_text_art
|
from axolotl.cli.art import print_axolotl_text_art
|
||||||
from axolotl.cli.config import load_cfg
|
from axolotl.cli.config import load_cfg
|
||||||
from axolotl.cli.utils import load_model_and_tokenizer
|
from axolotl.cli.utils import load_model_and_tokenizer
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
|
|
||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def do_merge_lora(*, cfg: DictDefault) -> None:
|
def do_merge_lora(*, cfg: DictDefault) -> None:
|
||||||
"""
|
"""
|
||||||
Calls `transformers`' `merge_and_unload` on the model given in the `axolotl` config
|
Calls `transformers`' `merge_and_unload` on the model given in the `axolotl` config
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ from torch.distributed.checkpoint.format_utils import _EmptyStateDictLoadPlanner
|
|||||||
|
|
||||||
from axolotl.cli.art import print_axolotl_text_art
|
from axolotl.cli.art import print_axolotl_text_art
|
||||||
from axolotl.cli.config import load_cfg
|
from axolotl.cli.config import load_cfg
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
|
|
||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
@@ -118,6 +119,7 @@ def _distributed_checkpoint_to_merged_weights(
|
|||||||
return save_path_
|
return save_path_
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def merge_fsdp_weights(
|
def merge_fsdp_weights(
|
||||||
checkpoint_dir: str,
|
checkpoint_dir: str,
|
||||||
output_path: str,
|
output_path: str,
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ from axolotl.cli.config import load_cfg
|
|||||||
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
||||||
from axolotl.common.datasets import load_datasets, load_preference_datasets
|
from axolotl.common.datasets import load_datasets, load_preference_datasets
|
||||||
from axolotl.integrations.base import PluginManager
|
from axolotl.integrations.base import PluginManager
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
from axolotl.utils.trainer import disable_datasets_caching
|
from axolotl.utils.trainer import disable_datasets_caching
|
||||||
@@ -25,6 +26,7 @@ from axolotl.utils.trainer import disable_datasets_caching
|
|||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
|
def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
|
||||||
"""
|
"""
|
||||||
Preprocesses dataset specified in axolotl config.
|
Preprocesses dataset specified in axolotl config.
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
"""Various shared constants"""
|
"""
|
||||||
|
Various shared constants
|
||||||
|
"""
|
||||||
|
|
||||||
DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
|
DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"
|
||||||
|
|||||||
@@ -3,13 +3,16 @@
|
|||||||
import math
|
import math
|
||||||
import random
|
import random
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional, Union
|
||||||
|
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
|
|
||||||
import axolotl.monkeypatch.data.batch_dataset_fetcher # pylint: disable=unused-import # noqa: F401
|
import axolotl.monkeypatch.data.batch_dataset_fetcher # pylint: disable=unused-import # noqa: F401
|
||||||
from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
|
from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
|
||||||
from axolotl.loaders import load_processor, load_tokenizer
|
from axolotl.loaders import load_processor, load_tokenizer
|
||||||
from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
|
from axolotl.telemetry.errors import send_errors
|
||||||
|
from axolotl.utils.data import prepare_dataset
|
||||||
|
from axolotl.utils.data.rl import load_prepare_preference_datasets
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
from axolotl.utils.schemas.enums import RLType
|
from axolotl.utils.schemas.enums import RLType
|
||||||
@@ -28,49 +31,66 @@ class TrainDatasetMeta:
|
|||||||
|
|
||||||
|
|
||||||
def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
|
def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
|
||||||
"""Randomly sample `num_samples` samples with replacement from `dataset`."""
|
"""
|
||||||
|
Randomly sample `num_samples` samples from `dataset`.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
dataset: Dataset.
|
||||||
|
num_samples: Number of samples to return.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Random sample (with replacement) of examples in `dataset`.
|
||||||
|
"""
|
||||||
return dataset.select(
|
return dataset.select(
|
||||||
[random.randrange(0, len(dataset) - 1) for _ in range(num_samples)] # nosec
|
[random.randrange(0, len(dataset) - 1) for _ in range(num_samples)] # nosec
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def load_datasets(
|
def load_datasets(
|
||||||
*,
|
*,
|
||||||
cfg: DictDefault,
|
cfg: DictDefault,
|
||||||
cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
|
cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
|
||||||
debug: bool = False,
|
debug: bool = False,
|
||||||
) -> TrainDatasetMeta:
|
) -> TrainDatasetMeta:
|
||||||
"""Loads one or more training or evaluation datasets, calling
|
"""
|
||||||
`axolotl.utils.data.prepare_datasets`. Optionally, logs out debug information.
|
Loads one or more training or evaluation datasets, calling
|
||||||
|
`axolotl.utils.data.prepare_dataset`. Optionally, logs out debug information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
cfg: Dictionary mapping `axolotl` config keys to values.
|
cfg: Dictionary mapping `axolotl` config keys to values.
|
||||||
cli_args: Command-specific CLI arguments.
|
cli_args: Command-specific CLI arguments.
|
||||||
debug: Whether to print out tokenization of sample. This is duplicated in
|
debug: Whether to print out tokenization of sample
|
||||||
`cfg` and `cli_args`, but is kept due to use in our Colab notebooks.
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Dataclass with fields for training and evaluation datasets and the computed
|
Dataclass with fields for training and evaluation datasets and the computed
|
||||||
`total_num_steps`.
|
`total_num_steps`.
|
||||||
"""
|
"""
|
||||||
tokenizer = load_tokenizer(cfg)
|
tokenizer = load_tokenizer(cfg)
|
||||||
processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
|
processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None
|
||||||
preprocess_iterable = getattr(cli_args, "iterable", False)
|
preprocess_iterable = (
|
||||||
|
cli_args
|
||||||
|
and hasattr(cli_args, "iterable")
|
||||||
|
and cli_args.iterable is not None
|
||||||
|
and cli_args.iterable
|
||||||
|
)
|
||||||
|
|
||||||
train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
|
train_dataset, eval_dataset, total_num_steps, prompters = prepare_dataset(
|
||||||
cfg,
|
cfg,
|
||||||
tokenizer,
|
tokenizer,
|
||||||
processor=processor,
|
processor=processor,
|
||||||
preprocess_iterable=preprocess_iterable,
|
preprocess_iterable=preprocess_iterable,
|
||||||
)
|
)
|
||||||
|
|
||||||
if (
|
if ( # pylint: disable=too-many-boolean-expressions
|
||||||
cfg.debug
|
cli_args
|
||||||
or getattr(cli_args, "debug", False)
|
and (
|
||||||
or getattr(cli_args, "debug_text_only", False)
|
cli_args.debug
|
||||||
or getattr(cli_args, "debug_num_examples", 0) > 0
|
or cfg.debug
|
||||||
or debug
|
or cli_args.debug_text_only
|
||||||
):
|
or int(cli_args.debug_num_examples) > 0
|
||||||
|
)
|
||||||
|
) or debug:
|
||||||
LOG.info("check_dataset_labels...")
|
LOG.info("check_dataset_labels...")
|
||||||
|
|
||||||
num_examples = cli_args.debug_num_examples if cli_args else 1
|
num_examples = cli_args.debug_num_examples if cli_args else 1
|
||||||
@@ -94,11 +114,15 @@ def load_datasets(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def load_preference_datasets(
|
def load_preference_datasets(
|
||||||
*, cfg: DictDefault, cli_args: PreprocessCliArgs | TrainerCliArgs | None = None
|
*,
|
||||||
|
cfg: DictDefault,
|
||||||
|
cli_args: Union[PreprocessCliArgs, TrainerCliArgs],
|
||||||
) -> TrainDatasetMeta:
|
) -> TrainDatasetMeta:
|
||||||
"""Loads one or more training or evaluation datasets for RL training using paired
|
"""
|
||||||
preference data, calling `axolotl.utils.data.rl.prepare_preference_datasets`.
|
Loads one or more training or evaluation datasets for RL training using paired
|
||||||
|
preference data, calling `axolotl.utils.data.rl.load_prepare_preference_datasets`.
|
||||||
Optionally, logs out debug information.
|
Optionally, logs out debug information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
@@ -109,28 +133,23 @@ def load_preference_datasets(
|
|||||||
Dataclass with fields for training and evaluation datasets and the computed
|
Dataclass with fields for training and evaluation datasets and the computed
|
||||||
`total_num_steps`.
|
`total_num_steps`.
|
||||||
"""
|
"""
|
||||||
tokenizer = load_tokenizer(cfg)
|
train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
|
||||||
train_dataset, eval_dataset = prepare_preference_datasets(cfg, tokenizer)
|
total_num_steps: Optional[int] = int(
|
||||||
|
math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
|
||||||
|
)
|
||||||
|
if cfg.rl is RLType.GRPO:
|
||||||
|
total_num_steps = None
|
||||||
|
|
||||||
total_num_steps: int | None = None
|
if cli_args.debug or cfg.debug:
|
||||||
if cfg.rl is not RLType.GRPO:
|
|
||||||
total_num_steps = int(
|
|
||||||
math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
|
|
||||||
)
|
|
||||||
|
|
||||||
if (cli_args and cli_args.debug) or cfg.debug:
|
|
||||||
LOG.info("check_dataset_labels...")
|
LOG.info("check_dataset_labels...")
|
||||||
|
|
||||||
num_examples = cli_args.debug_num_examples if cli_args else 1
|
|
||||||
text_only = cli_args.debug_text_only if cli_args else False
|
|
||||||
|
|
||||||
tokenizer = load_tokenizer(cfg)
|
tokenizer = load_tokenizer(cfg)
|
||||||
train_samples = sample_dataset(train_dataset, num_examples)
|
train_samples = sample_dataset(train_dataset, cli_args.debug_num_examples)
|
||||||
check_dataset_labels(
|
check_dataset_labels(
|
||||||
dataset=train_samples,
|
train_samples,
|
||||||
tokenizer=tokenizer,
|
tokenizer,
|
||||||
num_examples=num_examples,
|
num_examples=cli_args.debug_num_examples,
|
||||||
text_only=text_only,
|
text_only=cli_args.debug_text_only,
|
||||||
rl_mode=True,
|
rl_mode=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -31,6 +31,8 @@ from transformers.training_args import OptimizerNames
|
|||||||
|
|
||||||
from axolotl.integrations.base import PluginManager
|
from axolotl.integrations.base import PluginManager
|
||||||
from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
|
from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
|
||||||
|
from axolotl.telemetry.callbacks import TelemetryCallback
|
||||||
|
from axolotl.telemetry.manager import TelemetryManager
|
||||||
from axolotl.utils import is_comet_available, is_mlflow_available
|
from axolotl.utils import is_comet_available, is_mlflow_available
|
||||||
from axolotl.utils.callbacks import (
|
from axolotl.utils.callbacks import (
|
||||||
GCCallback,
|
GCCallback,
|
||||||
@@ -145,6 +147,10 @@ class TrainerBuilderBase(abc.ABC):
|
|||||||
|
|
||||||
callbacks.append(GPUStatsCallback(cfg=self.cfg))
|
callbacks.append(GPUStatsCallback(cfg=self.cfg))
|
||||||
|
|
||||||
|
telemetry_manager = TelemetryManager.get_instance()
|
||||||
|
if telemetry_manager.enabled:
|
||||||
|
callbacks.append(TelemetryCallback())
|
||||||
|
|
||||||
return callbacks
|
return callbacks
|
||||||
|
|
||||||
def get_post_trainer_create_callbacks(self, trainer):
|
def get_post_trainer_create_callbacks(self, trainer):
|
||||||
@@ -380,16 +386,14 @@ class TrainerBuilderBase(abc.ABC):
|
|||||||
)
|
)
|
||||||
|
|
||||||
# eval_strategy and eval_steps
|
# eval_strategy and eval_steps
|
||||||
if not self.eval_dataset and self.cfg.val_set_size == 0:
|
if not self.eval_dataset or self.cfg.val_set_size == 0:
|
||||||
# do not eval if no eval_dataset and val_set_size=0
|
# do not eval if no eval_dataset or val_set_size=0
|
||||||
training_args_kwargs["eval_strategy"] = "no"
|
training_args_kwargs["eval_strategy"] = "no"
|
||||||
elif self.cfg.eval_steps:
|
elif self.cfg.eval_steps:
|
||||||
training_args_kwargs["eval_strategy"] = "steps"
|
training_args_kwargs["eval_strategy"] = "steps"
|
||||||
training_args_kwargs["eval_steps"] = self.cfg.eval_steps
|
training_args_kwargs["eval_steps"] = self.cfg.eval_steps
|
||||||
training_args_kwargs["eval_on_start"] = True
|
|
||||||
elif self.cfg.eval_strategy:
|
elif self.cfg.eval_strategy:
|
||||||
training_args_kwargs["eval_strategy"] = self.cfg.eval_strategy
|
training_args_kwargs["eval_strategy"] = self.cfg.eval_strategy
|
||||||
training_args_kwargs["eval_on_start"] = True
|
|
||||||
|
|
||||||
def _configure_reporting(self, training_args_kwargs: dict):
|
def _configure_reporting(self, training_args_kwargs: dict):
|
||||||
report_to = []
|
report_to = []
|
||||||
@@ -492,9 +496,6 @@ class TrainerBuilderBase(abc.ABC):
|
|||||||
training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
|
training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
|
||||||
training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs
|
training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs
|
||||||
|
|
||||||
if self.cfg.dataset_processes:
|
|
||||||
training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
|
|
||||||
|
|
||||||
# max_length is not used in CausalTrainer
|
# max_length is not used in CausalTrainer
|
||||||
if self.cfg.reward_model or self.cfg.rl:
|
if self.cfg.reward_model or self.cfg.rl:
|
||||||
training_args_kwargs["max_length"] = self.cfg.sequence_len
|
training_args_kwargs["max_length"] = self.cfg.sequence_len
|
||||||
|
|||||||
@@ -21,12 +21,18 @@ from axolotl.core.trainers import (
|
|||||||
AxolotlTrainer,
|
AxolotlTrainer,
|
||||||
ReLoRATrainer,
|
ReLoRATrainer,
|
||||||
)
|
)
|
||||||
|
from axolotl.core.training_args import (
|
||||||
|
AxolotlPRMConfig,
|
||||||
|
AxolotlRewardConfig,
|
||||||
|
AxolotlTrainingArguments,
|
||||||
|
)
|
||||||
from axolotl.integrations.base import PluginManager
|
from axolotl.integrations.base import PluginManager
|
||||||
from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
|
from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
|
||||||
from axolotl.monkeypatch.relora import ReLoRACallback
|
from axolotl.monkeypatch.relora import ReLoRACallback
|
||||||
from axolotl.processing_strategies import get_processing_strategy
|
from axolotl.processing_strategies import get_processing_strategy
|
||||||
from axolotl.utils import is_comet_available, is_mlflow_available
|
from axolotl.utils import is_comet_available, is_mlflow_available
|
||||||
from axolotl.utils.callbacks import (
|
from axolotl.utils.callbacks import (
|
||||||
|
EvalFirstStepCallback,
|
||||||
LossWatchDogCallback,
|
LossWatchDogCallback,
|
||||||
SaveBetterTransformerModelCallback,
|
SaveBetterTransformerModelCallback,
|
||||||
bench_eval_callback_factory,
|
bench_eval_callback_factory,
|
||||||
@@ -57,6 +63,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
|
|
||||||
def get_callbacks(self):
|
def get_callbacks(self):
|
||||||
callbacks = super().get_callbacks()
|
callbacks = super().get_callbacks()
|
||||||
|
callbacks.append(EvalFirstStepCallback())
|
||||||
|
|
||||||
if self.cfg.relora_steps:
|
if self.cfg.relora_steps:
|
||||||
callbacks.append(ReLoRACallback(self.cfg))
|
callbacks.append(ReLoRACallback(self.cfg))
|
||||||
@@ -123,9 +130,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
return callbacks
|
return callbacks
|
||||||
|
|
||||||
def _get_trainer_cls(self):
|
def _get_trainer_cls(self):
|
||||||
"""
|
|
||||||
Gets the trainer class for the given configuration.
|
|
||||||
"""
|
|
||||||
if self.cfg.plugins:
|
if self.cfg.plugins:
|
||||||
plugin_manager = PluginManager.get_instance()
|
plugin_manager = PluginManager.get_instance()
|
||||||
trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
|
trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
|
||||||
@@ -142,12 +146,6 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
return AxolotlTrainer
|
return AxolotlTrainer
|
||||||
|
|
||||||
def build(self, total_num_steps):
|
def build(self, total_num_steps):
|
||||||
from axolotl.core.training_args import (
|
|
||||||
AxolotlPRMConfig,
|
|
||||||
AxolotlRewardConfig,
|
|
||||||
AxolotlTrainingArguments,
|
|
||||||
)
|
|
||||||
|
|
||||||
training_arguments_kwargs, trainer_kwargs = self._set_base_training_args(
|
training_arguments_kwargs, trainer_kwargs = self._set_base_training_args(
|
||||||
total_num_steps
|
total_num_steps
|
||||||
)
|
)
|
||||||
@@ -316,12 +314,20 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
training_arguments_kwargs["image_resize_algorithm"] = (
|
training_arguments_kwargs["image_resize_algorithm"] = (
|
||||||
self.cfg.image_resize_algorithm
|
self.cfg.image_resize_algorithm
|
||||||
)
|
)
|
||||||
|
if self.cfg.kd_ce_alpha is not None:
|
||||||
if self.cfg.plugins:
|
training_arguments_kwargs["kd_ce_alpha"] = self.cfg.kd_ce_alpha
|
||||||
plugin_manager = PluginManager.get_instance()
|
if self.cfg.kd_alpha is not None:
|
||||||
plugin_training_args = plugin_manager.get_training_args(self.cfg)
|
training_arguments_kwargs["kd_alpha"] = self.cfg.kd_alpha
|
||||||
if plugin_training_args:
|
if self.cfg.kd_temperature is not None:
|
||||||
training_arguments_kwargs.update(plugin_training_args)
|
training_arguments_kwargs["kd_temperature"] = self.cfg.kd_temperature
|
||||||
|
if self.cfg.kd_zscore_base_temp is not None:
|
||||||
|
training_arguments_kwargs["kd_zscore_base_temp"] = (
|
||||||
|
self.cfg.kd_zscore_base_temp
|
||||||
|
)
|
||||||
|
if self.cfg.kd_top_k_before_softmax is not None:
|
||||||
|
training_arguments_kwargs["kd_top_k_before_softmax"] = (
|
||||||
|
self.cfg.kd_top_k_before_softmax
|
||||||
|
)
|
||||||
|
|
||||||
if self.cfg.reward_model:
|
if self.cfg.reward_model:
|
||||||
training_args_cls = AxolotlRewardConfig
|
training_args_cls = AxolotlRewardConfig
|
||||||
@@ -375,7 +381,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
elif "tokenizer" in sig.parameters:
|
elif "tokenizer" in sig.parameters:
|
||||||
trainer_kwargs["tokenizer"] = self.tokenizer
|
trainer_kwargs["tokenizer"] = self.tokenizer
|
||||||
if (
|
if (
|
||||||
trainer_cls not in [AxolotlRewardTrainer, AxolotlPRMTrainer]
|
not (trainer_cls in [AxolotlRewardTrainer, AxolotlPRMTrainer])
|
||||||
and self.cfg.datasets is not None
|
and self.cfg.datasets is not None
|
||||||
):
|
):
|
||||||
trainer_kwargs["dataset_tags"] = [
|
trainer_kwargs["dataset_tags"] = [
|
||||||
@@ -402,10 +408,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
return trainer
|
return trainer
|
||||||
|
|
||||||
def build_collator(
|
def build_collator(
|
||||||
self,
|
self, training_args: AxolotlTrainingArguments, is_eval=False, **kwargs
|
||||||
training_args, # type: "AxolotlTrainingArguments" # type: ignore
|
|
||||||
is_eval=False,
|
|
||||||
**kwargs,
|
|
||||||
):
|
):
|
||||||
if training_args.pretraining:
|
if training_args.pretraining:
|
||||||
if (
|
if (
|
||||||
@@ -434,19 +437,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
]
|
]
|
||||||
]
|
]
|
||||||
collator_args = [self.tokenizer]
|
collator_args = [self.tokenizer]
|
||||||
|
if self.cfg.reward_model:
|
||||||
collator_cls_and_kwargs = None
|
|
||||||
if self.cfg.plugins:
|
|
||||||
plugin_manager = PluginManager.get_instance()
|
|
||||||
collator_cls_and_kwargs = plugin_manager.get_collator_cls_and_kwargs(
|
|
||||||
self.cfg, is_eval=is_eval
|
|
||||||
)
|
|
||||||
|
|
||||||
if collator_cls_and_kwargs:
|
|
||||||
collator = collator_cls_and_kwargs[0]
|
|
||||||
if kwargs and isinstance(kwargs, dict):
|
|
||||||
kwargs.update(collator_cls_and_kwargs[1])
|
|
||||||
elif self.cfg.reward_model:
|
|
||||||
collator = RewardDataCollatorWithPadding
|
collator = RewardDataCollatorWithPadding
|
||||||
elif use_batch_sampler_collator:
|
elif use_batch_sampler_collator:
|
||||||
# Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
|
# Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
|
||||||
@@ -477,6 +468,16 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
collator_args.pop(0)
|
collator_args.pop(0)
|
||||||
kwargs.pop("pad_to_multiple_of", None)
|
kwargs.pop("pad_to_multiple_of", None)
|
||||||
kwargs.pop("padding", None)
|
kwargs.pop("padding", None)
|
||||||
|
elif self.cfg.kd_trainer:
|
||||||
|
from axolotl.integrations.kd.collator import (
|
||||||
|
DataCollatorForKD,
|
||||||
|
KDBatchSamplerDataCollatorForSeq2Seq,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.cfg.sample_packing:
|
||||||
|
collator = KDBatchSamplerDataCollatorForSeq2Seq
|
||||||
|
else:
|
||||||
|
collator = DataCollatorForKD
|
||||||
else:
|
else:
|
||||||
collator = DataCollatorForSeq2Seq
|
collator = DataCollatorForSeq2Seq
|
||||||
|
|
||||||
|
|||||||
@@ -12,9 +12,13 @@ from axolotl.core.trainers import (
|
|||||||
from axolotl.core.trainers.dpo import DPOStrategy
|
from axolotl.core.trainers.dpo import DPOStrategy
|
||||||
from axolotl.core.trainers.dpo.args import AxolotlDPOConfig
|
from axolotl.core.trainers.dpo.args import AxolotlDPOConfig
|
||||||
from axolotl.core.trainers.grpo import GRPOStrategy
|
from axolotl.core.trainers.grpo import GRPOStrategy
|
||||||
|
from axolotl.core.training_args import (
|
||||||
|
AxolotlCPOConfig,
|
||||||
|
AxolotlKTOConfig,
|
||||||
|
AxolotlORPOConfig,
|
||||||
|
)
|
||||||
from axolotl.integrations.base import PluginManager
|
from axolotl.integrations.base import PluginManager
|
||||||
from axolotl.loaders.utils import ensure_dtype
|
from axolotl.loaders.utils import ensure_dtype
|
||||||
from axolotl.utils.callbacks.qat import QATCallback
|
|
||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
from axolotl.utils.schemas.enums import RLType
|
from axolotl.utils.schemas.enums import RLType
|
||||||
|
|
||||||
@@ -27,9 +31,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
|
|||||||
def get_callbacks(self):
|
def get_callbacks(self):
|
||||||
callbacks = super().get_callbacks()
|
callbacks = super().get_callbacks()
|
||||||
|
|
||||||
if self.cfg.qat:
|
|
||||||
callbacks.append(QATCallback(self.cfg.qat))
|
|
||||||
|
|
||||||
return callbacks
|
return callbacks
|
||||||
|
|
||||||
def get_post_trainer_create_callbacks(self, trainer):
|
def get_post_trainer_create_callbacks(self, trainer):
|
||||||
@@ -78,12 +79,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
|
|||||||
"""
|
"""
|
||||||
Returns training_args and trainer_kwargs
|
Returns training_args and trainer_kwargs
|
||||||
"""
|
"""
|
||||||
from axolotl.core.training_args import (
|
|
||||||
AxolotlCPOConfig,
|
|
||||||
AxolotlKTOConfig,
|
|
||||||
AxolotlORPOConfig,
|
|
||||||
)
|
|
||||||
|
|
||||||
training_args_kwargs, trainer_kwargs = self._set_base_training_args(
|
training_args_kwargs, trainer_kwargs = self._set_base_training_args(
|
||||||
total_num_steps=total_num_steps
|
total_num_steps=total_num_steps
|
||||||
)
|
)
|
||||||
@@ -95,6 +90,10 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
|
|||||||
else:
|
else:
|
||||||
training_args_kwargs["remove_unused_columns"] = False
|
training_args_kwargs["remove_unused_columns"] = False
|
||||||
|
|
||||||
|
# only rlhf
|
||||||
|
if self.cfg.dataset_processes:
|
||||||
|
training_args_kwargs["dataset_num_proc"] = self.cfg.dataset_processes
|
||||||
|
|
||||||
if self.cfg.trl and self.cfg.trl.beta is not None:
|
if self.cfg.trl and self.cfg.trl.beta is not None:
|
||||||
training_args_kwargs["beta"] = self.cfg.trl.beta
|
training_args_kwargs["beta"] = self.cfg.trl.beta
|
||||||
elif self.cfg.rl_beta is not None:
|
elif self.cfg.rl_beta is not None:
|
||||||
@@ -143,7 +142,22 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
|
|||||||
|
|
||||||
elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
|
elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
|
||||||
training_args_cls = AxolotlDPOConfig
|
training_args_cls = AxolotlDPOConfig
|
||||||
training_args_kwargs.update(DPOStrategy.set_training_args_kwargs(self.cfg))
|
if self.cfg.rl is RLType.IPO:
|
||||||
|
training_args_kwargs["loss_type"] = "ipo"
|
||||||
|
|
||||||
|
# Not compatible with IPO
|
||||||
|
if self.cfg.rl is RLType.DPO and self.cfg.dpo_label_smoothing:
|
||||||
|
training_args_kwargs["label_smoothing"] = self.cfg.dpo_label_smoothing
|
||||||
|
|
||||||
|
training_args_kwargs["max_completion_length"] = None
|
||||||
|
training_args_kwargs["max_prompt_length"] = self.cfg.sequence_len
|
||||||
|
training_args_kwargs["generate_during_eval"] = self.cfg.use_wandb
|
||||||
|
if self.cfg.dpo_use_weighting is not None:
|
||||||
|
training_args_kwargs["use_weighting"] = self.cfg.dpo_use_weighting
|
||||||
|
if self.cfg.dpo_use_logits_to_keep is not None:
|
||||||
|
training_args_kwargs["use_logits_to_keep"] = (
|
||||||
|
self.cfg.dpo_use_logits_to_keep
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
raise ValueError(f"Unsupported RL: {self.cfg.rl}")
|
raise ValueError(f"Unsupported RL: {self.cfg.rl}")
|
||||||
|
|
||||||
@@ -151,12 +165,6 @@ class HFRLTrainerBuilder(TrainerBuilderBase):
|
|||||||
if blocklist_key in training_args_kwargs:
|
if blocklist_key in training_args_kwargs:
|
||||||
del training_args_kwargs[blocklist_key]
|
del training_args_kwargs[blocklist_key]
|
||||||
|
|
||||||
if self.cfg.plugins:
|
|
||||||
plugin_manager = PluginManager.get_instance()
|
|
||||||
plugin_training_args = plugin_manager.get_training_args(self.cfg)
|
|
||||||
if plugin_training_args:
|
|
||||||
training_args_kwargs.update(plugin_training_args)
|
|
||||||
|
|
||||||
training_args = training_args_cls( # pylint: disable=unexpected-keyword-arg
|
training_args = training_args_cls( # pylint: disable=unexpected-keyword-arg
|
||||||
logging_first_step=True,
|
logging_first_step=True,
|
||||||
**training_args_kwargs,
|
**training_args_kwargs,
|
||||||
|
|||||||
@@ -25,7 +25,6 @@ from trl.trainer.utils import pad_to_length
|
|||||||
from typing_extensions import override
|
from typing_extensions import override
|
||||||
|
|
||||||
from axolotl.core.trainers.mixins import (
|
from axolotl.core.trainers.mixins import (
|
||||||
CheckpointSaveMixin,
|
|
||||||
OptimizerMixin,
|
OptimizerMixin,
|
||||||
RngLoaderMixin,
|
RngLoaderMixin,
|
||||||
SchedulerMixin,
|
SchedulerMixin,
|
||||||
@@ -34,16 +33,13 @@ from axolotl.core.trainers.utils import (
|
|||||||
sanitize_kwargs_for_ds_tagging,
|
sanitize_kwargs_for_ds_tagging,
|
||||||
sanitize_kwargs_for_tagging,
|
sanitize_kwargs_for_tagging,
|
||||||
)
|
)
|
||||||
from axolotl.utils import get_not_null
|
|
||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
|
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
|
||||||
|
|
||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class AxolotlTrainer(
|
class AxolotlTrainer(SchedulerMixin, OptimizerMixin, RngLoaderMixin, Trainer):
|
||||||
SchedulerMixin, OptimizerMixin, RngLoaderMixin, CheckpointSaveMixin, Trainer
|
|
||||||
):
|
|
||||||
"""Extend the base Trainer for axolotl helpers"""
|
"""Extend the base Trainer for axolotl helpers"""
|
||||||
|
|
||||||
args = None # type: "AxolotlTrainingArguments" # type: ignore[name-defined]
|
args = None # type: "AxolotlTrainingArguments" # type: ignore[name-defined]
|
||||||
@@ -105,7 +101,7 @@ class AxolotlTrainer(
|
|||||||
)
|
)
|
||||||
batch_max_len = train_batch_size * self.args.max_seq_length
|
batch_max_len = train_batch_size * self.args.max_seq_length
|
||||||
|
|
||||||
sampler = MultipackBatchSampler(
|
return MultipackBatchSampler(
|
||||||
base_sampler,
|
base_sampler,
|
||||||
lengths=get_dataset_lengths(dataset),
|
lengths=get_dataset_lengths(dataset),
|
||||||
packing_efficiency_estimate=self.args.sample_packing_efficiency,
|
packing_efficiency_estimate=self.args.sample_packing_efficiency,
|
||||||
@@ -115,12 +111,8 @@ class AxolotlTrainer(
|
|||||||
bin_size=self.args.sample_packing_bin_size,
|
bin_size=self.args.sample_packing_bin_size,
|
||||||
sequential=self.args.sample_packing_sequentially,
|
sequential=self.args.sample_packing_sequentially,
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
num_processes=self.args.dataset_num_proc,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
len(sampler)
|
|
||||||
return sampler
|
|
||||||
|
|
||||||
def _get_train_sampler(
|
def _get_train_sampler(
|
||||||
self, train_dataset: Optional[Dataset] = None
|
self, train_dataset: Optional[Dataset] = None
|
||||||
) -> Optional[Sampler]:
|
) -> Optional[Sampler]:
|
||||||
@@ -228,9 +220,7 @@ class AxolotlTrainer(
|
|||||||
}
|
}
|
||||||
|
|
||||||
if not isinstance(dataset, torch.utils.data.IterableDataset):
|
if not isinstance(dataset, torch.utils.data.IterableDataset):
|
||||||
dataloader_params["drop_last"] = get_not_null(
|
dataloader_params["drop_last"] = self.args.dataloader_drop_last
|
||||||
self.args.dataloader_drop_last, True
|
|
||||||
)
|
|
||||||
if sampler_fn is not None:
|
if sampler_fn is not None:
|
||||||
sampler = sampler_fn(dataset)
|
sampler = sampler_fn(dataset)
|
||||||
if isinstance(sampler, BatchSampler):
|
if isinstance(sampler, BatchSampler):
|
||||||
|
|||||||
@@ -22,19 +22,10 @@ class DPOStrategy:
|
|||||||
training_args_kwargs = {}
|
training_args_kwargs = {}
|
||||||
if cfg.rl is RLType.IPO:
|
if cfg.rl is RLType.IPO:
|
||||||
training_args_kwargs["loss_type"] = "ipo"
|
training_args_kwargs["loss_type"] = "ipo"
|
||||||
# Label smoothing is not compatible with IPO
|
|
||||||
if cfg.rl is RLType.DPO and cfg.dpo_label_smoothing:
|
|
||||||
training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing
|
|
||||||
training_args_kwargs["max_completion_length"] = None
|
|
||||||
training_args_kwargs["max_length"] = cfg.sequence_len
|
training_args_kwargs["max_length"] = cfg.sequence_len
|
||||||
|
training_args_kwargs["max_completion_length"] = None
|
||||||
training_args_kwargs["max_prompt_length"] = cfg.sequence_len
|
training_args_kwargs["max_prompt_length"] = cfg.sequence_len
|
||||||
training_args_kwargs["generate_during_eval"] = cfg.use_wandb
|
training_args_kwargs["generate_during_eval"] = cfg.use_wandb
|
||||||
if cfg.dpo_use_weighting is not None:
|
if cfg.dpo_use_weighting is not None:
|
||||||
training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting
|
training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting
|
||||||
if cfg.dpo_padding_free is not None:
|
|
||||||
training_args_kwargs["padding_free"] = cfg.dpo_padding_free
|
|
||||||
if cfg.dpo_norm_loss is not None:
|
|
||||||
training_args_kwargs["dpo_norm_loss"] = cfg.dpo_norm_loss
|
|
||||||
if cfg.dpo_use_logits_to_keep is not None:
|
|
||||||
training_args_kwargs["use_logits_to_keep"] = cfg.dpo_use_logits_to_keep
|
|
||||||
return training_args_kwargs
|
return training_args_kwargs
|
||||||
|
|||||||
@@ -14,5 +14,3 @@ class AxolotlDPOConfig(AxolotlTrainingMixins, DPOConfig):
|
|||||||
"""
|
"""
|
||||||
DPO config for DPO training
|
DPO config for DPO training
|
||||||
"""
|
"""
|
||||||
|
|
||||||
dpo_norm_loss: bool | None = False
|
|
||||||
|
|||||||
@@ -83,20 +83,3 @@ class AxolotlDPOTrainer(
|
|||||||
gc.collect()
|
gc.collect()
|
||||||
torch.cuda.empty_cache()
|
torch.cuda.empty_cache()
|
||||||
return loss
|
return loss
|
||||||
|
|
||||||
def concatenated_forward(
|
|
||||||
self,
|
|
||||||
model: nn.Module,
|
|
||||||
batch: dict[str, Union[list, torch.LongTensor]],
|
|
||||||
is_ref_model: bool = False,
|
|
||||||
) -> dict[str, torch.Tensor]:
|
|
||||||
if self.args.dpo_norm_loss:
|
|
||||||
# fmt: off
|
|
||||||
loss_type: str = self.loss_type # type: ignore[has-type] # pylint: disable=access-member-before-definition
|
|
||||||
# fmt: on
|
|
||||||
# concatenated_forward handles avg token logprob for ipo case already
|
|
||||||
self.loss_type = "ipo" # pylint: disable=attribute-defined-outside-init
|
|
||||||
res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
|
|
||||||
self.loss_type = loss_type # pylint: disable=attribute-defined-outside-init
|
|
||||||
return res
|
|
||||||
return super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
|
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
# pylint: disable=too-many-lines,duplicate-code,protected-access,no-member
|
# pylint: disable=too-many-lines,duplicate-code,protected-access,no-member
|
||||||
|
|
||||||
import warnings
|
import warnings
|
||||||
from functools import partial
|
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@@ -59,42 +58,6 @@ class AxolotlGRPOTrainer(
|
|||||||
|
|
||||||
_tag_names = ["trl", "grpo", "axolotl"]
|
_tag_names = ["trl", "grpo", "axolotl"]
|
||||||
|
|
||||||
def get_train_dataloader(self):
|
|
||||||
if self.train_dataset is None:
|
|
||||||
raise ValueError("Trainer: training requires a train_dataset.")
|
|
||||||
|
|
||||||
train_dataset = self.train_dataset
|
|
||||||
data_collator = self.data_collator
|
|
||||||
if isinstance(train_dataset, datasets.Dataset):
|
|
||||||
train_dataset = self._remove_unused_columns(
|
|
||||||
train_dataset, description="training"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
data_collator = self._get_collator_with_removed_columns(
|
|
||||||
data_collator, description="training"
|
|
||||||
)
|
|
||||||
|
|
||||||
dataloader_params = {
|
|
||||||
"batch_size": self._train_batch_size
|
|
||||||
* self.args.steps_per_generation, # < this is the change
|
|
||||||
"collate_fn": data_collator,
|
|
||||||
"num_workers": self.args.dataloader_num_workers,
|
|
||||||
"pin_memory": self.args.dataloader_pin_memory,
|
|
||||||
"persistent_workers": self.args.dataloader_persistent_workers,
|
|
||||||
}
|
|
||||||
|
|
||||||
if not isinstance(train_dataset, torch.utils.data.IterableDataset):
|
|
||||||
dataloader_params["sampler"] = self._get_train_sampler()
|
|
||||||
dataloader_params["drop_last"] = self.args.dataloader_drop_last
|
|
||||||
dataloader_params["worker_init_fn"] = partial(
|
|
||||||
seed_worker,
|
|
||||||
num_workers=self.args.dataloader_num_workers,
|
|
||||||
rank=self.args.process_index,
|
|
||||||
)
|
|
||||||
dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
|
|
||||||
|
|
||||||
return self.accelerator.prepare(DataLoader(train_dataset, **dataloader_params))
|
|
||||||
|
|
||||||
|
|
||||||
class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
|
class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
|
||||||
"""Extend the base GRPOTrainer for sequence parallelism handling"""
|
"""Extend the base GRPOTrainer for sequence parallelism handling"""
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
# pylint: disable=unused-import
|
# pylint: disable=unused-import
|
||||||
# flake8: noqa
|
# flake8: noqa
|
||||||
|
|
||||||
from .checkpoints import CheckpointSaveMixin
|
|
||||||
from .optimizer import OptimizerMixin
|
from .optimizer import OptimizerMixin
|
||||||
from .rng_state_loader import RngLoaderMixin
|
from .rng_state_loader import RngLoaderMixin
|
||||||
from .scheduler import SchedulerMixin
|
from .scheduler import SchedulerMixin
|
||||||
|
|||||||
@@ -1,21 +0,0 @@
|
|||||||
"""Custom handling to not fail training if fsdp optimizer is not savable"""
|
|
||||||
|
|
||||||
from transformers import Trainer
|
|
||||||
|
|
||||||
from axolotl.utils.logging import get_logger
|
|
||||||
|
|
||||||
LOG = get_logger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class CheckpointSaveMixin(Trainer):
|
|
||||||
"""Mixin to handle saving the optimizer and scheduler if they are not savable."""
|
|
||||||
|
|
||||||
def _save_optimizer_and_scheduler(self, output_dir):
|
|
||||||
try:
|
|
||||||
super()._save_optimizer_and_scheduler(output_dir)
|
|
||||||
except NotImplementedError as exc:
|
|
||||||
LOG.warning(
|
|
||||||
f"Trainer does not support saving optimizer and scheduler: {exc}\n"
|
|
||||||
"Optimizer and scheduler states were not saved - resuming from checkpoints "
|
|
||||||
"for this training run will not be possible."
|
|
||||||
)
|
|
||||||
@@ -2,17 +2,238 @@
|
|||||||
extra axolotl specific training args
|
extra axolotl specific training args
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
from typing import Optional, Type
|
from typing import Optional
|
||||||
|
|
||||||
|
from PIL.Image import Resampling
|
||||||
from transformers import TrainingArguments
|
from transformers import TrainingArguments
|
||||||
from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig
|
from trl import CPOConfig, KTOConfig, ORPOConfig, PRMConfig, RewardConfig
|
||||||
|
|
||||||
from axolotl.integrations.config import merge_training_args
|
|
||||||
|
|
||||||
AxolotlTrainingMixins: Type = merge_training_args()
|
@dataclass
|
||||||
|
class AxolotlTrainingMixins:
|
||||||
|
"""
|
||||||
|
Mixin class for the Axolotl training args.
|
||||||
|
"""
|
||||||
|
|
||||||
|
# pylint: disable=duplicate-code
|
||||||
|
model_type: Optional[str] = field(
|
||||||
|
default=None, metadata={"help": "HF model configuration model_type."}
|
||||||
|
)
|
||||||
|
lr_quadratic_warmup: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={"help": "Use quadratic warmup for cosine scheduling."},
|
||||||
|
)
|
||||||
|
pretraining: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={
|
||||||
|
"help": "Indicates to trainer whether we are doing continued pretraining."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
sample_packing: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={"help": "Use sample packing for efficient training."},
|
||||||
|
)
|
||||||
|
sample_packing_sequentially: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={
|
||||||
|
"help": "Use next-fit sample packing that preserves the order of samples coming from the sampler. Use in combination with curriculum_sampling for fully sequential packing."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
multipack_real_batches: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={"help": "Use real batches for efficient training."},
|
||||||
|
)
|
||||||
|
eval_sample_packing: Optional[bool] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "Use sample packing for efficient evals."},
|
||||||
|
)
|
||||||
|
sample_packing_efficiency: float = field(
|
||||||
|
default=1.0,
|
||||||
|
metadata={"help": "Sample packing efficiency for calculating batch length."},
|
||||||
|
)
|
||||||
|
sample_packing_bin_size: int = field(
|
||||||
|
default=200,
|
||||||
|
metadata={
|
||||||
|
"help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
sample_packing_group_size: int = field(
|
||||||
|
default=100000,
|
||||||
|
metadata={
|
||||||
|
"help": "The number of samples to group together for packing. Increase for better packing."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
max_seq_length: int = field(
|
||||||
|
default=2048,
|
||||||
|
metadata={"help": "The maximum sequence length the model can handle"},
|
||||||
|
)
|
||||||
|
relora_steps: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "how often to reset for ReLoRA"},
|
||||||
|
)
|
||||||
|
relora_warmup_steps: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
|
||||||
|
)
|
||||||
|
relora_anneal_steps: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
|
||||||
|
)
|
||||||
|
relora_prune_ratio: Optional[float] = field(
|
||||||
|
default=0.9,
|
||||||
|
metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
|
||||||
|
)
|
||||||
|
bench_split: Optional[str] = field(
|
||||||
|
default="eval", metadata={"help": "The benchmark split to run on"}
|
||||||
|
)
|
||||||
|
bench_dataset: Optional[str] = field(
|
||||||
|
default="pharaouk/dharma-1/dharma_1_mini.json",
|
||||||
|
metadata={
|
||||||
|
"help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
|
||||||
|
},
|
||||||
|
)
|
||||||
|
do_bench_eval: Optional[bool] = field(
|
||||||
|
default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
|
||||||
|
)
|
||||||
|
do_causal_lm_eval: Optional[bool] = field(
|
||||||
|
default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
|
||||||
|
)
|
||||||
|
max_bench_samples: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
|
||||||
|
},
|
||||||
|
)
|
||||||
|
bench_source_max_len: int = field(
|
||||||
|
default=2048, metadata={"help": "Maximum source sequence length for bench."}
|
||||||
|
)
|
||||||
|
dataloader_prefetch_factor: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "prefetch_factor argument to the dataloader"},
|
||||||
|
)
|
||||||
|
cosine_min_lr_ratio: Optional[float] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
|
||||||
|
)
|
||||||
|
cosine_constant_lr_ratio: Optional[float] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
|
||||||
|
},
|
||||||
|
)
|
||||||
|
loraplus_lr_ratio: Optional[float] = field(
|
||||||
|
default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
|
||||||
|
)
|
||||||
|
loraplus_lr_embedding: Optional[float] = field(
|
||||||
|
default=1e-6,
|
||||||
|
metadata={"help": "loraplus learning rate for lora embedding layers."},
|
||||||
|
)
|
||||||
|
embedding_lr_scale: Optional[float] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "Scale the learning rate for the embedding layers."},
|
||||||
|
)
|
||||||
|
lr_groups: Optional[list[dict]] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "Specify learning rate groups for with different LRs."},
|
||||||
|
)
|
||||||
|
embedding_lr: Optional[float] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "absolute learning rate for the embedding layers."},
|
||||||
|
)
|
||||||
|
qlora: bool = field(
|
||||||
|
default=False,
|
||||||
|
metadata={"help": "whether this is a qlora training"},
|
||||||
|
)
|
||||||
|
orpo_alpha: Optional[float] = field(
|
||||||
|
default=None,
|
||||||
|
)
|
||||||
|
lisa_n_layers: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "the number of activate layers in LISA"},
|
||||||
|
)
|
||||||
|
lisa_step_interval: Optional[int] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "how often to switch layers in LISA"},
|
||||||
|
)
|
||||||
|
lisa_layers_attribute: Optional[str] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "path under the model to access the layers"},
|
||||||
|
)
|
||||||
|
curriculum_sampling: Optional[bool] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "whether to use sequential sampling for curriculum learning"},
|
||||||
|
)
|
||||||
|
alternate_lr_scheduler_type: Optional[str] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": "workaround to pass an alternate lr scheduler to the HF trainer"
|
||||||
|
},
|
||||||
|
)
|
||||||
|
chat_template: Optional[str] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "Chat template converting chat messages to text"},
|
||||||
|
)
|
||||||
|
|
||||||
|
kd_ce_alpha: Optional[float] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": "The alpha scaling parameter for SFT cross entropy loss when using KD"
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
kd_alpha: Optional[float] = field(
|
||||||
|
default=1.0,
|
||||||
|
metadata={"help": "The alpha scaling parameter for KD loss"},
|
||||||
|
)
|
||||||
|
|
||||||
|
kd_temperature: Optional[float] = field(
|
||||||
|
default=1.0,
|
||||||
|
metadata={
|
||||||
|
"help": "the temperature parameter for KL divergence loss when using KD"
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
kd_zscore_base_temp: Optional[float] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": "the base temperature parameter for KL divergence with z-score when using KD"
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
kd_top_k_before_softmax: Optional[bool] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": "Whether to apply top_k_before_softmax to the logits when using KD"
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
adam_beta3: Optional[float] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": "The beta3 hyperparameter used in some optimizers such as CAME"
|
||||||
|
},
|
||||||
|
)
|
||||||
|
adam_epsilon2: Optional[float] = field(
|
||||||
|
default=None,
|
||||||
|
metadata={
|
||||||
|
"help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
# multi-modal section
|
||||||
|
|
||||||
|
image_size: int | tuple[int, int] | None = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "The size of the image to resize to"},
|
||||||
|
)
|
||||||
|
|
||||||
|
image_resize_algorithm: Resampling | None = field(
|
||||||
|
default=None,
|
||||||
|
metadata={"help": "The algorithm to use for image resizing"},
|
||||||
|
)
|
||||||
|
|
||||||
|
# end of multi-modal section
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
|
|||||||
@@ -1,224 +0,0 @@
|
|||||||
"""
|
|
||||||
Base Axolotl Training Mixins shared across various trainer configs
|
|
||||||
"""
|
|
||||||
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
from PIL.Image import Resampling
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class AxolotlTrainingMixins:
|
|
||||||
"""
|
|
||||||
Mixin class for the Axolotl training args.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
model_type: Optional[str] = field(
|
|
||||||
default=None, metadata={"help": "HF model configuration model_type."}
|
|
||||||
)
|
|
||||||
lr_quadratic_warmup: bool = field(
|
|
||||||
default=False,
|
|
||||||
metadata={"help": "Use quadratic warmup for cosine scheduling."},
|
|
||||||
)
|
|
||||||
pretraining: bool = field(
|
|
||||||
default=False,
|
|
||||||
metadata={
|
|
||||||
"help": "Indicates to trainer whether we are doing continued pretraining."
|
|
||||||
},
|
|
||||||
)
|
|
||||||
sample_packing: bool = field(
|
|
||||||
default=False,
|
|
||||||
metadata={"help": "Use sample packing for efficient training."},
|
|
||||||
)
|
|
||||||
sample_packing_sequentially: bool = field(
|
|
||||||
default=False,
|
|
||||||
metadata={
|
|
||||||
"help": "Use next-fit sample packing that preserves the order of samples coming from the sampler. Use in combination with curriculum_sampling for fully sequential packing."
|
|
||||||
},
|
|
||||||
)
|
|
||||||
multipack_real_batches: bool = field(
|
|
||||||
default=False,
|
|
||||||
metadata={"help": "Use real batches for efficient training."},
|
|
||||||
)
|
|
||||||
eval_sample_packing: Optional[bool] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "Use sample packing for efficient evals."},
|
|
||||||
)
|
|
||||||
sample_packing_efficiency: float = field(
|
|
||||||
default=1.0,
|
|
||||||
metadata={"help": "Sample packing efficiency for calculating batch length."},
|
|
||||||
)
|
|
||||||
sample_packing_bin_size: int = field(
|
|
||||||
default=200,
|
|
||||||
metadata={
|
|
||||||
"help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
|
|
||||||
},
|
|
||||||
)
|
|
||||||
sample_packing_group_size: int = field(
|
|
||||||
default=100000,
|
|
||||||
metadata={
|
|
||||||
"help": "The number of samples to group together for packing. Increase for better packing."
|
|
||||||
},
|
|
||||||
)
|
|
||||||
max_seq_length: int = field(
|
|
||||||
default=2048,
|
|
||||||
metadata={"help": "The maximum sequence length the model can handle"},
|
|
||||||
)
|
|
||||||
dataset_num_proc: int | None = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "The number of processes to use for data processing"},
|
|
||||||
)
|
|
||||||
relora_steps: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "how often to reset for ReLoRA"},
|
|
||||||
)
|
|
||||||
relora_warmup_steps: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
|
|
||||||
)
|
|
||||||
relora_anneal_steps: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "how many warmup steps to take after reset for ReLoRA"},
|
|
||||||
)
|
|
||||||
relora_prune_ratio: Optional[float] = field(
|
|
||||||
default=0.9,
|
|
||||||
metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
|
|
||||||
)
|
|
||||||
bench_split: Optional[str] = field(
|
|
||||||
default="eval", metadata={"help": "The benchmark split to run on"}
|
|
||||||
)
|
|
||||||
bench_dataset: Optional[str] = field(
|
|
||||||
default="pharaouk/dharma-1/dharma_1_mini.json",
|
|
||||||
metadata={
|
|
||||||
"help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
|
|
||||||
},
|
|
||||||
)
|
|
||||||
do_bench_eval: Optional[bool] = field(
|
|
||||||
default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
|
|
||||||
)
|
|
||||||
do_causal_lm_eval: Optional[bool] = field(
|
|
||||||
default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
|
|
||||||
)
|
|
||||||
max_bench_samples: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={
|
|
||||||
"help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
|
|
||||||
},
|
|
||||||
)
|
|
||||||
bench_source_max_len: int = field(
|
|
||||||
default=2048, metadata={"help": "Maximum source sequence length for bench."}
|
|
||||||
)
|
|
||||||
dataloader_prefetch_factor: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "prefetch_factor argument to the dataloader"},
|
|
||||||
)
|
|
||||||
cosine_min_lr_ratio: Optional[float] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
|
|
||||||
)
|
|
||||||
cosine_constant_lr_ratio: Optional[float] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={
|
|
||||||
"help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
|
|
||||||
},
|
|
||||||
)
|
|
||||||
loraplus_lr_ratio: Optional[float] = field(
|
|
||||||
default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
|
|
||||||
)
|
|
||||||
loraplus_lr_embedding: Optional[float] = field(
|
|
||||||
default=1e-6,
|
|
||||||
metadata={"help": "loraplus learning rate for lora embedding layers."},
|
|
||||||
)
|
|
||||||
embedding_lr_scale: Optional[float] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "Scale the learning rate for the embedding layers."},
|
|
||||||
)
|
|
||||||
lr_groups: Optional[list[dict]] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "Specify learning rate groups for with different LRs."},
|
|
||||||
)
|
|
||||||
embedding_lr: Optional[float] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "absolute learning rate for the embedding layers."},
|
|
||||||
)
|
|
||||||
qlora: bool = field(
|
|
||||||
default=False,
|
|
||||||
metadata={"help": "whether this is a qlora training"},
|
|
||||||
)
|
|
||||||
orpo_alpha: Optional[float] = field(
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
lisa_n_layers: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "the number of activate layers in LISA"},
|
|
||||||
)
|
|
||||||
lisa_step_interval: Optional[int] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "how often to switch layers in LISA"},
|
|
||||||
)
|
|
||||||
lisa_layers_attribute: Optional[str] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "path under the model to access the layers"},
|
|
||||||
)
|
|
||||||
curriculum_sampling: Optional[bool] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "whether to use sequential sampling for curriculum learning"},
|
|
||||||
)
|
|
||||||
alternate_lr_scheduler_type: Optional[str] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={
|
|
||||||
"help": "workaround to pass an alternate lr scheduler to the HF trainer"
|
|
||||||
},
|
|
||||||
)
|
|
||||||
chat_template: Optional[str] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "Chat template converting chat messages to text"},
|
|
||||||
)
|
|
||||||
|
|
||||||
# kd_ce_alpha: Optional[float] = field(
|
|
||||||
# default=None,
|
|
||||||
# metadata={
|
|
||||||
# "help": "The alpha scaling parameter for SFT cross entropy loss when using KD"
|
|
||||||
# },
|
|
||||||
# )
|
|
||||||
#
|
|
||||||
# kd_alpha: Optional[float] = field(
|
|
||||||
# default=1.0,
|
|
||||||
# metadata={"help": "The alpha scaling parameter for KD loss"},
|
|
||||||
# )
|
|
||||||
#
|
|
||||||
# kd_temperature: Optional[float] = field(
|
|
||||||
# default=1.0,
|
|
||||||
# metadata={
|
|
||||||
# "help": "the temperature parameter for KL divergence loss when using KD"
|
|
||||||
# },
|
|
||||||
# )
|
|
||||||
|
|
||||||
adam_beta3: Optional[float] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={
|
|
||||||
"help": "The beta3 hyperparameter used in some optimizers such as CAME"
|
|
||||||
},
|
|
||||||
)
|
|
||||||
adam_epsilon2: Optional[float] = field(
|
|
||||||
default=None,
|
|
||||||
metadata={
|
|
||||||
"help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
# multi-modal section
|
|
||||||
|
|
||||||
image_size: int | tuple[int, int] | None = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "The size of the image to resize to"},
|
|
||||||
)
|
|
||||||
|
|
||||||
image_resize_algorithm: Resampling | None = field(
|
|
||||||
default=None,
|
|
||||||
metadata={"help": "The algorithm to use for image resizing"},
|
|
||||||
)
|
|
||||||
|
|
||||||
# end of multi-modal section
|
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
"""Module containing Dataset functionality"""
|
"""Module containing Dataset functionality"""
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
from typing import List, Optional, Union
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
from datasets import Dataset, IterableDataset
|
from datasets import Dataset, IterableDataset
|
||||||
@@ -19,21 +20,21 @@ LOG = get_logger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class TokenizedPromptDataset(Dataset):
|
class TokenizedPromptDataset(Dataset):
|
||||||
"""Dataset that returns tokenized prompts from a stream of text files.
|
"""
|
||||||
|
Dataset that returns tokenized prompts from a stream of text files.
|
||||||
Args:
|
Args:
|
||||||
prompt_tokenizer: The prompt tokenizing method for processing the data.
|
prompt_tokenizer (PromptTokenizingStrategy): The prompt tokenizing method for processing the data.
|
||||||
dataset: Dataset with text files.
|
dataset (dataset.Dataset): Dataset with text files.
|
||||||
process_count: Number of processes to use for tokenizing.
|
process_count (int): Number of processes to use for tokenizing.
|
||||||
keep_in_memory: Whether to keep the tokenized dataset in memory.
|
keep_in_memory (bool): Whether to keep the tokenized dataset in memory.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__( # pylint: disable=super-init-not-called
|
def __init__( # pylint: disable=super-init-not-called
|
||||||
self,
|
self,
|
||||||
prompt_tokenizer: PromptTokenizingStrategy,
|
prompt_tokenizer: PromptTokenizingStrategy,
|
||||||
dataset: Dataset,
|
dataset: Dataset,
|
||||||
process_count: int | None = None,
|
process_count: Optional[int] = None,
|
||||||
keep_in_memory: bool | None = False,
|
keep_in_memory: Optional[bool] = False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
self.prompt_tokenizer = prompt_tokenizer
|
self.prompt_tokenizer = prompt_tokenizer
|
||||||
@@ -48,13 +49,6 @@ class TokenizedPromptDataset(Dataset):
|
|||||||
features = dataset.features.keys()
|
features = dataset.features.keys()
|
||||||
num_proc = min(64, self.process_count if self.process_count else os.cpu_count())
|
num_proc = min(64, self.process_count if self.process_count else os.cpu_count())
|
||||||
|
|
||||||
# Disable multiprocessing if the tokenizer doesn't support it (e.g., mistral_common)
|
|
||||||
if not getattr(self.prompt_tokenizer, "supports_multiprocessing", True):
|
|
||||||
LOG.info(
|
|
||||||
"Disabling multiprocessing for tokenizer as it doesn't support it (e.g., mistral_common)"
|
|
||||||
)
|
|
||||||
num_proc = 1
|
|
||||||
|
|
||||||
map_kwargs = {}
|
map_kwargs = {}
|
||||||
if self.prompt_tokenizer.supports_batched:
|
if self.prompt_tokenizer.supports_batched:
|
||||||
map_kwargs["batched"] = True
|
map_kwargs["batched"] = True
|
||||||
@@ -82,14 +76,14 @@ class TokenizedPromptDataset(Dataset):
|
|||||||
|
|
||||||
def wrap_dataset_for_tokenized_prompt(
|
def wrap_dataset_for_tokenized_prompt(
|
||||||
prompt_tokenizer: PromptTokenizingStrategy,
|
prompt_tokenizer: PromptTokenizingStrategy,
|
||||||
dataset: Dataset | IterableDataset,
|
dataset: Union[Dataset, IterableDataset],
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
if isinstance(dataset, IterableDataset):
|
if isinstance(dataset, IterableDataset):
|
||||||
map_kwargs = {}
|
map_kwargs = {}
|
||||||
if prompt_tokenizer.supports_batched:
|
if prompt_tokenizer.supports_batched:
|
||||||
map_kwargs["batched"] = True
|
map_kwargs["batched"] = True
|
||||||
features = list(dataset.features.keys())
|
features = dataset.features.keys()
|
||||||
return dataset.map(
|
return dataset.map(
|
||||||
prompt_tokenizer.tokenize_prompt,
|
prompt_tokenizer.tokenize_prompt,
|
||||||
remove_columns=features,
|
remove_columns=features,
|
||||||
@@ -100,13 +94,12 @@ def wrap_dataset_for_tokenized_prompt(
|
|||||||
|
|
||||||
# TODO this isn't the best since it can't interleave datasets
|
# TODO this isn't the best since it can't interleave datasets
|
||||||
class ConstantLengthDataset(IterableDataset):
|
class ConstantLengthDataset(IterableDataset):
|
||||||
"""Iterable dataset that returns constant length chunks of tokens from stream of
|
"""
|
||||||
text files.
|
Iterable dataset that returns constant length chunks of tokens from stream of text files.
|
||||||
|
Args:
|
||||||
Args:
|
tokenizer (Tokenizer): The processor used for processing the data.
|
||||||
tokenizer: The processor used for processing the data.
|
dataset (dataset.Dataset): Dataset with text files.
|
||||||
dataset: Dataset with text files.
|
seq_length (int): Length of token sequences to return.
|
||||||
seq_length: Length of token sequences to return.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__( # pylint: disable=super-init-not-called
|
def __init__( # pylint: disable=super-init-not-called
|
||||||
@@ -117,7 +110,7 @@ class ConstantLengthDataset(IterableDataset):
|
|||||||
):
|
):
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.concat_token_id = tokenizer.eos_token_id
|
self.concat_token_id = tokenizer.eos_token_id
|
||||||
self.datasets: list[IterableDataset] = datasets
|
self.datasets: List[IterableDataset] = datasets
|
||||||
self.seq_length = seq_length
|
self.seq_length = seq_length
|
||||||
|
|
||||||
vocab_size = len(tokenizer.get_vocab())
|
vocab_size = len(tokenizer.get_vocab())
|
||||||
@@ -181,10 +174,7 @@ class ConstantLengthDataset(IterableDataset):
|
|||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
LOG.warning(
|
LOG.warning(
|
||||||
"Dropping batch due to tensor size mismatch "
|
f"dropping batch due to tensor size mismatch input_ids: {input_ids.size()}, labels: {labels.size()}, attention_mask: {attention_mask.size()}"
|
||||||
f"input_ids: {input_ids.size()}, "
|
|
||||||
f"labels: {labels.size()}, "
|
|
||||||
f"attention_mask: {attention_mask.size()}"
|
|
||||||
)
|
)
|
||||||
buffer = {
|
buffer = {
|
||||||
"input_ids": [],
|
"input_ids": [],
|
||||||
|
|||||||
@@ -7,16 +7,17 @@ from pathlib import Path
|
|||||||
from typing import Dict, Optional
|
from typing import Dict, Optional
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
from accelerate.logging import get_logger
|
||||||
from datasets import Dataset
|
from datasets import Dataset
|
||||||
from transformers.trainer import Trainer
|
from transformers.trainer import Trainer
|
||||||
|
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
from axolotl.train import (
|
from axolotl.train import (
|
||||||
TrainDatasetMeta,
|
TrainDatasetMeta,
|
||||||
setup_model_and_tokenizer,
|
setup_model_and_tokenizer,
|
||||||
)
|
)
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.distributed import cleanup_distributed
|
from axolotl.utils.distributed import cleanup_distributed
|
||||||
from axolotl.utils.logging import get_logger
|
|
||||||
from axolotl.utils.trainer import setup_trainer
|
from axolotl.utils.trainer import setup_trainer
|
||||||
|
|
||||||
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
||||||
@@ -63,6 +64,7 @@ def evaluate_dataset(
|
|||||||
return metrics
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, float]:
|
def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, float]:
|
||||||
"""
|
"""
|
||||||
Evaluate a model on training and validation datasets.
|
Evaluate a model on training and validation datasets.
|
||||||
|
|||||||
@@ -22,7 +22,6 @@ from __future__ import annotations
|
|||||||
|
|
||||||
import collections
|
import collections
|
||||||
import importlib
|
import importlib
|
||||||
import traceback
|
|
||||||
from typing import TYPE_CHECKING, Callable, OrderedDict, Union
|
from typing import TYPE_CHECKING, Callable, OrderedDict, Union
|
||||||
|
|
||||||
from peft import PeftModel
|
from peft import PeftModel
|
||||||
@@ -84,11 +83,6 @@ class BasePlugin:
|
|||||||
def get_input_args(self) -> str | None:
|
def get_input_args(self) -> str | None:
|
||||||
"""Returns a pydantic model for the plugin's input arguments."""
|
"""Returns a pydantic model for the plugin's input arguments."""
|
||||||
|
|
||||||
def get_training_args_mixin(self) -> str | None:
|
|
||||||
"""
|
|
||||||
Returns a dataclass model for the plugin's training arguments.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def load_datasets(
|
def load_datasets(
|
||||||
self, cfg: DictDefault, preprocess: bool = False
|
self, cfg: DictDefault, preprocess: bool = False
|
||||||
) -> Union["TrainDatasetMeta", None]:
|
) -> Union["TrainDatasetMeta", None]:
|
||||||
@@ -164,31 +158,6 @@ class BasePlugin:
|
|||||||
trainer: The trainer object for training.
|
trainer: The trainer object for training.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def get_training_args(self, cfg: DictDefault): # pylint: disable=unused-argument):
|
|
||||||
"""
|
|
||||||
Returns custom training arguments to set on TrainingArgs.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cfg: The global axolotl configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
object: dict containing the training arguments.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def get_collator_cls_and_kwargs(
|
|
||||||
self, cfg: DictDefault, is_eval: bool = False
|
|
||||||
): # pylint: disable=unused-argument):
|
|
||||||
"""
|
|
||||||
Returns a custom class for the collator.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cfg: The global axolotl configuration.
|
|
||||||
is_eval: Whether this is an eval split.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
class: The class for the collator.
|
|
||||||
"""
|
|
||||||
|
|
||||||
# pylint: disable=unused-argument
|
# pylint: disable=unused-argument
|
||||||
def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
|
def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
|
||||||
"""Creates and returns an optimizer for training.
|
"""Creates and returns an optimizer for training.
|
||||||
@@ -309,7 +278,7 @@ def load_plugin(plugin_name: str) -> BasePlugin:
|
|||||||
return plugin
|
return plugin
|
||||||
|
|
||||||
|
|
||||||
class PluginManager: # pylint: disable=too-many-public-methods
|
class PluginManager:
|
||||||
"""The `PluginManager` class is responsible for loading and managing plugins. It
|
"""The `PluginManager` class is responsible for loading and managing plugins. It
|
||||||
should be a singleton so it can be accessed from anywhere in the codebase.
|
should be a singleton so it can be accessed from anywhere in the codebase.
|
||||||
|
|
||||||
@@ -368,11 +337,8 @@ class PluginManager: # pylint: disable=too-many-public-methods
|
|||||||
plugin = load_plugin(plugin_name)
|
plugin = load_plugin(plugin_name)
|
||||||
self.plugins[plugin_name] = plugin
|
self.plugins[plugin_name] = plugin
|
||||||
LOG.info(f"Plugin loaded successfully: {plugin_name}")
|
LOG.info(f"Plugin loaded successfully: {plugin_name}")
|
||||||
except ImportError as exc:
|
except ImportError:
|
||||||
LOG.error(f"Failed to load plugin: {plugin_name}")
|
LOG.error(f"Failed to load plugin: {plugin_name}")
|
||||||
# print stacktrace
|
|
||||||
traceback.print_exc()
|
|
||||||
print(f"Error: {exc}")
|
|
||||||
|
|
||||||
def get_input_args(self) -> list[str]:
|
def get_input_args(self) -> list[str]:
|
||||||
"""Returns a list of Pydantic classes for all registered plugins' input arguments.'
|
"""Returns a list of Pydantic classes for all registered plugins' input arguments.'
|
||||||
@@ -387,20 +353,6 @@ class PluginManager: # pylint: disable=too-many-public-methods
|
|||||||
input_args.append(input_args_from_plugin)
|
input_args.append(input_args_from_plugin)
|
||||||
return input_args
|
return input_args
|
||||||
|
|
||||||
def get_training_args_mixin(self):
|
|
||||||
"""
|
|
||||||
Returns a list of dataclasses for all registered plugins' training args mixins'
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list[str]: A list of dataclsses
|
|
||||||
"""
|
|
||||||
training_args = []
|
|
||||||
for plugin in self.plugins.values():
|
|
||||||
training_args_from_plugin = plugin.get_training_args_mixin()
|
|
||||||
if training_args_from_plugin is not None:
|
|
||||||
training_args.append(training_args_from_plugin)
|
|
||||||
return training_args
|
|
||||||
|
|
||||||
def load_datasets(
|
def load_datasets(
|
||||||
self, cfg: DictDefault, preprocess: bool = False
|
self, cfg: DictDefault, preprocess: bool = False
|
||||||
) -> Union["TrainDatasetMeta", None]:
|
) -> Union["TrainDatasetMeta", None]:
|
||||||
@@ -490,42 +442,6 @@ class PluginManager: # pylint: disable=too-many-public-methods
|
|||||||
return trainer_cls
|
return trainer_cls
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_training_args(self, cfg):
|
|
||||||
"""
|
|
||||||
Calls the get_training_args method of all registered plugins and returns the combined training arguments.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
cfg (dict): The configuration for the plugins.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
object: The training arguments
|
|
||||||
"""
|
|
||||||
training_args_kwargs = {}
|
|
||||||
for plugin in self.plugins.values():
|
|
||||||
training_args = plugin.get_training_args(cfg)
|
|
||||||
if training_args is not None:
|
|
||||||
training_args_kwargs.update(training_args)
|
|
||||||
|
|
||||||
return training_args_kwargs
|
|
||||||
|
|
||||||
def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
|
|
||||||
"""
|
|
||||||
Calls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.
|
|
||||||
|
|
||||||
Parameters:
|
|
||||||
cfg (dict): The configuration for the plugins.
|
|
||||||
is_eval (bool): Whether this is an eval split.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
object: The collator class, or None if none was found.
|
|
||||||
"""
|
|
||||||
for plugin in self.plugins.values():
|
|
||||||
collator = plugin.get_collator_cls_and_kwargs(cfg, is_eval=is_eval)
|
|
||||||
if collator is not None:
|
|
||||||
collator_cls, collator_kwargs = collator
|
|
||||||
return collator_cls, collator_kwargs
|
|
||||||
return None
|
|
||||||
|
|
||||||
def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
|
def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
|
||||||
"""Calls the `post_trainer_create` method of all registered plugins.
|
"""Calls the `post_trainer_create` method of all registered plugins.
|
||||||
|
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ Module to handle merging the plugins' input arguments with the base configuratio
|
|||||||
This was moved here to prevent circular imports.
|
This was moved here to prevent circular imports.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from typing import Any, Dict, List, Type
|
from typing import Any, Dict, List
|
||||||
|
|
||||||
from axolotl.utils.schemas.config import (
|
from axolotl.utils.schemas.config import (
|
||||||
AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
|
AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
|
||||||
@@ -61,43 +61,3 @@ def merge_input_args():
|
|||||||
]
|
]
|
||||||
return AxolotlConfigWCapabilities, AxolotlInputConfig
|
return AxolotlConfigWCapabilities, AxolotlInputConfig
|
||||||
return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase
|
return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase
|
||||||
|
|
||||||
|
|
||||||
def merge_training_args() -> Type:
|
|
||||||
"""
|
|
||||||
Merges training arguments from registered plugins with the base TrainingArguments.
|
|
||||||
|
|
||||||
This function retrieves the training arguments from registered plugins using the PluginManager.
|
|
||||||
It then dynamically creates new classes, AxolotlTrainingMixins,
|
|
||||||
that inherit from the base configurations and include the training arguments from the plugins.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple: A tuple containing the newly created classes, AxolotlTrainingMixins.
|
|
||||||
"""
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
from axolotl.core.training_args_base import (
|
|
||||||
AxolotlTrainingMixins as AxolotlTrainingMixinsBase,
|
|
||||||
)
|
|
||||||
from axolotl.integrations.base import PluginManager
|
|
||||||
|
|
||||||
plugin_manager = PluginManager.get_instance()
|
|
||||||
training_args_mixins: List[str] = plugin_manager.get_training_args_mixin()
|
|
||||||
mixin_classes = []
|
|
||||||
dynamic_input = ""
|
|
||||||
for plugin_args in training_args_mixins:
|
|
||||||
plugin_module, plugin_cls = plugin_args.rsplit(".", 1)
|
|
||||||
dynamic_input += f"from {plugin_module} import {plugin_cls}\n"
|
|
||||||
mixin_classes.append(plugin_cls)
|
|
||||||
if dynamic_input:
|
|
||||||
dynamic_input += f"class AxolotlTrainingMixins(AxolotlTrainingMixinsBase, {', '.join(mixin_classes)}):\n pass\n"
|
|
||||||
|
|
||||||
namespace: Dict[Any, Any] = {}
|
|
||||||
local_vars = {"AxolotlTrainingMixinsBase": AxolotlTrainingMixinsBase}
|
|
||||||
exec( # pylint: disable=exec-used # nosec B102
|
|
||||||
dynamic_input, {**globals(), **local_vars}, namespace
|
|
||||||
)
|
|
||||||
AxolotlTrainingMixins = namespace[ # pylint: disable=invalid-name
|
|
||||||
"AxolotlTrainingMixins"
|
|
||||||
]
|
|
||||||
return AxolotlTrainingMixins
|
|
||||||
return AxolotlTrainingMixinsBase
|
|
||||||
|
|||||||
@@ -24,14 +24,6 @@ pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transform
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
**NOTE**: If you are training a VLM model, please use older version of Axolotl as upstream has applied a major VLM refactor, and our patches have not been updated yet.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git checkout 787880215b3ab32ccaf81c1b2e9588c6f3e6e764
|
|
||||||
|
|
||||||
pip3 install --no-build-isolation -e .
|
|
||||||
```
|
|
||||||
|
|
||||||
```yaml
|
```yaml
|
||||||
plugins:
|
plugins:
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
||||||
|
|||||||
@@ -15,12 +15,7 @@
|
|||||||
"""
|
"""
|
||||||
Plugin init to add KD support to Axolotl.
|
Plugin init to add KD support to Axolotl.
|
||||||
"""
|
"""
|
||||||
from typing import Any
|
|
||||||
|
|
||||||
from transformers import Trainer
|
|
||||||
|
|
||||||
from axolotl.integrations.base import BasePlugin
|
from axolotl.integrations.base import BasePlugin
|
||||||
from axolotl.integrations.kd.callbacks import KDTemperatureSchedulerCallback
|
|
||||||
|
|
||||||
from .args import KDArgs # pylint: disable=unused-import. # noqa: F401
|
from .args import KDArgs # pylint: disable=unused-import. # noqa: F401
|
||||||
|
|
||||||
@@ -33,75 +28,9 @@ class KDPlugin(BasePlugin):
|
|||||||
def get_input_args(self):
|
def get_input_args(self):
|
||||||
return "axolotl.integrations.kd.KDArgs"
|
return "axolotl.integrations.kd.KDArgs"
|
||||||
|
|
||||||
def get_training_args_mixin(self):
|
|
||||||
return "axolotl.integrations.kd.args.KDTrainingArgsMixin"
|
|
||||||
|
|
||||||
def get_trainer_cls(self, cfg):
|
def get_trainer_cls(self, cfg):
|
||||||
if cfg.kd_trainer:
|
if cfg.kd_trainer:
|
||||||
from .trainer import AxolotlKDTrainer
|
from .trainer import AxolotlKDTrainer
|
||||||
|
|
||||||
return AxolotlKDTrainer
|
return AxolotlKDTrainer
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def get_training_args(self, cfg):
|
|
||||||
return {
|
|
||||||
"kd_ce_alpha": cfg.kd_ce_alpha,
|
|
||||||
"kd_alpha": cfg.kd_alpha,
|
|
||||||
"kd_temperature": cfg.kd_temperature,
|
|
||||||
"kd_beta": cfg.kd_beta,
|
|
||||||
"kd_normalize_topk": cfg.kd_normalize_topk,
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
|
|
||||||
if not cfg.kd_trainer:
|
|
||||||
return None, None
|
|
||||||
|
|
||||||
from .collator import DataCollatorForKD, KDBatchSamplerDataCollatorForSeq2Seq
|
|
||||||
|
|
||||||
use_batch_sampler_collator = False
|
|
||||||
if is_eval is False and cfg.sample_packing:
|
|
||||||
use_batch_sampler_collator = True
|
|
||||||
if cfg.eval_sample_packing and is_eval:
|
|
||||||
use_batch_sampler_collator = True
|
|
||||||
|
|
||||||
if cfg.kd_online_server_base_url:
|
|
||||||
from .collator_online_teacher import OnlineTeacherCollator
|
|
||||||
|
|
||||||
return OnlineTeacherCollator, {
|
|
||||||
"kd_online_server_base_url": cfg.kd_online_server_base_url,
|
|
||||||
"kd_online_topk": cfg.kd_online_topk,
|
|
||||||
"kd_temperature": cfg.kd_temperature,
|
|
||||||
"kd_online_server": cfg.kd_online_server,
|
|
||||||
"kd_online_timeout": cfg.kd_online_timeout,
|
|
||||||
"kd_normalize_topk": cfg.kd_normalize_topk,
|
|
||||||
}
|
|
||||||
|
|
||||||
if use_batch_sampler_collator:
|
|
||||||
return KDBatchSamplerDataCollatorForSeq2Seq, {}
|
|
||||||
return DataCollatorForKD, {}
|
|
||||||
|
|
||||||
def pre_model_load(self, cfg):
|
|
||||||
from .kernels.models import apply_kernel
|
|
||||||
|
|
||||||
apply_kernel(cfg.model_config_type)
|
|
||||||
|
|
||||||
def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
|
|
||||||
"""
|
|
||||||
Adds temp scheduler callback to the Trainer instance.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cfg (Any): Configuration object containing the sparse recipe.
|
|
||||||
trainer (Trainer): Huggingface Trainer instance.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
list: List containing the configured callback instances.
|
|
||||||
"""
|
|
||||||
if cfg.kd_temperature_min is not None and cfg.kd_online_server_base_url:
|
|
||||||
callback = KDTemperatureSchedulerCallback(
|
|
||||||
cfg.kd_temperature,
|
|
||||||
cfg.kd_temperature_min,
|
|
||||||
trainer,
|
|
||||||
)
|
|
||||||
return [callback]
|
|
||||||
|
|
||||||
return []
|
|
||||||
|
|||||||
@@ -15,19 +15,9 @@
|
|||||||
"""
|
"""
|
||||||
Plugin args for KD support.
|
Plugin args for KD support.
|
||||||
"""
|
"""
|
||||||
from dataclasses import dataclass
|
from typing import Optional
|
||||||
from enum import Enum
|
|
||||||
|
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
class InferenceServerType(str, Enum):
|
|
||||||
"""
|
|
||||||
Online inferences server types to handle different request args
|
|
||||||
"""
|
|
||||||
|
|
||||||
vllm = "vllm" # pylint: disable=invalid-name
|
|
||||||
sglang = "sglang" # pylint: disable=invalid-name
|
|
||||||
|
|
||||||
|
|
||||||
class KDArgs(BaseModel):
|
class KDArgs(BaseModel):
|
||||||
@@ -35,41 +25,13 @@ class KDArgs(BaseModel):
|
|||||||
Input args for knowledge distillation.
|
Input args for knowledge distillation.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
kd_trainer: float | None = None # whether to use KD trainer
|
kd_trainer: Optional[bool] = None # whether to use KD trainer
|
||||||
kd_ce_alpha: float | None = (
|
kd_ce_alpha: Optional[float] = (
|
||||||
None # loss coefficient for cross-entropy loss during KD
|
None # loss coefficient for cross-entropy loss during KD
|
||||||
)
|
)
|
||||||
kd_alpha: float | None = None # loss coefficient for KD loss
|
kd_alpha: Optional[float] = None # loss coefficient for KD loss
|
||||||
kd_temperature: float | None = None # temperature for sampling during KD
|
kd_temperature: Optional[float] = None # temperature for sampling during KD
|
||||||
kd_beta: float | None = 0.0 # beta coefficient for ratio of fwd and reverse KL
|
kd_zscore_base_temp: Optional[float] = None # base temperature for zscore scaling
|
||||||
kd_normalize_topk: bool | None = (
|
kd_top_k_before_softmax: Optional[bool] = (
|
||||||
None # whether to normalize student logits during KD
|
None # whether to sample top k before softmax during KD
|
||||||
)
|
|
||||||
|
|
||||||
# TODO online kd
|
|
||||||
kd_online_server_base_url: str | None = None
|
|
||||||
kd_online_topk: int | None = None
|
|
||||||
kd_online_server: InferenceServerType | None = Field(
|
|
||||||
default_factory=lambda: InferenceServerType.vllm
|
|
||||||
)
|
|
||||||
kd_online_timeout: int | None = 120
|
|
||||||
kd_temperature_min: float | None = (
|
|
||||||
None # kd temperature scheduling during online kd
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class KDTrainingArgsMixin:
|
|
||||||
"""
|
|
||||||
Additional args for KD training.
|
|
||||||
"""
|
|
||||||
|
|
||||||
kd_ce_alpha: float | None = (
|
|
||||||
None # loss coefficient for cross-entropy loss during KD
|
|
||||||
)
|
|
||||||
kd_alpha: float | None = None # loss coefficient for KD loss
|
|
||||||
kd_temperature: float | None = None # temperature for sampling during KD
|
|
||||||
kd_beta: float | None = None # beta coefficient for ratio of fwd and reverse KL
|
|
||||||
kd_normalize_topk: float | None = (
|
|
||||||
None # whether to normalize student logits during KD
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,36 +0,0 @@
|
|||||||
"""
|
|
||||||
Transformers trainer callbacks to schedule the KD temperature during training
|
|
||||||
"""
|
|
||||||
|
|
||||||
import math
|
|
||||||
|
|
||||||
from transformers.trainer_callback import TrainerCallback
|
|
||||||
|
|
||||||
|
|
||||||
class KDTemperatureSchedulerCallback(TrainerCallback):
|
|
||||||
"""
|
|
||||||
KD temperature scheduler callback for the trainer.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, temperature_start, temperature_min, trainer):
|
|
||||||
self.temperature_start = temperature_start
|
|
||||||
self.temperature_min = temperature_min
|
|
||||||
self.temperature = temperature_start
|
|
||||||
|
|
||||||
self.trainer = trainer
|
|
||||||
|
|
||||||
def on_step_end(
|
|
||||||
self, args, state, control, **kwargs
|
|
||||||
): # pylint: disable=unused-argument
|
|
||||||
# cosine decay temperature over the max steps
|
|
||||||
|
|
||||||
progress = state.global_step / state.max_steps
|
|
||||||
# Cosine decay factor: 0.5 * (1 + cos(pi * progress))
|
|
||||||
# This factor goes from 1 (at progress=0) to 0 (at progress=1)
|
|
||||||
decay_factor = 0.5 * (1.0 + math.cos(math.pi * progress))
|
|
||||||
self.temperature = self.temperature_start - (
|
|
||||||
(self.temperature_start - self.temperature_min) * (1.0 - decay_factor)
|
|
||||||
)
|
|
||||||
|
|
||||||
if hasattr(self.trainer.data_collator, "kd_temperature"):
|
|
||||||
self.trainer.data_collator.kd_temperature = self.temperature
|
|
||||||
@@ -15,15 +15,12 @@
|
|||||||
"""
|
"""
|
||||||
Chat template prompt strategy loader with KD support
|
Chat template prompt strategy loader with KD support
|
||||||
"""
|
"""
|
||||||
import logging
|
|
||||||
from typing import Any, Dict
|
from typing import Any, Dict
|
||||||
|
|
||||||
import torch
|
import torch
|
||||||
|
|
||||||
from axolotl.prompt_strategies.chat_template import ChatTemplateStrategy, StrategyLoader
|
from axolotl.prompt_strategies.chat_template import ChatTemplateStrategy, StrategyLoader
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
|
class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
|
||||||
"""
|
"""
|
||||||
@@ -104,8 +101,10 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
|
|||||||
# fill with -inf for padding_len tokens for top_k tokens
|
# fill with -inf for padding_len tokens for top_k tokens
|
||||||
# extend target_logprobs with a padding_len x top_k 2D list filled with -inf
|
# extend target_logprobs with a padding_len x top_k 2D list filled with -inf
|
||||||
|
|
||||||
# we shift for causal models in the trainer, so start the range from 0
|
# for causal models, if we start the range at 1, then we don't need to shift in the trainer
|
||||||
for _ in range(0, input_padding_len):
|
# otherwise, we need to shift in the trainer
|
||||||
|
shift = 0
|
||||||
|
for _ in range(shift, input_padding_len):
|
||||||
target_logprobs.append([-float("inf")] * top_k)
|
target_logprobs.append([-float("inf")] * top_k)
|
||||||
target_token_ids.append(list(range(top_k)))
|
target_token_ids.append(list(range(top_k)))
|
||||||
target_mask.append([0] * top_k)
|
target_mask.append([0] * top_k)
|
||||||
@@ -144,10 +143,6 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
|
|||||||
#
|
#
|
||||||
# Convert from log to probability
|
# Convert from log to probability
|
||||||
teacher_probs_t1 = position_logprobs_tensor.exp()
|
teacher_probs_t1 = position_logprobs_tensor.exp()
|
||||||
# normalize probabilities to sum to 1 in case they aren't already
|
|
||||||
teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
|
|
||||||
if teacher_probs_t1_sum > 1e-9:
|
|
||||||
teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
|
|
||||||
if self.kd_temperature != self.gen_temperature:
|
if self.kd_temperature != self.gen_temperature:
|
||||||
# Exponentiate by factor (T1 / T2)
|
# Exponentiate by factor (T1 / T2)
|
||||||
exponent = self.gen_temperature / self.kd_temperature
|
exponent = self.gen_temperature / self.kd_temperature
|
||||||
@@ -167,115 +162,12 @@ class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
|
|||||||
target_logprobs.append(position_logprobs_scaled)
|
target_logprobs.append(position_logprobs_scaled)
|
||||||
target_token_ids.append(position_token_ids)
|
target_token_ids.append(position_token_ids)
|
||||||
|
|
||||||
# Update sample with transformed logprobs
|
if shift == 1:
|
||||||
sample["target_logprobs"] = target_logprobs
|
# since we started at index 1 for causal, we need one more padding token
|
||||||
sample["target_token_ids"] = target_token_ids
|
|
||||||
sample["target_mask"] = target_mask
|
|
||||||
|
|
||||||
return sample
|
|
||||||
|
|
||||||
def _tokenize_single_prompt(self, prompt):
|
|
||||||
logprobs = prompt.pop(self.logprobs_field)
|
|
||||||
tokenized_prompt = super()._tokenize_single_prompt(prompt)
|
|
||||||
tokenized_prompt[self.logprobs_field] = logprobs
|
|
||||||
tokenized_prompt = self.transform_logprobs(tokenized_prompt)
|
|
||||||
|
|
||||||
return tokenized_prompt
|
|
||||||
|
|
||||||
|
|
||||||
class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
|
|
||||||
"""
|
|
||||||
Strat for datasets with complete structured KD logprob data
|
|
||||||
"""
|
|
||||||
|
|
||||||
def transform_logprobs(self, sample):
|
|
||||||
"""
|
|
||||||
Transform logprobs to target format for KD training
|
|
||||||
"""
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
|
|
||||||
logprobs = sample.pop(self.logprobs_field)
|
|
||||||
target_seq_len = len(logprobs)
|
|
||||||
input_seq_len = len(sample["input_ids"])
|
|
||||||
input_padding_len = input_seq_len - target_seq_len
|
|
||||||
# get non-zero top-k (prune None logprobs from vllm data step)
|
|
||||||
top_k_vals = [
|
|
||||||
len(logprobs[i])
|
|
||||||
for i in range(len(logprobs))
|
|
||||||
if logprobs[i] is not None and len(logprobs[i])
|
|
||||||
]
|
|
||||||
max_top_k = max(set(top_k_vals), key=top_k_vals.count)
|
|
||||||
min_top_k = min(set(top_k_vals), key=top_k_vals.count)
|
|
||||||
top_k = min(max_top_k, min_top_k)
|
|
||||||
if top_k == 0:
|
|
||||||
raise ValueError("No non-zero top-k logprobs found.")
|
|
||||||
|
|
||||||
target_logprobs = []
|
|
||||||
target_token_ids = []
|
|
||||||
target_mask = []
|
|
||||||
|
|
||||||
if input_padding_len < 0:
|
|
||||||
# logprobs is longer than target_seq_len,
|
|
||||||
# so we need to slice from the left/beginning of logprobs
|
|
||||||
logprobs = logprobs[:-input_seq_len]
|
|
||||||
input_padding_len = 0
|
|
||||||
# target_seq_len = input_seq_len
|
|
||||||
|
|
||||||
# truncate the second dimension of the logprobs to top_k
|
|
||||||
logprobs = [row[:top_k] for row in logprobs]
|
|
||||||
|
|
||||||
# fill with -inf for padding_len tokens for top_k tokens
|
|
||||||
# extend target_logprobs with a padding_len x top_k 2D list filled with -inf
|
|
||||||
|
|
||||||
# we shift for causal models in the trainer, so start the range from 0
|
|
||||||
for _ in range(0, input_padding_len):
|
|
||||||
target_logprobs.append([-float("inf")] * top_k)
|
target_logprobs.append([-float("inf")] * top_k)
|
||||||
target_token_ids.append(list(range(top_k)))
|
target_token_ids.append(list(range(top_k)))
|
||||||
target_mask.append([0] * top_k)
|
target_mask.append([0] * top_k)
|
||||||
|
|
||||||
for position in range(input_padding_len, input_seq_len):
|
|
||||||
if sample["labels"][position] == -100:
|
|
||||||
target_mask.append([0] * top_k)
|
|
||||||
else:
|
|
||||||
target_mask.append([1] * top_k)
|
|
||||||
|
|
||||||
for token_pos_logprobs, pos_target_token_ids in zip(
|
|
||||||
logprobs, sample["target_token_ids"]
|
|
||||||
):
|
|
||||||
# Convert to a tensor for easier manipulation
|
|
||||||
position_logprobs_tensor = torch.tensor(
|
|
||||||
token_pos_logprobs, dtype=torch.float
|
|
||||||
)
|
|
||||||
|
|
||||||
# Now we have distribution at T1 in log form, i.e. log p_{T1}(k).
|
|
||||||
# Next, re-scale to T2 = self.kd_temperature via exponent-based trick
|
|
||||||
# p_{T2}(k) = [p_{T1}(k)]^(T1 / T2) / Z
|
|
||||||
#
|
|
||||||
# Convert from log to probability
|
|
||||||
teacher_probs_t1 = position_logprobs_tensor.exp()
|
|
||||||
# normalize probabilities to sum to 1 in case they aren't already
|
|
||||||
teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
|
|
||||||
if teacher_probs_t1_sum > 1e-9:
|
|
||||||
teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
|
|
||||||
if self.kd_temperature != self.gen_temperature:
|
|
||||||
# Exponentiate by factor (T1 / T2)
|
|
||||||
exponent = self.gen_temperature / self.kd_temperature
|
|
||||||
teacher_probs_t2 = teacher_probs_t1**exponent
|
|
||||||
else:
|
|
||||||
teacher_probs_t2 = teacher_probs_t1
|
|
||||||
# Re-normalize
|
|
||||||
teacher_probs_t2 = teacher_probs_t2 / teacher_probs_t2.sum(
|
|
||||||
dim=0, keepdim=True
|
|
||||||
)
|
|
||||||
# Convert back to log
|
|
||||||
position_logprobs_tensor = torch.log(teacher_probs_t2)
|
|
||||||
|
|
||||||
# Now we have log p_{teacher, T2}(k) stored in position_logprobs_tensor
|
|
||||||
position_logprobs_scaled = position_logprobs_tensor.tolist()
|
|
||||||
|
|
||||||
target_logprobs.append(position_logprobs_scaled)
|
|
||||||
target_token_ids.append(pos_target_token_ids)
|
|
||||||
|
|
||||||
# Update sample with transformed logprobs
|
# Update sample with transformed logprobs
|
||||||
sample["target_logprobs"] = target_logprobs
|
sample["target_logprobs"] = target_logprobs
|
||||||
sample["target_token_ids"] = target_token_ids
|
sample["target_token_ids"] = target_token_ids
|
||||||
@@ -285,10 +177,8 @@ class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
|
|||||||
|
|
||||||
def _tokenize_single_prompt(self, prompt):
|
def _tokenize_single_prompt(self, prompt):
|
||||||
logprobs = prompt.pop(self.logprobs_field)
|
logprobs = prompt.pop(self.logprobs_field)
|
||||||
target_token_ids = prompt.pop("target_token_ids")
|
|
||||||
tokenized_prompt = super()._tokenize_single_prompt(prompt)
|
tokenized_prompt = super()._tokenize_single_prompt(prompt)
|
||||||
tokenized_prompt[self.logprobs_field] = logprobs
|
tokenized_prompt[self.logprobs_field] = logprobs
|
||||||
tokenized_prompt["target_token_ids"] = target_token_ids
|
|
||||||
tokenized_prompt = self.transform_logprobs(tokenized_prompt)
|
tokenized_prompt = self.transform_logprobs(tokenized_prompt)
|
||||||
|
|
||||||
return tokenized_prompt
|
return tokenized_prompt
|
||||||
@@ -299,7 +189,7 @@ class KDStrategyLoader(StrategyLoader):
|
|||||||
Load ChatTemplateStrategy with KD support using StrategyLoader.
|
Load ChatTemplateStrategy with KD support using StrategyLoader.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _get_strategy_cls(self, cfg): # pylint: disable=unused-argument
|
def _get_strategy_cls(self):
|
||||||
return ChatTemplateStrategyWithKD
|
return ChatTemplateStrategyWithKD
|
||||||
|
|
||||||
def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
|
def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
|
||||||
@@ -314,14 +204,4 @@ class KDStrategyLoader(StrategyLoader):
|
|||||||
return strategy_params
|
return strategy_params
|
||||||
|
|
||||||
|
|
||||||
class KDStrategyLoaderV2(KDStrategyLoader):
|
load = KDStrategyLoader()
|
||||||
"""
|
|
||||||
Load KD chat template datasets with pre-tokenized logprob data
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _get_strategy_cls(self, cfg): # pylint: disable=unused-argument
|
|
||||||
return ChatTemplateStrategyWithKDv2
|
|
||||||
|
|
||||||
|
|
||||||
load_legacy = KDStrategyLoader()
|
|
||||||
load = KDStrategyLoaderV2()
|
|
||||||
|
|||||||
@@ -47,16 +47,11 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
|
|||||||
position_pad_token_id: int = 0
|
position_pad_token_id: int = 0
|
||||||
return_tensors: str = "pt"
|
return_tensors: str = "pt"
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
self.tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
|
|
||||||
|
|
||||||
def __call__(self, features, return_tensors=None):
|
def __call__(self, features, return_tensors=None):
|
||||||
if return_tensors is None:
|
if return_tensors is None:
|
||||||
return_tensors = self.return_tensors
|
return_tensors = self.return_tensors
|
||||||
|
|
||||||
padding_side = self.tokenizer.padding_side
|
padding_side = self.tokenizer.padding_side
|
||||||
max_len = 0
|
|
||||||
|
|
||||||
# Pad labels and position_ids first
|
# Pad labels and position_ids first
|
||||||
for feature_name, pad_token_id in [
|
for feature_name, pad_token_id in [
|
||||||
@@ -107,9 +102,7 @@ class DataCollatorForKD(DataCollatorForSeq2Seq):
|
|||||||
target_mask_list.append(f.pop("target_mask"))
|
target_mask_list.append(f.pop("target_mask"))
|
||||||
|
|
||||||
# Determine max lengths
|
# Determine max lengths
|
||||||
max_teacher_seq_len = max_len or max(
|
max_teacher_seq_len = max(len(seq) for seq in target_logprobs_list)
|
||||||
len(seq) for seq in target_logprobs_list
|
|
||||||
)
|
|
||||||
max_k = max(len(seq_k) for seq in target_logprobs_list for seq_k in seq)
|
max_k = max(len(seq_k) for seq in target_logprobs_list for seq_k in seq)
|
||||||
|
|
||||||
padded_target_logprobs = []
|
padded_target_logprobs = []
|
||||||
@@ -216,9 +209,7 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
|
|||||||
# We want to produce a single "merged" feature dict for each sub-batch.
|
# We want to produce a single "merged" feature dict for each sub-batch.
|
||||||
out_features = [{} for _ in features]
|
out_features = [{} for _ in features]
|
||||||
|
|
||||||
for i, sub_features in enumerate( # pylint: disable=too-many-nested-blocks
|
for i, sub_features in enumerate(features):
|
||||||
features
|
|
||||||
):
|
|
||||||
# sub_features is a list of dicts, each dict = one sequence’s features
|
# sub_features is a list of dicts, each dict = one sequence’s features
|
||||||
# We'll merge them into out_features[i].
|
# We'll merge them into out_features[i].
|
||||||
#
|
#
|
||||||
@@ -252,17 +243,10 @@ class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
|
|||||||
# For example, input_ids or labels are often arrays.
|
# For example, input_ids or labels are often arrays.
|
||||||
arrays = []
|
arrays = []
|
||||||
for feat in sub_features:
|
for feat in sub_features:
|
||||||
if field_name in feat and isinstance(
|
if field_name in feat:
|
||||||
feat[field_name], (list, torch.Tensor)
|
|
||||||
):
|
|
||||||
if isinstance(
|
|
||||||
feat[field_name][0], (dict, str)
|
|
||||||
): # pylint: disable=too-many-nested-blocks
|
|
||||||
continue
|
|
||||||
arr = np.array(feat[field_name])
|
arr = np.array(feat[field_name])
|
||||||
arrays.append(arr)
|
arrays.append(arr)
|
||||||
if arrays:
|
out_features[i][field_name] = np.concatenate(arrays)
|
||||||
out_features[i][field_name] = np.concatenate(arrays)
|
|
||||||
|
|
||||||
# 3) Now call the parent collator, which will do:
|
# 3) Now call the parent collator, which will do:
|
||||||
# - padding of labels/position_ids
|
# - padding of labels/position_ids
|
||||||
|
|||||||
@@ -1,561 +0,0 @@
|
|||||||
"""
|
|
||||||
Packed data loader for online teacher training supporting vllm and sglang.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import hashlib
|
|
||||||
import hmac
|
|
||||||
import logging
|
|
||||||
from typing import Any, Dict, List, Optional
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import torch
|
|
||||||
from orjson import orjson
|
|
||||||
|
|
||||||
from axolotl.integrations.kd.collator import KDBatchSamplerDataCollatorForSeq2Seq
|
|
||||||
from axolotl.integrations.kd.utils import normalize_logprobs
|
|
||||||
from axolotl.utils.data.utils import retry_on_request_exceptions
|
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def hmac_sha_from_int_list(int_list, key, hash_func=hashlib.sha256):
|
|
||||||
"""
|
|
||||||
Create HMAC-SHA hash from a list of integers
|
|
||||||
|
|
||||||
Args:
|
|
||||||
int_list: List of integers
|
|
||||||
key: Secret key (string or bytes)
|
|
||||||
hash_func: Hash function (default: sha256)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
HMAC digest as hex string
|
|
||||||
"""
|
|
||||||
# Convert key to bytes if it's a string
|
|
||||||
if isinstance(key, str):
|
|
||||||
key = key.encode("utf-8")
|
|
||||||
|
|
||||||
# Convert list of ints to bytes
|
|
||||||
# Method 1: Convert each int to bytes and concatenate
|
|
||||||
data = b"".join(i.to_bytes(4, byteorder="big") for i in int_list)
|
|
||||||
|
|
||||||
# Create HMAC
|
|
||||||
h = hmac.new(key, data, hash_func)
|
|
||||||
return h.hexdigest()
|
|
||||||
|
|
||||||
|
|
||||||
class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
|
|
||||||
"""
|
|
||||||
Collator for online teacher training.
|
|
||||||
"""
|
|
||||||
|
|
||||||
DEFAULT_LABEL_PAD_TOKEN_ID: int = -100
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
*args: Any,
|
|
||||||
kd_online_server_base_url: Optional[str] = None,
|
|
||||||
kd_online_topk: Optional[int] = None,
|
|
||||||
kd_temperature: Optional[float] = 1.0,
|
|
||||||
kd_online_server: Optional[str] = "vllm",
|
|
||||||
kd_online_timeout: Optional[int] = 120,
|
|
||||||
kd_cache_dir: Optional[str] = None,
|
|
||||||
kd_normalize_topk: Optional[bool] = True,
|
|
||||||
**kwargs: Any,
|
|
||||||
):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
if kd_online_server_base_url is None:
|
|
||||||
raise ValueError(
|
|
||||||
"kd_online_server_base_url must be provided for OnlineTeacherDataloader"
|
|
||||||
)
|
|
||||||
if kd_online_topk is None or kd_online_topk <= 0:
|
|
||||||
raise ValueError(
|
|
||||||
"kd_online_topk must be a positive integer for OnlineTeacherDataloader"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.kd_online_server_base_url = kd_online_server_base_url.rstrip("/")
|
|
||||||
self.kd_online_topk = kd_online_topk
|
|
||||||
self.kd_temperature = kd_temperature
|
|
||||||
self.kd_online_server = kd_online_server
|
|
||||||
self.http_session = requests.Session()
|
|
||||||
self.kd_online_timeout = kd_online_timeout
|
|
||||||
self.kd_cache_dir = kd_cache_dir
|
|
||||||
self.kd_normalize_topk = kd_normalize_topk
|
|
||||||
|
|
||||||
def _normalize_logprobs(self, raw_logprobs: List[float]) -> List[float]:
|
|
||||||
"""
|
|
||||||
Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs.
|
|
||||||
"""
|
|
||||||
if not raw_logprobs or self.kd_online_topk == 0:
|
|
||||||
return (
|
|
||||||
[-float("inf")] * self.kd_online_topk if self.kd_online_topk > 0 else []
|
|
||||||
)
|
|
||||||
|
|
||||||
raw_logprobs_tensor = torch.tensor(raw_logprobs, dtype=torch.float32)
|
|
||||||
return normalize_logprobs(raw_logprobs_tensor, self.kd_online_topk).tolist()
|
|
||||||
|
|
||||||
@retry_on_request_exceptions(max_retries=10, delay=5)
|
|
||||||
def fetch_online_logprobs_sglang(
|
|
||||||
self, batch_input_ids: List[List[int]], labels: List[List[int]]
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Fetches logprobs from an online teacher served by sglang for a batch of input_ids.
|
|
||||||
Assumes API returns token IDs as strings in logprob dictionary keys.
|
|
||||||
"""
|
|
||||||
api_endpoint = f"{self.kd_online_server_base_url}/generate"
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"input_ids": batch_input_ids,
|
|
||||||
"return_logprob": True,
|
|
||||||
"top_logprobs_num": self.kd_online_topk,
|
|
||||||
"logprob_start_len": 0,
|
|
||||||
"return_text_in_logprobs": True,
|
|
||||||
"echo": True,
|
|
||||||
"sampling_params": {
|
|
||||||
"max_new_tokens": 0,
|
|
||||||
"temperature": self.kd_temperature,
|
|
||||||
"skip_special_tokens": False,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
# Initialize with empty lists, so if API call fails, these are returned.
|
|
||||||
ret_data_target_token_ids: List[List[List[int]]] = []
|
|
||||||
ret_data_target_logprobs: List[List[List[float]]] = []
|
|
||||||
ret_data_target_mask: List[List[List[int]]] = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
response = self.http_session.post(
|
|
||||||
api_endpoint, json=payload, timeout=self.kd_online_timeout
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
api_data: list[dict] = response.json()
|
|
||||||
|
|
||||||
# Ensure api_data is a list, and its length matches batch_input_ids
|
|
||||||
if not isinstance(api_data, list) or len(api_data) != len(batch_input_ids):
|
|
||||||
LOG.error(
|
|
||||||
f"API response format error. Expected a list of {len(batch_input_ids)} "
|
|
||||||
f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}."
|
|
||||||
)
|
|
||||||
# Return empty data; items processed later will get default empty KD fields
|
|
||||||
return {
|
|
||||||
"target_token_ids": ret_data_target_token_ids,
|
|
||||||
"target_logprobs": ret_data_target_logprobs,
|
|
||||||
"target_mask": ret_data_target_mask,
|
|
||||||
}
|
|
||||||
|
|
||||||
for sequence_data, seq_input_ids, seq_labels in zip(
|
|
||||||
api_data, batch_input_ids, labels
|
|
||||||
):
|
|
||||||
current_target_logprobs = []
|
|
||||||
current_target_token_ids = []
|
|
||||||
current_target_mask = []
|
|
||||||
|
|
||||||
meta_info = sequence_data.pop("meta_info", {})
|
|
||||||
# Ensure input_top_logprobs is a list
|
|
||||||
input_top_logprobs: Optional[list[None | list[tuple]]] = meta_info.pop(
|
|
||||||
"input_top_logprobs", []
|
|
||||||
)
|
|
||||||
if not isinstance(input_top_logprobs, list):
|
|
||||||
LOG.warning(
|
|
||||||
f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence."
|
|
||||||
)
|
|
||||||
input_top_logprobs = [] # Treat as empty
|
|
||||||
|
|
||||||
# basic check that the logprob data len matches the input len, so no need to handle padding
|
|
||||||
assert len(seq_input_ids) == len(input_top_logprobs)
|
|
||||||
|
|
||||||
for i, _, label in zip(
|
|
||||||
range(len(seq_input_ids)), seq_input_ids, seq_labels
|
|
||||||
):
|
|
||||||
if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
|
|
||||||
# this is always the case for the first token.
|
|
||||||
# there is never logprob data for the first token since that's a true input
|
|
||||||
# so we replace the None value with padding data
|
|
||||||
current_target_logprobs.append(
|
|
||||||
[-float("inf")] * self.kd_online_topk
|
|
||||||
)
|
|
||||||
current_target_token_ids.append([0] * self.kd_online_topk)
|
|
||||||
current_target_mask.append([0] * self.kd_online_topk)
|
|
||||||
elif (
|
|
||||||
i < len(input_top_logprobs)
|
|
||||||
and input_top_logprobs[i] is not None
|
|
||||||
):
|
|
||||||
pos_top_logprobs_data = input_top_logprobs[i]
|
|
||||||
# Ensure pos_top_logprobs_data is a list of lists as expected
|
|
||||||
if not (
|
|
||||||
isinstance(pos_top_logprobs_data, list)
|
|
||||||
and all(
|
|
||||||
isinstance(item, list) for item in pos_top_logprobs_data
|
|
||||||
)
|
|
||||||
and len(pos_top_logprobs_data) > 0
|
|
||||||
and len(pos_top_logprobs_data[0]) == 3
|
|
||||||
): # [logprob, token_id, token_str]
|
|
||||||
LOG.warning(
|
|
||||||
f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position."
|
|
||||||
)
|
|
||||||
current_target_logprobs.append(
|
|
||||||
[-float("inf")] * self.kd_online_topk
|
|
||||||
)
|
|
||||||
current_target_token_ids.append([0] * self.kd_online_topk)
|
|
||||||
current_target_mask.append([0] * self.kd_online_topk)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
|
|
||||||
pos_logprobs_raw, pos_token_ids, _ = [
|
|
||||||
list(row) for row in zip(*pos_top_logprobs_data)
|
|
||||||
]
|
|
||||||
|
|
||||||
# Ensure correct length (top_k)
|
|
||||||
if len(pos_logprobs_raw) < self.kd_online_topk:
|
|
||||||
pad_len = self.kd_online_topk - len(pos_logprobs_raw)
|
|
||||||
pos_logprobs_raw.extend([-float("inf")] * pad_len)
|
|
||||||
pos_token_ids.extend([0] * pad_len) # Pad with 0 token_id
|
|
||||||
|
|
||||||
# truncate to top_k in case the response was longer
|
|
||||||
current_target_token_ids.append(
|
|
||||||
pos_token_ids[: self.kd_online_topk]
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.kd_normalize_topk:
|
|
||||||
normalized_logprobs_for_position = self._normalize_logprobs(
|
|
||||||
pos_logprobs_raw[: self.kd_online_topk]
|
|
||||||
)
|
|
||||||
current_target_logprobs.append(
|
|
||||||
normalized_logprobs_for_position
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
current_target_logprobs.append(
|
|
||||||
pos_logprobs_raw[: self.kd_online_topk]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Mask depends on the corresponding label for the student
|
|
||||||
if label == self.DEFAULT_LABEL_PAD_TOKEN_ID:
|
|
||||||
current_target_mask.append([0] * self.kd_online_topk)
|
|
||||||
else:
|
|
||||||
current_target_mask.append([1] * self.kd_online_topk)
|
|
||||||
else:
|
|
||||||
# Pad if no logprobs for this position (either due to length mismatch or None entry)
|
|
||||||
current_target_logprobs.append(
|
|
||||||
[-float("inf")] * self.kd_online_topk
|
|
||||||
)
|
|
||||||
current_target_token_ids.append([0] * self.kd_online_topk)
|
|
||||||
current_target_mask.append([0] * self.kd_online_topk)
|
|
||||||
|
|
||||||
ret_data_target_token_ids.append(current_target_token_ids)
|
|
||||||
ret_data_target_logprobs.append(current_target_logprobs)
|
|
||||||
ret_data_target_mask.append(current_target_mask)
|
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
LOG.error(f"Error fetching logprobs from online teacher: {e}")
|
|
||||||
raise e
|
|
||||||
# ret_logprobs_data will be returned with empty lists, handled by the caller.
|
|
||||||
except Exception as e: # Catch other potential errors during processing
|
|
||||||
LOG.error(
|
|
||||||
f"Unexpected error processing API response in fetch_online_logprobs: {e}",
|
|
||||||
exc_info=True,
|
|
||||||
)
|
|
||||||
raise e
|
|
||||||
|
|
||||||
return {
|
|
||||||
"target_token_ids": ret_data_target_token_ids,
|
|
||||||
"target_logprobs": ret_data_target_logprobs,
|
|
||||||
"target_mask": ret_data_target_mask,
|
|
||||||
}
|
|
||||||
|
|
||||||
@retry_on_request_exceptions(max_retries=10, delay=5)
|
|
||||||
def fetch_online_logprobs_vllm(
|
|
||||||
self, batch_input_ids: List[List[int]], labels: List[List[int]]
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Fetches logprobs from an online teacher served by vllm for a batch of input_ids.
|
|
||||||
Assumes API returns token IDs as strings in logprob dictionary keys.
|
|
||||||
"""
|
|
||||||
api_endpoint = f"{self.kd_online_server_base_url}/v1/completions"
|
|
||||||
|
|
||||||
payload = {
|
|
||||||
"prompt": batch_input_ids,
|
|
||||||
"echo": True,
|
|
||||||
"logprobs": True,
|
|
||||||
"prompt_logprobs": self.kd_online_topk,
|
|
||||||
"top_logprobs": self.kd_online_topk,
|
|
||||||
"max_new_tokens": 0,
|
|
||||||
"skip_special_tokens": False,
|
|
||||||
"temperature": self.kd_temperature,
|
|
||||||
"sampling_params": {
|
|
||||||
"max_tokens": 0,
|
|
||||||
},
|
|
||||||
}
|
|
||||||
|
|
||||||
# Initialize with empty lists, so if API call fails, these are returned.
|
|
||||||
ret_data_target_token_ids: List[List[List[int]]] = []
|
|
||||||
ret_data_target_logprobs: List[List[List[float]]] = []
|
|
||||||
ret_data_target_mask: List[List[List[int]]] = []
|
|
||||||
|
|
||||||
try:
|
|
||||||
headers = {"Accept-Encoding": "deflate, gzip, br, zstd"}
|
|
||||||
response = self.http_session.post(
|
|
||||||
api_endpoint,
|
|
||||||
json=payload,
|
|
||||||
headers=headers,
|
|
||||||
timeout=self.kd_online_timeout,
|
|
||||||
)
|
|
||||||
response.raise_for_status()
|
|
||||||
api_data: dict = orjson.loads(response.content)
|
|
||||||
choices: list[dict] = api_data["choices"]
|
|
||||||
|
|
||||||
# Ensure api_data is a list, and its length matches batch_input_ids
|
|
||||||
if not isinstance(choices, list) or len(choices) != len(batch_input_ids):
|
|
||||||
LOG.error(
|
|
||||||
f"API response format error. Expected a list of {len(batch_input_ids)} "
|
|
||||||
f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}."
|
|
||||||
)
|
|
||||||
# Return empty data; items processed later will get default empty KD fields
|
|
||||||
return {
|
|
||||||
"target_token_ids": ret_data_target_token_ids,
|
|
||||||
"target_logprobs": ret_data_target_logprobs,
|
|
||||||
"target_mask": ret_data_target_mask,
|
|
||||||
}
|
|
||||||
|
|
||||||
for sequence_data, seq_input_ids, seq_labels in zip(
|
|
||||||
choices, batch_input_ids, labels
|
|
||||||
):
|
|
||||||
# seq_input_ids: List[int]
|
|
||||||
# seq_labels: List[int]
|
|
||||||
|
|
||||||
current_target_logprobs = []
|
|
||||||
current_target_token_ids = []
|
|
||||||
current_target_mask = []
|
|
||||||
|
|
||||||
# Ensure input_top_logprobs is a list
|
|
||||||
input_top_logprobs: Optional[list[None | dict[str, dict]]] = (
|
|
||||||
sequence_data.pop("prompt_logprobs", [])
|
|
||||||
)
|
|
||||||
|
|
||||||
if not isinstance(input_top_logprobs, list):
|
|
||||||
LOG.warning(
|
|
||||||
f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence."
|
|
||||||
)
|
|
||||||
input_top_logprobs = [] # Treat as empty
|
|
||||||
|
|
||||||
# basic check that the logprob data len matches the input len, so no need to handle padding
|
|
||||||
assert len(seq_input_ids) == len(input_top_logprobs)
|
|
||||||
|
|
||||||
seq_len = len(seq_input_ids)
|
|
||||||
|
|
||||||
for i, _, label in zip(range(seq_len), seq_input_ids, seq_labels):
|
|
||||||
if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
|
|
||||||
# this is always the case for the first token.
|
|
||||||
# there is never logprob data for the first token since that's a true input
|
|
||||||
continue
|
|
||||||
if (
|
|
||||||
i < len(input_top_logprobs)
|
|
||||||
and input_top_logprobs[i] is not None
|
|
||||||
):
|
|
||||||
pos_top_logprobs_data: dict[str, dict] = input_top_logprobs[i] # type: ignore[assignment]
|
|
||||||
# Ensure pos_top_logprobs_data is a list of lists as expected
|
|
||||||
if not (
|
|
||||||
isinstance(pos_top_logprobs_data, dict)
|
|
||||||
and all(
|
|
||||||
isinstance(item, dict)
|
|
||||||
for item in pos_top_logprobs_data.values()
|
|
||||||
)
|
|
||||||
and len(pos_top_logprobs_data.keys()) > 0
|
|
||||||
): # [logprob, token_id, token_str]
|
|
||||||
LOG.warning(
|
|
||||||
f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position."
|
|
||||||
)
|
|
||||||
current_target_logprobs.append(
|
|
||||||
[-float("inf")] * self.kd_online_topk
|
|
||||||
)
|
|
||||||
current_target_token_ids.append(
|
|
||||||
list(range(self.kd_online_topk))
|
|
||||||
)
|
|
||||||
current_target_mask.append([0] * self.kd_online_topk)
|
|
||||||
continue
|
|
||||||
|
|
||||||
# pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
|
|
||||||
pos_token_ids_str = list(pos_top_logprobs_data.keys())
|
|
||||||
pos_logprobs_dict = pos_top_logprobs_data.values()
|
|
||||||
pos_token_ids = [
|
|
||||||
int(token_id) for token_id in pos_token_ids_str
|
|
||||||
]
|
|
||||||
pos_logprobs_raw = [
|
|
||||||
float(logprob.get("logprob", -float("inf")))
|
|
||||||
for logprob in pos_logprobs_dict
|
|
||||||
]
|
|
||||||
|
|
||||||
# Ensure correct length (top_k)
|
|
||||||
if len(pos_logprobs_raw) < self.kd_online_topk:
|
|
||||||
pad_len = self.kd_online_topk - len(pos_logprobs_raw)
|
|
||||||
LOG.warning(
|
|
||||||
f"Padding position {i} with {pad_len} top-k tokens and logprobs."
|
|
||||||
)
|
|
||||||
pos_logprobs_raw.extend([-float("inf")] * pad_len)
|
|
||||||
pos_token_ids.extend([0] * pad_len) # Pad with 0 token_id
|
|
||||||
|
|
||||||
# truncate to top_k in case the response was longer
|
|
||||||
current_target_token_ids.append(
|
|
||||||
pos_token_ids[: self.kd_online_topk]
|
|
||||||
)
|
|
||||||
|
|
||||||
if self.kd_normalize_topk:
|
|
||||||
normalized_logprobs_for_position = self._normalize_logprobs(
|
|
||||||
pos_logprobs_raw[: self.kd_online_topk]
|
|
||||||
)
|
|
||||||
current_target_logprobs.append(
|
|
||||||
normalized_logprobs_for_position
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
current_target_logprobs.append(
|
|
||||||
pos_logprobs_raw[: self.kd_online_topk]
|
|
||||||
)
|
|
||||||
|
|
||||||
# Mask depends on the corresponding label for the student
|
|
||||||
if label == self.DEFAULT_LABEL_PAD_TOKEN_ID:
|
|
||||||
current_target_mask.append([0] * self.kd_online_topk)
|
|
||||||
else:
|
|
||||||
current_target_mask.append([1] * self.kd_online_topk)
|
|
||||||
else:
|
|
||||||
# Pad if no logprobs for this position (either due to length mismatch or None entry)
|
|
||||||
current_target_logprobs.append(
|
|
||||||
[-float("inf")] * self.kd_online_topk
|
|
||||||
)
|
|
||||||
current_target_token_ids.append(
|
|
||||||
list(range(self.kd_online_topk))
|
|
||||||
)
|
|
||||||
current_target_mask.append([0] * self.kd_online_topk)
|
|
||||||
for i in range(max(0, seq_len - len(current_target_logprobs))):
|
|
||||||
current_target_logprobs.append(
|
|
||||||
[-float("inf")] * self.kd_online_topk
|
|
||||||
)
|
|
||||||
current_target_token_ids.append(list(range(self.kd_online_topk)))
|
|
||||||
current_target_mask.append([0] * self.kd_online_topk)
|
|
||||||
|
|
||||||
ret_data_target_token_ids.append(current_target_token_ids)
|
|
||||||
ret_data_target_logprobs.append(current_target_logprobs)
|
|
||||||
ret_data_target_mask.append(current_target_mask)
|
|
||||||
|
|
||||||
# TODO save and load targets to disk for caching for next epoch
|
|
||||||
# generate a hmac SHA256 hash over the list seq_input_ids and convert it to an int
|
|
||||||
# if self.kd_cache_dir:
|
|
||||||
# hash_input_ids = hmac_sha_from_int_list(
|
|
||||||
# seq_input_ids, f"{self.kd_online_server_base_url}:{self.kd_online_topk}"
|
|
||||||
# )
|
|
||||||
# with open(f"{self.kd_cache_dir}/{hash_input_ids}.parquet", "wb") as f:
|
|
||||||
# pd.DataFrame(ret_logprobs_data).to_parquet(f, index=False)
|
|
||||||
|
|
||||||
except requests.exceptions.RequestException as e:
|
|
||||||
LOG.error(f"Error fetching logprobs from online teacher: {e}")
|
|
||||||
raise e
|
|
||||||
# ret_logprobs_data will be returned with empty lists, handled by the caller.
|
|
||||||
except Exception as e: # Catch other potential errors during processing
|
|
||||||
LOG.error(
|
|
||||||
f"Unexpected error processing API response in fetch_online_logprobs: {e}",
|
|
||||||
exc_info=True,
|
|
||||||
)
|
|
||||||
raise e
|
|
||||||
|
|
||||||
return {
|
|
||||||
"target_token_ids": ret_data_target_token_ids,
|
|
||||||
"target_logprobs": ret_data_target_logprobs,
|
|
||||||
"target_mask": ret_data_target_mask,
|
|
||||||
}
|
|
||||||
|
|
||||||
def __call__(
|
|
||||||
self, features: List[List[Dict[str, Any]]], return_tensors: Optional[str] = None
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
if not features:
|
|
||||||
return super().__call__(features, return_tensors=return_tensors)
|
|
||||||
|
|
||||||
for (
|
|
||||||
sub_batch_features
|
|
||||||
) in features: # sub_batch_features is List[Dict[str, Any]]
|
|
||||||
if not sub_batch_features:
|
|
||||||
continue
|
|
||||||
|
|
||||||
input_ids_for_api_call: List[List[int]] = []
|
|
||||||
labels_for_api_call: List[List[int]] = []
|
|
||||||
# Store references to the original item dictionaries to update them in-place
|
|
||||||
items_for_api_call: List[Dict[str, Any]] = []
|
|
||||||
|
|
||||||
for item_dict in sub_batch_features:
|
|
||||||
if not isinstance(item_dict, dict):
|
|
||||||
LOG.warning(
|
|
||||||
f"Skipping non-dict item in sub_batch_features: {item_dict}"
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
current_input_ids = item_dict.get("input_ids")
|
|
||||||
current_labels = item_dict.get("labels")
|
|
||||||
|
|
||||||
if current_input_ids is not None and current_labels is not None:
|
|
||||||
# Ensure input_ids and labels are lists of ints for JSON serialization
|
|
||||||
input_ids_list = (
|
|
||||||
current_input_ids.tolist()
|
|
||||||
if hasattr(current_input_ids, "tolist")
|
|
||||||
else list(current_input_ids)
|
|
||||||
)
|
|
||||||
labels_list = (
|
|
||||||
current_labels.tolist()
|
|
||||||
if hasattr(current_labels, "tolist")
|
|
||||||
else list(current_labels)
|
|
||||||
)
|
|
||||||
|
|
||||||
input_ids_for_api_call.append(input_ids_list)
|
|
||||||
labels_for_api_call.append(labels_list)
|
|
||||||
items_for_api_call.append(item_dict)
|
|
||||||
else:
|
|
||||||
# This item will not get teacher logprobs from the API.
|
|
||||||
# Initialize KD fields to empty lists so downstream collators handle them uniformly.
|
|
||||||
item_dict.setdefault("target_token_ids", [])
|
|
||||||
item_dict.setdefault("target_logprobs", [])
|
|
||||||
item_dict.setdefault("target_mask", [])
|
|
||||||
|
|
||||||
# print(items_for_api_call)
|
|
||||||
if items_for_api_call: # Only call API if there's something to process
|
|
||||||
if self.kd_online_server == "sglang":
|
|
||||||
api_responses_for_sub_batch = self.fetch_online_logprobs_sglang(
|
|
||||||
input_ids_for_api_call, labels_for_api_call
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
api_responses_for_sub_batch = self.fetch_online_logprobs_vllm(
|
|
||||||
input_ids_for_api_call, labels_for_api_call
|
|
||||||
)
|
|
||||||
|
|
||||||
# api_responses_for_sub_batch has keys: "target_token_ids", "target_logprobs", "target_mask"
|
|
||||||
# Each value is a list, corresponding to items_for_api_call
|
|
||||||
for i, item_to_update in enumerate(items_for_api_call):
|
|
||||||
# TODO make sure to figure out which input in sub_batch_features to update the batch in the original `features` object so the super class can handle it properly.
|
|
||||||
if api_responses_for_sub_batch and i < len(
|
|
||||||
api_responses_for_sub_batch["target_token_ids"]
|
|
||||||
): # Check bounds
|
|
||||||
assert len(
|
|
||||||
api_responses_for_sub_batch["target_token_ids"][i]
|
|
||||||
) == len(item_to_update["input_ids"])
|
|
||||||
assert len(
|
|
||||||
api_responses_for_sub_batch["target_logprobs"][i]
|
|
||||||
) == len(item_to_update["input_ids"])
|
|
||||||
assert len(
|
|
||||||
api_responses_for_sub_batch["target_mask"][i]
|
|
||||||
) == len(item_to_update["labels"])
|
|
||||||
item_to_update["target_token_ids"] = (
|
|
||||||
api_responses_for_sub_batch["target_token_ids"][i]
|
|
||||||
)
|
|
||||||
item_to_update["target_logprobs"] = api_responses_for_sub_batch[
|
|
||||||
"target_logprobs"
|
|
||||||
][i]
|
|
||||||
item_to_update["target_mask"] = api_responses_for_sub_batch[
|
|
||||||
"target_mask"
|
|
||||||
][i]
|
|
||||||
else:
|
|
||||||
# API call failed for this item, or response was shorter than expected.
|
|
||||||
# Ensure KD fields are initialized as empty lists.
|
|
||||||
LOG.warning(
|
|
||||||
f" (index {i}), or API response was too short. "
|
|
||||||
f"API response keys: {list(api_responses_for_sub_batch.keys()) if api_responses_for_sub_batch else 'None'}"
|
|
||||||
)
|
|
||||||
item_to_update.setdefault("target_token_ids", [])
|
|
||||||
item_to_update.setdefault("target_logprobs", [])
|
|
||||||
item_to_update.setdefault("target_mask", [])
|
|
||||||
|
|
||||||
return super().__call__(features, return_tensors=return_tensors)
|
|
||||||
@@ -1,8 +0,0 @@
|
|||||||
"""
|
|
||||||
Liger Chunked loss optimizations module
|
|
||||||
"""
|
|
||||||
|
|
||||||
from .liger import LigerFusedLinearKLTopKLogprobLoss
|
|
||||||
from .models import apply_kernel
|
|
||||||
|
|
||||||
__all__ = ["LigerFusedLinearKLTopKLogprobLoss", "apply_kernel"]
|
|
||||||
|
|||||||
@@ -1,485 +0,0 @@
|
|||||||
"""
|
|
||||||
Liger Kernels for Chunked Top-K Log-Prob Distillation
|
|
||||||
"""
|
|
||||||
|
|
||||||
import torch
|
|
||||||
import torch.nn.functional as F
|
|
||||||
from liger_kernel.chunked_loss.fused_linear_distillation import (
|
|
||||||
LigerFusedLinearDistillationBase,
|
|
||||||
)
|
|
||||||
|
|
||||||
from axolotl.integrations.kd.utils import normalize_logprobs
|
|
||||||
|
|
||||||
|
|
||||||
class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
|
|
||||||
"""
|
|
||||||
Chunked kl-div loss for top-k logprobs
|
|
||||||
"""
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def distillation_loss_fn(
|
|
||||||
student_logits_temp_scaled: torch.Tensor, # [chunk_size, vocab_size], already temp-scaled
|
|
||||||
target_token_ids_chunk: torch.Tensor, # [chunk_size, top_k]
|
|
||||||
target_logprobs_chunk: torch.Tensor, # [chunk_size, top_k], already temp-scaled and normalized logprobs
|
|
||||||
target_mask_chunk: torch.Tensor, # [chunk_size, top_k]
|
|
||||||
beta: float = 0.0,
|
|
||||||
normalize_topk: bool = True,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
"""
|
|
||||||
Compute Top-K KL divergence loss for a chunk.
|
|
||||||
Args:
|
|
||||||
student_logits_temp_scaled: Student logits, scaled by temperature. Shape: (N, V).
|
|
||||||
target_token_ids_chunk: Top-k teacher token IDs. Shape: (N, K).
|
|
||||||
target_logprobs_chunk: Top-k teacher log probabilities (temp-scaled, normalized). Shape: (N, K).
|
|
||||||
target_mask_chunk: Mask for valid top-k tokens. Shape: (N, K).
|
|
||||||
beta: Controls the type of KL divergence.
|
|
||||||
0.0 for Forward KL (P_teacher || P_student).
|
|
||||||
1.0 for Reverse KL (P_student || P_teacher).
|
|
||||||
0.5 for Symmetric KL (average of Forward and Reverse).
|
|
||||||
normalize_topk: Whether to normalize the log probabilities
|
|
||||||
Returns:
|
|
||||||
Sum of KL divergence losses for the chunk.
|
|
||||||
"""
|
|
||||||
topk = target_token_ids_chunk.shape[-1]
|
|
||||||
student_logits_temp_scaled = ( # [chunk_size, vocab_size]
|
|
||||||
student_logits_temp_scaled.float()
|
|
||||||
)
|
|
||||||
target_logprobs_chunk = target_logprobs_chunk.float()
|
|
||||||
|
|
||||||
# Gather student logits for the top-k teacher token IDs
|
|
||||||
# target_token_ids_chunk: [chunk_size, top_k]
|
|
||||||
# student_logits_topk_temp_scaled: [chunk_size, top_k]
|
|
||||||
student_logits_topk_temp_scaled = torch.gather(
|
|
||||||
student_logits_temp_scaled, dim=-1, index=target_token_ids_chunk
|
|
||||||
)
|
|
||||||
|
|
||||||
# Student log-probabilities for the gathered top-k tokens
|
|
||||||
student_lse = torch.logsumexp(
|
|
||||||
student_logits_temp_scaled, dim=-1, keepdim=True
|
|
||||||
) # [chunk_size, 1]
|
|
||||||
student_logprobs_topk_temp_scaled = (
|
|
||||||
student_logits_topk_temp_scaled - student_lse
|
|
||||||
)
|
|
||||||
|
|
||||||
# we have the top-k student logprobs, normalize them
|
|
||||||
if normalize_topk:
|
|
||||||
student_logprobs_topk_temp_scaled = normalize_logprobs(
|
|
||||||
student_logprobs_topk_temp_scaled, topk
|
|
||||||
)
|
|
||||||
|
|
||||||
valid_mask = target_mask_chunk.to(torch.bool) # [chunk_size, top_k]
|
|
||||||
|
|
||||||
student_logprobs_topk_valid = student_logprobs_topk_temp_scaled[valid_mask]
|
|
||||||
teacher_logprobs_valid = target_logprobs_chunk[valid_mask]
|
|
||||||
|
|
||||||
# Teacher probabilities P(y|x_teacher) from logprobs
|
|
||||||
# target_logprobs_valid are already normalized (log(softmax(teacher_logits/T)))
|
|
||||||
teacher_probs_valid = teacher_logprobs_valid.exp()
|
|
||||||
# Student probabilities P_student from log P_student
|
|
||||||
student_probs_topk_valid = student_logprobs_topk_valid.exp()
|
|
||||||
|
|
||||||
# kd_loss_per_token = torch.zeros_like(target_logprobs_valid)
|
|
||||||
|
|
||||||
# KL divergence: sum(P_teacher * (log P_teacher - log P_student))
|
|
||||||
# = sum(P_teacher * log P_teacher) - sum(P_teacher * log P_student)
|
|
||||||
# The distillation loss is often formulated as -sum(P_teacher * log P_student)
|
|
||||||
# or as sum(P_teacher * (log_softmax_teacher - log_softmax_student))
|
|
||||||
# Here, target_logprobs_valid are log_softmax_teacher.
|
|
||||||
# student_logprobs_topk_valid are log_softmax_student (for the selected K indices).
|
|
||||||
if beta == 0.0: # Contribution from Forward KL
|
|
||||||
fwd_kl_per_token = teacher_probs_valid * (
|
|
||||||
teacher_logprobs_valid - student_logprobs_topk_valid
|
|
||||||
)
|
|
||||||
kd_loss = fwd_kl_per_token.sum()
|
|
||||||
elif beta == 1.0: # Contribution from Reverse KL
|
|
||||||
rev_kl_per_token = student_probs_topk_valid * (
|
|
||||||
student_logprobs_topk_valid - teacher_logprobs_valid
|
|
||||||
)
|
|
||||||
kd_loss = rev_kl_per_token.sum()
|
|
||||||
else:
|
|
||||||
# JSD - Jensen-Shannon Divergence / Symmetric
|
|
||||||
mean_probs = (
|
|
||||||
1 - beta
|
|
||||||
) * student_probs_topk_valid + beta * teacher_probs_valid
|
|
||||||
log_mean_probs = mean_probs.log()
|
|
||||||
student_kl = F.kl_div(
|
|
||||||
log_mean_probs,
|
|
||||||
student_logprobs_topk_valid,
|
|
||||||
reduction="sum",
|
|
||||||
log_target=True,
|
|
||||||
)
|
|
||||||
teacher_kl = F.kl_div(
|
|
||||||
log_mean_probs, teacher_logprobs_valid, reduction="sum", log_target=True
|
|
||||||
)
|
|
||||||
jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
|
|
||||||
kd_loss = jsd_loss
|
|
||||||
|
|
||||||
return kd_loss
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def _compute_loss_kl_topk(
|
|
||||||
student_input_chunk: torch.Tensor,
|
|
||||||
student_weight: torch.Tensor,
|
|
||||||
# Args for student_bias, target_token_ids_chunk etc. are passed to the lambda wrapped by grad_and_value
|
|
||||||
# or through `partial`. Let's make them explicit here for clarity.
|
|
||||||
target_token_ids_chunk: torch.Tensor,
|
|
||||||
target_logprobs_chunk: torch.Tensor,
|
|
||||||
target_mask_chunk: torch.Tensor,
|
|
||||||
target_chunk: torch.Tensor, # For hard loss (true labels)
|
|
||||||
student_bias: torch.Tensor = None, # This will be one of the grad targets
|
|
||||||
# Other params passed via `partial` from `forward`
|
|
||||||
distillation_loss_fn=None,
|
|
||||||
ignore_index: int = -100,
|
|
||||||
weight_hard_loss: float = 0.5,
|
|
||||||
weight_soft_loss: float = 0.5,
|
|
||||||
compute_ce_loss: bool = True,
|
|
||||||
temperature: float = 1.0,
|
|
||||||
beta: float = 0.0,
|
|
||||||
normalize_topk: bool = True,
|
|
||||||
):
|
|
||||||
# Compute student logits for the chunk from hidden states and LM head
|
|
||||||
# student_input_chunk: [chunk_size, hidden_dim]
|
|
||||||
# student_lm_head_weight: [vocab_size, hidden_dim]
|
|
||||||
# student_logits_chunk: [chunk_size, vocab_size]
|
|
||||||
student_logits_chunk = F.linear(
|
|
||||||
student_input_chunk, student_weight, student_bias
|
|
||||||
)
|
|
||||||
|
|
||||||
ce_loss = torch.tensor(
|
|
||||||
0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype
|
|
||||||
)
|
|
||||||
if compute_ce_loss and weight_hard_loss > 0.0:
|
|
||||||
ce_loss = F.cross_entropy(
|
|
||||||
student_logits_chunk.view(-1, student_logits_chunk.shape[-1]),
|
|
||||||
target_chunk.view(-1),
|
|
||||||
reduction="sum",
|
|
||||||
ignore_index=ignore_index,
|
|
||||||
)
|
|
||||||
|
|
||||||
soft_loss = torch.tensor(
|
|
||||||
0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype
|
|
||||||
)
|
|
||||||
if weight_soft_loss > 0.0:
|
|
||||||
student_logits_chunk_temp_scaled = student_logits_chunk / temperature
|
|
||||||
|
|
||||||
# Assuming student_weight.shape[0] (vocab_size) is adequate for target_token_ids_chunk.max()
|
|
||||||
# No explicit padding here; user must ensure vocab alignment or pre-pad student_weight.
|
|
||||||
|
|
||||||
soft_loss = distillation_loss_fn(
|
|
||||||
student_logits_chunk_temp_scaled,
|
|
||||||
target_token_ids_chunk,
|
|
||||||
target_logprobs_chunk,
|
|
||||||
target_mask_chunk,
|
|
||||||
beta=beta,
|
|
||||||
normalize_topk=normalize_topk,
|
|
||||||
)
|
|
||||||
|
|
||||||
return soft_loss, ce_loss
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def forward(
|
|
||||||
cls,
|
|
||||||
ctx,
|
|
||||||
student_input: torch.Tensor, # [batch_size, seq_len, dim]
|
|
||||||
student_lm_head_weight: torch.Tensor, # [dim, vocab_size]
|
|
||||||
target_token_ids: torch.Tensor, # [batch_size, seq_len, top_k]
|
|
||||||
target_logprobs: torch.Tensor, # [batch_size, seq_len, top_k]
|
|
||||||
target_mask: torch.Tensor, # [batch_size, seq_len, top_k]
|
|
||||||
true_labels: torch.Tensor, # [batch_size, seq_len]
|
|
||||||
student_lm_head_bias: torch.Tensor = None,
|
|
||||||
weight_hard_loss: float = 0.5,
|
|
||||||
weight_soft_loss: float = 0.5,
|
|
||||||
ignore_index: int = -100,
|
|
||||||
temperature: float = 1.0,
|
|
||||||
beta: float = 0.0,
|
|
||||||
compiled: bool = False,
|
|
||||||
chunk_size: int = 1024,
|
|
||||||
compute_ce_loss: bool = True,
|
|
||||||
normalize_topk: bool = True,
|
|
||||||
):
|
|
||||||
CHUNK_SIZE = chunk_size # pylint: disable=invalid-name
|
|
||||||
grad_weight_acc = torch.zeros_like(student_lm_head_weight)
|
|
||||||
grad_inputs_list = []
|
|
||||||
grad_bias_acc = (
|
|
||||||
torch.zeros_like(student_lm_head_bias)
|
|
||||||
if student_lm_head_bias is not None
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
kd_loss_acc = torch.zeros(
|
|
||||||
(), device=student_input.device, dtype=student_input.dtype
|
|
||||||
)
|
|
||||||
ce_loss_acc = torch.zeros(
|
|
||||||
(), device=student_input.device, dtype=student_input.dtype
|
|
||||||
)
|
|
||||||
|
|
||||||
# This function will be what torch.func.grad_and_value differentiates.
|
|
||||||
# It takes student_input_chunk, student_weight (full), student_bias (full) as primals.
|
|
||||||
# Other necessary data (target_*, etc.) are passed as non-differentiable arguments.
|
|
||||||
def loss_fn_for_grad(
|
|
||||||
_student_input_chunk,
|
|
||||||
_student_lm_head_weight, # full weight
|
|
||||||
_student_lm_head_bias, # full bias
|
|
||||||
# Fixed arguments for a given chunk, not differentiated:
|
|
||||||
_target_token_ids_chunk,
|
|
||||||
_target_logprobs_chunk,
|
|
||||||
_target_mask_chunk,
|
|
||||||
_true_labels_chunk,
|
|
||||||
):
|
|
||||||
return cls._compute_loss_kl_topk(
|
|
||||||
student_input_chunk=_student_input_chunk,
|
|
||||||
student_weight=_student_lm_head_weight,
|
|
||||||
target_token_ids_chunk=_target_token_ids_chunk,
|
|
||||||
target_logprobs_chunk=_target_logprobs_chunk,
|
|
||||||
target_mask_chunk=_target_mask_chunk,
|
|
||||||
target_chunk=_true_labels_chunk,
|
|
||||||
student_bias=_student_lm_head_bias,
|
|
||||||
distillation_loss_fn=cls.distillation_loss_fn,
|
|
||||||
ignore_index=ignore_index,
|
|
||||||
weight_hard_loss=weight_hard_loss,
|
|
||||||
weight_soft_loss=weight_soft_loss,
|
|
||||||
compute_ce_loss=compute_ce_loss,
|
|
||||||
temperature=temperature,
|
|
||||||
beta=beta,
|
|
||||||
normalize_topk=normalize_topk,
|
|
||||||
)
|
|
||||||
|
|
||||||
def accumulate_chunk_grads(
|
|
||||||
student_input_chunk_ac,
|
|
||||||
target_token_ids_chunk_ac,
|
|
||||||
target_logprobs_chunk_ac,
|
|
||||||
target_mask_chunk_ac,
|
|
||||||
true_labels_chunk_ac,
|
|
||||||
):
|
|
||||||
# student_weight and student_bias are closed over from the outer scope (full tensors)
|
|
||||||
if student_lm_head_bias is not None:
|
|
||||||
(
|
|
||||||
(chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
|
|
||||||
(chunk_kd_loss, chunk_ce_loss),
|
|
||||||
) = torch.func.grad_and_value(
|
|
||||||
loss_fn_for_grad, argnums=(0, 1, 2), has_aux=True
|
|
||||||
)(
|
|
||||||
student_input_chunk_ac,
|
|
||||||
student_lm_head_weight,
|
|
||||||
student_lm_head_bias, # primals
|
|
||||||
target_token_ids_chunk_ac,
|
|
||||||
target_logprobs_chunk_ac,
|
|
||||||
target_mask_chunk_ac,
|
|
||||||
true_labels_chunk_ac,
|
|
||||||
) # non-primals
|
|
||||||
grad_bias_acc.add_(chunk_grad_bias)
|
|
||||||
else:
|
|
||||||
argnums_for_grad = (0, 1) # Differentiate wrt input_chunk, weight
|
|
||||||
(
|
|
||||||
(chunk_grad_input, chunk_grad_weight), # No grad for bias
|
|
||||||
(chunk_kd_loss, chunk_ce_loss),
|
|
||||||
) = torch.func.grad_and_value(
|
|
||||||
loss_fn_for_grad, argnums=argnums_for_grad, has_aux=True
|
|
||||||
)(
|
|
||||||
student_input_chunk_ac,
|
|
||||||
student_lm_head_weight,
|
|
||||||
None, # Pass None for student_bias primal
|
|
||||||
target_token_ids_chunk_ac,
|
|
||||||
target_logprobs_chunk_ac,
|
|
||||||
target_mask_chunk_ac,
|
|
||||||
true_labels_chunk_ac,
|
|
||||||
)
|
|
||||||
|
|
||||||
grad_weight_acc.add_(chunk_grad_weight)
|
|
||||||
kd_loss_acc.add_(chunk_kd_loss)
|
|
||||||
ce_loss_acc.add_(chunk_ce_loss)
|
|
||||||
|
|
||||||
return chunk_grad_input
|
|
||||||
|
|
||||||
if compiled:
|
|
||||||
accumulate_chunk_grads_compiled = torch.compile(
|
|
||||||
accumulate_chunk_grads, dynamic=True, backend="inductor"
|
|
||||||
) # dynamic=True often helpful
|
|
||||||
else:
|
|
||||||
accumulate_chunk_grads_compiled = accumulate_chunk_grads
|
|
||||||
|
|
||||||
# Use the same chunking logic as LigerFusedLinearDistillationBase.forward
|
|
||||||
B, N, D = student_input.shape # pylint: disable=invalid-name
|
|
||||||
K = target_token_ids.shape[-1] # pylint: disable=invalid-name
|
|
||||||
|
|
||||||
student_input_flat = student_input.reshape(-1, student_input.shape[-1])
|
|
||||||
target_token_ids_flat = target_token_ids.reshape(-1, target_token_ids.shape[-1])
|
|
||||||
target_logprobs_flat = target_logprobs.reshape(-1, target_logprobs.shape[-1])
|
|
||||||
target_mask_flat = target_mask.reshape(-1, target_mask.shape[-1])
|
|
||||||
# pad and shift for cross entropy loss
|
|
||||||
true_labels = torch.nn.functional.pad(true_labels, (0, 1), value=ignore_index)
|
|
||||||
true_labels_flat = true_labels[:, 1:].contiguous().view(-1)
|
|
||||||
|
|
||||||
num_chunks = max(1, student_input_flat.shape[0] // CHUNK_SIZE)
|
|
||||||
|
|
||||||
_student_input_chunks = torch.chunk(
|
|
||||||
student_input_flat, chunks=num_chunks, dim=0
|
|
||||||
)
|
|
||||||
_target_token_ids_chunks = torch.chunk(
|
|
||||||
target_token_ids_flat, chunks=num_chunks, dim=0
|
|
||||||
)
|
|
||||||
_target_logprobs_chunks = torch.chunk(
|
|
||||||
target_logprobs_flat, chunks=num_chunks, dim=0
|
|
||||||
)
|
|
||||||
_target_mask_chunks = torch.chunk(target_mask_flat, chunks=num_chunks, dim=0)
|
|
||||||
_true_labels_chunks = torch.chunk(true_labels_flat, chunks=num_chunks, dim=0)
|
|
||||||
|
|
||||||
for i in range(num_chunks):
|
|
||||||
grad_input_chunk = accumulate_chunk_grads_compiled(
|
|
||||||
_student_input_chunks[i],
|
|
||||||
_target_token_ids_chunks[i],
|
|
||||||
_target_logprobs_chunks[i],
|
|
||||||
_target_mask_chunks[i],
|
|
||||||
_true_labels_chunks[i],
|
|
||||||
)
|
|
||||||
grad_inputs_list.append(grad_input_chunk)
|
|
||||||
|
|
||||||
grad_inputs_combined = torch.cat(grad_inputs_list, dim=0)
|
|
||||||
ctx.save_for_backward(grad_inputs_combined, grad_weight_acc, grad_bias_acc)
|
|
||||||
|
|
||||||
# For matching None returns in backward for non-tensor/non-grad_requiring inputs
|
|
||||||
ctx.hyperparams_count = 9 # Corresponds to number of hyperparams after main tensors in fwd signature
|
|
||||||
ctx.bias_was_none = student_lm_head_bias is None
|
|
||||||
ctx.orig_dims = (B, N, D, K)
|
|
||||||
|
|
||||||
# since this is packed, there is simply a single batch, so batchmean reduction of kl-div is simply the accumulated sum
|
|
||||||
# we still need to scale the kd_loss by the temp^2
|
|
||||||
kd_loss_acc = kd_loss_acc * (temperature**2)
|
|
||||||
final_loss = weight_soft_loss * kd_loss_acc + weight_hard_loss * ce_loss_acc
|
|
||||||
|
|
||||||
return final_loss
|
|
||||||
|
|
||||||
@staticmethod
|
|
||||||
def backward(ctx, grad_output):
|
|
||||||
grad_input_flat, grad_weight, grad_bias_maybe = (
|
|
||||||
ctx.saved_tensors
|
|
||||||
) # grad_input_flat is (B*N, D)
|
|
||||||
|
|
||||||
# Scale gradients by grad_output if it's not 1.0
|
|
||||||
if not torch.equal(
|
|
||||||
grad_output,
|
|
||||||
torch.tensor(1.0, device=grad_output.device, dtype=grad_output.dtype),
|
|
||||||
):
|
|
||||||
grad_input_flat = grad_input_flat * grad_output
|
|
||||||
grad_weight = grad_weight * grad_output
|
|
||||||
if grad_bias_maybe is not None:
|
|
||||||
grad_bias_maybe = grad_bias_maybe * grad_output
|
|
||||||
|
|
||||||
# Reshape grad_input_flat to match original student_input shape (B, N, D)
|
|
||||||
# ctx.orig_dims stores (B, N, D, K)
|
|
||||||
# We need the first three dimensions for student_input's shape.
|
|
||||||
# Ensure that orig_dims are not (0,0,0,K) for empty inputs leading to view errors
|
|
||||||
if (
|
|
||||||
ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0
|
|
||||||
and grad_input_flat.numel() == 0
|
|
||||||
):
|
|
||||||
# If original input was empty, gradient should also be empty with correct shape
|
|
||||||
grad_input_reshaped = torch.zeros(
|
|
||||||
ctx.orig_dims[0],
|
|
||||||
ctx.orig_dims[1],
|
|
||||||
ctx.orig_dims[2],
|
|
||||||
dtype=grad_input_flat.dtype,
|
|
||||||
device=grad_input_flat.device,
|
|
||||||
)
|
|
||||||
elif grad_input_flat.numel() == 0 and not (
|
|
||||||
ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0
|
|
||||||
):
|
|
||||||
# This case should ideally not happen if forward path is correct (non-empty input -> non-empty flat grad)
|
|
||||||
# but as a safeguard:
|
|
||||||
grad_input_reshaped = torch.zeros(
|
|
||||||
ctx.orig_dims[0],
|
|
||||||
ctx.orig_dims[1],
|
|
||||||
ctx.orig_dims[2],
|
|
||||||
dtype=grad_input_flat.dtype,
|
|
||||||
device=grad_input_flat.device,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
grad_input_reshaped = grad_input_flat.view(
|
|
||||||
ctx.orig_dims[0], ctx.orig_dims[1], ctx.orig_dims[2]
|
|
||||||
)
|
|
||||||
|
|
||||||
nones_for_hyperparams = [None] * ctx.hyperparams_count
|
|
||||||
grad_bias_return = grad_bias_maybe if not ctx.bias_was_none else None
|
|
||||||
|
|
||||||
return (
|
|
||||||
grad_input_reshaped, # Gradient for student_input (reshaped)
|
|
||||||
grad_weight, # Gradient for student_lm_head_weight
|
|
||||||
None, # Gradient for target_token_ids
|
|
||||||
None, # Gradient for target_logprobs
|
|
||||||
None, # Gradient for target_mask
|
|
||||||
None, # Gradient for true_labels
|
|
||||||
grad_bias_return, # Gradient for student_lm_head_bias
|
|
||||||
*nones_for_hyperparams, # Grads for weight_hard_loss, ..., compute_ce_loss
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class LigerFusedLinearKLTopKLogprobLoss(torch.nn.Module):
|
|
||||||
"""
|
|
||||||
wrapper for chunked top-k logprob kl-d
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
weight_hard_loss: float = 0.5,
|
|
||||||
weight_soft_loss: float = 0.5,
|
|
||||||
temperature: float = 1.0, # This is the kd_temperature
|
|
||||||
beta: float = 1.0,
|
|
||||||
ignore_index: int = -100,
|
|
||||||
compiled: bool = True,
|
|
||||||
chunk_size: int = 1024,
|
|
||||||
compute_ce_loss: bool = True,
|
|
||||||
normalize_topk: bool = True,
|
|
||||||
):
|
|
||||||
super().__init__()
|
|
||||||
if not (0.0 <= weight_hard_loss <= 1.0 and 0.0 <= weight_soft_loss <= 1.0):
|
|
||||||
raise ValueError("Loss weights must be between 0.0 and 1.0.")
|
|
||||||
if temperature <= 0:
|
|
||||||
raise ValueError("Temperature must be positive.")
|
|
||||||
|
|
||||||
self.weight_hard_loss = weight_hard_loss
|
|
||||||
self.weight_soft_loss = weight_soft_loss
|
|
||||||
self.temperature = temperature
|
|
||||||
self.beta = beta
|
|
||||||
self.ignore_index = ignore_index
|
|
||||||
self.compiled = compiled
|
|
||||||
self.chunk_size = chunk_size
|
|
||||||
self.compute_ce_loss = compute_ce_loss
|
|
||||||
self.normalize_topk = normalize_topk
|
|
||||||
|
|
||||||
if not self.compute_ce_loss and self.weight_hard_loss > 0.0:
|
|
||||||
print(
|
|
||||||
f"Warning: compute_ce_loss is False, but weight_hard_loss ({self.weight_hard_loss}) > 0. Hard loss will effectively be zero."
|
|
||||||
)
|
|
||||||
# self.weight_hard_loss = 0.0 # Or let user manage this
|
|
||||||
if self.weight_soft_loss == 0.0:
|
|
||||||
print(
|
|
||||||
"Warning: weight_soft_loss is 0.0. Soft (KD) loss will not be computed."
|
|
||||||
)
|
|
||||||
|
|
||||||
def forward(
|
|
||||||
self,
|
|
||||||
lm_head_weight: torch.Tensor, # Weights of the linear layer in the LM head
|
|
||||||
student_hidden_states: torch.Tensor, # student_hidden_states before the lm_head
|
|
||||||
target_token_ids: torch.Tensor,
|
|
||||||
target_logprobs: torch.Tensor,
|
|
||||||
target_mask: torch.Tensor,
|
|
||||||
true_labels: torch.Tensor,
|
|
||||||
student_bias: torch.Tensor = None,
|
|
||||||
) -> torch.Tensor:
|
|
||||||
return LigerFusedLinearKLTopKLogprobFunction.apply(
|
|
||||||
student_hidden_states,
|
|
||||||
lm_head_weight,
|
|
||||||
target_token_ids,
|
|
||||||
target_logprobs,
|
|
||||||
target_mask,
|
|
||||||
true_labels,
|
|
||||||
student_bias,
|
|
||||||
self.weight_hard_loss,
|
|
||||||
self.weight_soft_loss,
|
|
||||||
self.ignore_index,
|
|
||||||
self.temperature,
|
|
||||||
self.beta,
|
|
||||||
self.compiled,
|
|
||||||
self.chunk_size,
|
|
||||||
self.compute_ce_loss,
|
|
||||||
self.normalize_topk,
|
|
||||||
)
|
|
||||||
@@ -1,98 +0,0 @@
|
|||||||
"""
|
|
||||||
model patcher for chunked top-k kl-div
|
|
||||||
"""
|
|
||||||
|
|
||||||
from types import MethodType
|
|
||||||
from typing import Optional, Union, Unpack
|
|
||||||
|
|
||||||
import torch
|
|
||||||
from transformers import Cache
|
|
||||||
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
|
|
||||||
from transformers.modeling_outputs import CausalLMOutputWithPast
|
|
||||||
from transformers.utils import LossKwargs
|
|
||||||
|
|
||||||
|
|
||||||
class KwargsForCausalLM(FlashAttentionKwargs, LossKwargs):
|
|
||||||
"""
|
|
||||||
placeholder kwargs for hf model classes
|
|
||||||
"""
|
|
||||||
|
|
||||||
|
|
||||||
def kldiv_forward_llama_like(
|
|
||||||
self,
|
|
||||||
input_ids: Optional[torch.LongTensor] = None,
|
|
||||||
target_logprobs: Optional[torch.Tensor] = None,
|
|
||||||
target_token_ids: Optional[torch.LongTensor] = None,
|
|
||||||
target_mask: Optional[torch.Tensor] = None,
|
|
||||||
attention_mask: Optional[torch.Tensor] = None,
|
|
||||||
position_ids: Optional[torch.LongTensor] = None,
|
|
||||||
past_key_values: Optional[Cache] = None,
|
|
||||||
inputs_embeds: Optional[torch.FloatTensor] = None,
|
|
||||||
labels: Optional[torch.LongTensor] = None,
|
|
||||||
use_cache: Optional[bool] = None,
|
|
||||||
output_attentions: Optional[bool] = None,
|
|
||||||
output_hidden_states: Optional[bool] = None,
|
|
||||||
cache_position: Optional[torch.LongTensor] = None,
|
|
||||||
logits_to_keep: Union[int, torch.Tensor] = 0, # pylint: disable=unused-argument
|
|
||||||
**kwargs: Unpack[KwargsForCausalLM], # type: ignore[misc]
|
|
||||||
) -> CausalLMOutputWithPast:
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
output_attentions = (
|
|
||||||
output_attentions
|
|
||||||
if output_attentions is not None
|
|
||||||
else self.config.output_attentions
|
|
||||||
)
|
|
||||||
output_hidden_states = (
|
|
||||||
output_hidden_states
|
|
||||||
if output_hidden_states is not None
|
|
||||||
else self.config.output_hidden_states
|
|
||||||
)
|
|
||||||
|
|
||||||
# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
|
|
||||||
outputs = self.model(
|
|
||||||
input_ids=input_ids,
|
|
||||||
attention_mask=attention_mask,
|
|
||||||
position_ids=position_ids,
|
|
||||||
past_key_values=past_key_values,
|
|
||||||
inputs_embeds=inputs_embeds,
|
|
||||||
use_cache=use_cache,
|
|
||||||
output_attentions=output_attentions,
|
|
||||||
output_hidden_states=output_hidden_states,
|
|
||||||
cache_position=cache_position,
|
|
||||||
**kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
hidden_states = outputs.last_hidden_state
|
|
||||||
|
|
||||||
# Only compute necessary logits, and do not upcast them to float if we are not computing the loss
|
|
||||||
# TODO, we can optimize this further by filtering hidden_states on sequence dimension using labels != -100
|
|
||||||
# self.loss_function should be LigerFusedLinearKLTopKLogprobLoss
|
|
||||||
|
|
||||||
loss = self.loss_function(
|
|
||||||
self.lm_head.weight,
|
|
||||||
hidden_states,
|
|
||||||
target_token_ids,
|
|
||||||
target_logprobs,
|
|
||||||
target_mask,
|
|
||||||
true_labels=labels,
|
|
||||||
)
|
|
||||||
num_items_in_batch = kwargs.pop("num_items_in_batch", -1)
|
|
||||||
if num_items_in_batch is not None and num_items_in_batch > 0:
|
|
||||||
loss = loss / num_items_in_batch
|
|
||||||
|
|
||||||
return CausalLMOutputWithPast(
|
|
||||||
loss=loss,
|
|
||||||
logits=None,
|
|
||||||
past_key_values=outputs.past_key_values,
|
|
||||||
hidden_states=outputs.hidden_states,
|
|
||||||
attentions=outputs.attentions,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def apply_kernel(model_type):
|
|
||||||
# Dynamically import the module and attention class
|
|
||||||
module_path = f"transformers.models.{model_type}.modeling_{model_type}"
|
|
||||||
model_cls_prefix = "".join([part.capitalize() for part in model_type.split("_")])
|
|
||||||
module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"])
|
|
||||||
model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
|
|
||||||
model_cls.forward = MethodType(kldiv_forward_llama_like, model_cls)
|
|
||||||
@@ -16,7 +16,40 @@
|
|||||||
loss for top_k KL divergence
|
loss for top_k KL divergence
|
||||||
"""
|
"""
|
||||||
import torch
|
import torch
|
||||||
from torch import nn
|
|
||||||
|
|
||||||
|
def zscore_standardize(
|
||||||
|
logits: torch.Tensor,
|
||||||
|
mask: torch.Tensor = None,
|
||||||
|
base_temperature: float = 1.0,
|
||||||
|
eps: float = 1e-9,
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Z-score standardize along the last dimension of `logits`.
|
||||||
|
i.e., for each [B, seq_len] row, across K entries:
|
||||||
|
z = (logits - mean) / std,
|
||||||
|
then scale by 1 / base_temperature if desired.
|
||||||
|
|
||||||
|
mask can be broadcastable or None. If None, we standardize all elements.
|
||||||
|
"""
|
||||||
|
if mask is None:
|
||||||
|
# shape: [B, seq_len, K]
|
||||||
|
# Mean and std over dim=-1
|
||||||
|
mean = logits.mean(dim=-1, keepdim=True)
|
||||||
|
var = logits.var(dim=-1, unbiased=False, keepdim=True)
|
||||||
|
else:
|
||||||
|
# If you have to exclude some tokens, multiply by mask, etc.
|
||||||
|
float_mask = mask.to(logits.dtype)
|
||||||
|
count = float_mask.sum(dim=-1, keepdim=True).clamp_min(1.0)
|
||||||
|
mean = (logits * float_mask).sum(dim=-1, keepdim=True) / count
|
||||||
|
var = (float_mask * (logits - mean) ** 2).sum(dim=-1, keepdim=True) / count
|
||||||
|
|
||||||
|
std = torch.sqrt(var.clamp_min(eps))
|
||||||
|
z = (logits - mean) / std
|
||||||
|
|
||||||
|
# Scale by 1 / base_temperature
|
||||||
|
z = z / base_temperature
|
||||||
|
return z
|
||||||
|
|
||||||
|
|
||||||
@torch.jit.script
|
@torch.jit.script
|
||||||
@@ -27,6 +60,7 @@ def loss(
|
|||||||
target_mask: torch.Tensor,
|
target_mask: torch.Tensor,
|
||||||
num_items_in_batch: int = -1, # Use -1 to indicate "None"
|
num_items_in_batch: int = -1, # Use -1 to indicate "None"
|
||||||
kd_temperature: float = 1.0,
|
kd_temperature: float = 1.0,
|
||||||
|
top_k_before_softmax: int = 0,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
"""
|
"""
|
||||||
A KD loss function that is TorchScript-friendly.
|
A KD loss function that is TorchScript-friendly.
|
||||||
@@ -43,6 +77,8 @@ def loss(
|
|||||||
num_items_in_batch (int, optional): The number of items in the batch.
|
num_items_in_batch (int, optional): The number of items in the batch.
|
||||||
kd_temperature (float, optional): The temperature for KD.
|
kd_temperature (float, optional): The temperature for KD.
|
||||||
Default: 1.0
|
Default: 1.0
|
||||||
|
top_k_before_softmax (int, optional): Flag of whether to apply softmax before gathering student top-k logits
|
||||||
|
Default: 0
|
||||||
"""
|
"""
|
||||||
|
|
||||||
target_logprobs = target_logprobs.float()
|
target_logprobs = target_logprobs.float()
|
||||||
@@ -52,24 +88,46 @@ def loss(
|
|||||||
# student_logits shape: [B, student_seq_len, vocab_size]
|
# student_logits shape: [B, student_seq_len, vocab_size]
|
||||||
teacher_seq_len = target_token_ids.shape[1]
|
teacher_seq_len = target_token_ids.shape[1]
|
||||||
|
|
||||||
# Slice student logits to match teacher-provided sequence length
|
if top_k_before_softmax:
|
||||||
student_logits_for_kd = (
|
# Slice student logits to match teacher-provided sequence length
|
||||||
student_logits[:, :teacher_seq_len, :] / kd_temperature
|
student_logits_for_kd = student_logits[
|
||||||
) # [B, teacher_seq_len, vocab_size]
|
:, :teacher_seq_len, :
|
||||||
|
] # [B, teacher_seq_len, vocab_size]
|
||||||
|
|
||||||
# keep in full precision for numerical stability of loss
|
# Gather student logits for teacher's top-K tokens
|
||||||
student_logits_for_kd = student_logits_for_kd.float()
|
student_logits_topk = torch.gather(
|
||||||
|
student_logits_for_kd, dim=-1, index=target_token_ids
|
||||||
|
) # [B, teacher_seq_len, K]
|
||||||
|
|
||||||
# Gather student logits for teacher's top-K tokens
|
student_logits_topk = student_logits_topk.float()
|
||||||
student_logits_topk = torch.gather(
|
|
||||||
student_logits_for_kd, dim=-1, index=target_token_ids
|
|
||||||
) # [B, teacher_seq_len, K]
|
|
||||||
|
|
||||||
# Compute logsumexp across full vocabulary
|
# Apply KD temperature to student’s logits
|
||||||
student_lse = torch.logsumexp(student_logits_for_kd, dim=-1, keepdim=True)
|
if kd_temperature != 1.0:
|
||||||
|
student_logits_topk = student_logits_topk / kd_temperature
|
||||||
|
|
||||||
# Convert just the top-k logits to logprobs
|
# Convert student top-k logits to logprobs
|
||||||
student_logprobs_topk = student_logits_topk - student_lse
|
student_logprobs_topk = student_logits_topk - torch.logsumexp(
|
||||||
|
student_logits_topk, dim=-1, keepdim=True
|
||||||
|
) # [B, teacher_seq_len, K]
|
||||||
|
else:
|
||||||
|
# Slice student logits to match teacher-provided sequence length
|
||||||
|
student_logits_for_kd = (
|
||||||
|
student_logits[:, :teacher_seq_len, :] / kd_temperature
|
||||||
|
) # [B, teacher_seq_len, vocab_size]
|
||||||
|
|
||||||
|
# keep in full precision for numerical stability of loss
|
||||||
|
student_logits_for_kd = student_logits_for_kd.float()
|
||||||
|
|
||||||
|
# Gather student logits for teacher's top-K tokens
|
||||||
|
student_logits_topk = torch.gather(
|
||||||
|
student_logits_for_kd, dim=-1, index=target_token_ids
|
||||||
|
) # [B, teacher_seq_len, K]
|
||||||
|
|
||||||
|
# Compute logsumexp across full vocabulary
|
||||||
|
student_lse = torch.logsumexp(student_logits_for_kd, dim=-1, keepdim=True)
|
||||||
|
|
||||||
|
# Convert just the top-k logits to logprobs
|
||||||
|
student_logprobs_topk = student_logits_topk - student_lse
|
||||||
|
|
||||||
# Convert teacher_mask to boolean for indexing
|
# Convert teacher_mask to boolean for indexing
|
||||||
# In TorchScript, .bool() is sometimes unsupported, so we do:
|
# In TorchScript, .bool() is sometimes unsupported, so we do:
|
||||||
@@ -86,6 +144,10 @@ def loss(
|
|||||||
kd_loss_per_token = teacher_probs * (target_logprobs - student_logprobs_topk)
|
kd_loss_per_token = teacher_probs * (target_logprobs - student_logprobs_topk)
|
||||||
kd_loss = kd_loss_per_token.sum()
|
kd_loss = kd_loss_per_token.sum()
|
||||||
|
|
||||||
|
# Multiply by T^2 (classical KD scaling)
|
||||||
|
if kd_temperature != 1.0:
|
||||||
|
kd_loss = kd_loss * (kd_temperature**2)
|
||||||
|
|
||||||
# Normalize by number of items (if provided) or by valid tokens
|
# Normalize by number of items (if provided) or by valid tokens
|
||||||
if num_items_in_batch > 0:
|
if num_items_in_batch > 0:
|
||||||
kd_loss = kd_loss / float(num_items_in_batch)
|
kd_loss = kd_loss / float(num_items_in_batch)
|
||||||
@@ -96,74 +158,80 @@ def loss(
|
|||||||
return kd_loss
|
return kd_loss
|
||||||
|
|
||||||
|
|
||||||
class ChunkedTopKKDLoss(nn.Module):
|
def topk_kd_loss_with_zscore(
|
||||||
|
student_logits: torch.Tensor, # [B, seq_len, vocab_size]
|
||||||
|
target_token_ids: torch.Tensor, # [B, seq_len, K]
|
||||||
|
target_logprobs: torch.Tensor, # [B, seq_len, K], sums to 1.0 in prob space
|
||||||
|
target_mask: torch.Tensor, # [B, seq_len, K] or [B, seq_len]
|
||||||
|
kd_temperature: float = 1.0, # classic KD temperature
|
||||||
|
zscore_base_temp: float = 1.0, # from the paper
|
||||||
|
num_items_in_batch: int = -1,
|
||||||
|
):
|
||||||
"""
|
"""
|
||||||
A wrapper that chunks (splits) the student and teacher outputs along the time dimension
|
A variant of top_k KL divergence with Z-score scaling
|
||||||
to reduce peak memory usage when upcasting from bf16 to fp32, especially for large vocabularies.
|
from "Logit Standardization in Knowledge Distillation".
|
||||||
|
|
||||||
Usage is analogous to ForwardKLWithChunkedOutputLoss but adapted to top-K teacher logprobs.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, num_output_chunks: int = 8, kd_temperature: float = 1.0):
|
target_logprobs = target_logprobs.float()
|
||||||
super().__init__()
|
|
||||||
self.num_output_chunks = num_output_chunks
|
|
||||||
self.kd_temperature = kd_temperature
|
|
||||||
|
|
||||||
def forward(
|
B, teacher_seq_len, K = target_logprobs.shape # pylint: disable=invalid-name
|
||||||
self,
|
# 1) Gather the student's top-k logits to match teacher
|
||||||
student_logits: torch.Tensor, # [B, seq_len, vocab_size]
|
student_logits_for_kd = student_logits[
|
||||||
target_token_ids: torch.Tensor, # [B, seq_len, K]
|
:, :teacher_seq_len, :
|
||||||
target_logprobs: torch.Tensor, # [B, seq_len, K]
|
] # [B, seq_len, vocab]
|
||||||
target_mask: torch.Tensor, # [B, seq_len, K]
|
student_topk_logits = torch.gather(
|
||||||
num_items_in_batch: int = -1, # optional batch size for normalization
|
student_logits_for_kd, dim=-1, index=target_token_ids
|
||||||
) -> torch.Tensor:
|
) # [B, seq_len, K]
|
||||||
|
|
||||||
# 1. Split along the "token" dimension (dim=1).
|
student_topk_logits = student_topk_logits.float()
|
||||||
student_logits_chunks = student_logits.chunk(self.num_output_chunks, dim=1)
|
|
||||||
token_ids_chunks = target_token_ids.chunk(self.num_output_chunks, dim=1)
|
|
||||||
logprobs_chunks = target_logprobs.chunk(self.num_output_chunks, dim=1)
|
|
||||||
mask_chunks = target_mask.chunk(self.num_output_chunks, dim=1)
|
|
||||||
|
|
||||||
# We'll accumulate a global "sum of losses" and "sum of valid tokens"
|
# 2) If you want to keep the "classical" T scaling, apply it first
|
||||||
# so that our final average is consistent with the entire sequence/batch.
|
if kd_temperature != 1.0:
|
||||||
total_loss = 0.0
|
student_topk_logits = student_topk_logits / kd_temperature
|
||||||
total_valid_tokens = 0
|
|
||||||
|
|
||||||
# 2. Loop over each chunk and compute a chunk-specific loss.
|
# 3) Convert teacher logprobs -> treat them as “logits” for z-score
|
||||||
for st_chunk, tid_chunk, lp_chunk, msk_chunk in zip(
|
# (They differ by +some_constant from real logits, but in z-score
|
||||||
student_logits_chunks, token_ids_chunks, logprobs_chunks, mask_chunks
|
# that constant is subtracted out anyway.)
|
||||||
):
|
teacher_logits_for_zscore = target_logprobs # rename variable for clarity
|
||||||
# We pass num_items_in_batch=-1 so that the kd_loss
|
|
||||||
# will average over *this chunk's* valid tokens only.
|
|
||||||
chunk_loss = loss(
|
|
||||||
student_logits=st_chunk,
|
|
||||||
target_token_ids=tid_chunk,
|
|
||||||
target_logprobs=lp_chunk,
|
|
||||||
target_mask=msk_chunk,
|
|
||||||
num_items_in_batch=-1, # ensure per-chunk averaging by valid tokens
|
|
||||||
kd_temperature=self.kd_temperature,
|
|
||||||
)
|
|
||||||
|
|
||||||
# kd_loss returns an average over the chunk's valid tokens.
|
# 4) Z-score teacher and student
|
||||||
# We want a global average in the end, so we need to re‐weight
|
# If target_mask is 2D, expand to 3D for the K dimension
|
||||||
# by the number of valid tokens in this chunk and keep track of the total.
|
if target_mask.dim() == 2 and target_mask.shape[:2] == (B, teacher_seq_len):
|
||||||
chunk_valid_mask = msk_chunk.to(torch.bool)
|
target_mask = target_mask.unsqueeze(-1).expand(-1, -1, K)
|
||||||
chunk_valid_count = chunk_valid_mask.sum() # scalar tensor
|
|
||||||
|
|
||||||
# Re-scale "chunk average" back to "chunk sum"
|
teacher_z = zscore_standardize(
|
||||||
chunk_loss_sum = chunk_loss * chunk_valid_count
|
teacher_logits_for_zscore, mask=target_mask, base_temperature=zscore_base_temp
|
||||||
|
)
|
||||||
|
student_z = zscore_standardize(
|
||||||
|
student_topk_logits, mask=target_mask, base_temperature=zscore_base_temp
|
||||||
|
)
|
||||||
|
|
||||||
total_loss += chunk_loss_sum
|
# 5) Convert to log-probs for KL
|
||||||
total_valid_tokens += chunk_valid_count
|
teacher_logprobs_z = teacher_z - torch.logsumexp(teacher_z, dim=-1, keepdim=True)
|
||||||
|
student_logprobs_z = student_z - torch.logsumexp(student_z, dim=-1, keepdim=True)
|
||||||
|
|
||||||
# 3. Normalize *once* at the end.
|
# 6) Restrict to valid tokens if needed
|
||||||
if num_items_in_batch > 0:
|
valid_mask = target_mask.bool() # shape [B, seq_len, K]
|
||||||
# If the user gave us a manual denominator (e.g. total items in batch),
|
teacher_probs_z = teacher_logprobs_z.exp()
|
||||||
# we divide by it. Typically used if each item is of different length.
|
teacher_probs_z = teacher_probs_z[valid_mask]
|
||||||
final_loss = total_loss / float(num_items_in_batch)
|
teacher_logprobs_z = teacher_logprobs_z[valid_mask]
|
||||||
else:
|
student_logprobs_z = student_logprobs_z[valid_mask]
|
||||||
# Otherwise, divide by total valid tokens across all chunks.
|
|
||||||
# to get the same result as a non-chunked approach.
|
|
||||||
final_loss = total_loss / float(total_valid_tokens)
|
|
||||||
|
|
||||||
return final_loss
|
# 7) forward KL: sum( p_teacher * [log(p_teacher) - log(p_student)] )
|
||||||
|
kd_loss_per_token = teacher_probs_z * (teacher_logprobs_z - student_logprobs_z)
|
||||||
|
kd_loss = kd_loss_per_token.sum()
|
||||||
|
|
||||||
|
# 8) If using classical KD scaling by T^2
|
||||||
|
if kd_temperature != 1.0:
|
||||||
|
kd_loss = kd_loss * (kd_temperature**2)
|
||||||
|
|
||||||
|
# Optionally scale by zscore_base_temp**2 if you want (paper might differ).
|
||||||
|
# kd_loss = kd_loss * (zscore_base_temp**2)
|
||||||
|
|
||||||
|
# 9) Normalize
|
||||||
|
if num_items_in_batch is not None and num_items_in_batch > 0:
|
||||||
|
kd_loss = kd_loss / float(num_items_in_batch)
|
||||||
|
else:
|
||||||
|
kd_loss = kd_loss / float(kd_loss_per_token.size(0))
|
||||||
|
|
||||||
|
return kd_loss
|
||||||
|
|||||||
@@ -18,7 +18,8 @@ KD trainer
|
|||||||
|
|
||||||
from axolotl.core.trainers.base import AxolotlTrainer
|
from axolotl.core.trainers.base import AxolotlTrainer
|
||||||
|
|
||||||
from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss
|
from .topk_logprob.forward_kl import loss as topk_kd_loss
|
||||||
|
from .topk_logprob.forward_kl import topk_kd_loss_with_zscore
|
||||||
|
|
||||||
|
|
||||||
class AxolotlKDTrainer(AxolotlTrainer):
|
class AxolotlKDTrainer(AxolotlTrainer):
|
||||||
@@ -26,18 +27,6 @@ class AxolotlKDTrainer(AxolotlTrainer):
|
|||||||
Custom trainer subclass for Knowledge Distillation (KD)
|
Custom trainer subclass for Knowledge Distillation (KD)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
self.model_accepts_loss_kwargs = True
|
|
||||||
self.model._loss_function = LigerFusedLinearKLTopKLogprobLoss(
|
|
||||||
self.args.kd_ce_alpha, # hard label loss
|
|
||||||
self.args.kd_alpha, # kd loss
|
|
||||||
self.args.kd_temperature,
|
|
||||||
self.args.kd_beta or 0.0,
|
|
||||||
compute_ce_loss=bool(self.args.kd_ce_alpha),
|
|
||||||
normalize_topk=self.args.kd_normalize_topk,
|
|
||||||
)
|
|
||||||
|
|
||||||
def _set_signature_columns_if_needed(self):
|
def _set_signature_columns_if_needed(self):
|
||||||
super()._set_signature_columns_if_needed()
|
super()._set_signature_columns_if_needed()
|
||||||
columns_to_add = []
|
columns_to_add = []
|
||||||
@@ -63,12 +52,12 @@ class AxolotlKDTrainer(AxolotlTrainer):
|
|||||||
|
|
||||||
Subclass and override for custom behavior.
|
Subclass and override for custom behavior.
|
||||||
"""
|
"""
|
||||||
if (
|
|
||||||
self.args.sample_packing
|
target_logprobs = inputs.pop("target_logprobs")
|
||||||
and hasattr(inputs, "attention_mask")
|
target_token_ids = inputs.pop("target_token_ids")
|
||||||
and hasattr(inputs, "position_ids")
|
target_mask = inputs.pop("target_mask")
|
||||||
):
|
|
||||||
del inputs["attention_mask"]
|
seq_len = target_token_ids.shape[1]
|
||||||
|
|
||||||
if self.model_accepts_loss_kwargs:
|
if self.model_accepts_loss_kwargs:
|
||||||
loss_kwargs = {}
|
loss_kwargs = {}
|
||||||
@@ -76,4 +65,49 @@ class AxolotlKDTrainer(AxolotlTrainer):
|
|||||||
loss_kwargs["num_items_in_batch"] = num_items_in_batch
|
loss_kwargs["num_items_in_batch"] = num_items_in_batch
|
||||||
inputs = {**inputs, **loss_kwargs}
|
inputs = {**inputs, **loss_kwargs}
|
||||||
outputs = model(**inputs)
|
outputs = model(**inputs)
|
||||||
return outputs[0]
|
|
||||||
|
# FIXME: account for tokenizer.padding_side
|
||||||
|
student_logits = outputs["logits"][:, : seq_len - 1, :].contiguous()
|
||||||
|
|
||||||
|
shift_logits = student_logits.contiguous()
|
||||||
|
target_logprobs_for_loss = target_logprobs[..., 1:, :].contiguous()
|
||||||
|
target_token_ids_for_loss = target_token_ids[..., 1:, :].contiguous()
|
||||||
|
target_mask_for_loss = target_mask[..., 1:, :].contiguous()
|
||||||
|
|
||||||
|
if self.args.kd_zscore_base_temp:
|
||||||
|
loss_kd = topk_kd_loss_with_zscore(
|
||||||
|
shift_logits,
|
||||||
|
target_token_ids_for_loss,
|
||||||
|
target_logprobs_for_loss,
|
||||||
|
target_mask_for_loss,
|
||||||
|
kd_temperature=self.args.kd_temperature,
|
||||||
|
zscore_base_temp=self.args.kd_zscore_base_temp,
|
||||||
|
num_items_in_batch=num_items_in_batch,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
loss_kd = topk_kd_loss(
|
||||||
|
shift_logits,
|
||||||
|
target_token_ids_for_loss,
|
||||||
|
target_logprobs_for_loss,
|
||||||
|
target_mask_for_loss,
|
||||||
|
num_items_in_batch=num_items_in_batch,
|
||||||
|
kd_temperature=self.args.kd_temperature,
|
||||||
|
top_k_before_softmax=1 if self.args.kd_top_k_before_softmax else 0,
|
||||||
|
)
|
||||||
|
|
||||||
|
if self.args.kd_ce_alpha > 0:
|
||||||
|
kd_alpha = self.args.kd_alpha
|
||||||
|
loss = self.args.kd_ce_alpha * outputs["loss"] + kd_alpha * loss_kd
|
||||||
|
else:
|
||||||
|
loss = loss_kd
|
||||||
|
# Save past state if it exists
|
||||||
|
# TODO: this needs to be fixed and made cleaner later.
|
||||||
|
if self.args.past_index >= 0:
|
||||||
|
self._past = outputs[ # pylint: disable=attribute-defined-outside-init
|
||||||
|
self.args.past_index
|
||||||
|
]
|
||||||
|
|
||||||
|
if self.args.average_tokens_across_devices and self.model_accepts_loss_kwargs:
|
||||||
|
loss *= self.accelerator.num_processes
|
||||||
|
|
||||||
|
return (loss, outputs) if return_outputs else loss
|
||||||
|
|||||||
@@ -1,100 +0,0 @@
|
|||||||
"""Helper KD utils"""
|
|
||||||
|
|
||||||
import math
|
|
||||||
from typing import List, Union
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
import torch
|
|
||||||
from torch import FloatTensor, Tensor
|
|
||||||
|
|
||||||
|
|
||||||
def normalize_logprobs(logprobs: FloatTensor, topk: int) -> FloatTensor:
|
|
||||||
"""
|
|
||||||
Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs.
|
|
||||||
"""
|
|
||||||
# Ensure raw_logprobs matches kd_online_topk length for tensor operations
|
|
||||||
# This should ideally be handled by the caller ensuring correct padding/truncation first
|
|
||||||
if logprobs.shape[-1] != topk:
|
|
||||||
# pad last dimension of logprobs to match topk length with -inf
|
|
||||||
padding_len = topk - logprobs.shape[-1]
|
|
||||||
padding_tensor = torch.full(
|
|
||||||
(
|
|
||||||
*logprobs.shape[:-1],
|
|
||||||
padding_len,
|
|
||||||
), # Takes all dimensions of logprobs except the last, then appends padding_needed
|
|
||||||
float("-inf"),
|
|
||||||
dtype=logprobs.dtype,
|
|
||||||
device=logprobs.device,
|
|
||||||
)
|
|
||||||
logprobs = torch.cat((logprobs, padding_tensor), dim=-1)
|
|
||||||
|
|
||||||
# Convert logprobs at T_online to probabilities
|
|
||||||
# use log sum exp trick to avoid underflow
|
|
||||||
position_logprobs_lse = torch.logsumexp(logprobs, dim=-1, keepdim=True)
|
|
||||||
teacher_probs_t_online = torch.exp(logprobs - position_logprobs_lse)
|
|
||||||
|
|
||||||
# Normalize probabilities (sum to 1)
|
|
||||||
# This is important if the top-k from server aren't a full distribution
|
|
||||||
teacher_probs_t_online_sum = teacher_probs_t_online.sum(dim=-1, keepdim=True)
|
|
||||||
teacher_probs_t_online = teacher_probs_t_online / teacher_probs_t_online_sum
|
|
||||||
|
|
||||||
final_logprobs_tensor = torch.log(teacher_probs_t_online)
|
|
||||||
|
|
||||||
return final_logprobs_tensor
|
|
||||||
|
|
||||||
|
|
||||||
def strided_chunk_views(
|
|
||||||
tensor: Union[np.ndarray, torch.Tensor],
|
|
||||||
chunks: int,
|
|
||||||
dim: int = 0,
|
|
||||||
stride: int = 1,
|
|
||||||
chunk_size: int | None = None,
|
|
||||||
) -> List[Union[np.ndarray, torch.Tensor]]:
|
|
||||||
"""
|
|
||||||
Split a tensor into chunks along a dimension with striding, prioritizing views over copies.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
tensor: Input tensor (numpy array or torch tensor)
|
|
||||||
chunks: Number of chunks to create
|
|
||||||
dim: Dimension along which to chunk (default: 0)
|
|
||||||
stride: Stride between chunk starting positions (default: 1)
|
|
||||||
chunk_size: Size of each chunk. If None, calculated automatically (default: None)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of tensor chunks (views when possible, copies when necessary)
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Get the size of the specified dimension
|
|
||||||
dim_size = tensor.shape[dim]
|
|
||||||
|
|
||||||
# Calculate chunk size if not provided
|
|
||||||
if chunk_size is None:
|
|
||||||
chunk_size = (dim_size + chunks - 1) // chunks # Ceiling division
|
|
||||||
|
|
||||||
chunks_list = []
|
|
||||||
|
|
||||||
for i in range(chunks):
|
|
||||||
start_idx = i * stride
|
|
||||||
end_idx = min(start_idx + chunk_size, dim_size)
|
|
||||||
|
|
||||||
# Break if we've gone beyond the tensor
|
|
||||||
if start_idx >= dim_size:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Create slice objects for all dimensions
|
|
||||||
slices = [slice(None)] * tensor.ndim
|
|
||||||
slices[dim] = slice(start_idx, end_idx)
|
|
||||||
|
|
||||||
chunk = tensor[tuple(slices)]
|
|
||||||
chunks_list.append(chunk)
|
|
||||||
|
|
||||||
return chunks_list
|
|
||||||
|
|
||||||
|
|
||||||
def chunk_overlap(input_tensor: Tensor, chunks: int, dim: int = 0, overlap: int = 1):
|
|
||||||
dim_size = input_tensor.shape[dim]
|
|
||||||
stride = math.ceil(dim_size / chunks)
|
|
||||||
|
|
||||||
return strided_chunk_views(
|
|
||||||
input_tensor, chunks, dim, stride=stride, chunk_size=stride + overlap
|
|
||||||
)
|
|
||||||
@@ -19,6 +19,7 @@ from peft import (
|
|||||||
from transformers import PreTrainedModel
|
from transformers import PreTrainedModel
|
||||||
|
|
||||||
from axolotl.loaders.utils import get_linear_embedding_layers
|
from axolotl.loaders.utils import get_linear_embedding_layers
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
|
|
||||||
@@ -162,6 +163,7 @@ def load_lora(
|
|||||||
return model, lora_config
|
return model, lora_config
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def load_adapter(
|
def load_adapter(
|
||||||
model: PreTrainedModel,
|
model: PreTrainedModel,
|
||||||
cfg: DictDefault,
|
cfg: DictDefault,
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ from axolotl.loaders.utils import (
|
|||||||
load_model_config,
|
load_model_config,
|
||||||
)
|
)
|
||||||
from axolotl.models.mamba import fix_mamba_attn_for_loss
|
from axolotl.models.mamba import fix_mamba_attn_for_loss
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
from axolotl.utils.bench import log_gpu_memory_usage
|
from axolotl.utils.bench import log_gpu_memory_usage
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.distributed import (
|
from axolotl.utils.distributed import (
|
||||||
@@ -145,6 +146,7 @@ class ModelLoader:
|
|||||||
"""Property that determines if FSDP with QLoRA is enabled."""
|
"""Property that determines if FSDP with QLoRA is enabled."""
|
||||||
return self.cfg.fsdp and self.cfg.adapter == "qlora"
|
return self.cfg.fsdp and self.cfg.adapter == "qlora"
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def load(self) -> tuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]:
|
def load(self) -> tuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]:
|
||||||
"""Load and prepare the model with all configurations and patches.
|
"""Load and prepare the model with all configurations and patches.
|
||||||
|
|
||||||
|
|||||||
@@ -166,17 +166,6 @@ class PatchManager:
|
|||||||
def _apply_self_attention_lora_patch(self):
|
def _apply_self_attention_lora_patch(self):
|
||||||
"""Apply self-attention LoRA patches if configured."""
|
"""Apply self-attention LoRA patches if configured."""
|
||||||
if self.cfg.lora_qkv_kernel or self.cfg.lora_o_kernel:
|
if self.cfg.lora_qkv_kernel or self.cfg.lora_o_kernel:
|
||||||
# Only patch if conditions are met
|
|
||||||
can_patch = (
|
|
||||||
self.cfg.lora_dropout == 0
|
|
||||||
if hasattr(self.cfg, "lora_dropout")
|
|
||||||
else True
|
|
||||||
) # default to True if lora_dropout is not set
|
|
||||||
|
|
||||||
if not can_patch:
|
|
||||||
LOG.warning("Cannot patch self-attention - requires no dropout")
|
|
||||||
return
|
|
||||||
|
|
||||||
from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
|
from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora
|
||||||
|
|
||||||
patch_self_attn_lora(self.cfg)
|
patch_self_attn_lora(self.cfg)
|
||||||
|
|||||||
@@ -8,12 +8,14 @@ from transformers import (
|
|||||||
PreTrainedTokenizerBase,
|
PreTrainedTokenizerBase,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
|
|
||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
|
def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
|
||||||
processor_kwargs: dict[str, Any] = {} # Do we actually need this?
|
processor_kwargs: dict[str, Any] = {} # Do we actually need this?
|
||||||
|
|
||||||
|
|||||||
@@ -7,14 +7,13 @@ import transformers
|
|||||||
from transformers import (
|
from transformers import (
|
||||||
AddedToken,
|
AddedToken,
|
||||||
AutoTokenizer,
|
AutoTokenizer,
|
||||||
PreTrainedTokenizer,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
from axolotl.integrations.base import PluginManager
|
from axolotl.integrations.base import PluginManager
|
||||||
from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
|
from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
|
||||||
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
|
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
from axolotl.utils.chat_templates import get_chat_template_from_config
|
from axolotl.utils.chat_templates import get_chat_template_from_config
|
||||||
from axolotl.utils.dict import DictDefault
|
|
||||||
from axolotl.utils.distributed import (
|
from axolotl.utils.distributed import (
|
||||||
barrier,
|
barrier,
|
||||||
is_local_main_process,
|
is_local_main_process,
|
||||||
@@ -119,21 +118,9 @@ def modify_tokenizer_files(
|
|||||||
return tokenizer_dir
|
return tokenizer_dir
|
||||||
|
|
||||||
|
|
||||||
def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
|
@send_errors
|
||||||
|
def load_tokenizer(cfg):
|
||||||
"""Load and configure the tokenizer based on the provided config."""
|
"""Load and configure the tokenizer based on the provided config."""
|
||||||
|
|
||||||
def _load_mistral_common_tokenizer(cfg: DictDefault):
|
|
||||||
"""Load mistral-common tokenizer"""
|
|
||||||
from axolotl.utils.mistral_tokenizer import HFMistralTokenizer
|
|
||||||
|
|
||||||
# Load the HF-compatible wrapper around MistralTokenizer
|
|
||||||
tokenizer = HFMistralTokenizer.from_pretrained(cfg.tokenizer_config)
|
|
||||||
|
|
||||||
return tokenizer
|
|
||||||
|
|
||||||
if cfg.tokenizer_use_mistral_common:
|
|
||||||
return _load_mistral_common_tokenizer(cfg)
|
|
||||||
|
|
||||||
model_config = load_model_config(cfg)
|
model_config = load_model_config(cfg)
|
||||||
tokenizer_kwargs = {}
|
tokenizer_kwargs = {}
|
||||||
use_fast = True # this is the default
|
use_fast = True # this is the default
|
||||||
@@ -222,12 +209,11 @@ def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
|
|||||||
)
|
)
|
||||||
and k != "pad_token"
|
and k != "pad_token"
|
||||||
):
|
):
|
||||||
lora_modules_to_save_str = ", ".join(
|
lora_modules_to_save = ", ".join(
|
||||||
[f"`{x}`" for x in lora_modules_to_save]
|
[f"`{x}`" for x in lora_modules_to_save]
|
||||||
)
|
)
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Please set lora_modules_to_save to [{lora_modules_to_save_str}] "
|
f"Please set lora_modules_to_save to [{lora_modules_to_save}] when using an adapter and changing the special tokens."
|
||||||
"when using an adapter and changing the special tokens."
|
|
||||||
)
|
)
|
||||||
|
|
||||||
tokenizer.add_special_tokens(
|
tokenizer.add_special_tokens(
|
||||||
|
|||||||
@@ -145,11 +145,6 @@ def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
|
|||||||
|
|
||||||
return Qwen2Attention
|
return Qwen2Attention
|
||||||
|
|
||||||
if model_type == "mllama":
|
|
||||||
from transformers.models.mllama.modeling_mllama import MllamaTextSelfAttention
|
|
||||||
|
|
||||||
return MllamaTextSelfAttention
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Dynamically import the module and attention class
|
# Dynamically import the module and attention class
|
||||||
module_path = f"transformers.models.{model_type}.modeling_{model_type}"
|
module_path = f"transformers.models.{model_type}.modeling_{model_type}"
|
||||||
@@ -274,29 +269,6 @@ def find_mlp_in_layer(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def get_layers(model: PeftModelForCausalLM) -> list[nn.Module]:
|
|
||||||
"""
|
|
||||||
Get the layers of the model. Handles text-only and multimodal models.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
model: A PEFT model.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A list of layers.
|
|
||||||
"""
|
|
||||||
pretrained_model = model.model
|
|
||||||
|
|
||||||
# check for multimodal models first
|
|
||||||
if hasattr(pretrained_model, "language_model"):
|
|
||||||
return pretrained_model.language_model.layers
|
|
||||||
if hasattr(pretrained_model, "model"):
|
|
||||||
return pretrained_model.model.layers
|
|
||||||
|
|
||||||
raise NotImplementedError(
|
|
||||||
f"Model type {model.config.model_type} is not supported yet. Please create an Issue."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def apply_lora_kernel_patches(
|
def apply_lora_kernel_patches(
|
||||||
model: PeftModelForCausalLM, cfg: DictDefault
|
model: PeftModelForCausalLM, cfg: DictDefault
|
||||||
) -> PeftModelForCausalLM:
|
) -> PeftModelForCausalLM:
|
||||||
@@ -368,7 +340,17 @@ def apply_lora_kernel_patches(
|
|||||||
if activation not in SUPPORTED_ACTIVATIONS:
|
if activation not in SUPPORTED_ACTIVATIONS:
|
||||||
raise NotImplementedError(f"Activation {activation} is not supported")
|
raise NotImplementedError(f"Activation {activation} is not supported")
|
||||||
|
|
||||||
layers = get_layers(model)
|
layers = []
|
||||||
|
# check for multimodal models first
|
||||||
|
pretrained_model = model.model
|
||||||
|
if hasattr(pretrained_model, "language_model"):
|
||||||
|
layers = pretrained_model.language_model.layers
|
||||||
|
elif hasattr(pretrained_model, "model"):
|
||||||
|
layers = pretrained_model.model.layers
|
||||||
|
else:
|
||||||
|
raise NotImplementedError(
|
||||||
|
f"Model type {model.config.model_type} is not supported yet. Please create an Issue."
|
||||||
|
)
|
||||||
|
|
||||||
# Patch each layer
|
# Patch each layer
|
||||||
for layer in layers:
|
for layer in layers:
|
||||||
|
|||||||
@@ -17,10 +17,7 @@ def load(strategy, tokenizer, cfg, ds_cfg, processor=None):
|
|||||||
return messages_load(tokenizer, cfg, ds_cfg, processor=processor)
|
return messages_load(tokenizer, cfg, ds_cfg, processor=processor)
|
||||||
load_fn = "load"
|
load_fn = "load"
|
||||||
package = "axolotl.prompt_strategies"
|
package = "axolotl.prompt_strategies"
|
||||||
if (
|
if strategy.split(".")[-1].startswith("load_"):
|
||||||
strategy.split(".")[-1].startswith("load_")
|
|
||||||
or strategy.split(".")[-1] == "load"
|
|
||||||
):
|
|
||||||
load_fn = strategy.split(".")[-1]
|
load_fn = strategy.split(".")[-1]
|
||||||
strategy = ".".join(strategy.split(".")[:-1])
|
strategy = ".".join(strategy.split(".")[:-1])
|
||||||
elif len(strategy.split(".")) > 1:
|
elif len(strategy.split(".")) > 1:
|
||||||
|
|||||||
@@ -2,10 +2,8 @@
|
|||||||
HF Chat Templates prompt strategy
|
HF Chat Templates prompt strategy
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# pylint: disable=too-many-lines
|
|
||||||
|
|
||||||
from collections import defaultdict
|
from collections import defaultdict
|
||||||
from typing import TYPE_CHECKING, Any, Dict, List, Set, Union
|
from typing import Any, Dict, List, Set, Union
|
||||||
|
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from transformers import ProcessorMixin
|
from transformers import ProcessorMixin
|
||||||
@@ -17,9 +15,6 @@ from axolotl.utils.chat_templates import get_chat_template_from_config
|
|||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
from axolotl.utils.schemas.datasets import DatasetConfig
|
from axolotl.utils.schemas.datasets import DatasetConfig
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from axolotl.utils.mistral_tokenizer import HFMistralTokenizer
|
|
||||||
|
|
||||||
# Configure the logger
|
# Configure the logger
|
||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
LOG.setLevel("INFO")
|
LOG.setLevel("INFO")
|
||||||
@@ -39,7 +34,6 @@ class ChatTemplatePrompter(Prompter):
|
|||||||
message_field_training_detail: str | None = None,
|
message_field_training_detail: str | None = None,
|
||||||
field_messages: str = "messages",
|
field_messages: str = "messages",
|
||||||
field_system: str = "system",
|
field_system: str = "system",
|
||||||
field_tools: str = "tools",
|
|
||||||
roles: dict[str, list[str]] | None = None,
|
roles: dict[str, list[str]] | None = None,
|
||||||
chat_template_kwargs: dict[str, Any] | None = None,
|
chat_template_kwargs: dict[str, Any] | None = None,
|
||||||
drop_system_message: bool = False,
|
drop_system_message: bool = False,
|
||||||
@@ -72,7 +66,6 @@ class ChatTemplatePrompter(Prompter):
|
|||||||
self.message_field_training_detail = message_field_training_detail
|
self.message_field_training_detail = message_field_training_detail
|
||||||
self.field_messages = field_messages
|
self.field_messages = field_messages
|
||||||
self.field_system = field_system
|
self.field_system = field_system
|
||||||
self.field_tools = field_tools
|
|
||||||
self.tokenizer = tokenizer
|
self.tokenizer = tokenizer
|
||||||
self.processor: ProcessorMixin | None = processor
|
self.processor: ProcessorMixin | None = processor
|
||||||
self.chat_template = chat_template
|
self.chat_template = chat_template
|
||||||
@@ -84,38 +77,17 @@ class ChatTemplatePrompter(Prompter):
|
|||||||
def chat_template_msg_variables(self) -> Set[str]:
|
def chat_template_msg_variables(self) -> Set[str]:
|
||||||
return self._chat_template_msg_variables
|
return self._chat_template_msg_variables
|
||||||
|
|
||||||
def build_prompt(
|
def build_prompt(self, conversation, add_generation_prompt=False, images=None):
|
||||||
self,
|
|
||||||
conversation: list[dict],
|
|
||||||
add_generation_prompt=False,
|
|
||||||
images=None,
|
|
||||||
tools=None,
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Build a prompt from a conversation.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
conversation: A list of messages.
|
|
||||||
add_generation_prompt: Whether to add a generation prompt.
|
|
||||||
images: A list of images. (optional)
|
|
||||||
tools: A list of tools. (optional)
|
|
||||||
"""
|
|
||||||
chat_template_kwargs = {
|
|
||||||
"chat_template": self.chat_template,
|
|
||||||
"add_generation_prompt": add_generation_prompt,
|
|
||||||
}
|
|
||||||
|
|
||||||
if tools:
|
|
||||||
chat_template_kwargs["tools"] = tools
|
|
||||||
|
|
||||||
if self.processor:
|
if self.processor:
|
||||||
if not callable(self.processor):
|
if not callable(self.processor):
|
||||||
raise TypeError("Processor must be callable")
|
raise TypeError("Processor must be callable")
|
||||||
|
|
||||||
text = self.processor.apply_chat_template(
|
text = self.processor.apply_chat_template(
|
||||||
conversation,
|
conversation,
|
||||||
|
chat_template=self.chat_template,
|
||||||
tokenize=False,
|
tokenize=False,
|
||||||
**chat_template_kwargs,
|
add_generation_prompt=add_generation_prompt,
|
||||||
|
**self.chat_template_kwargs,
|
||||||
)
|
)
|
||||||
batch = self.processor(
|
batch = self.processor(
|
||||||
text=text,
|
text=text,
|
||||||
@@ -132,7 +104,9 @@ class ChatTemplatePrompter(Prompter):
|
|||||||
|
|
||||||
return self.tokenizer.apply_chat_template(
|
return self.tokenizer.apply_chat_template(
|
||||||
conversation,
|
conversation,
|
||||||
**chat_template_kwargs,
|
add_generation_prompt=add_generation_prompt,
|
||||||
|
chat_template=self.chat_template,
|
||||||
|
**self.chat_template_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_offsets_for_train_detail(
|
def get_offsets_for_train_detail(
|
||||||
@@ -276,15 +250,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos
|
self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos
|
||||||
|
|
||||||
# Default to eos_token if eot_tokens not provided
|
# Default to eos_token if eot_tokens not provided
|
||||||
self.eot_tokens = []
|
self.eot_tokens = (
|
||||||
if eot_tokens is not None:
|
eot_tokens if eot_tokens is not None else [self.tokenizer.eos_token]
|
||||||
self.eot_tokens = eot_tokens
|
)
|
||||||
elif (
|
|
||||||
hasattr(self.tokenizer, "eos_token")
|
|
||||||
and self.tokenizer.eos_token is not None
|
|
||||||
):
|
|
||||||
self.eot_tokens = [self.tokenizer.eos_token]
|
|
||||||
|
|
||||||
self.split_thinking = split_thinking
|
self.split_thinking = split_thinking
|
||||||
|
|
||||||
self.images = "images"
|
self.images = "images"
|
||||||
@@ -408,7 +376,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
and not self.prompter.message_field_training_detail # type: ignore
|
and not self.prompter.message_field_training_detail # type: ignore
|
||||||
):
|
):
|
||||||
turns = self.get_conversation_thread(prompt)
|
turns = self.get_conversation_thread(prompt)
|
||||||
images = self._get_images(prompt)
|
images = self.get_images(prompt)
|
||||||
prompt_ids = self.prompter.build_prompt( # type: ignore
|
prompt_ids = self.prompter.build_prompt( # type: ignore
|
||||||
turns[:-1],
|
turns[:-1],
|
||||||
add_generation_prompt=True,
|
add_generation_prompt=True,
|
||||||
@@ -437,8 +405,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
return tokenized_prompt
|
return tokenized_prompt
|
||||||
|
|
||||||
turns = self.get_conversation_thread(prompt)
|
turns = self.get_conversation_thread(prompt)
|
||||||
tools = self._get_tools(prompt)
|
input_ids = self.prompter.build_prompt(turns) # type: ignore
|
||||||
input_ids = self.prompter.build_prompt(turns, tools=tools) # type: ignore
|
|
||||||
labels = [IGNORE_TOKEN_ID] * len(input_ids)
|
labels = [IGNORE_TOKEN_ID] * len(input_ids)
|
||||||
|
|
||||||
last_eos_idx = -1
|
last_eos_idx = -1
|
||||||
@@ -477,9 +444,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
turn_start_idx, turn_end_idx = self.find_turn(
|
turn_start_idx, turn_end_idx = self.find_turn(turns=turns, turn_idx=index)
|
||||||
turns=turns, turn_idx=index, tools=tools
|
|
||||||
)
|
|
||||||
|
|
||||||
LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}")
|
LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}")
|
||||||
|
|
||||||
@@ -581,9 +546,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
return i
|
return i
|
||||||
return -1
|
return -1
|
||||||
|
|
||||||
def find_turn(
|
def find_turn(self, turns: list[dict], turn_idx: int):
|
||||||
self, turns: list[dict], turn_idx: int, tools: list[dict] | None = None
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Locate the starting and ending indices of the specified turn in a conversation.
|
Locate the starting and ending indices of the specified turn in a conversation.
|
||||||
"""
|
"""
|
||||||
@@ -614,10 +577,10 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
turns_with_content = turns[: turn_idx + 1]
|
turns_with_content = turns[: turn_idx + 1]
|
||||||
|
|
||||||
# Generate the conversation up to the turn, with final turn replaced with dummy content
|
# Generate the conversation up to the turn, with final turn replaced with dummy content
|
||||||
dummy_ids = self.prompter.build_prompt(turns_with_empty, tools=tools) # type: ignore
|
dummy_ids = self.prompter.build_prompt(turns_with_empty) # type: ignore
|
||||||
|
|
||||||
# Generate the conversation up to the turn, with final turn included
|
# Generate the conversation up to the turn, with final turn included
|
||||||
full_ids = self.prompter.build_prompt(turns_with_content, tools=tools) # type: ignore
|
full_ids = self.prompter.build_prompt(turns_with_content) # type: ignore
|
||||||
|
|
||||||
if not full_ids or not dummy_ids:
|
if not full_ids or not dummy_ids:
|
||||||
LOG.warning(f"Empty template generated for turn {turn_idx}")
|
LOG.warning(f"Empty template generated for turn {turn_idx}")
|
||||||
@@ -670,10 +633,9 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
def get_conversation_thread(self, prompt):
|
def get_conversation_thread(self, prompt):
|
||||||
turns = []
|
turns = []
|
||||||
|
|
||||||
messages = self._get_messages(prompt)
|
possible_sys_turn = self.transform_message(
|
||||||
|
prompt[self.prompter.field_messages][0]
|
||||||
possible_sys_turn = self.transform_message(messages[0])
|
)
|
||||||
|
|
||||||
if (
|
if (
|
||||||
possible_sys_turn["role"] != "system"
|
possible_sys_turn["role"] != "system"
|
||||||
and self.prompter.field_system in prompt
|
and self.prompter.field_system in prompt
|
||||||
@@ -681,7 +643,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
turn = {"role": "system", "content": prompt[self.prompter.field_system]}
|
turn = {"role": "system", "content": prompt[self.prompter.field_system]}
|
||||||
turns.append(turn)
|
turns.append(turn)
|
||||||
|
|
||||||
for message in messages:
|
for message in prompt[self.prompter.field_messages]:
|
||||||
transformed_message = self.transform_message(message)
|
transformed_message = self.transform_message(message)
|
||||||
|
|
||||||
turn = {
|
turn = {
|
||||||
@@ -699,7 +661,7 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
|
|
||||||
return turns
|
return turns
|
||||||
|
|
||||||
def transform_message(self, message: dict) -> dict:
|
def transform_message(self, message):
|
||||||
# Build the initial transformed message from the mappings
|
# Build the initial transformed message from the mappings
|
||||||
transformed_message = {}
|
transformed_message = {}
|
||||||
for key, value in self.prompter.message_property_mappings.items():
|
for key, value in self.prompter.message_property_mappings.items():
|
||||||
@@ -776,135 +738,18 @@ class ChatTemplateStrategy(PromptTokenizingStrategy):
|
|||||||
|
|
||||||
return transformed_message
|
return transformed_message
|
||||||
|
|
||||||
def _get_images(self, prompt):
|
def get_images(self, prompt):
|
||||||
return prompt.get(self.images, None)
|
return prompt.get(self.images, None)
|
||||||
|
|
||||||
def _get_tools(self, prompt) -> list[dict] | None:
|
|
||||||
"""Get tools from prompt if available."""
|
|
||||||
tools = prompt.get(self.prompter.field_tools, None)
|
|
||||||
if tools is None:
|
|
||||||
return None
|
|
||||||
|
|
||||||
if isinstance(tools, list):
|
|
||||||
return tools
|
|
||||||
|
|
||||||
raise ValueError(
|
|
||||||
"Unknown tools format. Please convert it into a list[dict].\n"
|
|
||||||
f"Current format: {type(tools)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
def _get_messages(self, prompt):
|
|
||||||
messages = prompt.get(self.prompter.field_messages, None)
|
|
||||||
if messages is None:
|
|
||||||
raise ValueError("Messages is null. Please check `field_messages`.")
|
|
||||||
|
|
||||||
if isinstance(messages, list):
|
|
||||||
return messages
|
|
||||||
|
|
||||||
raise ValueError(
|
|
||||||
"Unknown messages format. Please convert it into a list[dict].\n"
|
|
||||||
f"Current format: {type(messages)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class MistralStrategy(ChatTemplateStrategy):
|
|
||||||
"""
|
|
||||||
Mistral strategy for chat template.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
prompter: "ChatTemplatePrompter",
|
|
||||||
tokenizer: "HFMistralTokenizer",
|
|
||||||
train_on_inputs: bool,
|
|
||||||
sequence_len: int,
|
|
||||||
roles_to_train: list[str] | None = None,
|
|
||||||
train_on_eos: str | None = None,
|
|
||||||
train_on_eot: str | None = None,
|
|
||||||
eot_tokens: list[str] | None = None,
|
|
||||||
split_thinking: bool | None = False,
|
|
||||||
):
|
|
||||||
# Call the parent's parent __init__ (PromptTokenizingStrategy) to skip ChatTemplateStrategy's validation
|
|
||||||
# pylint: disable=non-parent-init-called,super-init-not-called
|
|
||||||
PromptTokenizingStrategy.__init__(
|
|
||||||
self, prompter, tokenizer, train_on_inputs, sequence_len
|
|
||||||
)
|
|
||||||
self.prompter: ChatTemplatePrompter = prompter
|
|
||||||
|
|
||||||
self.roles_to_train = []
|
|
||||||
if roles_to_train:
|
|
||||||
# map roles if exist in prompter.roles else use the role as is
|
|
||||||
self.roles_to_train = [
|
|
||||||
prompter.roles.get(role, role) for role in roles_to_train
|
|
||||||
]
|
|
||||||
|
|
||||||
self.train_on_eos = train_on_eos
|
|
||||||
# Backward compatibility, load from train_on_eos
|
|
||||||
self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos
|
|
||||||
|
|
||||||
# Default to eos_token if eot_tokens not provided
|
|
||||||
self.eot_tokens = []
|
|
||||||
if eot_tokens is not None:
|
|
||||||
self.eot_tokens = eot_tokens
|
|
||||||
else:
|
|
||||||
# set eot_tokens to the eos_token
|
|
||||||
self.eot_tokens = [self.tokenizer.eos_token]
|
|
||||||
|
|
||||||
self.split_thinking = split_thinking
|
|
||||||
|
|
||||||
self.images = "images"
|
|
||||||
|
|
||||||
LOG.debug(
|
|
||||||
f"The chat template uses the following properites on the message: {self.prompter.chat_template_msg_variables}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Skip the validation that ChatTemplateStrategy calls
|
|
||||||
# TODO: address this in the future with mistral-specific checks
|
|
||||||
# self._validate_eot_and_eos_tokens()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def supports_multiprocessing(self) -> bool:
|
|
||||||
"""
|
|
||||||
Whether this tokenizing strategy supports multiprocessing.
|
|
||||||
mistral_common tokenizers cannot be pickled for multiprocessing.
|
|
||||||
"""
|
|
||||||
|
|
||||||
return False
|
|
||||||
|
|
||||||
def find_first_eot_token(self, input_ids, start_idx):
|
|
||||||
"""Find the first EOT token in the input_ids starting from start_idx."""
|
|
||||||
# mistral-common tokenizer does not support eot_tokens
|
|
||||||
return self.find_first_eos_token(input_ids, start_idx)
|
|
||||||
|
|
||||||
|
|
||||||
class MistralPrompter(ChatTemplatePrompter):
|
|
||||||
"""
|
|
||||||
Mistral prompter for chat template.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
super().__init__(*args, **kwargs)
|
|
||||||
|
|
||||||
self._chat_template_msg_variables = set(["tool_call_id", "name", "tool_calls"])
|
|
||||||
|
|
||||||
|
|
||||||
class StrategyLoader:
|
class StrategyLoader:
|
||||||
"""
|
"""
|
||||||
Load chat template strategy based on configuration.
|
Load chat template strategy based on configuration.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def _get_strategy_cls(self, cfg):
|
def _get_strategy_cls(self):
|
||||||
if cfg.tokenizer_use_mistral_common:
|
|
||||||
return MistralStrategy
|
|
||||||
|
|
||||||
return ChatTemplateStrategy
|
return ChatTemplateStrategy
|
||||||
|
|
||||||
def _get_prompter_cls(self, cfg):
|
|
||||||
if cfg.tokenizer_use_mistral_common:
|
|
||||||
return MistralPrompter
|
|
||||||
|
|
||||||
return ChatTemplatePrompter
|
|
||||||
|
|
||||||
def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
|
def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
|
||||||
return {
|
return {
|
||||||
"train_on_inputs": cfg.train_on_inputs,
|
"train_on_inputs": cfg.train_on_inputs,
|
||||||
@@ -930,14 +775,9 @@ class StrategyLoader:
|
|||||||
else:
|
else:
|
||||||
dataset_config = ds_cfg
|
dataset_config = ds_cfg
|
||||||
|
|
||||||
if cfg.tokenizer_use_mistral_common:
|
chat_template_string = get_chat_template_from_config(
|
||||||
# mistral-common does not use this, so we pass an empty string
|
cfg=cfg, ds_cfg=dataset_config, tokenizer=tokenizer
|
||||||
chat_template_string = ""
|
)
|
||||||
else:
|
|
||||||
chat_template_string = get_chat_template_from_config(
|
|
||||||
cfg=cfg, ds_cfg=dataset_config, tokenizer=tokenizer
|
|
||||||
)
|
|
||||||
|
|
||||||
LOG.info(f"Using chat template:\n---\n{chat_template_string!s}\n---")
|
LOG.info(f"Using chat template:\n---\n{chat_template_string!s}\n---")
|
||||||
|
|
||||||
prompter_params = {
|
prompter_params = {
|
||||||
@@ -963,11 +803,10 @@ class StrategyLoader:
|
|||||||
}
|
}
|
||||||
|
|
||||||
strategy_params = self._get_strategy_params(cfg, dataset_config)
|
strategy_params = self._get_strategy_params(cfg, dataset_config)
|
||||||
strategy_cls = self._get_strategy_cls(cfg)
|
strategy_cls = self._get_strategy_cls()
|
||||||
prompter_cls = self._get_prompter_cls(cfg)
|
|
||||||
|
|
||||||
strategy = strategy_cls(
|
strategy = strategy_cls(
|
||||||
prompter_cls(**prompter_params),
|
ChatTemplatePrompter(**prompter_params),
|
||||||
tokenizer=tokenizer,
|
tokenizer=tokenizer,
|
||||||
**strategy_params,
|
**strategy_params,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -46,14 +46,6 @@ def default(
|
|||||||
)
|
)
|
||||||
|
|
||||||
messages = sample[field_messages]
|
messages = sample[field_messages]
|
||||||
if isinstance(messages, str):
|
|
||||||
messages = [
|
|
||||||
{
|
|
||||||
message_property_mappings["role"]: "user",
|
|
||||||
message_property_mappings["content"]: messages,
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
{
|
{
|
||||||
"role": role_map[m[message_property_mappings["role"]]],
|
"role": role_map[m[message_property_mappings["role"]]],
|
||||||
@@ -61,35 +53,13 @@ def default(
|
|||||||
}
|
}
|
||||||
for m in messages
|
for m in messages
|
||||||
]
|
]
|
||||||
|
|
||||||
chosen_raw = sample[field_chosen]
|
|
||||||
if isinstance(chosen_raw, str):
|
|
||||||
chosen_msg = {
|
|
||||||
message_property_mappings["role"]: "assistant",
|
|
||||||
message_property_mappings["content"]: chosen_raw,
|
|
||||||
}
|
|
||||||
elif isinstance(chosen_raw, dict):
|
|
||||||
chosen_msg = chosen_raw
|
|
||||||
else:
|
|
||||||
chosen_msg = chosen_raw[-1]
|
|
||||||
chosen = {
|
chosen = {
|
||||||
"role": role_map[chosen_msg[message_property_mappings["role"]]],
|
"role": role_map[sample[field_chosen][message_property_mappings["role"]]],
|
||||||
"content": chosen_msg[message_property_mappings["content"]],
|
"content": sample[field_chosen][message_property_mappings["content"]],
|
||||||
}
|
}
|
||||||
|
|
||||||
rejected_raw = sample[field_rejected]
|
|
||||||
if isinstance(rejected_raw, str):
|
|
||||||
rejected_msg = {
|
|
||||||
message_property_mappings["role"]: "assistant",
|
|
||||||
message_property_mappings["content"]: rejected_raw,
|
|
||||||
}
|
|
||||||
elif isinstance(rejected_raw, dict):
|
|
||||||
rejected_msg = rejected_raw
|
|
||||||
else:
|
|
||||||
rejected_msg = rejected_raw[-1]
|
|
||||||
rejected = {
|
rejected = {
|
||||||
"role": role_map[rejected_msg[message_property_mappings["role"]]],
|
"role": role_map[sample[field_rejected][message_property_mappings["role"]]],
|
||||||
"content": rejected_msg[message_property_mappings["content"]],
|
"content": sample[field_rejected][message_property_mappings["content"]],
|
||||||
}
|
}
|
||||||
dummy_user_message = {"role": "user", "content": "[[dummy_message]]"}
|
dummy_user_message = {"role": "user", "content": "[[dummy_message]]"}
|
||||||
|
|
||||||
|
|||||||
@@ -32,3 +32,4 @@ def load(tokenizer, cfg, ds_cfg, processor=None):
|
|||||||
except Exception as exc: # pylint: disable=broad-exception-caught
|
except Exception as exc: # pylint: disable=broad-exception-caught
|
||||||
LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
|
LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
|
||||||
raise exc
|
raise exc
|
||||||
|
return None
|
||||||
|
|||||||
@@ -3,7 +3,6 @@
|
|||||||
import abc
|
import abc
|
||||||
from typing import Callable, Dict, List, Optional, Tuple, Union
|
from typing import Callable, Dict, List, Optional, Tuple, Union
|
||||||
|
|
||||||
from datasets import Dataset
|
|
||||||
from transformers import BatchEncoding, PreTrainedTokenizer
|
from transformers import BatchEncoding, PreTrainedTokenizer
|
||||||
|
|
||||||
from axolotl.prompters import Prompter
|
from axolotl.prompters import Prompter
|
||||||
@@ -29,16 +28,6 @@ class DatasetWrappingStrategy(abc.ABC):
|
|||||||
Abstract class for wrapping datasets for Chat Messages
|
Abstract class for wrapping datasets for Chat Messages
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@abc.abstractmethod
|
|
||||||
def wrap_dataset(
|
|
||||||
self,
|
|
||||||
dataset,
|
|
||||||
process_count: int | None = None,
|
|
||||||
keep_in_memory: bool | None = False,
|
|
||||||
**kwargs,
|
|
||||||
) -> Dataset:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class PromptTokenizingStrategy(abc.ABC):
|
class PromptTokenizingStrategy(abc.ABC):
|
||||||
"""
|
"""
|
||||||
@@ -70,14 +59,6 @@ class PromptTokenizingStrategy(abc.ABC):
|
|||||||
def supports_batched(self):
|
def supports_batched(self):
|
||||||
return False
|
return False
|
||||||
|
|
||||||
@property
|
|
||||||
def supports_multiprocessing(self):
|
|
||||||
"""
|
|
||||||
Whether this tokenizing strategy supports multiprocessing.
|
|
||||||
Should return False if the tokenizer has unpicklable objects.
|
|
||||||
"""
|
|
||||||
return True
|
|
||||||
|
|
||||||
def _tokenize(
|
def _tokenize(
|
||||||
self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
|
self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
|
||||||
) -> BatchEncoding:
|
) -> BatchEncoding:
|
||||||
|
|||||||
0
src/axolotl/telemetry/__init__.py
Normal file
0
src/axolotl/telemetry/__init__.py
Normal file
164
src/axolotl/telemetry/callbacks.py
Normal file
164
src/axolotl/telemetry/callbacks.py
Normal file
@@ -0,0 +1,164 @@
|
|||||||
|
"""Trainer callbacks for reporting runtime metrics at regular intervals."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
|
||||||
|
from transformers import (
|
||||||
|
TrainerCallback,
|
||||||
|
TrainerControl,
|
||||||
|
TrainerState,
|
||||||
|
TrainingArguments,
|
||||||
|
)
|
||||||
|
|
||||||
|
from axolotl.telemetry.manager import TelemetryManager
|
||||||
|
from axolotl.telemetry.runtime_metrics import RuntimeMetricsTracker
|
||||||
|
|
||||||
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
TIME_SINCE_LAST = 30
|
||||||
|
|
||||||
|
|
||||||
|
class TelemetryCallback(TrainerCallback):
|
||||||
|
"""
|
||||||
|
Trainer callback for tracking and reporting runtime metrics.
|
||||||
|
|
||||||
|
This callback tracks training progress, runtime, and memory usage,
|
||||||
|
sending telemetry at configurable intervals.
|
||||||
|
"""
|
||||||
|
|
||||||
|
report_interval_steps: int = 100
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the metrics callback."""
|
||||||
|
self.tracker = RuntimeMetricsTracker()
|
||||||
|
self.telemetry_manager = TelemetryManager.get_instance()
|
||||||
|
self.current_epoch = -1
|
||||||
|
self.start_time = time.time()
|
||||||
|
self.last_report_time = None
|
||||||
|
self.last_report_step = 0
|
||||||
|
|
||||||
|
# pylint: disable=unused-argument
|
||||||
|
def on_train_begin(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Handle training start."""
|
||||||
|
self.telemetry_manager.send_event(event_type="train-start")
|
||||||
|
|
||||||
|
# pylint: disable=unused-argument
|
||||||
|
def on_train_end(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Handle training end."""
|
||||||
|
# Send training completion event
|
||||||
|
self.telemetry_manager.send_event(
|
||||||
|
event_type="train-end",
|
||||||
|
properties=self._extract_last_metrics(state)
|
||||||
|
| self.tracker.metrics.to_dict(),
|
||||||
|
)
|
||||||
|
|
||||||
|
# pylint: disable=unused-argument
|
||||||
|
def on_epoch_begin(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Handle epoch start."""
|
||||||
|
self.current_epoch += 1
|
||||||
|
self.tracker.start_epoch(self.current_epoch)
|
||||||
|
|
||||||
|
# pylint: disable=unused-argument
|
||||||
|
def on_epoch_end(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Handle epoch end."""
|
||||||
|
self.tracker.end_epoch(self.current_epoch)
|
||||||
|
|
||||||
|
# pylint: disable=unused-argument
|
||||||
|
def on_step_end(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
"""Handle step end."""
|
||||||
|
step = state.global_step
|
||||||
|
self.tracker.update_step(step)
|
||||||
|
|
||||||
|
# Check if we should report metrics
|
||||||
|
should_report = (
|
||||||
|
step % self.report_interval_steps == 0
|
||||||
|
or step == 1 # Always report first step
|
||||||
|
or step - self.last_report_step >= self.report_interval_steps
|
||||||
|
)
|
||||||
|
|
||||||
|
if should_report:
|
||||||
|
current_time = time.time()
|
||||||
|
if self.last_report_time is not None:
|
||||||
|
time_since_last_report = current_time - self.last_report_time
|
||||||
|
else:
|
||||||
|
time_since_last_report = current_time - self.start_time
|
||||||
|
steps_since_last_report = step - self.last_report_step
|
||||||
|
|
||||||
|
# Only report if enough time has passed
|
||||||
|
if (
|
||||||
|
step == 1
|
||||||
|
or time_since_last_report >= TIME_SINCE_LAST
|
||||||
|
or steps_since_last_report >= self.report_interval_steps
|
||||||
|
):
|
||||||
|
# Calculate steps per second for this interval
|
||||||
|
if time_since_last_report > 0 and steps_since_last_report > 0:
|
||||||
|
steps_per_second = steps_since_last_report / time_since_last_report
|
||||||
|
else:
|
||||||
|
steps_per_second = 0
|
||||||
|
|
||||||
|
# Update memory metrics
|
||||||
|
self.tracker.update_memory_metrics()
|
||||||
|
|
||||||
|
# Prepare metrics to report
|
||||||
|
metrics = self._extract_last_metrics(state) | {
|
||||||
|
"step": step,
|
||||||
|
"epoch": self.current_epoch,
|
||||||
|
"progress": state.epoch, # Fractional epoch progress
|
||||||
|
"steps_per_second": steps_per_second,
|
||||||
|
"elapsed_time": current_time - self.start_time,
|
||||||
|
"time_since_last_report": time_since_last_report,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add memory metrics
|
||||||
|
memory_metrics = self.tracker.get_memory_metrics()
|
||||||
|
metrics.update({"memory": memory_metrics})
|
||||||
|
|
||||||
|
# Send telemetry
|
||||||
|
self.telemetry_manager.send_event(
|
||||||
|
event_type="train-progress", properties=metrics
|
||||||
|
)
|
||||||
|
|
||||||
|
# Update last report time and step
|
||||||
|
self.last_report_time = current_time
|
||||||
|
self.last_report_step = step
|
||||||
|
|
||||||
|
def _extract_last_metrics(self, state: TrainerState) -> dict:
|
||||||
|
"""Extract last loss and learning_rate from log history."""
|
||||||
|
if not state.log_history:
|
||||||
|
return {"loss": 0, "learning_rate": 0}
|
||||||
|
|
||||||
|
last_log = state.log_history[-1]
|
||||||
|
return {
|
||||||
|
"loss": last_log.get("loss", 0),
|
||||||
|
"learning_rate": last_log.get("learning_rate", 0),
|
||||||
|
}
|
||||||
160
src/axolotl/telemetry/errors.py
Normal file
160
src/axolotl/telemetry/errors.py
Normal file
@@ -0,0 +1,160 @@
|
|||||||
|
"""Telemetry utilities for exception and traceback information."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import traceback
|
||||||
|
from functools import wraps
|
||||||
|
from inspect import getmodule
|
||||||
|
from typing import Any, Callable
|
||||||
|
|
||||||
|
from axolotl.telemetry.manager import TelemetryManager
|
||||||
|
|
||||||
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
ERROR_HANDLED = False
|
||||||
|
|
||||||
|
|
||||||
|
def sanitize_stack_trace(stack_trace: str) -> str:
|
||||||
|
"""
|
||||||
|
Remove personal information from stack trace messages while keeping Python package codepaths.
|
||||||
|
|
||||||
|
This function identifies Python packages by looking for common patterns in virtual environment
|
||||||
|
and site-packages directories, preserving the package path while removing user-specific paths.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
stack_trace: The original stack trace string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A sanitized version of the stack trace with Python package paths preserved.
|
||||||
|
"""
|
||||||
|
# Split the stack trace into lines to process each file path separately
|
||||||
|
lines = stack_trace.split("\n")
|
||||||
|
sanitized_lines = []
|
||||||
|
|
||||||
|
# Regular expression to find file paths in the stack trace
|
||||||
|
path_pattern = re.compile(r'(?:File ")(.*?)(?:")')
|
||||||
|
|
||||||
|
# Regular expression to identify paths in site-packages or dist-packages
|
||||||
|
# This matches path segments like "site-packages/package_name" or "dist-packages/package_name"
|
||||||
|
site_packages_pattern = re.compile(
|
||||||
|
r"(?:site-packages|dist-packages)[/\\]([\w\-\.]+)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Additional common virtual environment patterns
|
||||||
|
venv_lib_pattern = re.compile(
|
||||||
|
r"(?:lib|Lib)[/\\](?:python\d+(?:\.\d+)?[/\\])?(?:site-packages|dist-packages)[/\\]([\w\-\.]+)"
|
||||||
|
)
|
||||||
|
|
||||||
|
for line in lines:
|
||||||
|
# Check if this line contains a file path
|
||||||
|
path_match = path_pattern.search(line)
|
||||||
|
|
||||||
|
if path_match:
|
||||||
|
full_path = path_match.group(1)
|
||||||
|
sanitized_path = ""
|
||||||
|
|
||||||
|
# Try to match site-packages pattern
|
||||||
|
site_packages_match = site_packages_pattern.search(full_path)
|
||||||
|
venv_lib_match = venv_lib_pattern.search(full_path)
|
||||||
|
|
||||||
|
if site_packages_match:
|
||||||
|
# Find the index where the matched pattern starts
|
||||||
|
idx = full_path.find("site-packages")
|
||||||
|
if idx == -1:
|
||||||
|
idx = full_path.find("dist-packages")
|
||||||
|
|
||||||
|
# Keep from 'site-packages' onward
|
||||||
|
if idx >= 0:
|
||||||
|
sanitized_path = full_path[idx:]
|
||||||
|
elif venv_lib_match:
|
||||||
|
# For other virtual environment patterns, find the package directory
|
||||||
|
match_idx = venv_lib_match.start(1)
|
||||||
|
if match_idx > 0:
|
||||||
|
# Keep from the package name onward
|
||||||
|
package_name = venv_lib_match.group(1)
|
||||||
|
idx = full_path.rfind(
|
||||||
|
package_name, 0, match_idx + len(package_name)
|
||||||
|
)
|
||||||
|
if idx >= 0:
|
||||||
|
sanitized_path = full_path[idx:]
|
||||||
|
|
||||||
|
# If we couldn't identify a package pattern but path contains 'axolotl'
|
||||||
|
elif "axolotl" in full_path:
|
||||||
|
idx = full_path.rfind("axolotl")
|
||||||
|
if idx >= 0:
|
||||||
|
sanitized_path = full_path[idx:]
|
||||||
|
|
||||||
|
# Apply the sanitization to the line
|
||||||
|
if sanitized_path:
|
||||||
|
line = line.replace(full_path, sanitized_path)
|
||||||
|
else:
|
||||||
|
# If we couldn't identify a package pattern, just keep the filename
|
||||||
|
filename = os.path.basename(full_path)
|
||||||
|
if filename:
|
||||||
|
line = line.replace(full_path, filename)
|
||||||
|
else:
|
||||||
|
line = line.replace(full_path, "")
|
||||||
|
|
||||||
|
sanitized_lines.append(line)
|
||||||
|
|
||||||
|
return "\n".join(sanitized_lines)
|
||||||
|
|
||||||
|
|
||||||
|
def send_errors(func: Callable) -> Callable:
|
||||||
|
"""
|
||||||
|
Decorator to send exception info in a function. If an exception is raised, we send
|
||||||
|
telemetry containing the stack trace and error message.
|
||||||
|
|
||||||
|
If an error occurs in a decorated function that is called by another decorated
|
||||||
|
function, we'll only send telemetry corresponding to the lower-level function.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
func: Function to decorate.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Decorated function.
|
||||||
|
"""
|
||||||
|
|
||||||
|
@wraps(func)
|
||||||
|
def wrapper(*args, **kwargs) -> Any:
|
||||||
|
telemetry_manager = TelemetryManager.get_instance()
|
||||||
|
|
||||||
|
if not telemetry_manager.enabled:
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return func(*args, **kwargs)
|
||||||
|
except Exception as exception:
|
||||||
|
# Only track if we're not already handling an error. This prevents us from
|
||||||
|
# capturing an error more than once in nested decorated function calls.
|
||||||
|
global ERROR_HANDLED # pylint: disable=global-statement
|
||||||
|
if not ERROR_HANDLED:
|
||||||
|
ERROR_HANDLED = True
|
||||||
|
|
||||||
|
# Get function module path
|
||||||
|
module = getmodule(func)
|
||||||
|
module_path = (
|
||||||
|
f"{module.__name__}.{func.__name__}" if module else func.__name__
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get stack trace
|
||||||
|
stack_trace = "".join(
|
||||||
|
traceback.format_exception(
|
||||||
|
type(exception), exception, exception.__traceback__
|
||||||
|
)
|
||||||
|
)
|
||||||
|
stack_trace = sanitize_stack_trace(stack_trace)
|
||||||
|
|
||||||
|
# Send error telemetry
|
||||||
|
telemetry_manager.send_event(
|
||||||
|
event_type=f"{module_path}-error",
|
||||||
|
properties={
|
||||||
|
"exception": str(exception),
|
||||||
|
"stack_trace": stack_trace,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
raise
|
||||||
|
|
||||||
|
return wrapper
|
||||||
417
src/axolotl/telemetry/manager.py
Normal file
417
src/axolotl/telemetry/manager.py
Normal file
@@ -0,0 +1,417 @@
|
|||||||
|
"""Telemetry manager and associated utilities."""
|
||||||
|
|
||||||
|
import atexit
|
||||||
|
import importlib
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import platform
|
||||||
|
import time
|
||||||
|
import uuid
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import posthog
|
||||||
|
import psutil
|
||||||
|
import torch
|
||||||
|
import yaml
|
||||||
|
|
||||||
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
POSTHOG_HOST = "https://app.posthog.com"
|
||||||
|
POSTHOG_WRITE_KEY = "phc_1kUR0o04oJKKTTeSsIz2Mfm5mpiVsQEf2WOlzljMD7y"
|
||||||
|
|
||||||
|
OPT_IN_WARNING_SLEEP_SECONDS = 10
|
||||||
|
OPT_IN_WARNING = (
|
||||||
|
"\nTelemetry is currently disabled by default. If you'd like to help improve "
|
||||||
|
"Axolotl, consider enabling it by setting AXOLOTL_DO_NOT_TRACK=0 in your environment.\n\n"
|
||||||
|
"Telemetry data helps us understand:\n"
|
||||||
|
"- Which features are most used\n"
|
||||||
|
"- What hardware configurations to prioritize\n"
|
||||||
|
"- Where users encounter errors\n\n"
|
||||||
|
"Personally identifiable information (PII) is not collected.\n\n"
|
||||||
|
"To remove this warning, explicitly set AXOLOTL_DO_NOT_TRACK=0 (enable telemetry) "
|
||||||
|
"or AXOLOTL_DO_NOT_TRACK=1 (explicitly disable telemetry).\n\n"
|
||||||
|
"Note: Telemetry will move to an opt-out in a later release.\n\n"
|
||||||
|
"For details, see: https://axolotl-ai-cloud.github.io/axolotl/docs/telemetry.html\n\n"
|
||||||
|
f"Sleeping for {OPT_IN_WARNING_SLEEP_SECONDS}s..."
|
||||||
|
)
|
||||||
|
|
||||||
|
WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml")
|
||||||
|
|
||||||
|
# NOTE: Need to keep these up to date with any config schema changes
|
||||||
|
FIELDS_TO_REDACT = {
|
||||||
|
"base_model",
|
||||||
|
"tokenizer_config",
|
||||||
|
"base_model_config",
|
||||||
|
"pretraining_dataset", # NOTE: this field may be a string or a dictionary
|
||||||
|
"resume_from_checkpoint",
|
||||||
|
"hub_model_id",
|
||||||
|
}
|
||||||
|
PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_"}
|
||||||
|
PATH_INDICATORS = {"path", "dir"}
|
||||||
|
|
||||||
|
# pylint: disable=duplicate-code
|
||||||
|
RELEVANT_PACKAGES = {
|
||||||
|
"torch",
|
||||||
|
"transformers",
|
||||||
|
"trl",
|
||||||
|
"datasets",
|
||||||
|
"peft",
|
||||||
|
"bitsandbytes",
|
||||||
|
"accelerate",
|
||||||
|
"optimum",
|
||||||
|
"deepspeed",
|
||||||
|
"ray",
|
||||||
|
"axolotl",
|
||||||
|
"triton",
|
||||||
|
"mamba-ssm",
|
||||||
|
"flash-attn",
|
||||||
|
"xformers",
|
||||||
|
"autoawq",
|
||||||
|
"tokenizers",
|
||||||
|
"sentencepiece",
|
||||||
|
"torchao",
|
||||||
|
"lm_eval",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def is_main_process() -> bool:
|
||||||
|
"""
|
||||||
|
Check whether we're running in the main process.
|
||||||
|
|
||||||
|
Note:
|
||||||
|
We're using this function instead of `torch.utils.distributed.is_main_process`
|
||||||
|
causes issues with DeepSpeed world_size since. This function avoids that issue
|
||||||
|
by checking env vars that are set by various launchers.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Whether we're running in the main process.
|
||||||
|
"""
|
||||||
|
# If PyTorch distributed is already initialized, use it
|
||||||
|
if torch.distributed.is_initialized():
|
||||||
|
return torch.distributed.get_rank() == 0
|
||||||
|
|
||||||
|
# Otherwise check environment variables for global rank
|
||||||
|
# NOTE: need to verify this in SLURM / OpenMPI environments
|
||||||
|
global_rank = int(
|
||||||
|
os.environ.get(
|
||||||
|
"RANK",
|
||||||
|
os.environ.get(
|
||||||
|
"GLOBAL_RANK",
|
||||||
|
os.environ.get(
|
||||||
|
"SLURM_PROCID",
|
||||||
|
os.environ.get(
|
||||||
|
"OMPI_COMM_WORLD_RANK",
|
||||||
|
"0",
|
||||||
|
),
|
||||||
|
),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
return global_rank == 0
|
||||||
|
|
||||||
|
|
||||||
|
class TelemetryManager:
|
||||||
|
"""Manages telemetry collection and transmission"""
|
||||||
|
|
||||||
|
_instance = None
|
||||||
|
_initialized = False
|
||||||
|
|
||||||
|
def __new__(cls):
|
||||||
|
"""
|
||||||
|
Telemetry manager constructor. Creates the singleton instance of this class if
|
||||||
|
it doesn't already exist.
|
||||||
|
"""
|
||||||
|
if cls._instance is None:
|
||||||
|
cls._instance = super(TelemetryManager, cls).__new__(cls)
|
||||||
|
cls._instance._initialized = False
|
||||||
|
|
||||||
|
return cls._instance
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Telemetry manager initializer"""
|
||||||
|
if self._initialized:
|
||||||
|
return
|
||||||
|
|
||||||
|
self.enabled = self._check_telemetry_enabled()
|
||||||
|
|
||||||
|
if self.enabled:
|
||||||
|
self.run_id = str(uuid.uuid4())
|
||||||
|
self.whitelist = self._load_whitelist()
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.system_info = self._get_system_info()
|
||||||
|
except Exception as e: # pylint: disable=broad-exception-caught
|
||||||
|
LOG.warning(f"Error during system info collection: {e}")
|
||||||
|
self.system_info = None
|
||||||
|
|
||||||
|
self._init_posthog()
|
||||||
|
|
||||||
|
# Register shutdown method to flush posthog telemetry
|
||||||
|
atexit.register(self.shutdown)
|
||||||
|
|
||||||
|
self._initialized = True
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def get_instance(cls) -> "TelemetryManager":
|
||||||
|
if cls._instance is None:
|
||||||
|
cls._instance = TelemetryManager()
|
||||||
|
|
||||||
|
return cls._instance
|
||||||
|
|
||||||
|
def _check_telemetry_enabled(self) -> bool:
|
||||||
|
"""
|
||||||
|
Check if telemetry is enabled based on environment variables. We also check
|
||||||
|
whether this is the main process (for the distributed setting and to avoid
|
||||||
|
sending duplicate PostHog events per GPU).
|
||||||
|
|
||||||
|
Note: This is disabled by default on an opt-in basis. Set
|
||||||
|
`AXOLOTL_DO_NOT_TRACK=0` to enable telemetry. We plan to move to an opt-out
|
||||||
|
model in a later release. For more details, see
|
||||||
|
https://axolotl-ai-cloud.github.io/axolotl/docs/telemetry.html.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Boolean denoting whether telemetry is enabled or not.
|
||||||
|
"""
|
||||||
|
# Parse relevant env vars
|
||||||
|
axolotl_do_not_track = os.getenv("AXOLOTL_DO_NOT_TRACK")
|
||||||
|
do_not_track = os.getenv("DO_NOT_TRACK")
|
||||||
|
|
||||||
|
# Default to disabled (opt-in model for initial release)
|
||||||
|
if axolotl_do_not_track is None or axolotl_do_not_track.lower() not in (
|
||||||
|
"0",
|
||||||
|
"1",
|
||||||
|
"false",
|
||||||
|
"true",
|
||||||
|
):
|
||||||
|
# Print opt-in info message for main process only
|
||||||
|
if is_main_process():
|
||||||
|
LOG.warning(OPT_IN_WARNING)
|
||||||
|
time.sleep(OPT_IN_WARNING_SLEEP_SECONDS)
|
||||||
|
|
||||||
|
return False
|
||||||
|
|
||||||
|
# Only rank 0 will send telemetry
|
||||||
|
if not is_main_process():
|
||||||
|
return False
|
||||||
|
|
||||||
|
if do_not_track is None:
|
||||||
|
do_not_track = "0"
|
||||||
|
|
||||||
|
# Respect AXOLOTL_DO_NOT_TRACK, DO_NOT_TRACK if enabled
|
||||||
|
enabled = axolotl_do_not_track.lower() not in (
|
||||||
|
"1",
|
||||||
|
"true",
|
||||||
|
) and do_not_track.lower() not in ("1", "true")
|
||||||
|
|
||||||
|
return enabled
|
||||||
|
|
||||||
|
def _load_whitelist(self) -> dict:
|
||||||
|
"""Load HuggingFace Hub organization whitelist"""
|
||||||
|
with open(WHITELIST_PATH, encoding="utf-8") as f:
|
||||||
|
whitelist = yaml.safe_load(f)
|
||||||
|
|
||||||
|
# Send org strings to lowercase since model names are case insensitive
|
||||||
|
whitelist["organizations"] = {
|
||||||
|
org.lower() for org in whitelist["organizations"]
|
||||||
|
}
|
||||||
|
|
||||||
|
return whitelist
|
||||||
|
|
||||||
|
def _is_whitelisted(self, value: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if model / dataset / etc. org is in whitelist.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
value: Value for one of `axolotl.telemetry.manager.FIELDS_WITH_ORGS`
|
||||||
|
("base_model", etc.).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Boolean indicating whitelist membership.
|
||||||
|
"""
|
||||||
|
# NOTE: This membership-checking logic can be improved.
|
||||||
|
# What happens when a local model path matches a whitelisted org?
|
||||||
|
parts = value.split("/")
|
||||||
|
if len(parts) < 2:
|
||||||
|
return False
|
||||||
|
org = parts[0]
|
||||||
|
whitelisted = org.lower() in self.whitelist["organizations"]
|
||||||
|
|
||||||
|
return whitelisted
|
||||||
|
|
||||||
|
def _init_posthog(self):
|
||||||
|
"""Initialize PostHog client"""
|
||||||
|
posthog.host = POSTHOG_HOST
|
||||||
|
posthog.project_api_key = POSTHOG_WRITE_KEY
|
||||||
|
|
||||||
|
def _redact_paths(self, properties: dict[str, Any]) -> dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Redact properties to remove any paths, so as to avoid inadvertently collecting
|
||||||
|
private or personally identifiable information (PII). We also remove
|
||||||
|
information related to Wandb, MLflow, etc. configuration.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
properties: Dictionary of properties to redact.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Properties dictionary with redaction applied.
|
||||||
|
"""
|
||||||
|
if not properties:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
def redact_value(value: Any, key: str = "") -> Any:
|
||||||
|
"""Recursively sanitize values, redacting those with path-like keys"""
|
||||||
|
if isinstance(key, str) and isinstance(value, str):
|
||||||
|
# Other redaction special cases
|
||||||
|
if (
|
||||||
|
key in FIELDS_TO_REDACT
|
||||||
|
or any(prefix in key for prefix in PREFIXES_TO_REDACT)
|
||||||
|
or any(indicator in key.lower() for indicator in PATH_INDICATORS)
|
||||||
|
):
|
||||||
|
# Fields with whitelisted orgs don't need to be redacted
|
||||||
|
if not self._is_whitelisted(value):
|
||||||
|
return "[REDACTED]"
|
||||||
|
|
||||||
|
# Handle nested values
|
||||||
|
if isinstance(value, dict):
|
||||||
|
return {k: redact_value(v, k) for k, v in value.items()}
|
||||||
|
if isinstance(value, list):
|
||||||
|
return [redact_value(item) for item in value]
|
||||||
|
|
||||||
|
return value
|
||||||
|
|
||||||
|
# Create new dict with redacted values
|
||||||
|
redacted = {k: redact_value(v, k) for k, v in properties.items()}
|
||||||
|
|
||||||
|
return redacted
|
||||||
|
|
||||||
|
def _get_system_info(self) -> dict[str, Any]:
|
||||||
|
"""Collect system information for various hardware accelerators"""
|
||||||
|
gpu_info = []
|
||||||
|
accelerator_type = "none"
|
||||||
|
|
||||||
|
# NVIDIA GPUs
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
accelerator_type = "cuda"
|
||||||
|
for i in range(torch.cuda.device_count()):
|
||||||
|
gpu_info.append(
|
||||||
|
{
|
||||||
|
"name": torch.cuda.get_device_name(i),
|
||||||
|
"memory": torch.cuda.get_device_properties(i).total_memory,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# AMD GPUs
|
||||||
|
elif hasattr(torch, "hip") and torch.hip.is_available():
|
||||||
|
accelerator_type = "hip"
|
||||||
|
for i in range(torch.hip.device_count()):
|
||||||
|
gpu_info.append(
|
||||||
|
{
|
||||||
|
"name": torch.hip.get_device_name(i),
|
||||||
|
"memory": (
|
||||||
|
torch.hip.get_device_properties(i).total_memory
|
||||||
|
if hasattr(torch.hip, "get_device_properties")
|
||||||
|
else None
|
||||||
|
),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Apple Silicon
|
||||||
|
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
||||||
|
accelerator_type = "mps"
|
||||||
|
gpu_info.append(
|
||||||
|
{
|
||||||
|
"name": "Apple Silicon",
|
||||||
|
# NOTE: this is memory allocated to this process, not total memory
|
||||||
|
"memory": torch.mps.driver_allocated_memory(),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Intel GPUs
|
||||||
|
elif hasattr(torch, "xpu") and torch.xpu.is_available():
|
||||||
|
accelerator_type = "xpu"
|
||||||
|
for i in range(torch.xpu.device_count()):
|
||||||
|
memory = None
|
||||||
|
if hasattr(torch.xpu, "get_device_properties"):
|
||||||
|
memory = torch.xpu.get_device_properties(i).total_memory
|
||||||
|
|
||||||
|
gpu_info.append(
|
||||||
|
{
|
||||||
|
"name": torch.xpu.get_device_name(i),
|
||||||
|
"memory": memory,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# NPUs
|
||||||
|
elif hasattr(torch, "npu") and torch.npu.is_available():
|
||||||
|
accelerator_type = "npu"
|
||||||
|
for i in range(torch.npu.device_count()):
|
||||||
|
memory = None
|
||||||
|
if hasattr(torch.npu, "get_device_properties"):
|
||||||
|
memory = torch.npu.get_device_properties(i).total_memory
|
||||||
|
|
||||||
|
gpu_info.append(
|
||||||
|
{
|
||||||
|
"name": torch.npu.get_device_name(i),
|
||||||
|
"memory": memory,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get relevant package versions
|
||||||
|
installed_packages = {}
|
||||||
|
for package in RELEVANT_PACKAGES:
|
||||||
|
try:
|
||||||
|
version = importlib.metadata.version(package)
|
||||||
|
installed_packages[f"{package}_version"] = version
|
||||||
|
except importlib.metadata.PackageNotFoundError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
return {
|
||||||
|
"os": platform.system(),
|
||||||
|
"python_version": platform.python_version(),
|
||||||
|
"cpu_count": psutil.cpu_count(),
|
||||||
|
"memory_total": psutil.virtual_memory().total,
|
||||||
|
"accelerator_type": accelerator_type,
|
||||||
|
"accelerator_count": len(gpu_info),
|
||||||
|
"accelerator_info": gpu_info,
|
||||||
|
**installed_packages,
|
||||||
|
}
|
||||||
|
|
||||||
|
def send_event(self, event_type: str, properties: dict[str, Any] | None = None):
|
||||||
|
"""Send a telemetry event"""
|
||||||
|
if not self.enabled:
|
||||||
|
return
|
||||||
|
|
||||||
|
if properties is None:
|
||||||
|
properties = {}
|
||||||
|
|
||||||
|
# Sanitize properties to remove PII
|
||||||
|
properties = self._redact_paths(properties)
|
||||||
|
|
||||||
|
# Wrap PostHog errors in try / except to not raise errors during Axolotl usage
|
||||||
|
try:
|
||||||
|
# Send event via PostHog
|
||||||
|
posthog.capture(
|
||||||
|
distinct_id=self.run_id,
|
||||||
|
event=event_type,
|
||||||
|
properties=properties,
|
||||||
|
disable_geoip=True,
|
||||||
|
)
|
||||||
|
except Exception as e: # pylint: disable=broad-exception-caught
|
||||||
|
LOG.warning(f"Failed to send telemetry event: {e}")
|
||||||
|
|
||||||
|
# Additionally, send system info telemetry when loading config.
|
||||||
|
# NOTE: Is this the best place for this?
|
||||||
|
if event_type == "config-loaded":
|
||||||
|
self.send_system_info()
|
||||||
|
|
||||||
|
def send_system_info(self):
|
||||||
|
"""Helper method for sending system info"""
|
||||||
|
if self.system_info is not None:
|
||||||
|
self.send_event(event_type="system-info", properties=self.system_info)
|
||||||
|
|
||||||
|
def shutdown(self):
|
||||||
|
"""Ensure all queued events are processed before shutdown"""
|
||||||
|
if self.enabled:
|
||||||
|
posthog.flush()
|
||||||
209
src/axolotl/telemetry/runtime_metrics.py
Normal file
209
src/axolotl/telemetry/runtime_metrics.py
Normal file
@@ -0,0 +1,209 @@
|
|||||||
|
"""Telemetry utilities for runtime and memory metrics."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import psutil
|
||||||
|
import torch
|
||||||
|
|
||||||
|
from axolotl.telemetry.manager import TelemetryManager
|
||||||
|
|
||||||
|
LOG = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class RuntimeMetrics:
|
||||||
|
"""Container for runtime metrics to be tracked throughout training."""
|
||||||
|
|
||||||
|
# Timing metrics
|
||||||
|
start_time: float
|
||||||
|
epoch_start_times: dict[int, float] = field(init=False)
|
||||||
|
epoch_end_times: dict[int, float] = field(init=False)
|
||||||
|
|
||||||
|
# Memory metrics
|
||||||
|
peak_cpu_memory: int = 0
|
||||||
|
peak_gpu_memory: dict[int, int] = field(init=False)
|
||||||
|
|
||||||
|
# Progress metrics
|
||||||
|
total_steps: int = 0
|
||||||
|
current_epoch: int = 0
|
||||||
|
current_step: int = 0
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
"""Initialize empty metric mappings."""
|
||||||
|
self.epoch_start_times = {}
|
||||||
|
self.epoch_end_times = {}
|
||||||
|
self.peak_gpu_memory = {}
|
||||||
|
|
||||||
|
@property
|
||||||
|
def elapsed_time(self) -> float:
|
||||||
|
"""Calculate total elapsed time in seconds."""
|
||||||
|
return time.time() - self.start_time
|
||||||
|
|
||||||
|
def epoch_time(self, epoch: int) -> float | None:
|
||||||
|
"""Calculate time taken for a specific epoch in seconds."""
|
||||||
|
if epoch in self.epoch_start_times and epoch in self.epoch_end_times:
|
||||||
|
return self.epoch_end_times[epoch] - self.epoch_start_times[epoch]
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
def average_epoch_time(self) -> float | None:
|
||||||
|
"""Calculate average time per epoch in seconds."""
|
||||||
|
completed_epochs = [
|
||||||
|
epoch for epoch in self.epoch_start_times if epoch in self.epoch_end_times
|
||||||
|
]
|
||||||
|
if not completed_epochs:
|
||||||
|
return None
|
||||||
|
|
||||||
|
total_time = 0.0
|
||||||
|
for epoch in completed_epochs:
|
||||||
|
epoch_time = self.epoch_time(epoch)
|
||||||
|
if epoch_time is not None: # Check to avoid mypy warning
|
||||||
|
total_time += epoch_time
|
||||||
|
|
||||||
|
return total_time / len(completed_epochs)
|
||||||
|
|
||||||
|
def steps_per_second(self) -> float | None:
|
||||||
|
"""Calculate average steps per second across all training."""
|
||||||
|
if self.total_steps == 0 or self.elapsed_time == 0:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return self.total_steps / self.elapsed_time
|
||||||
|
|
||||||
|
def to_dict(self) -> dict[str, Any]:
|
||||||
|
"""Convert metrics to a dictionary for telemetry reporting."""
|
||||||
|
metrics = {
|
||||||
|
"total_time_seconds": self.elapsed_time,
|
||||||
|
"total_steps": self.total_steps,
|
||||||
|
"steps_per_second": self.steps_per_second(),
|
||||||
|
"epochs_completed": len(
|
||||||
|
[
|
||||||
|
epoch
|
||||||
|
for epoch in self.epoch_start_times
|
||||||
|
if epoch in self.epoch_end_times
|
||||||
|
]
|
||||||
|
),
|
||||||
|
"peak_cpu_memory_bytes": self.peak_cpu_memory,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add per-epoch timing if available
|
||||||
|
epoch_times: dict[str, float] = {}
|
||||||
|
for epoch in sorted(self.epoch_end_times.keys()):
|
||||||
|
time_taken = self.epoch_time(epoch)
|
||||||
|
if time_taken is not None:
|
||||||
|
epoch_times[f"epoch_{epoch}_seconds"] = time_taken
|
||||||
|
|
||||||
|
if epoch_times:
|
||||||
|
metrics["epoch_times"] = epoch_times # type: ignore
|
||||||
|
metrics["average_epoch_time_seconds"] = self.average_epoch_time()
|
||||||
|
|
||||||
|
# Add GPU memory metrics if available
|
||||||
|
if self.peak_gpu_memory:
|
||||||
|
gpu_metrics: dict[str, int] = {}
|
||||||
|
for gpu_id, memory in self.peak_gpu_memory.items():
|
||||||
|
gpu_metrics[f"gpu_{gpu_id}_peak_memory_bytes"] = memory
|
||||||
|
metrics["gpu_memory"] = gpu_metrics # type: ignore
|
||||||
|
|
||||||
|
return metrics
|
||||||
|
|
||||||
|
|
||||||
|
class RuntimeMetricsTracker:
|
||||||
|
"""Tracker for runtime metrics during training."""
|
||||||
|
|
||||||
|
update_interval = 100
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
"""Initialize the runtime metrics tracker."""
|
||||||
|
self.metrics = RuntimeMetrics(start_time=time.time())
|
||||||
|
self.telemetry_manager = TelemetryManager.get_instance()
|
||||||
|
|
||||||
|
def start_epoch(self, epoch: int):
|
||||||
|
"""Record the start of a new epoch."""
|
||||||
|
self.metrics.current_epoch = epoch
|
||||||
|
self.metrics.epoch_start_times[epoch] = time.time()
|
||||||
|
self.update_memory_metrics()
|
||||||
|
|
||||||
|
def end_epoch(self, epoch: int):
|
||||||
|
"""Record the end of an epoch."""
|
||||||
|
self.metrics.epoch_end_times[epoch] = time.time()
|
||||||
|
|
||||||
|
def update_step(self, step: int):
|
||||||
|
"""Update the current step count."""
|
||||||
|
self.metrics.current_step = step
|
||||||
|
self.metrics.total_steps += 1
|
||||||
|
|
||||||
|
# Periodically update memory metrics
|
||||||
|
if step % self.update_interval == 0:
|
||||||
|
self.update_memory_metrics()
|
||||||
|
|
||||||
|
def _get_allocated_memory(self) -> dict[int, int]:
|
||||||
|
"""
|
||||||
|
Helper function for getting accelerator-agnostic allocated memory.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary mapping device IDs to allocated memory in bytes
|
||||||
|
"""
|
||||||
|
memory_used: dict[int, int] = {}
|
||||||
|
|
||||||
|
# NVIDIA GPUs
|
||||||
|
if torch.cuda.is_available():
|
||||||
|
for i in range(torch.cuda.device_count()):
|
||||||
|
memory_used[i] = torch.cuda.memory_allocated(i)
|
||||||
|
|
||||||
|
# AMD GPUs
|
||||||
|
elif hasattr(torch, "hip") and torch.hip.is_available():
|
||||||
|
for i in range(torch.hip.device_count()):
|
||||||
|
if hasattr(torch.hip, "memory_allocated"):
|
||||||
|
memory_used[i] = torch.hip.memory_allocated(i)
|
||||||
|
|
||||||
|
# Apple Silicon
|
||||||
|
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
|
||||||
|
# MPS doesn't have per-device memory stats since there's only one device
|
||||||
|
if hasattr(torch.mps, "current_allocated_memory"):
|
||||||
|
memory_used[0] = torch.mps.current_allocated_memory()
|
||||||
|
|
||||||
|
# Intel GPUs
|
||||||
|
elif hasattr(torch, "xpu") and torch.xpu.is_available():
|
||||||
|
for i in range(torch.xpu.device_count()):
|
||||||
|
if hasattr(torch.xpu, "memory_allocated"):
|
||||||
|
memory_used[i] = torch.xpu.memory_allocated(i)
|
||||||
|
|
||||||
|
# NPUs
|
||||||
|
elif hasattr(torch, "npu") and torch.npu.is_available():
|
||||||
|
for i in range(torch.npu.device_count()):
|
||||||
|
if hasattr(torch.npu, "memory_allocated"):
|
||||||
|
memory_used[i] = torch.npu.memory_allocated(i)
|
||||||
|
|
||||||
|
return memory_used
|
||||||
|
|
||||||
|
def update_memory_metrics(self):
|
||||||
|
"""Update peak memory usage metrics."""
|
||||||
|
# CPU memory
|
||||||
|
cpu_memory = psutil.Process().memory_info().rss
|
||||||
|
self.metrics.peak_cpu_memory = max(self.metrics.peak_cpu_memory, cpu_memory)
|
||||||
|
|
||||||
|
# GPU memory (if available)
|
||||||
|
memory_used = self._get_allocated_memory()
|
||||||
|
for i, memory in memory_used.items():
|
||||||
|
self.metrics.peak_gpu_memory[i] = max(
|
||||||
|
self.metrics.peak_gpu_memory.get(i, 0), memory
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_memory_metrics(self) -> dict[str, Any]:
|
||||||
|
"""Get the current memory metrics as a dictionary."""
|
||||||
|
memory_metrics = {
|
||||||
|
"cpu_memory_bytes": psutil.Process().memory_info().rss,
|
||||||
|
"peak_cpu_memory_bytes": self.metrics.peak_cpu_memory,
|
||||||
|
}
|
||||||
|
|
||||||
|
# GPU memory (if available)
|
||||||
|
memory_used = self._get_allocated_memory()
|
||||||
|
for i, memory in memory_used.items():
|
||||||
|
memory_metrics[f"gpu_{i}_memory_bytes"] = memory
|
||||||
|
memory_metrics[f"gpu_{i}_peak_memory_bytes"] = (
|
||||||
|
self.metrics.peak_gpu_memory.get(i, 0)
|
||||||
|
)
|
||||||
|
|
||||||
|
return memory_metrics
|
||||||
17
src/axolotl/telemetry/whitelist.yaml
Normal file
17
src/axolotl/telemetry/whitelist.yaml
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
organizations:
|
||||||
|
- "axolotl-ai-co"
|
||||||
|
- "meta-llama"
|
||||||
|
- "huggingface"
|
||||||
|
- "nvidia"
|
||||||
|
- "facebook"
|
||||||
|
- "google"
|
||||||
|
- "microsoft"
|
||||||
|
- "deepseek-ai"
|
||||||
|
- "HuggingFaceTB"
|
||||||
|
- "mistralai"
|
||||||
|
- "Qwen"
|
||||||
|
- "unsloth"
|
||||||
|
- "NousResearch"
|
||||||
|
- "allenai"
|
||||||
|
- "amd"
|
||||||
|
- "tiiuae"
|
||||||
@@ -1,13 +1,10 @@
|
|||||||
"""Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
|
"""Prepare and train a model on a dataset. Can also infer from a model or merge lora"""
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import importlib
|
import importlib
|
||||||
import inspect
|
import inspect
|
||||||
import os
|
import os
|
||||||
import signal
|
import signal
|
||||||
import sys
|
import sys
|
||||||
import typing
|
|
||||||
import weakref
|
import weakref
|
||||||
from contextlib import ExitStack
|
from contextlib import ExitStack
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -28,12 +25,15 @@ from axolotl.common.datasets import TrainDatasetMeta
|
|||||||
from axolotl.contribs.lgpl import ( # pylint: disable = no-name-in-module
|
from axolotl.contribs.lgpl import ( # pylint: disable = no-name-in-module
|
||||||
fix_untrained_tokens,
|
fix_untrained_tokens,
|
||||||
)
|
)
|
||||||
|
from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
|
||||||
from axolotl.integrations.base import PluginManager
|
from axolotl.integrations.base import PluginManager
|
||||||
from axolotl.loaders import (
|
from axolotl.loaders import (
|
||||||
ModelLoader,
|
ModelLoader,
|
||||||
load_processor,
|
load_processor,
|
||||||
load_tokenizer,
|
load_tokenizer,
|
||||||
)
|
)
|
||||||
|
from axolotl.telemetry.errors import send_errors
|
||||||
|
from axolotl.telemetry.manager import TelemetryManager
|
||||||
from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
|
from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.distributed import cleanup_distributed
|
from axolotl.utils.distributed import cleanup_distributed
|
||||||
@@ -47,19 +47,19 @@ try:
|
|||||||
except ImportError:
|
except ImportError:
|
||||||
BetterTransformer = None
|
BetterTransformer = None
|
||||||
|
|
||||||
if typing.TYPE_CHECKING:
|
|
||||||
from axolotl.core.trainer_builder import HFCausalTrainerBuilder, HFRLTrainerBuilder
|
|
||||||
|
|
||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
|
TELEMETRY_MANAGER = TelemetryManager.get_instance()
|
||||||
|
PLUGIN_MANAGER = PluginManager.get_instance()
|
||||||
|
|
||||||
|
|
||||||
def setup_model_and_tokenizer(
|
def setup_model_and_tokenizer(
|
||||||
cfg: DictDefault,
|
cfg: DictDefault,
|
||||||
) -> tuple[
|
) -> tuple[
|
||||||
PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None
|
PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None
|
||||||
]:
|
]:
|
||||||
"""Load the tokenizer, processor (for multimodal models), and model based on
|
"""
|
||||||
configuration.
|
Load the tokenizer, processor (for multimodal models), and model based on configuration.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
cfg: Dictionary mapping `axolotl` config keys to values.
|
cfg: Dictionary mapping `axolotl` config keys to values.
|
||||||
@@ -69,7 +69,10 @@ def setup_model_and_tokenizer(
|
|||||||
`None`), and processor (if multimodal, else `None`).
|
`None`), and processor (if multimodal, else `None`).
|
||||||
"""
|
"""
|
||||||
# Load tokenizer
|
# Load tokenizer
|
||||||
LOG.debug(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
|
LOG.debug(
|
||||||
|
f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
|
||||||
|
main_process_only=True,
|
||||||
|
)
|
||||||
tokenizer = load_tokenizer(cfg)
|
tokenizer = load_tokenizer(cfg)
|
||||||
|
|
||||||
# Load processor for multimodal models if needed
|
# Load processor for multimodal models if needed
|
||||||
@@ -88,6 +91,14 @@ def setup_model_and_tokenizer(
|
|||||||
if model.generation_config is not None:
|
if model.generation_config is not None:
|
||||||
model.generation_config.do_sample = True
|
model.generation_config.do_sample = True
|
||||||
|
|
||||||
|
TELEMETRY_MANAGER.send_event(
|
||||||
|
event_type="model-load", properties=model.config.to_dict()
|
||||||
|
)
|
||||||
|
if peft_config:
|
||||||
|
TELEMETRY_MANAGER.send_event(
|
||||||
|
event_type="peft-config-load", properties=peft_config.to_dict()
|
||||||
|
)
|
||||||
|
|
||||||
# Apply freezing if specified
|
# Apply freezing if specified
|
||||||
if cfg.unfrozen_parameters:
|
if cfg.unfrozen_parameters:
|
||||||
freeze_layers_except(model, cfg.unfrozen_parameters)
|
freeze_layers_except(model, cfg.unfrozen_parameters)
|
||||||
@@ -477,7 +488,7 @@ def handle_untrained_tokens_fix(
|
|||||||
|
|
||||||
|
|
||||||
def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> tuple[
|
def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> tuple[
|
||||||
"HFRLTrainerBuilder" | "HFCausalTrainerBuilder",
|
HFRLTrainerBuilder | HFCausalTrainerBuilder,
|
||||||
PeftModel | PreTrainedModel,
|
PeftModel | PreTrainedModel,
|
||||||
PreTrainedTokenizer,
|
PreTrainedTokenizer,
|
||||||
PeftConfig | None,
|
PeftConfig | None,
|
||||||
@@ -522,6 +533,7 @@ def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) ->
|
|||||||
model_ref=model_ref,
|
model_ref=model_ref,
|
||||||
peft_config=peft_config,
|
peft_config=peft_config,
|
||||||
)
|
)
|
||||||
|
PLUGIN_MANAGER.post_trainer_create(cfg, trainer)
|
||||||
|
|
||||||
return (
|
return (
|
||||||
trainer,
|
trainer,
|
||||||
@@ -532,6 +544,7 @@ def setup_model_and_trainer(cfg: DictDefault, dataset_meta: TrainDatasetMeta) ->
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@send_errors
|
||||||
def train(
|
def train(
|
||||||
cfg: DictDefault, dataset_meta: TrainDatasetMeta
|
cfg: DictDefault, dataset_meta: TrainDatasetMeta
|
||||||
) -> tuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]:
|
) -> tuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]:
|
||||||
@@ -556,8 +569,11 @@ def train(
|
|||||||
processor,
|
processor,
|
||||||
) = setup_model_and_trainer(cfg, dataset_meta)
|
) = setup_model_and_trainer(cfg, dataset_meta)
|
||||||
|
|
||||||
plugin_manager = PluginManager.get_instance()
|
# Determine if we need to resume from a checkpoint
|
||||||
plugin_manager.post_trainer_create(cfg, trainer)
|
resume_from_checkpoint = determine_resume_checkpoint(cfg)
|
||||||
|
|
||||||
|
# Configuration for saving
|
||||||
|
safe_serialization = cfg.save_safetensors is True
|
||||||
|
|
||||||
# Handle untrained tokens if configured
|
# Handle untrained tokens if configured
|
||||||
safe_serialization = cfg.save_safetensors is True
|
safe_serialization = cfg.save_safetensors is True
|
||||||
@@ -572,7 +588,6 @@ def train(
|
|||||||
setup_model_card(cfg)
|
setup_model_card(cfg)
|
||||||
|
|
||||||
# Execute the training
|
# Execute the training
|
||||||
resume_from_checkpoint = determine_resume_checkpoint(cfg)
|
|
||||||
execute_training(cfg, trainer, resume_from_checkpoint)
|
execute_training(cfg, trainer, resume_from_checkpoint)
|
||||||
|
|
||||||
# Save the trained model and cleanup
|
# Save the trained model and cleanup
|
||||||
@@ -580,7 +595,6 @@ def train(
|
|||||||
create_model_card(cfg, trainer)
|
create_model_card(cfg, trainer)
|
||||||
if not cfg.use_ray:
|
if not cfg.use_ray:
|
||||||
cleanup_distributed()
|
cleanup_distributed()
|
||||||
|
PLUGIN_MANAGER.post_train(cfg, model)
|
||||||
plugin_manager.post_train(cfg, model)
|
|
||||||
|
|
||||||
return model, tokenizer, trainer
|
return model, tokenizer, trainer
|
||||||
|
|||||||
@@ -52,10 +52,3 @@ def patch_optimized_env():
|
|||||||
if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
|
if os.getenv("HF_HUB_ENABLE_HF_TRANSFER") is None:
|
||||||
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
|
||||||
set_pytorch_cuda_alloc_conf()
|
set_pytorch_cuda_alloc_conf()
|
||||||
|
|
||||||
|
|
||||||
def get_not_null(value, default=None):
|
|
||||||
"""
|
|
||||||
return the value if it's not None, otherwise return the default value
|
|
||||||
"""
|
|
||||||
return value if value is not None else default
|
|
||||||
|
|||||||
@@ -53,6 +53,25 @@ IGNORE_INDEX = -100
|
|||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class EvalFirstStepCallback(
|
||||||
|
TrainerCallback
|
||||||
|
): # pylint: disable=too-few-public-methods disable=unused-argument
|
||||||
|
"""
|
||||||
|
Callback to trigger evals on the first step
|
||||||
|
"""
|
||||||
|
|
||||||
|
def on_step_end(
|
||||||
|
self,
|
||||||
|
args: TrainingArguments,
|
||||||
|
state: TrainerState,
|
||||||
|
control: TrainerControl,
|
||||||
|
**kwargs,
|
||||||
|
):
|
||||||
|
if args.eval_strategy == IntervalStrategy.STEPS and state.global_step == 1:
|
||||||
|
control.should_evaluate = True
|
||||||
|
return control
|
||||||
|
|
||||||
|
|
||||||
class SaveBetterTransformerModelCallback(
|
class SaveBetterTransformerModelCallback(
|
||||||
TrainerCallback
|
TrainerCallback
|
||||||
): # pylint: disable=too-few-public-methods
|
): # pylint: disable=too-few-public-methods
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -1,7 +1,7 @@
|
|||||||
"""Data collators for axolotl to pad labels and position_ids for packed sequences"""
|
"""Data collators for axolotl to pad labels and position_ids for packed sequences"""
|
||||||
|
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
from typing import Any, List
|
from typing import Any
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
from transformers import PreTrainedTokenizerBase
|
from transformers import PreTrainedTokenizerBase
|
||||||
@@ -81,11 +81,9 @@ class DataCollatorForSeq2Seq:
|
|||||||
|
|
||||||
padding_side = self.tokenizer.padding_side
|
padding_side = self.tokenizer.padding_side
|
||||||
for feature in features:
|
for feature in features:
|
||||||
remainder_len = max_feature_length - len(feature[feature_name])
|
remainder = [pad_token_id] * (
|
||||||
if feature_name == "position_ids":
|
max_feature_length - len(feature[feature_name])
|
||||||
remainder = list(range(remainder_len))
|
)
|
||||||
else:
|
|
||||||
remainder = [pad_token_id] * remainder_len
|
|
||||||
if isinstance(feature[feature_name], list):
|
if isinstance(feature[feature_name], list):
|
||||||
feature[feature_name] = (
|
feature[feature_name] = (
|
||||||
feature[feature_name] + remainder
|
feature[feature_name] + remainder
|
||||||
@@ -163,7 +161,7 @@ class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
|
|||||||
|
|
||||||
def __call__(self, features, return_tensors=None):
|
def __call__(self, features, return_tensors=None):
|
||||||
if not isinstance(features[0], list):
|
if not isinstance(features[0], list):
|
||||||
features: List[List[dict]] = [features]
|
features = [features]
|
||||||
out_features = [{} for _ in features]
|
out_features = [{} for _ in features]
|
||||||
for i, features_ in enumerate(features):
|
for i, features_ in enumerate(features):
|
||||||
for feature in features_[0].keys():
|
for feature in features_[0].keys():
|
||||||
|
|||||||
@@ -1,21 +1,16 @@
|
|||||||
"""Init for `axolotl.utils.data` module."""
|
"""
|
||||||
|
Data processing modules
|
||||||
|
"""
|
||||||
|
|
||||||
from axolotl.utils.data.pretraining import (
|
from axolotl.utils.data.pretraining import ( # noqa: F401
|
||||||
encode_pretraining,
|
encode_pretraining,
|
||||||
wrap_pretraining_dataset,
|
wrap_pretraining_dataset,
|
||||||
)
|
)
|
||||||
from axolotl.utils.data.rl import prepare_preference_datasets
|
from axolotl.utils.data.rl import load_prepare_preference_datasets # noqa: F401
|
||||||
from axolotl.utils.data.sft import (
|
from axolotl.utils.data.sft import ( # noqa: F401
|
||||||
get_dataset_wrapper,
|
get_dataset_wrapper,
|
||||||
prepare_datasets,
|
load_prepare_datasets,
|
||||||
|
load_tokenized_prepared_datasets,
|
||||||
|
prepare_dataset,
|
||||||
)
|
)
|
||||||
from axolotl.utils.data.utils import md5
|
from axolotl.utils.data.utils import md5 # noqa: F401
|
||||||
|
|
||||||
__all__ = [
|
|
||||||
"encode_pretraining",
|
|
||||||
"wrap_pretraining_dataset",
|
|
||||||
"prepare_preference_datasets",
|
|
||||||
"get_dataset_wrapper",
|
|
||||||
"prepare_datasets",
|
|
||||||
"md5",
|
|
||||||
]
|
|
||||||
|
|||||||
@@ -1,66 +0,0 @@
|
|||||||
"""Logic for loading / preparing a dataset once over all processes."""
|
|
||||||
|
|
||||||
import time
|
|
||||||
from pathlib import Path
|
|
||||||
from typing import Any, Callable
|
|
||||||
|
|
||||||
from filelock import FileLock
|
|
||||||
|
|
||||||
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
|
||||||
from axolotl.utils.dict import DictDefault
|
|
||||||
|
|
||||||
LOCK_FILE_NAME = "datasets_prep.lock"
|
|
||||||
READY_FILE_NAME = "datasets_ready.flag"
|
|
||||||
PROCESS_COUNTER_FILE_NAME = "process_counter.txt"
|
|
||||||
|
|
||||||
|
|
||||||
class FileLockLoader:
|
|
||||||
"""
|
|
||||||
Simple class for abstracting single process data loading / processing. The first
|
|
||||||
process that creates a lock file does the work; the remaining procesees simply load
|
|
||||||
the preprocessed dataset once the first process is done.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self, cfg: DictDefault):
|
|
||||||
self.cfg = cfg
|
|
||||||
self.dataset_prepared_path = (
|
|
||||||
cfg.dataset_prepared_path or DEFAULT_DATASET_PREPARED_PATH
|
|
||||||
)
|
|
||||||
self.lock_file_path = Path(self.dataset_prepared_path) / LOCK_FILE_NAME
|
|
||||||
self.ready_flag_path = Path(self.dataset_prepared_path) / READY_FILE_NAME
|
|
||||||
self.counter_path = Path(self.dataset_prepared_path) / PROCESS_COUNTER_FILE_NAME
|
|
||||||
|
|
||||||
def load(self, load_fn: Callable[[], Any]) -> Any:
|
|
||||||
with FileLock(str(self.lock_file_path)):
|
|
||||||
self._increment_counter()
|
|
||||||
|
|
||||||
if not self.ready_flag_path.exists():
|
|
||||||
result = load_fn()
|
|
||||||
self.ready_flag_path.touch()
|
|
||||||
return result
|
|
||||||
|
|
||||||
while not self.ready_flag_path.exists():
|
|
||||||
time.sleep(1)
|
|
||||||
return load_fn()
|
|
||||||
|
|
||||||
def _increment_counter(self):
|
|
||||||
"""Safely increment the process counter."""
|
|
||||||
if self.counter_path.exists():
|
|
||||||
count = int(self.counter_path.read_text().strip())
|
|
||||||
else:
|
|
||||||
count = 0
|
|
||||||
self.counter_path.write_text(str(count + 1))
|
|
||||||
|
|
||||||
def cleanup(self):
|
|
||||||
"""Clean up ready flag when last process is done."""
|
|
||||||
with FileLock(str(self.lock_file_path)):
|
|
||||||
count = int(self.counter_path.read_text().strip())
|
|
||||||
count -= 1
|
|
||||||
|
|
||||||
if count == 0:
|
|
||||||
# Last process cleans everything up
|
|
||||||
self.ready_flag_path.unlink(missing_ok=True)
|
|
||||||
self.counter_path.unlink(missing_ok=True)
|
|
||||||
else:
|
|
||||||
# Still have active processes
|
|
||||||
self.counter_path.write_text(str(count))
|
|
||||||
@@ -250,7 +250,7 @@ def encode_packed_pretraining(
|
|||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
# tokenize all the examples
|
# tokenize all the examples
|
||||||
# rows get split with stride (overlap)
|
# rows get split with stride (overlap)
|
||||||
train_dataset = ds_wrapper(dataset=Dataset.from_dict(examples))[0]
|
train_dataset = ds_wrapper(Dataset.from_dict(examples))[0]
|
||||||
|
|
||||||
train_dataset = process_pretraining_datasets_for_packing(
|
train_dataset = process_pretraining_datasets_for_packing(
|
||||||
train_dataset,
|
train_dataset,
|
||||||
|
|||||||
@@ -1,117 +1,75 @@
|
|||||||
"""Data handling specific to RL trainers."""
|
"""data handling specific to DPO"""
|
||||||
|
|
||||||
import inspect
|
import inspect
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import Any, Callable, Literal
|
from pathlib import Path
|
||||||
|
from typing import Any, List, Union
|
||||||
|
|
||||||
from datasets import Dataset, DatasetDict
|
import yaml
|
||||||
from transformers import PreTrainedTokenizer
|
from datasets import Dataset, DatasetDict, concatenate_datasets, load_from_disk
|
||||||
|
|
||||||
|
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
||||||
from axolotl.loaders import load_tokenizer
|
from axolotl.loaders import load_tokenizer
|
||||||
from axolotl.prompt_strategies.dpo import load as load_dpo
|
from axolotl.prompt_strategies.dpo import load as load_dpo
|
||||||
from axolotl.prompt_strategies.kto import load as load_kto
|
from axolotl.prompt_strategies.kto import load as load_kto
|
||||||
from axolotl.prompt_strategies.orpo import load as load_orpo
|
from axolotl.prompt_strategies.orpo import load as load_orpo
|
||||||
from axolotl.utils.data.lock import FileLockLoader
|
from axolotl.utils.data.shared import datasets_w_name_generator, load_dataset_w_config
|
||||||
from axolotl.utils.data.shared import (
|
from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
|
||||||
create_train_validation_split,
|
|
||||||
datasets_with_name_generator,
|
|
||||||
generate_dataset_hash_from_config,
|
|
||||||
load_dataset_with_config,
|
|
||||||
load_preprocessed_dataset,
|
|
||||||
merge_datasets,
|
|
||||||
save_preprocessed_dataset,
|
|
||||||
try_load_from_hub,
|
|
||||||
)
|
|
||||||
from axolotl.utils.data.utils import (
|
|
||||||
deduplicate_and_log_datasets,
|
|
||||||
retry_on_request_exceptions,
|
|
||||||
)
|
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
from axolotl.utils.distributed import is_main_process, zero_first
|
||||||
from axolotl.utils.logging import get_logger
|
from axolotl.utils.logging import get_logger
|
||||||
from axolotl.utils.schemas.enums import RLType
|
from axolotl.utils.schemas.enums import RLType
|
||||||
|
|
||||||
LOG = get_logger(__name__)
|
LOG = get_logger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@retry_on_request_exceptions(max_retries=3, delay=5)
|
def _get_path(ds_hash, cfg):
|
||||||
def prepare_preference_datasets(
|
prepared_ds_path = (
|
||||||
cfg: DictDefault, tokenizer: PreTrainedTokenizer
|
Path(cfg.dataset_prepared_path) / ds_hash
|
||||||
) -> tuple[Dataset, Dataset | None]:
|
if cfg.dataset_prepared_path
|
||||||
"""Load and prepare preference datasets for RL training.
|
else Path(DEFAULT_DATASET_PREPARED_PATH) / ds_hash
|
||||||
|
)
|
||||||
|
|
||||||
Loads training and evaluation datasets, handling preprocessing, caching, and
|
return prepared_ds_path
|
||||||
deduplication as configured. Uses FileLock for distributed coordination.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cfg: Configuration object containing dataset and training settings.
|
|
||||||
tokenizer: Tokenizer to use for processing text.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (train_dataset, eval_dataset). eval_dataset may be None
|
|
||||||
if no evaluation dataset is configured.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def _load_datasets():
|
|
||||||
# Load training dataset
|
|
||||||
train_dataset = _load_or_create_dataset_split(cfg, tokenizer, split="train")
|
|
||||||
|
|
||||||
# Load or create evaluation dataset
|
|
||||||
eval_dataset: Dataset | None = None
|
|
||||||
if cfg.test_datasets:
|
|
||||||
eval_dataset = _load_or_create_dataset_split(cfg, tokenizer, split="test")
|
|
||||||
elif cfg.val_set_size:
|
|
||||||
# Create validation split from training data
|
|
||||||
train_dataset, eval_dataset = create_train_validation_split(
|
|
||||||
train_dataset, cfg, cfg.val_set_size
|
|
||||||
)
|
|
||||||
|
|
||||||
return train_dataset, eval_dataset
|
|
||||||
|
|
||||||
# Prepare datasets (with file locking logic for multiple ranks)
|
|
||||||
loader = FileLockLoader(cfg)
|
|
||||||
try:
|
|
||||||
train_dataset, eval_dataset = loader.load(_load_datasets)
|
|
||||||
finally:
|
|
||||||
loader.cleanup()
|
|
||||||
|
|
||||||
# Apply deduplication if configured
|
|
||||||
if cfg.dataset_exact_deduplication:
|
|
||||||
train_dataset, eval_dataset = deduplicate_and_log_datasets(
|
|
||||||
dataset=train_dataset, other_dataset=eval_dataset
|
|
||||||
)
|
|
||||||
|
|
||||||
return train_dataset, eval_dataset
|
|
||||||
|
|
||||||
|
|
||||||
def _map_dataset(
|
def _load_preprocessed_ds(cfg, sub_cfg):
|
||||||
cfg: DictDefault,
|
ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
|
||||||
dataset: Dataset | DatasetDict,
|
prepared_ds_path = _get_path(ds_hash, cfg)
|
||||||
ds_transform_fn: Callable[..., Any],
|
dataset = None
|
||||||
tokenizer: Any | None = None,
|
|
||||||
**map_kwargs: Any,
|
|
||||||
) -> Dataset:
|
|
||||||
"""Apply transformation function to dataset.
|
|
||||||
|
|
||||||
Args:
|
# pylint: disable=duplicate-code
|
||||||
cfg: Configuration object.
|
if (
|
||||||
dataset: Dataset to transform.
|
cfg.dataset_prepared_path
|
||||||
ds_transform_fn: Transformation function to apply.
|
and any(prepared_ds_path.glob("*"))
|
||||||
tokenizer: Optional tokenizer for transformation.
|
and not cfg.is_preprocess
|
||||||
**map_kwargs: Additional arguments for dataset mapping.
|
):
|
||||||
|
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
||||||
|
dataset = load_from_disk(str(prepared_ds_path))
|
||||||
|
|
||||||
Returns:
|
return dataset
|
||||||
Transformed dataset.
|
|
||||||
"""
|
|
||||||
|
def _save_preprocessed_ds(cfg, sub_cfg, dataset):
|
||||||
|
ds_hash = md5(yaml.dump(sub_cfg, Dumper=yaml.Dumper))
|
||||||
|
prepared_ds_path = _get_path(ds_hash, cfg)
|
||||||
|
|
||||||
|
if cfg.is_preprocess and is_main_process():
|
||||||
|
LOG.info(f"Loading prepared dataset from disk at {prepared_ds_path}...")
|
||||||
|
dataset.save_to_disk(str(prepared_ds_path))
|
||||||
|
|
||||||
|
|
||||||
|
def map_dataset(cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs):
|
||||||
sig = inspect.signature(ds_transform_fn)
|
sig = inspect.signature(ds_transform_fn)
|
||||||
if "tokenizer" in sig.parameters:
|
if "tokenizer" in sig.parameters:
|
||||||
if not tokenizer:
|
if not tokenizer:
|
||||||
tokenizer = load_tokenizer(cfg)
|
tokenizer = load_tokenizer(cfg)
|
||||||
ds_transform_fn = partial(ds_transform_fn, tokenizer=tokenizer)
|
ds_transform_fn = partial(ds_transform_fn, tokenizer=tokenizer)
|
||||||
|
|
||||||
if isinstance(dataset, DatasetDict):
|
if isinstance(data_set, DatasetDict):
|
||||||
dataset = dataset["train"]
|
data_set = data_set["train"]
|
||||||
|
|
||||||
dataset = dataset.map(
|
data_set = data_set.map(
|
||||||
ds_transform_fn,
|
ds_transform_fn,
|
||||||
num_proc=cfg.dataset_processes,
|
num_proc=cfg.dataset_processes,
|
||||||
load_from_cache_file=not cfg.is_preprocess,
|
load_from_cache_file=not cfg.is_preprocess,
|
||||||
@@ -119,27 +77,13 @@ def _map_dataset(
|
|||||||
**map_kwargs,
|
**map_kwargs,
|
||||||
)
|
)
|
||||||
|
|
||||||
return dataset
|
return data_set
|
||||||
|
|
||||||
|
|
||||||
def _drop_long_sequences(
|
def drop_long_rl_seq(
|
||||||
sample: dict[str, Any], rl: RLType, tokenizer: Any, sequence_len: int
|
sample, rl, tokenizer, sequence_len # pylint: disable=invalid-name
|
||||||
) -> bool:
|
):
|
||||||
"""Filter out samples that exceed maximum sequence length.
|
if rl in (RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO):
|
||||||
|
|
||||||
Args:
|
|
||||||
sample: Dataset sample to check.
|
|
||||||
rl: Reinforcement learning type.
|
|
||||||
tokenizer: Tokenizer for length calculation.
|
|
||||||
sequence_len: Maximum allowed sequence length.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if sample should be kept, False if it should be dropped.
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
ValueError: If required keys are missing or RL type is unknown.
|
|
||||||
"""
|
|
||||||
if rl in {RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO}:
|
|
||||||
if not (
|
if not (
|
||||||
sample.get("prompt") and sample.get("chosen") and sample.get("rejected")
|
sample.get("prompt") and sample.get("chosen") and sample.get("rejected")
|
||||||
):
|
):
|
||||||
@@ -179,115 +123,132 @@ def _drop_long_sequences(
|
|||||||
raise ValueError("Unknown RL type")
|
raise ValueError("Unknown RL type")
|
||||||
|
|
||||||
|
|
||||||
def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset:
|
def load_prepare_preference_datasets(cfg):
|
||||||
"""Load and process dataset split for RL training.
|
def load_split(dataset_cfgs, _cfg):
|
||||||
|
split_datasets: List[Any] = []
|
||||||
|
use_auth_token = _cfg.hf_use_auth_token
|
||||||
|
for config_dataset in datasets_w_name_generator(dataset_cfgs):
|
||||||
|
ds: Union[Dataset, DatasetDict] = load_dataset_w_config(
|
||||||
|
config_dataset, use_auth_token, streaming=False
|
||||||
|
)
|
||||||
|
split_datasets.append(ds)
|
||||||
|
|
||||||
Args:
|
tokenizer = load_tokenizer(cfg)
|
||||||
cfg: Configuration object containing dataset settings.
|
|
||||||
split: Dataset split to load ("train" or "test").
|
|
||||||
|
|
||||||
Returns:
|
for i, data_set in enumerate(split_datasets):
|
||||||
Combined and processed dataset for the specified split.
|
_type = dataset_cfgs[i]["type"]
|
||||||
"""
|
if _type:
|
||||||
datasets_configs = cfg.datasets if split == "train" else cfg.test_datasets
|
if isinstance(_type, DictDefault):
|
||||||
split_datasets: list[Dataset | DatasetDict] = []
|
_type = "user_defined.default"
|
||||||
|
if _cfg.rl is RLType.ORPO:
|
||||||
|
ds_transform_fn = load_orpo(_type, _cfg, dataset_idx=i)
|
||||||
|
elif _cfg.rl is RLType.KTO:
|
||||||
|
ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
|
||||||
|
else:
|
||||||
|
ds_transform_fn = load_dpo(_type, _cfg, dataset_idx=i)
|
||||||
|
|
||||||
for dataset_config in datasets_with_name_generator(datasets_configs):
|
map_kwargs = {}
|
||||||
dataset: Dataset | DatasetDict = load_dataset_with_config(
|
if isinstance(ds_transform_fn, tuple):
|
||||||
dataset_config, cfg.hf_use_auth_token, streaming=False
|
ds_transform_fn, map_kwargs = ds_transform_fn
|
||||||
)
|
split_datasets[i] = map_dataset(
|
||||||
split_datasets.append(dataset)
|
cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs
|
||||||
|
)
|
||||||
tokenizer = load_tokenizer(cfg)
|
elif _cfg.rl is RLType.KTO:
|
||||||
|
ds_transform_fn = load_kto(_type, _cfg, dataset_idx=i)
|
||||||
for i, dataset in enumerate(split_datasets):
|
map_kwargs = {}
|
||||||
_type = datasets_configs[i]["type"]
|
if isinstance(ds_transform_fn, tuple):
|
||||||
if _type:
|
ds_transform_fn, map_kwargs = ds_transform_fn
|
||||||
if isinstance(_type, DictDefault):
|
split_datasets[i] = map_dataset(
|
||||||
_type = "user_defined.default"
|
cfg, data_set, ds_transform_fn, tokenizer, **map_kwargs
|
||||||
if cfg.rl is RLType.ORPO:
|
)
|
||||||
ds_transform_fn = load_orpo(_type, cfg, dataset_idx=i)
|
|
||||||
elif cfg.rl is RLType.KTO:
|
|
||||||
ds_transform_fn = load_kto(_type, cfg, dataset_idx=i)
|
|
||||||
else:
|
else:
|
||||||
ds_transform_fn = load_dpo(_type, cfg, dataset_idx=i)
|
# If no `type` is provided, assume the dataset is already in the expected format with
|
||||||
|
# "prompt", "chosen" and "rejected" already preprocessed
|
||||||
|
split_datasets[i] = data_set
|
||||||
|
|
||||||
map_kwargs: dict[str, Any] = {}
|
if not cfg.skip_prepare_dataset:
|
||||||
if isinstance(ds_transform_fn, tuple):
|
drop_long = partial(
|
||||||
ds_transform_fn, map_kwargs = ds_transform_fn
|
drop_long_rl_seq,
|
||||||
split_datasets[i] = _map_dataset(
|
rl=_cfg.rl,
|
||||||
cfg, dataset, ds_transform_fn, tokenizer, **map_kwargs
|
tokenizer=tokenizer,
|
||||||
)
|
sequence_len=cfg.sequence_len,
|
||||||
|
)
|
||||||
|
|
||||||
|
prior_len = len(split_datasets[i])
|
||||||
|
split_datasets[i] = split_datasets[i].filter(
|
||||||
|
drop_long,
|
||||||
|
num_proc=cfg.dataset_processes,
|
||||||
|
load_from_cache_file=not cfg.is_preprocess,
|
||||||
|
desc="Dropping Long Sequences",
|
||||||
|
)
|
||||||
|
dropped = prior_len - len(split_datasets[i])
|
||||||
|
if dropped:
|
||||||
|
LOG.warning(
|
||||||
|
f"Dropped {dropped} long samples from dataset index {i}"
|
||||||
|
)
|
||||||
|
|
||||||
|
combined_datasets = concatenate_datasets(split_datasets)
|
||||||
|
combined_datasets = combined_datasets.shuffle(seed=cfg.seed or 42)
|
||||||
|
|
||||||
|
return combined_datasets
|
||||||
|
|
||||||
|
with zero_first(is_main_process()):
|
||||||
|
train_is_preprocessed = False
|
||||||
|
eval_is_preprocessed = False
|
||||||
|
if train_dataset := _load_preprocessed_ds(cfg, cfg.datasets):
|
||||||
|
train_is_preprocessed = True
|
||||||
else:
|
else:
|
||||||
# If no `type` is provided, assume the dataset is already in the expected format with
|
train_dataset = load_split(cfg.datasets, cfg)
|
||||||
# "prompt", "chosen", and "rejected" already preprocessed
|
|
||||||
split_datasets[i] = dataset
|
|
||||||
|
|
||||||
if not cfg.skip_prepare_dataset:
|
eval_dataset = None
|
||||||
drop_long = partial(
|
if cfg.test_datasets:
|
||||||
_drop_long_sequences,
|
if eval_dataset := _load_preprocessed_ds(cfg, cfg.test_datasets):
|
||||||
rl=cfg.rl,
|
eval_is_preprocessed = True
|
||||||
tokenizer=tokenizer,
|
else:
|
||||||
sequence_len=cfg.sequence_len,
|
eval_dataset = load_split(cfg.test_datasets, cfg)
|
||||||
)
|
if not eval_dataset:
|
||||||
|
if cfg.val_set_size:
|
||||||
|
seed = cfg.seed if cfg.seed is not None else 42
|
||||||
|
|
||||||
prior_len = len(split_datasets[i])
|
# ensure we end up with the same fingerprint by doing rank0 first and being able to cache
|
||||||
split_datasets[i] = split_datasets[i].filter(
|
to_hash_train = (
|
||||||
drop_long,
|
train_dataset._fingerprint # pylint: disable=protected-access
|
||||||
num_proc=cfg.dataset_processes,
|
+ "|"
|
||||||
load_from_cache_file=not cfg.is_preprocess,
|
+ str(cfg.val_set_size)
|
||||||
desc="Dropping Long Sequences",
|
+ "|"
|
||||||
)
|
+ "train"
|
||||||
dropped = prior_len - len(split_datasets[i])
|
+ "|"
|
||||||
if dropped:
|
+ str(cfg.seed or 42)
|
||||||
LOG.warning(f"Dropped {dropped} long samples from dataset index {i}")
|
)
|
||||||
|
to_hash_test = (
|
||||||
|
train_dataset._fingerprint # pylint: disable=protected-access
|
||||||
|
+ "|"
|
||||||
|
+ str(cfg.val_set_size)
|
||||||
|
+ "|"
|
||||||
|
+ "test"
|
||||||
|
+ "|"
|
||||||
|
+ str(cfg.seed or 42)
|
||||||
|
)
|
||||||
|
train_fingerprint = md5(to_hash_train)
|
||||||
|
test_fingerprint = md5(to_hash_test)
|
||||||
|
ds_w_test_split = train_dataset.train_test_split(
|
||||||
|
test_size=cfg.val_set_size,
|
||||||
|
seed=seed,
|
||||||
|
shuffle=False,
|
||||||
|
train_new_fingerprint=train_fingerprint,
|
||||||
|
test_new_fingerprint=test_fingerprint,
|
||||||
|
)
|
||||||
|
eval_dataset = ds_w_test_split["test"]
|
||||||
|
train_dataset = ds_w_test_split["train"]
|
||||||
|
|
||||||
# Merge datasets
|
if not train_is_preprocessed:
|
||||||
dataset = merge_datasets(split_datasets, cfg)
|
_save_preprocessed_ds(cfg, cfg.datasets, train_dataset)
|
||||||
|
if eval_dataset and not eval_is_preprocessed:
|
||||||
|
_save_preprocessed_ds(cfg, cfg.test_datasets, eval_dataset)
|
||||||
|
|
||||||
if not cfg.skip_prepare_dataset:
|
if cfg.dataset_exact_deduplication:
|
||||||
# Save preprocessed dataset
|
train_dataset, eval_dataset, _ = deduplicate_and_log_datasets(
|
||||||
dataset_hash = generate_dataset_hash_from_config(
|
train_dataset=train_dataset, eval_dataset=eval_dataset
|
||||||
cfg, datasets_configs, tokenizer.name_or_path
|
|
||||||
)
|
)
|
||||||
save_preprocessed_dataset(cfg, dataset, dataset_hash, split)
|
|
||||||
|
|
||||||
return dataset
|
return train_dataset, eval_dataset
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=duplicate-code
|
|
||||||
def _load_or_create_dataset_split(
|
|
||||||
cfg: DictDefault, tokenizer: PreTrainedTokenizer, split: Literal["train", "test"]
|
|
||||||
) -> Dataset:
|
|
||||||
"""Load preprocessed dataset or create new one for given split.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cfg: Configuration object.
|
|
||||||
tokenizer: Tokenizer to use for processing text.
|
|
||||||
split: Dataset split to load.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (dataset, is_preprocessed).
|
|
||||||
"""
|
|
||||||
# Select correct dataset configuration based on split
|
|
||||||
datasets_config = cfg.datasets if split == "train" else cfg.test_datasets
|
|
||||||
|
|
||||||
# Generate dataset hash for caching
|
|
||||||
dataset_hash = generate_dataset_hash_from_config(
|
|
||||||
cfg, datasets_config, tokenizer.name_or_path
|
|
||||||
)
|
|
||||||
|
|
||||||
# Try loading from hub if push_dataset_to_hub is configured
|
|
||||||
dataset = None
|
|
||||||
if cfg.push_dataset_to_hub:
|
|
||||||
dataset = try_load_from_hub(cfg, dataset_hash, split)
|
|
||||||
|
|
||||||
# Attempt to load preprocessed dataset
|
|
||||||
if dataset is None:
|
|
||||||
dataset = load_preprocessed_dataset(cfg, dataset_hash)
|
|
||||||
|
|
||||||
# Otherwise, load it
|
|
||||||
if dataset is None:
|
|
||||||
dataset = _load_split(cfg, split=split)
|
|
||||||
|
|
||||||
return dataset
|
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -1,21 +1,11 @@
|
|||||||
"""Dataset loading shared utils."""
|
"""
|
||||||
|
dataset loading shared utils
|
||||||
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
|
||||||
|
|
||||||
import functools
|
|
||||||
import os
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import TYPE_CHECKING, Any, Generator
|
from typing import Optional, Union
|
||||||
|
|
||||||
from datasets import (
|
from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
|
||||||
Dataset,
|
|
||||||
DatasetDict,
|
|
||||||
IterableDataset,
|
|
||||||
IterableDatasetDict,
|
|
||||||
concatenate_datasets,
|
|
||||||
load_dataset,
|
|
||||||
load_from_disk,
|
|
||||||
)
|
|
||||||
from huggingface_hub import hf_hub_download, snapshot_download
|
from huggingface_hub import hf_hub_download, snapshot_download
|
||||||
from huggingface_hub.errors import (
|
from huggingface_hub.errors import (
|
||||||
HFValidationError,
|
HFValidationError,
|
||||||
@@ -23,141 +13,78 @@ from huggingface_hub.errors import (
|
|||||||
RevisionNotFoundError,
|
RevisionNotFoundError,
|
||||||
)
|
)
|
||||||
|
|
||||||
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
|
|
||||||
from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
|
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.logging import get_logger
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from adlfs import AzureBlobFileSystem
|
|
||||||
from gcsfs import GCSFileSystem
|
|
||||||
from ocifs import OCIFileSystem
|
|
||||||
from s3fs import S3FileSystem
|
|
||||||
|
|
||||||
LOG = get_logger(__name__)
|
|
||||||
|
|
||||||
EXTENSIONS_TO_DATASET_TYPES = {
|
|
||||||
".parquet": "parquet",
|
|
||||||
".arrow": "arrow",
|
|
||||||
".csv": "csv",
|
|
||||||
".txt": "text",
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def get_dataset_type(dataset_config: DictDefault) -> str:
|
def get_ds_type(config_dataset: DictDefault):
|
||||||
"""Get the dataset type from the path if it's not specified."""
|
"""
|
||||||
if dataset_config.ds_type:
|
Get the dataset type from the path if it's not specified
|
||||||
return dataset_config.ds_type
|
"""
|
||||||
|
ds_type = "json"
|
||||||
for extension, dataset_type in EXTENSIONS_TO_DATASET_TYPES.items():
|
if config_dataset.ds_type:
|
||||||
if extension in dataset_config.path:
|
ds_type = config_dataset.ds_type
|
||||||
return dataset_type
|
elif ".parquet" in config_dataset.path:
|
||||||
|
ds_type = "parquet"
|
||||||
return "json"
|
elif ".arrow" in config_dataset.path:
|
||||||
|
ds_type = "arrow"
|
||||||
|
elif ".csv" in config_dataset.path:
|
||||||
|
ds_type = "csv"
|
||||||
|
elif ".txt" in config_dataset.path:
|
||||||
|
ds_type = "text"
|
||||||
|
return ds_type
|
||||||
|
|
||||||
|
|
||||||
def datasets_with_name_generator(
|
def datasets_w_name_generator(dataset_configs: list[DictDefault]):
|
||||||
dataset_configs: list[DictDefault],
|
"""
|
||||||
) -> Generator[DictDefault, None, None]:
|
Yields dataset configs handling multiple names or preprocess_shards
|
||||||
"""Yields expanded dataset configurations based on multiple names or preprocessing
|
|
||||||
shards.
|
|
||||||
|
|
||||||
When a dataset config has a list of names, it yields separate configs for each
|
|
||||||
name. When a dataset config specifies preprocessing shards, it yields configs for
|
|
||||||
each shard.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dataset_configs: List of dataset configuration objects.
|
dataset_configs: list of dataset configs (equivalent to cfg.datasets)
|
||||||
|
|
||||||
Yields:
|
|
||||||
Individual dataset configurations, expanded as needed for names or shards.
|
|
||||||
"""
|
"""
|
||||||
for config in dataset_configs:
|
for dataset in dataset_configs:
|
||||||
if config.name and isinstance(config.name, list):
|
if dataset.name and isinstance(dataset.name, list):
|
||||||
for name in config.name:
|
# load_dataset doesn't properly handle multiple named configurations
|
||||||
yield DictDefault({**config, "name": name})
|
# at the same time for a given dataset
|
||||||
elif config.preprocess_shards and not config.shards:
|
for name in dataset.name:
|
||||||
for shard_idx in range(config.preprocess_shards):
|
yield DictDefault({**dataset, "name": name})
|
||||||
|
elif dataset.preprocess_shards and not dataset.shards:
|
||||||
|
for shard in range(dataset.preprocess_shards):
|
||||||
yield DictDefault(
|
yield DictDefault(
|
||||||
{
|
{
|
||||||
**config,
|
**dataset,
|
||||||
"shards": config.preprocess_shards,
|
"shards": dataset.preprocess_shards,
|
||||||
"shards_idx": shard_idx,
|
"shards_idx": shard,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
yield config
|
yield dataset
|
||||||
|
|
||||||
|
|
||||||
def load_dataset_with_config(
|
def load_dataset_w_config(
|
||||||
dataset_config: DictDefault, use_auth_token: bool, streaming=False
|
config_dataset: DictDefault, use_auth_token: bool, streaming=False
|
||||||
) -> Dataset | IterableDataset:
|
) -> Union[Dataset, DatasetDict]:
|
||||||
"""Load a dataset from a config. Handles datasets that are stored locally, in the
|
"""
|
||||||
HuggingFace Hub, in a remote filesystem (S3, GCS, Azure, OCI), a URL, or
|
Load a dataset from a config
|
||||||
`data_files`.
|
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
dataset_config: Single dataset config.
|
config_dataset: single dataset config
|
||||||
use_auth_token: Whether to use HF auth token.
|
use_auth_token: whether to use HF auth token
|
||||||
streaming: Whether to stream the dataset.
|
streaming: whether to stream the dataset
|
||||||
|
|
||||||
Returns:
|
|
||||||
Loaded dataset.
|
|
||||||
"""
|
"""
|
||||||
# Set up common kwargs for dataset loading
|
# pylint: disable=invalid-name
|
||||||
load_dataset_kwargs = {
|
ds: Optional[Union[Dataset, DatasetDict]] = None # pylint: disable=invalid-name
|
||||||
"split": dataset_config.split if dataset_config.split else None,
|
ds_from_hub = False
|
||||||
"name": dataset_config.name,
|
|
||||||
"streaming": streaming,
|
|
||||||
"trust_remote_code": dataset_config.trust_remote_code,
|
|
||||||
}
|
|
||||||
|
|
||||||
# First check if it's a local path
|
|
||||||
if Path(dataset_config.path).exists():
|
|
||||||
return _load_from_local_path(dataset_config, load_dataset_kwargs)
|
|
||||||
|
|
||||||
# Check if it's a HuggingFace dataset
|
|
||||||
is_hub_dataset = _check_if_hub_dataset(dataset_config, use_auth_token)
|
|
||||||
|
|
||||||
# Check if it's a cloud storage path and get appropriate filesystem
|
|
||||||
remote_fs, storage_options = _get_remote_filesystem(dataset_config.path)
|
|
||||||
is_cloud_dataset = False
|
|
||||||
if remote_fs:
|
|
||||||
try:
|
|
||||||
is_cloud_dataset = remote_fs.exists(dataset_config.path)
|
|
||||||
except (FileNotFoundError, ConnectionError):
|
|
||||||
pass
|
|
||||||
|
|
||||||
# Load from appropriate source
|
|
||||||
if is_hub_dataset:
|
|
||||||
return _load_from_hub(dataset_config, use_auth_token, load_dataset_kwargs)
|
|
||||||
if is_cloud_dataset:
|
|
||||||
return _load_from_cloud(
|
|
||||||
dataset_config, remote_fs, storage_options, load_dataset_kwargs
|
|
||||||
)
|
|
||||||
if dataset_config.path.startswith("https://"):
|
|
||||||
return _load_from_url(dataset_config, load_dataset_kwargs)
|
|
||||||
if dataset_config.data_files:
|
|
||||||
return _load_from_data_files(dataset_config, load_dataset_kwargs)
|
|
||||||
|
|
||||||
raise ValueError(
|
|
||||||
f"The dataset could not be loaded. This could be due to a misconfigured dataset path "
|
|
||||||
f"({dataset_config.path}). Try double-check your path / name / data_files. "
|
|
||||||
f"This is not caused by the dataset type."
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _check_if_hub_dataset(dataset_config: DictDefault, use_auth_token: bool) -> bool:
|
|
||||||
"""Check if a dataset exists on the HuggingFace Hub."""
|
|
||||||
try:
|
try:
|
||||||
|
# this is just a basic check to see if the path is a
|
||||||
|
# valid HF dataset that's loadable
|
||||||
snapshot_download(
|
snapshot_download(
|
||||||
repo_id=dataset_config.path,
|
repo_id=config_dataset.path,
|
||||||
repo_type="dataset",
|
repo_type="dataset",
|
||||||
token=use_auth_token,
|
token=use_auth_token,
|
||||||
revision=dataset_config.revision,
|
revision=config_dataset.revision,
|
||||||
ignore_patterns=["*"],
|
ignore_patterns=["*"],
|
||||||
)
|
)
|
||||||
return True
|
ds_from_hub = True
|
||||||
except (
|
except (
|
||||||
RepositoryNotFoundError,
|
RepositoryNotFoundError,
|
||||||
RevisionNotFoundError,
|
RevisionNotFoundError,
|
||||||
@@ -166,373 +93,198 @@ def _check_if_hub_dataset(dataset_config: DictDefault, use_auth_token: bool) ->
|
|||||||
HFValidationError,
|
HFValidationError,
|
||||||
ValueError,
|
ValueError,
|
||||||
):
|
):
|
||||||
return False
|
pass
|
||||||
|
|
||||||
|
ds_from_cloud = False
|
||||||
def _get_remote_filesystem(
|
storage_options: dict = {}
|
||||||
path: str,
|
remote_file_system = None
|
||||||
) -> tuple[
|
if config_dataset.path.startswith("s3://"):
|
||||||
S3FileSystem | GCSFileSystem | AzureBlobFileSystem | OCIFileSystem | None, dict
|
|
||||||
]:
|
|
||||||
"""Get the appropriate filesystem for a remote path."""
|
|
||||||
if path.startswith("s3://"):
|
|
||||||
try:
|
try:
|
||||||
import s3fs
|
import s3fs # type: ignore
|
||||||
|
|
||||||
storage_options = {"anon": False}
|
|
||||||
return s3fs.S3FileSystem(**storage_options), storage_options
|
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
raise ImportError("s3:// paths require s3fs to be installed") from exc
|
raise ImportError("s3:// paths require s3fs to be installed") from exc
|
||||||
|
|
||||||
elif path.startswith(("gs://", "gcs://")):
|
# Reads env, credentials from ~/.aws/credentials, or IAM metadata provider
|
||||||
|
# https://s3fs.readthedocs.io/en/latest/index.html?highlight=storage_options#credentials
|
||||||
|
storage_options = {"anon": False}
|
||||||
|
remote_file_system = s3fs.S3FileSystem(**storage_options)
|
||||||
|
elif config_dataset.path.startswith("gs://") or config_dataset.path.startswith(
|
||||||
|
"gcs://"
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
import gcsfs
|
import gcsfs # type: ignore
|
||||||
|
|
||||||
storage_options = {"token": None} # type: ignore
|
|
||||||
return gcsfs.GCSFileSystem(**storage_options), storage_options
|
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"gs:// or gcs:// paths require gcsfs to be installed"
|
"gs:// or gcs:// paths require gcsfs to be installed"
|
||||||
) from exc
|
) from exc
|
||||||
|
|
||||||
elif path.startswith(("adl://", "abfs://", "az://")):
|
# gcsfs will use default credentials from the environment else anon
|
||||||
|
# https://gcsfs.readthedocs.io/en/latest/#credentials
|
||||||
|
storage_options = {"token": None}
|
||||||
|
remote_file_system = gcsfs.GCSFileSystem(**storage_options)
|
||||||
|
elif (
|
||||||
|
config_dataset.path.startswith("adl://")
|
||||||
|
or config_dataset.path.startswith("abfs://")
|
||||||
|
or config_dataset.path.startswith("az://")
|
||||||
|
):
|
||||||
try:
|
try:
|
||||||
import adlfs
|
import adlfs
|
||||||
|
|
||||||
storage_options = {"anon": False}
|
|
||||||
return adlfs.AzureBlobFileSystem(**storage_options), storage_options
|
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
raise ImportError(
|
raise ImportError(
|
||||||
"adl:// or abfs:// paths require adlfs to be installed"
|
"adl:// or abfs:// paths require adlfs to be installed"
|
||||||
) from exc
|
) from exc
|
||||||
|
|
||||||
elif path.startswith("oci://"):
|
# # Ensure you have the following environment variables set:
|
||||||
|
# # Gen 1
|
||||||
|
# storage_options = {
|
||||||
|
# "tenant_id": AZURE_STORAGE_TENANT_ID,
|
||||||
|
# "client_id": AZURE_STORAGE_CLIENT_ID,
|
||||||
|
# "client_secret": AZURE_STORAGE_CLIENT_SECRET,
|
||||||
|
# }
|
||||||
|
# # Gen 2
|
||||||
|
# storage_options = {
|
||||||
|
# "account_name": AZURE_STORAGE_ACCOUNT_NAME,
|
||||||
|
# "account_key": AZURE_STORAGE_ACCOUNT_KEY,
|
||||||
|
# }
|
||||||
|
|
||||||
|
# Reads env
|
||||||
|
# https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials
|
||||||
|
storage_options = {"anon": False}
|
||||||
|
remote_file_system = adlfs.AzureBlobFileSystem(**storage_options)
|
||||||
|
elif config_dataset.path.startswith("oci://"):
|
||||||
try:
|
try:
|
||||||
import ocifs
|
import ocifs
|
||||||
|
|
||||||
storage_options = {}
|
|
||||||
return ocifs.OCIFileSystem(**storage_options), storage_options
|
|
||||||
except ImportError as exc:
|
except ImportError as exc:
|
||||||
raise ImportError("oci:// paths require ocifs to be installed") from exc
|
raise ImportError("oci:// paths require ocifs to be installed") from exc
|
||||||
|
|
||||||
return None, {}
|
# https://ocifs.readthedocs.io/en/latest/getting-connected.html#Using-Environment-Variables
|
||||||
|
remote_file_system = ocifs.OCIFileSystem(**storage_options)
|
||||||
|
|
||||||
|
|
||||||
def _load_from_local_path(
|
|
||||||
dataset_config: DictDefault, load_dataset_kwargs: dict
|
|
||||||
) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
|
|
||||||
"""Load a dataset from a local path."""
|
|
||||||
local_path = Path(dataset_config.path)
|
|
||||||
|
|
||||||
if local_path.is_dir():
|
|
||||||
if dataset_config.data_files:
|
|
||||||
dataset_type = get_dataset_type(dataset_config)
|
|
||||||
return load_dataset(
|
|
||||||
dataset_type,
|
|
||||||
data_files=dataset_config.data_files,
|
|
||||||
**load_dataset_kwargs,
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
return load_from_disk(dataset_config.path)
|
|
||||||
except FileNotFoundError:
|
|
||||||
load_dataset_kwargs["streaming"] = False
|
|
||||||
return load_dataset(dataset_config.path, **load_dataset_kwargs)
|
|
||||||
elif local_path.is_file():
|
|
||||||
dataset_type = get_dataset_type(dataset_config)
|
|
||||||
load_dataset_kwargs["streaming"] = False
|
|
||||||
return load_dataset(
|
|
||||||
dataset_type,
|
|
||||||
data_files=dataset_config.path,
|
|
||||||
**load_dataset_kwargs,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
"Unhandled dataset load: local path exists, but is neither a directory or a file"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _load_from_hub(
|
|
||||||
dataset_config: DictDefault, use_auth_token: bool, load_dataset_kwargs: dict
|
|
||||||
) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
|
|
||||||
"""Load a dataset from the HuggingFace Hub."""
|
|
||||||
return load_dataset(
|
|
||||||
dataset_config.path,
|
|
||||||
data_files=dataset_config.data_files,
|
|
||||||
token=use_auth_token,
|
|
||||||
revision=dataset_config.revision,
|
|
||||||
**load_dataset_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _load_from_cloud(
|
|
||||||
dataset_config: DictDefault,
|
|
||||||
remote_fs: S3FileSystem | GCSFileSystem | AzureBlobFileSystem | OCIFileSystem,
|
|
||||||
storage_options: dict,
|
|
||||||
load_dataset_kwargs: dict,
|
|
||||||
) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
|
|
||||||
"""Load a dataset from cloud storage."""
|
|
||||||
if remote_fs.isdir(dataset_config.path):
|
|
||||||
return load_from_disk(
|
|
||||||
dataset_config.path,
|
|
||||||
storage_options=storage_options,
|
|
||||||
)
|
|
||||||
|
|
||||||
if remote_fs.isfile(dataset_config.path):
|
|
||||||
dataset_type = get_dataset_type(dataset_config)
|
|
||||||
return load_dataset(
|
|
||||||
dataset_type,
|
|
||||||
data_files=dataset_config.path,
|
|
||||||
storage_options=storage_options,
|
|
||||||
**load_dataset_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
raise ValueError(
|
|
||||||
f"Cloud path {dataset_config.path} is neither a directory nor a file"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _load_from_url(
|
|
||||||
dataset_config: DictDefault, load_dataset_kwargs: dict
|
|
||||||
) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
|
|
||||||
"""Load a dataset from a URL."""
|
|
||||||
dataset_type = get_dataset_type(dataset_config)
|
|
||||||
return load_dataset(
|
|
||||||
dataset_type,
|
|
||||||
data_files=dataset_config.path,
|
|
||||||
**load_dataset_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _load_from_data_files(
|
|
||||||
dataset_config: DictDefault, load_dataset_kwargs: dict
|
|
||||||
) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
|
|
||||||
"""Load a dataset from data files."""
|
|
||||||
file_path = None
|
|
||||||
|
|
||||||
if isinstance(dataset_config.data_files, str):
|
|
||||||
file_path = hf_hub_download(
|
|
||||||
repo_id=dataset_config.path,
|
|
||||||
repo_type="dataset",
|
|
||||||
filename=dataset_config.data_files,
|
|
||||||
revision=dataset_config.revision,
|
|
||||||
)
|
|
||||||
elif isinstance(dataset_config.data_files, list):
|
|
||||||
file_path = [
|
|
||||||
hf_hub_download(
|
|
||||||
repo_id=dataset_config.path,
|
|
||||||
repo_type="dataset",
|
|
||||||
filename=file,
|
|
||||||
revision=dataset_config.revision,
|
|
||||||
)
|
|
||||||
for file in dataset_config.data_files
|
|
||||||
]
|
|
||||||
else:
|
|
||||||
raise ValueError("data_files must be either a string or list of strings")
|
|
||||||
|
|
||||||
return load_dataset("json", data_files=file_path, **load_dataset_kwargs)
|
|
||||||
|
|
||||||
|
|
||||||
def generate_split_fingerprints(
|
|
||||||
dataset: Dataset, val_set_size: int | float, seed: int
|
|
||||||
) -> tuple[str, str]:
|
|
||||||
"""Generate consistent fingerprints for train/test splits."""
|
|
||||||
fingerprint = dataset._fingerprint # pylint: disable=protected-access
|
|
||||||
|
|
||||||
train_hash_input = f"{fingerprint}|{val_set_size}|train|{seed}"
|
|
||||||
test_hash_input = f"{fingerprint}|{val_set_size}|test|{seed}"
|
|
||||||
|
|
||||||
train_fingerprint = md5(train_hash_input)
|
|
||||||
test_fingerprint = md5(test_hash_input)
|
|
||||||
|
|
||||||
return train_fingerprint, test_fingerprint
|
|
||||||
|
|
||||||
|
|
||||||
def get_prepared_dataset_path(cfg: DictDefault, dataset_hash: str) -> Path:
|
|
||||||
"""Get standardized path for prepared datasets.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cfg: Configuration object.
|
|
||||||
dataset_hash: Hash identifying the specific dataset configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Path where the prepared dataset should be stored.
|
|
||||||
"""
|
|
||||||
base_path = cfg.dataset_prepared_path or DEFAULT_DATASET_PREPARED_PATH
|
|
||||||
return Path(base_path) / dataset_hash
|
|
||||||
|
|
||||||
|
|
||||||
def create_train_validation_split(
|
|
||||||
dataset: Dataset, cfg: DictDefault, val_set_size: int | float
|
|
||||||
) -> tuple[Dataset, Dataset]:
|
|
||||||
"""Create train/validation split with consistent fingerprinting.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
dataset: Dataset to split.
|
|
||||||
cfg: Configuration object containing seed and other settings.
|
|
||||||
val_set_size: Size of validation set (absolute number or fraction).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (train_dataset, eval_dataset).
|
|
||||||
"""
|
|
||||||
train_fingerprint, test_fingerprint = generate_split_fingerprints(
|
|
||||||
dataset, val_set_size, cfg.seed
|
|
||||||
)
|
|
||||||
|
|
||||||
# Apply deduplication before splitting if configured
|
|
||||||
if cfg.dataset_exact_deduplication:
|
|
||||||
dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
|
|
||||||
|
|
||||||
split_dataset = dataset.train_test_split(
|
|
||||||
test_size=val_set_size,
|
|
||||||
shuffle=False,
|
|
||||||
seed=cfg.seed,
|
|
||||||
train_new_fingerprint=train_fingerprint,
|
|
||||||
test_new_fingerprint=test_fingerprint,
|
|
||||||
)
|
|
||||||
|
|
||||||
return split_dataset["train"], split_dataset["test"]
|
|
||||||
|
|
||||||
|
|
||||||
def _generate_from_iterable_dataset(
|
|
||||||
dataset: IterableDataset, worker_id: list[int], num_workers: list[int]
|
|
||||||
) -> Generator[Any, None, None]:
|
|
||||||
"""Generator function to correctly split the dataset for each worker"""
|
|
||||||
for i, item in enumerate(dataset):
|
|
||||||
if i % num_workers[0] == worker_id[0]:
|
|
||||||
yield item
|
|
||||||
|
|
||||||
|
|
||||||
def save_preprocessed_dataset(
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset,
|
|
||||||
dataset_hash: str,
|
|
||||||
split: str,
|
|
||||||
) -> None:
|
|
||||||
"""Save preprocessed dataset to disk and optionally push to the HF Hub."""
|
|
||||||
prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)
|
|
||||||
if isinstance(dataset, IterableDataset):
|
|
||||||
num_workers = cfg.dataset_processes
|
|
||||||
|
|
||||||
ds_from_iter = Dataset.from_generator(
|
|
||||||
functools.partial(_generate_from_iterable_dataset, dataset),
|
|
||||||
features=dataset.features,
|
|
||||||
num_proc=num_workers,
|
|
||||||
split=split,
|
|
||||||
gen_kwargs={
|
|
||||||
"worker_id": list(range(num_workers)),
|
|
||||||
"num_workers": [num_workers] * num_workers,
|
|
||||||
},
|
|
||||||
)
|
|
||||||
ds_from_iter.save_to_disk(str(prepared_ds_path))
|
|
||||||
else:
|
|
||||||
os.makedirs(prepared_ds_path, exist_ok=True)
|
|
||||||
dataset.save_to_disk(str(prepared_ds_path))
|
|
||||||
if cfg.push_dataset_to_hub:
|
|
||||||
LOG.info(
|
|
||||||
"Pushing merged prepared dataset to Huggingface hub at "
|
|
||||||
f"{cfg.push_dataset_to_hub} (version {dataset_hash})...",
|
|
||||||
main_process_only=False,
|
|
||||||
)
|
|
||||||
dataset.push_to_hub(
|
|
||||||
cfg.push_dataset_to_hub,
|
|
||||||
dataset_hash,
|
|
||||||
private=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def load_preprocessed_dataset(cfg: DictDefault, dataset_hash: str) -> Dataset | None:
|
|
||||||
"""Load preprocessed dataset from disk if available.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cfg: Configuration object.
|
|
||||||
dataset_hash: Hash identifying the dataset configuration.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Loaded dataset if found and conditions are met, None otherwise.
|
|
||||||
"""
|
|
||||||
prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)
|
|
||||||
|
|
||||||
if (
|
|
||||||
cfg.dataset_prepared_path
|
|
||||||
and any(prepared_ds_path.glob("*"))
|
|
||||||
and not cfg.skip_prepare_dataset
|
|
||||||
and not cfg.is_preprocess
|
|
||||||
):
|
|
||||||
LOG.info(
|
|
||||||
f"Loading prepared dataset from disk at {prepared_ds_path}...",
|
|
||||||
main_process_only=False,
|
|
||||||
)
|
|
||||||
return load_from_disk(str(prepared_ds_path))
|
|
||||||
|
|
||||||
LOG.info(
|
|
||||||
f"Unable to find prepared dataset in {prepared_ds_path}",
|
|
||||||
main_process_only=False,
|
|
||||||
)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def try_load_from_hub(
|
|
||||||
cfg: DictDefault, dataset_hash: str, split: str
|
|
||||||
) -> Dataset | None:
|
|
||||||
"""Try to load the prepared dataset from HuggingFace Hub."""
|
|
||||||
try:
|
try:
|
||||||
LOG.info(
|
if remote_file_system and remote_file_system.exists(config_dataset.path):
|
||||||
"Attempting to load prepared dataset from HuggingFace Hub at "
|
ds_from_cloud = True
|
||||||
f"{cfg.push_dataset_to_hub} (version {dataset_hash})..."
|
except (FileNotFoundError, ConnectionError):
|
||||||
)
|
pass
|
||||||
dataset = load_dataset(
|
|
||||||
cfg.push_dataset_to_hub,
|
|
||||||
dataset_hash,
|
|
||||||
token=cfg.hf_use_auth_token,
|
|
||||||
)
|
|
||||||
return dataset[split]
|
|
||||||
except Exception: # pylint: disable=broad-except # nosec
|
|
||||||
LOG.info("Unable to find prepared dataset in HuggingFace Hub")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
# gather extra args from the config
|
||||||
def generate_dataset_hash_from_config(
|
load_ds_kwargs = {}
|
||||||
cfg: DictDefault, cfg_datasets: list, tokenizer_name: str
|
if config_dataset.split:
|
||||||
) -> str:
|
load_ds_kwargs["split"] = config_dataset.split
|
||||||
"""Generate a hash to uniquely identify a dataset configuration for SFT.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
cfg: Main configuration object.
|
|
||||||
cfg_datasets: List of dataset configurations.
|
|
||||||
tokenizer_name: Name of the tokenizer being used.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
MD5 hash string representing the configuration.
|
|
||||||
"""
|
|
||||||
config_str = (
|
|
||||||
f"{cfg.sequence_len}@{cfg.sample_packing}@{cfg.eval_sample_packing}@"
|
|
||||||
f"{cfg.group_by_length}@{cfg.kd_temperature or 1.0}|"
|
|
||||||
f"{'|'.join(sorted([f'{d.path}:{d.type}:{d.shards}:{d.conversation}:{d.split}:{d.temperature or 1.0}' for d in cfg_datasets]))}"
|
|
||||||
f"|{tokenizer_name}"
|
|
||||||
)
|
|
||||||
return str(md5(config_str))
|
|
||||||
|
|
||||||
|
|
||||||
def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
|
|
||||||
"""Merge multiple datasets into one with optional shuffling.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
datasets: List of datasets to merge.
|
|
||||||
cfg: Configuration object containing shuffle settings.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Merged dataset.
|
|
||||||
"""
|
|
||||||
if len(datasets) == 1:
|
|
||||||
return datasets[0]
|
|
||||||
|
|
||||||
LOG.info("Merging datasets...")
|
|
||||||
merged_dataset = concatenate_datasets(datasets)
|
|
||||||
|
|
||||||
if cfg.shuffle_merged_datasets:
|
|
||||||
LOG.debug("Shuffling merged datasets...")
|
|
||||||
merged_dataset = merged_dataset.shuffle(seed=cfg.seed)
|
|
||||||
else:
|
else:
|
||||||
LOG.debug("Not shuffling merged datasets.")
|
load_ds_kwargs["split"] = None
|
||||||
|
|
||||||
return merged_dataset
|
# prefer local dataset, even if hub exists
|
||||||
|
local_path = Path(config_dataset.path)
|
||||||
|
if local_path.exists():
|
||||||
|
if local_path.is_dir():
|
||||||
|
if config_dataset.data_files:
|
||||||
|
ds_type = get_ds_type(config_dataset)
|
||||||
|
ds = load_dataset( # pylint: disable=invalid-name
|
||||||
|
ds_type,
|
||||||
|
name=config_dataset.name,
|
||||||
|
data_files=config_dataset.data_files,
|
||||||
|
streaming=streaming,
|
||||||
|
**load_ds_kwargs,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
ds = load_from_disk(
|
||||||
|
config_dataset.path
|
||||||
|
) # pylint: disable=invalid-name
|
||||||
|
except FileNotFoundError:
|
||||||
|
ds = load_dataset(
|
||||||
|
config_dataset.path,
|
||||||
|
name=config_dataset.name,
|
||||||
|
streaming=False,
|
||||||
|
**load_ds_kwargs,
|
||||||
|
)
|
||||||
|
elif local_path.is_file():
|
||||||
|
ds_type = get_ds_type(config_dataset)
|
||||||
|
|
||||||
|
ds = load_dataset( # pylint: disable=invalid-name
|
||||||
|
ds_type,
|
||||||
|
name=config_dataset.name,
|
||||||
|
data_files=config_dataset.path,
|
||||||
|
streaming=False,
|
||||||
|
**load_ds_kwargs,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError(
|
||||||
|
"unhandled dataset load: local path exists, but is neither a directory or a file"
|
||||||
|
)
|
||||||
|
elif ds_from_hub:
|
||||||
|
ds = load_dataset(
|
||||||
|
config_dataset.path,
|
||||||
|
name=config_dataset.name,
|
||||||
|
streaming=streaming,
|
||||||
|
data_files=config_dataset.data_files,
|
||||||
|
token=use_auth_token,
|
||||||
|
revision=config_dataset.revision,
|
||||||
|
trust_remote_code=config_dataset.trust_remote_code,
|
||||||
|
**load_ds_kwargs,
|
||||||
|
)
|
||||||
|
elif ds_from_cloud and remote_file_system:
|
||||||
|
if remote_file_system.isdir(config_dataset.path):
|
||||||
|
ds = load_from_disk(
|
||||||
|
config_dataset.path,
|
||||||
|
storage_options=storage_options,
|
||||||
|
)
|
||||||
|
elif remote_file_system.isfile(config_dataset.path):
|
||||||
|
ds_type = get_ds_type(config_dataset)
|
||||||
|
ds = load_dataset(
|
||||||
|
ds_type,
|
||||||
|
name=config_dataset.name,
|
||||||
|
data_files=config_dataset.path,
|
||||||
|
streaming=streaming,
|
||||||
|
storage_options=storage_options,
|
||||||
|
trust_remote_code=config_dataset.trust_remote_code,
|
||||||
|
**load_ds_kwargs,
|
||||||
|
)
|
||||||
|
elif config_dataset.path.startswith("https://"):
|
||||||
|
ds_type = get_ds_type(config_dataset)
|
||||||
|
ds = load_dataset(
|
||||||
|
ds_type,
|
||||||
|
name=config_dataset.name,
|
||||||
|
data_files=config_dataset.path,
|
||||||
|
streaming=streaming,
|
||||||
|
storage_options=storage_options,
|
||||||
|
trust_remote_code=config_dataset.trust_remote_code,
|
||||||
|
**load_ds_kwargs,
|
||||||
|
)
|
||||||
|
elif config_dataset.data_files:
|
||||||
|
fp: str | list[str] | None = None
|
||||||
|
if isinstance(config_dataset.data_files, str):
|
||||||
|
fp = hf_hub_download(
|
||||||
|
repo_id=config_dataset.path,
|
||||||
|
repo_type="dataset",
|
||||||
|
filename=config_dataset.data_files,
|
||||||
|
revision=config_dataset.revision,
|
||||||
|
)
|
||||||
|
elif isinstance(config_dataset.data_files, list):
|
||||||
|
fp = []
|
||||||
|
for file in config_dataset.data_files:
|
||||||
|
fp.append(
|
||||||
|
hf_hub_download(
|
||||||
|
repo_id=config_dataset.path,
|
||||||
|
repo_type="dataset",
|
||||||
|
filename=file,
|
||||||
|
revision=config_dataset.revision,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
raise ValueError("data_files must be either a string or list of strings")
|
||||||
|
ds = load_dataset(
|
||||||
|
"json",
|
||||||
|
name=config_dataset.name,
|
||||||
|
data_files=fp,
|
||||||
|
streaming=streaming,
|
||||||
|
**load_ds_kwargs,
|
||||||
|
)
|
||||||
|
if not ds:
|
||||||
|
raise ValueError(
|
||||||
|
"The dataset could not be loaded. This could be due to a misconfigured dataset path "
|
||||||
|
f"({config_dataset.path}). Try double-check your path / name / data_files. "
|
||||||
|
"This is not caused by the dataset type."
|
||||||
|
)
|
||||||
|
|
||||||
|
return ds
|
||||||
|
|||||||
@@ -1,11 +1,9 @@
|
|||||||
"""Data handling helpers"""
|
"""data handling helpers"""
|
||||||
|
|
||||||
import contextlib
|
|
||||||
import functools
|
import functools
|
||||||
import hashlib
|
import hashlib
|
||||||
import time
|
import time
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
from typing import Callable
|
|
||||||
|
|
||||||
import huggingface_hub
|
import huggingface_hub
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -21,7 +19,9 @@ LOG = get_logger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
class RetryStrategy(Enum):
|
class RetryStrategy(Enum):
|
||||||
"""Enum for retry strategies."""
|
"""
|
||||||
|
Enum for retry strategies.
|
||||||
|
"""
|
||||||
|
|
||||||
CONSTANT = 1
|
CONSTANT = 1
|
||||||
LINEAR = 2
|
LINEAR = 2
|
||||||
@@ -30,18 +30,7 @@ class RetryStrategy(Enum):
|
|||||||
|
|
||||||
def retry_on_request_exceptions(
|
def retry_on_request_exceptions(
|
||||||
max_retries=3, delay=1, retry_strategy: RetryStrategy = RetryStrategy.LINEAR
|
max_retries=3, delay=1, retry_strategy: RetryStrategy = RetryStrategy.LINEAR
|
||||||
) -> Callable:
|
):
|
||||||
"""Decorator that retries function calls on specific request exceptions.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
max_retries: Maximum number of retry attempts.
|
|
||||||
delay: Base delay between retries in seconds.
|
|
||||||
retry_strategy: Strategy for calculating retry delays.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Decorated function with retry logic.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def decorator(func):
|
def decorator(func):
|
||||||
@functools.wraps(func)
|
@functools.wraps(func)
|
||||||
def wrapper(*args, **kwargs): # pylint: disable=inconsistent-return-statements
|
def wrapper(*args, **kwargs): # pylint: disable=inconsistent-return-statements
|
||||||
@@ -51,7 +40,6 @@ def retry_on_request_exceptions(
|
|||||||
except (
|
except (
|
||||||
requests.exceptions.ReadTimeout,
|
requests.exceptions.ReadTimeout,
|
||||||
requests.exceptions.ConnectionError,
|
requests.exceptions.ConnectionError,
|
||||||
requests.exceptions.HTTPError,
|
|
||||||
huggingface_hub.errors.HfHubHTTPError,
|
huggingface_hub.errors.HfHubHTTPError,
|
||||||
) as exc:
|
) as exc:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
@@ -71,7 +59,6 @@ def retry_on_request_exceptions(
|
|||||||
|
|
||||||
|
|
||||||
def md5(to_hash: str, encoding: str = "utf-8") -> str:
|
def md5(to_hash: str, encoding: str = "utf-8") -> str:
|
||||||
"""Generate MD5 hash of a string."""
|
|
||||||
try:
|
try:
|
||||||
return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
|
return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
|
||||||
except TypeError:
|
except TypeError:
|
||||||
@@ -79,89 +66,102 @@ def md5(to_hash: str, encoding: str = "utf-8") -> str:
|
|||||||
|
|
||||||
|
|
||||||
def sha256(to_hash: str, encoding: str = "utf-8") -> str:
|
def sha256(to_hash: str, encoding: str = "utf-8") -> str:
|
||||||
"""Generate SHA256 hash of a string."""
|
|
||||||
return hashlib.sha256(to_hash.encode(encoding)).hexdigest()
|
return hashlib.sha256(to_hash.encode(encoding)).hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def _deduplicate_dataset(
|
def deduplicate_dataset(
|
||||||
dataset: Dataset,
|
dataset: Dataset, seen_hashes: dict[str, list[int]], other_dataset: Dataset = None
|
||||||
seen_hashes: set[str] | None = None,
|
) -> Dataset:
|
||||||
) -> tuple[Dataset, set[str]]:
|
|
||||||
"""Remove duplicate rows from a dataset using SHA256 hashes.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
dataset: Dataset to deduplicate.
|
|
||||||
seen_hashes: Set of previously seen row hashes (for cross-deduplication).
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of deduplicated dataset and the set of seen hashes.
|
|
||||||
"""
|
|
||||||
if seen_hashes is None:
|
|
||||||
seen_hashes = set()
|
|
||||||
|
|
||||||
unique_indices = []
|
unique_indices = []
|
||||||
for idx, row in enumerate(dataset):
|
|
||||||
row_hash = sha256(str(row)) # Using SHA256 for collision resistance
|
|
||||||
if row_hash not in seen_hashes:
|
|
||||||
seen_hashes.add(row_hash)
|
|
||||||
unique_indices.append(idx)
|
|
||||||
|
|
||||||
return dataset.select(unique_indices), seen_hashes
|
for idx, row in enumerate(dataset):
|
||||||
|
row_hash = sha256(str(row)) # Using SHA256 for collision resistance.
|
||||||
|
if row_hash not in seen_hashes:
|
||||||
|
seen_hashes[row_hash] = [idx]
|
||||||
|
unique_indices.append(idx)
|
||||||
|
else:
|
||||||
|
# Check for collision by looking up the original dataset indices
|
||||||
|
original_indices = seen_hashes[row_hash]
|
||||||
|
is_duplicate = False
|
||||||
|
for original_idx in original_indices:
|
||||||
|
if (
|
||||||
|
not idx == original_idx
|
||||||
|
and original_idx < len(dataset)
|
||||||
|
and str(dataset[original_idx]) == str(row)
|
||||||
|
):
|
||||||
|
is_duplicate = True
|
||||||
|
break
|
||||||
|
# Check in the other dataset if provided
|
||||||
|
if other_dataset is not None:
|
||||||
|
if original_idx < len(other_dataset) and str(
|
||||||
|
other_dataset[original_idx]
|
||||||
|
) == str(row):
|
||||||
|
is_duplicate = True
|
||||||
|
break
|
||||||
|
if not is_duplicate:
|
||||||
|
seen_hashes[row_hash].append(idx)
|
||||||
|
unique_indices.append(idx)
|
||||||
|
continue
|
||||||
|
return dataset.select(unique_indices)
|
||||||
|
|
||||||
|
|
||||||
def deduplicate_and_log_datasets(
|
def deduplicate_and_log_datasets(
|
||||||
dataset: Dataset,
|
*,
|
||||||
other_dataset: Dataset | None = None,
|
train_dataset: Dataset = None,
|
||||||
dataset_name: str | None = "train",
|
eval_dataset: Dataset = None,
|
||||||
other_name: str | None = "eval",
|
dataset: Dataset = None,
|
||||||
) -> tuple[Dataset, Dataset | None]:
|
) -> tuple[Dataset, Dataset, Dataset]:
|
||||||
"""Deduplicate datasets, with optional cross-dataset deduplication.
|
"""
|
||||||
|
Deduplicates train, eval, and an optional dataset if provided, logging original and new sizes.
|
||||||
Args:
|
|
||||||
dataset: Primary dataset to deduplicate.
|
|
||||||
other_dataset: Optional second dataset to deduplicate against the first.
|
|
||||||
dataset_name: Name for the primary dataset (for logging).
|
|
||||||
other_name: Name for the second dataset (for logging).
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Tuple of (deduplicated_dataset, deduplicated_other_dataset).
|
tuple: Deduplicated train, eval, and additional datasets.
|
||||||
"""
|
"""
|
||||||
# Deduplicate primary dataset
|
seen_hashes: dict[str, list[int]] = {}
|
||||||
LOG.info(
|
|
||||||
f"Starting deduplication for {dataset_name} dataset. Original size: {len(dataset)}"
|
|
||||||
)
|
|
||||||
dataset, seen_rows = _deduplicate_dataset(dataset)
|
|
||||||
LOG.info(
|
|
||||||
f"Deduplication complete for {dataset_name} dataset. New size: {len(dataset)}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Deduplicate second dataset if provided
|
# Handle cases where datasets are None
|
||||||
if other_dataset is not None:
|
if train_dataset is not None:
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Starting deduplication for {other_name} dataset. Original size: {len(other_dataset)}"
|
f"Starting deduplication for train dataset. Original size: {len(train_dataset)}"
|
||||||
|
)
|
||||||
|
train_dataset = deduplicate_dataset(
|
||||||
|
dataset=train_dataset, seen_hashes=seen_hashes
|
||||||
)
|
)
|
||||||
other_dataset, _ = _deduplicate_dataset(other_dataset, seen_rows)
|
|
||||||
LOG.info(
|
LOG.info(
|
||||||
f"Deduplication complete for {other_name} dataset. New size: {len(other_dataset)}"
|
f"Deduplication complete for train dataset. New size: {len(train_dataset)}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
LOG.info("Train dataset is None. Skipping deduplication.")
|
||||||
|
|
||||||
|
if eval_dataset is not None:
|
||||||
|
LOG.info(
|
||||||
|
f"Starting deduplication for eval dataset. Original size: {len(eval_dataset)}"
|
||||||
|
)
|
||||||
|
eval_dataset = deduplicate_dataset(
|
||||||
|
dataset=eval_dataset, seen_hashes=seen_hashes, other_dataset=train_dataset
|
||||||
|
)
|
||||||
|
LOG.info(
|
||||||
|
f"Deduplication complete for eval dataset. New size: {len(eval_dataset)}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
LOG.info("Eval dataset is None. Skipping deduplication.")
|
||||||
|
|
||||||
|
if dataset is not None and (eval_dataset is None and train_dataset is None):
|
||||||
|
LOG.info(
|
||||||
|
f"Starting deduplication for combined dataset. Original size: {len(dataset)}"
|
||||||
|
)
|
||||||
|
dataset = deduplicate_dataset(dataset=dataset, seen_hashes=seen_hashes)
|
||||||
|
LOG.info(
|
||||||
|
f"Deduplication complete for combined dataset. New size: {len(dataset)}"
|
||||||
)
|
)
|
||||||
|
|
||||||
return dataset, other_dataset
|
return train_dataset, eval_dataset, dataset
|
||||||
|
|
||||||
|
|
||||||
def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault) -> Dataset:
|
def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault):
|
||||||
"""Remove sequences longer than configured maximum from dataset.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
dataset: Dataset to filter.
|
|
||||||
cfg: Dictionary mapping `axolotl` config keys to values.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Filtered dataset with long sequences removed.
|
|
||||||
"""
|
|
||||||
if "input_ids" not in dataset.column_names:
|
if "input_ids" not in dataset.column_names:
|
||||||
LOG.warning(
|
LOG.warning(
|
||||||
"Dataset does not contain 'input_ids' column. Skip drop long seq. This is "
|
"Dataset does not contain 'input_ids' column. Skip drop long seq. This is expected for RewardModeling."
|
||||||
"expected for reward modeling."
|
|
||||||
)
|
)
|
||||||
return dataset
|
return dataset
|
||||||
|
|
||||||
@@ -171,14 +171,20 @@ def drop_long_seq_in_dataset(dataset: Dataset, cfg: DictDefault) -> Dataset:
|
|||||||
min_sequence_len=cfg.min_sample_len,
|
min_sequence_len=cfg.min_sample_len,
|
||||||
)
|
)
|
||||||
|
|
||||||
with contextlib.suppress(AttributeError):
|
try:
|
||||||
ds_lengths = get_dataset_lengths(dataset, from_arrow=True)
|
ds_lengths = get_dataset_lengths(dataset, from_arrow=True)
|
||||||
min_input_len = np.min(ds_lengths)
|
min_input_len = np.min(ds_lengths)
|
||||||
LOG.info(f"min_input_len: {min_input_len}")
|
LOG.info(f"min_input_len: {min_input_len}")
|
||||||
max_input_len = np.max(ds_lengths)
|
max_input_len = np.max(ds_lengths)
|
||||||
LOG.info(f"max_input_len: {max_input_len}")
|
LOG.info(f"max_input_len: {max_input_len}")
|
||||||
|
except AttributeError:
|
||||||
|
pass
|
||||||
|
|
||||||
prior_len = len(dataset) if hasattr(dataset, "__len__") else None
|
try:
|
||||||
|
prior_len = len(dataset)
|
||||||
|
except TypeError:
|
||||||
|
# handle iterable datasets case
|
||||||
|
prior_len = None
|
||||||
|
|
||||||
filter_map_kwargs = {}
|
filter_map_kwargs = {}
|
||||||
if not isinstance(dataset, IterableDataset):
|
if not isinstance(dataset, IterableDataset):
|
||||||
|
|||||||
@@ -1,425 +0,0 @@
|
|||||||
"""Data handling specific to SFT."""
|
|
||||||
|
|
||||||
import logging
|
|
||||||
from typing import Any, NoReturn, cast
|
|
||||||
|
|
||||||
from datasets import (
|
|
||||||
Dataset,
|
|
||||||
IterableDataset,
|
|
||||||
Sequence,
|
|
||||||
Value,
|
|
||||||
)
|
|
||||||
from transformers import PreTrainedTokenizer
|
|
||||||
from transformers.processing_utils import ProcessorMixin
|
|
||||||
|
|
||||||
from axolotl.datasets import TokenizedPromptDataset, wrap_dataset_for_tokenized_prompt
|
|
||||||
from axolotl.prompt_strategies import load
|
|
||||||
from axolotl.prompt_strategies.bradley_terry import load as bradley_terry_load
|
|
||||||
from axolotl.prompt_tokenizers import (
|
|
||||||
AlpacaMultipleChoicePromptTokenizingStrategy,
|
|
||||||
AlpacaPromptTokenizingStrategy,
|
|
||||||
AlpacaReflectionPTStrategy,
|
|
||||||
DatasetWrappingStrategy,
|
|
||||||
GPTeacherPromptTokenizingStrategy,
|
|
||||||
JeopardyPromptTokenizingStrategy,
|
|
||||||
OpenAssistantPromptTokenizingStrategy,
|
|
||||||
PromptTokenizingStrategy,
|
|
||||||
SummarizeTLDRPromptTokenizingStrategy,
|
|
||||||
)
|
|
||||||
from axolotl.prompters import (
|
|
||||||
AlpacaPrompter,
|
|
||||||
GPTeacherPrompter,
|
|
||||||
JeopardyPrompter,
|
|
||||||
MultipleChoiceConcisePrompter,
|
|
||||||
MultipleChoiceExplainPrompter,
|
|
||||||
Prompter,
|
|
||||||
ReflectAlpacaPrompter,
|
|
||||||
SummarizeTLDRPrompter,
|
|
||||||
UnsupportedPrompter,
|
|
||||||
)
|
|
||||||
from axolotl.utils.dict import DictDefault
|
|
||||||
|
|
||||||
LOG = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
def handle_unknown_dataset_strategy(dataset_config: DictDefault) -> NoReturn:
|
|
||||||
"""Raise error for unknown dataset strategy."""
|
|
||||||
ds_type = dataset_config.type
|
|
||||||
suffix = ""
|
|
||||||
if ":load_" in ds_type:
|
|
||||||
suffix = f"Did you mean {ds_type.replace(':load_', '.load_')}?"
|
|
||||||
|
|
||||||
error_message = f"unhandled prompt tokenization strategy: {ds_type}. {suffix}"
|
|
||||||
LOG.error(error_message)
|
|
||||||
raise ValueError(error_message)
|
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=too-many-return-statements
|
|
||||||
def get_dataset_wrapper(
|
|
||||||
dataset_config: DictDefault,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset_base_type: str | None,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_prompt_style: str | None = None,
|
|
||||||
processor: ProcessorMixin | None = None, # pylint: disable=unused-argument
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter | None]:
|
|
||||||
"""Create an appropriate dataset wrapper and prompter based on dataset
|
|
||||||
configuration.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
dataset_config: Configuration for the dataset.
|
|
||||||
tokenizer: Tokenizer to use for processing text.
|
|
||||||
cfg: Global configuration object.
|
|
||||||
dataset_base_type: The base type of the dataset.
|
|
||||||
dataset: The actual dataset object.
|
|
||||||
dataset_prompt_style: Optional prompt style specification.
|
|
||||||
processor: Optional processor for multimodal datasets.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
tuple of (dataset_wrapper, dataset_prompter).
|
|
||||||
"""
|
|
||||||
# Common parameters for dataset wrapping
|
|
||||||
dataset_kwargs: dict[str, Any] = {
|
|
||||||
"process_count": cfg.dataset_processes,
|
|
||||||
"keep_in_memory": cfg.dataset_keep_in_memory is True,
|
|
||||||
}
|
|
||||||
|
|
||||||
LOG.info(
|
|
||||||
f"Loading dataset: {dataset_config['path']} with base_type: "
|
|
||||||
f"{dataset_base_type} and prompt_style: {dataset_prompt_style}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Dataset is already tokenized
|
|
||||||
if _is_dataset_already_tokenized(dataset):
|
|
||||||
return dataset, UnsupportedPrompter()
|
|
||||||
|
|
||||||
# Custom dataset type definition
|
|
||||||
if isinstance(dataset_config.type, DictDefault):
|
|
||||||
return _handle_custom_dataset_type(
|
|
||||||
dataset_config, tokenizer, cfg, dataset, dataset_kwargs
|
|
||||||
)
|
|
||||||
|
|
||||||
# Skip preparation if configured
|
|
||||||
if cfg.skip_prepare_dataset:
|
|
||||||
return dataset, None
|
|
||||||
|
|
||||||
# Bradley-Terry dataset
|
|
||||||
if dataset_config.type.startswith("bradley_terry"):
|
|
||||||
return _handle_bradley_terry_dataset(
|
|
||||||
dataset_config, tokenizer, cfg, dataset, dataset_kwargs
|
|
||||||
)
|
|
||||||
|
|
||||||
# Stepwise supervised dataset
|
|
||||||
if dataset_config.type.startswith("stepwise_supervised"):
|
|
||||||
return _handle_stepwise_supervised_dataset(
|
|
||||||
dataset_config, tokenizer, cfg, dataset, dataset_kwargs
|
|
||||||
)
|
|
||||||
|
|
||||||
# Try to load prompt tokenizer / dataset wrapper strategy from registry
|
|
||||||
dataset_strategy = load(
|
|
||||||
dataset_config.type, tokenizer, cfg, dataset_config, processor=processor
|
|
||||||
)
|
|
||||||
if dataset_strategy:
|
|
||||||
return _handle_loaded_strategy(dataset_strategy, dataset, dataset_kwargs)
|
|
||||||
|
|
||||||
# Known dataset types with specific handling
|
|
||||||
if dataset_base_type in DATASET_HANDLERS:
|
|
||||||
handler = DATASET_HANDLERS[dataset_base_type]
|
|
||||||
return handler(dataset_prompt_style, tokenizer, cfg, dataset, dataset_kwargs)
|
|
||||||
|
|
||||||
# Unhandled dataset type
|
|
||||||
handle_unknown_dataset_strategy(dataset_config)
|
|
||||||
|
|
||||||
|
|
||||||
def _is_dataset_already_tokenized(dataset: Dataset | IterableDataset) -> bool:
|
|
||||||
"""Check if the dataset is already tokenized."""
|
|
||||||
return (
|
|
||||||
isinstance(dataset, Dataset)
|
|
||||||
and "input_ids" in dataset.features
|
|
||||||
and "attention_mask" in dataset.features
|
|
||||||
and "labels" in dataset.features
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_custom_dataset_type(
|
|
||||||
dataset_config: DictDefault,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter]:
|
|
||||||
"""Handle a custom dataset type defined in the configuration."""
|
|
||||||
dataset_strategy = cast(
|
|
||||||
PromptTokenizingStrategy,
|
|
||||||
load("user_defined", tokenizer, cfg, dataset_config.type.to_dict()),
|
|
||||||
)
|
|
||||||
dataset_prompter = UnsupportedPrompter()
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_bradley_terry_dataset(
|
|
||||||
dataset_config: DictDefault,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter | None]:
|
|
||||||
"""Handle a Bradley-Terry dataset."""
|
|
||||||
bt_type = dataset_config.type.split(".", 1)[1]
|
|
||||||
dataset_strategy = bradley_terry_load(bt_type, tokenizer, cfg, dataset_config)
|
|
||||||
|
|
||||||
if not dataset_strategy:
|
|
||||||
handle_unknown_dataset_strategy(dataset_config)
|
|
||||||
|
|
||||||
dataset_prompter = UnsupportedPrompter()
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_stepwise_supervised_dataset(
|
|
||||||
dataset_config: DictDefault,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter]:
|
|
||||||
"""Handle a stepwise supervised dataset."""
|
|
||||||
dataset_prompter = UnsupportedPrompter()
|
|
||||||
dataset_strategy = load(dataset_config.type, tokenizer, cfg, dataset_config)
|
|
||||||
|
|
||||||
# We need to explicitly cast boolean labels to int
|
|
||||||
# for compatibility with how trl's PRMTrainer works
|
|
||||||
if isinstance(dataset, Dataset):
|
|
||||||
dataset = dataset.cast_column("labels", Sequence(Value("int64")))
|
|
||||||
|
|
||||||
dataset_wrapper = TokenizedPromptDataset(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_loaded_strategy(
|
|
||||||
dataset_strategy: PromptTokenizingStrategy | DatasetWrappingStrategy,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter | None]:
|
|
||||||
"""Handle a dataset with a strategy loaded from the registry."""
|
|
||||||
if isinstance(dataset_strategy, DatasetWrappingStrategy):
|
|
||||||
return dataset_strategy.wrap_dataset(dataset, **dataset_kwargs), None
|
|
||||||
|
|
||||||
dataset_prompter = UnsupportedPrompter()
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_alpaca_dataset(
|
|
||||||
dataset_prompt_style: str | None,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter]:
|
|
||||||
"""Handle an Alpaca dataset."""
|
|
||||||
dataset_prompter = AlpacaPrompter(dataset_prompt_style)
|
|
||||||
dataset_strategy = AlpacaPromptTokenizingStrategy(
|
|
||||||
dataset_prompter,
|
|
||||||
tokenizer,
|
|
||||||
cfg.train_on_inputs,
|
|
||||||
cfg.sequence_len,
|
|
||||||
)
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_explainchoice_dataset(
|
|
||||||
dataset_prompt_style: str | None,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter]:
|
|
||||||
"""Handle an ExplainChoice dataset."""
|
|
||||||
dataset_prompter = MultipleChoiceExplainPrompter(dataset_prompt_style)
|
|
||||||
dataset_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
|
|
||||||
dataset_prompter,
|
|
||||||
tokenizer,
|
|
||||||
cfg.train_on_inputs,
|
|
||||||
cfg.sequence_len,
|
|
||||||
)
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_concisechoice_dataset(
|
|
||||||
dataset_prompt_style: str | None,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter]:
|
|
||||||
"""Handle a ConciseChoice dataset."""
|
|
||||||
dataset_prompter = MultipleChoiceConcisePrompter(dataset_prompt_style)
|
|
||||||
dataset_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
|
|
||||||
dataset_prompter,
|
|
||||||
tokenizer,
|
|
||||||
cfg.train_on_inputs,
|
|
||||||
cfg.sequence_len,
|
|
||||||
)
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_summarizetldr_dataset(
|
|
||||||
dataset_prompt_style: str | None,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter]:
|
|
||||||
"""Handle a SummarizeTLDR dataset."""
|
|
||||||
dataset_prompter = SummarizeTLDRPrompter(dataset_prompt_style)
|
|
||||||
dataset_strategy = SummarizeTLDRPromptTokenizingStrategy(
|
|
||||||
dataset_prompter,
|
|
||||||
tokenizer,
|
|
||||||
cfg.train_on_inputs,
|
|
||||||
cfg.sequence_len,
|
|
||||||
)
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_jeopardy_dataset(
|
|
||||||
dataset_prompt_style: str | None,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter]:
|
|
||||||
"""Handle a Jeopardy dataset."""
|
|
||||||
dataset_prompter = JeopardyPrompter(dataset_prompt_style)
|
|
||||||
dataset_strategy = JeopardyPromptTokenizingStrategy(
|
|
||||||
dataset_prompter,
|
|
||||||
tokenizer,
|
|
||||||
cfg.train_on_inputs,
|
|
||||||
cfg.sequence_len,
|
|
||||||
)
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_oasst_dataset(
|
|
||||||
dataset_prompt_style: str | None,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter]:
|
|
||||||
"""Handle an OpenAssistant dataset."""
|
|
||||||
dataset_prompter = AlpacaPrompter(dataset_prompt_style)
|
|
||||||
dataset_strategy = OpenAssistantPromptTokenizingStrategy(
|
|
||||||
dataset_prompter,
|
|
||||||
tokenizer,
|
|
||||||
cfg.train_on_inputs,
|
|
||||||
cfg.sequence_len,
|
|
||||||
)
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_gpteacher_dataset(
|
|
||||||
dataset_prompt_style: str | None,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter]:
|
|
||||||
"""Handle a GPTeacher dataset."""
|
|
||||||
dataset_prompter = GPTeacherPrompter(dataset_prompt_style)
|
|
||||||
dataset_strategy = GPTeacherPromptTokenizingStrategy(
|
|
||||||
dataset_prompter,
|
|
||||||
tokenizer,
|
|
||||||
cfg.train_on_inputs,
|
|
||||||
cfg.sequence_len,
|
|
||||||
)
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
def _handle_reflection_dataset(
|
|
||||||
dataset_prompt_style: str | None,
|
|
||||||
tokenizer: PreTrainedTokenizer,
|
|
||||||
cfg: DictDefault,
|
|
||||||
dataset: Dataset | IterableDataset,
|
|
||||||
dataset_kwargs: dict[str, Any],
|
|
||||||
) -> tuple[Dataset | IterableDataset, Prompter]:
|
|
||||||
"""Handle a Reflection dataset."""
|
|
||||||
dataset_prompter = ReflectAlpacaPrompter(dataset_prompt_style)
|
|
||||||
dataset_strategy = AlpacaReflectionPTStrategy(
|
|
||||||
dataset_prompter,
|
|
||||||
tokenizer,
|
|
||||||
cfg.train_on_inputs,
|
|
||||||
cfg.sequence_len,
|
|
||||||
)
|
|
||||||
dataset_wrapper = wrap_dataset_for_tokenized_prompt(
|
|
||||||
dataset_strategy,
|
|
||||||
dataset,
|
|
||||||
**dataset_kwargs,
|
|
||||||
)
|
|
||||||
return dataset_wrapper, dataset_prompter
|
|
||||||
|
|
||||||
|
|
||||||
DATASET_HANDLERS = {
|
|
||||||
"alpaca": _handle_alpaca_dataset,
|
|
||||||
"explainchoice": _handle_explainchoice_dataset,
|
|
||||||
"concisechoice": _handle_concisechoice_dataset,
|
|
||||||
"summarizetldr": _handle_summarizetldr_dataset,
|
|
||||||
"jeopardy": _handle_jeopardy_dataset,
|
|
||||||
"oasst": _handle_oasst_dataset,
|
|
||||||
"gpteacher": _handle_gpteacher_dataset,
|
|
||||||
"reflection": _handle_reflection_dataset,
|
|
||||||
}
|
|
||||||
@@ -1,567 +0,0 @@
|
|||||||
"""Wrapper for MistralTokenizer from mistral-common"""
|
|
||||||
|
|
||||||
import math
|
|
||||||
import os
|
|
||||||
from shutil import copyfile
|
|
||||||
from typing import TYPE_CHECKING, Optional
|
|
||||||
|
|
||||||
import numpy as np
|
|
||||||
from huggingface_hub import hf_hub_download
|
|
||||||
from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
|
|
||||||
from mistral_common.tokens.tokenizers.tekken import Tekkenizer
|
|
||||||
from torch import Tensor
|
|
||||||
from transformers.utils import PaddingStrategy
|
|
||||||
|
|
||||||
from axolotl.utils.collators.core import IGNORE_INDEX
|
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
|
||||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
|
||||||
|
|
||||||
|
|
||||||
def _get_file_path(path_or_repo_id: str, filename: str) -> str:
|
|
||||||
"""Get the file path from local or HF Hub"""
|
|
||||||
if os.path.exists(path_or_repo_id):
|
|
||||||
maybe_file_path = os.path.join(path_or_repo_id, filename)
|
|
||||||
if os.path.exists(maybe_file_path):
|
|
||||||
return maybe_file_path
|
|
||||||
|
|
||||||
raise FileNotFoundError(f"File not found at {path_or_repo_id}")
|
|
||||||
|
|
||||||
return hf_hub_download(repo_id=path_or_repo_id, filename=filename)
|
|
||||||
|
|
||||||
|
|
||||||
class HFMistralTokenizer:
|
|
||||||
"""
|
|
||||||
Wraps mistral_common.tokens.tokenizers.mistral.MistralTokenizer
|
|
||||||
and exposes HuggingFace API for special tokens.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self, mistral: MistralTokenizer, name_or_path: str, tokenizer_path: str
|
|
||||||
):
|
|
||||||
"""
|
|
||||||
Args:
|
|
||||||
mistral: The mistral-common tokenizer to wrap.
|
|
||||||
name_or_path: The name or path to the tokenizer files or the repo id.
|
|
||||||
"""
|
|
||||||
self._mistral = mistral
|
|
||||||
self._padding_side = "right"
|
|
||||||
self._name_or_path = name_or_path
|
|
||||||
self._tokenizer_path = tokenizer_path
|
|
||||||
|
|
||||||
# Manual set to training mode
|
|
||||||
from mistral_common.protocol.instruct.validator import (
|
|
||||||
MistralRequestValidator,
|
|
||||||
ValidationMode,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Check if MistralRequestValidator has a _mode attribute.
|
|
||||||
# This is a private API and may change in the future.
|
|
||||||
# pylint: disable=protected-access
|
|
||||||
if not (
|
|
||||||
hasattr(self._mistral, "_chat_completion_request_validator")
|
|
||||||
and isinstance(
|
|
||||||
self._mistral._chat_completion_request_validator,
|
|
||||||
MistralRequestValidator,
|
|
||||||
)
|
|
||||||
and hasattr(self._mistral._chat_completion_request_validator, "_mode")
|
|
||||||
):
|
|
||||||
raise RuntimeError(
|
|
||||||
"Unable to switch mistral tokenizer to finetuning mode – "
|
|
||||||
"private API `_chat_completion_request_validator._mode` missing."
|
|
||||||
)
|
|
||||||
|
|
||||||
self._mistral._chat_completion_request_validator._mode = (
|
|
||||||
ValidationMode.finetuning
|
|
||||||
)
|
|
||||||
|
|
||||||
def _load_system_prompt(self, path_or_repo_id: str) -> str:
|
|
||||||
"""Load system prompt from local or HF Hub.
|
|
||||||
|
|
||||||
Note: Unused for now as we don't want to explicitly set the system prompt if a user does
|
|
||||||
not provide one.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path_or_repo_id: The path to the tokenizer files or the repo id.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The system prompt.
|
|
||||||
"""
|
|
||||||
file_path = _get_file_path(path_or_repo_id, "SYSTEM_PROMPT.txt")
|
|
||||||
|
|
||||||
if not os.path.exists(file_path):
|
|
||||||
raise FileNotFoundError(f"System prompt file not found at {file_path}")
|
|
||||||
|
|
||||||
with open(file_path, "r", encoding="utf-8") as file:
|
|
||||||
return file.read()
|
|
||||||
|
|
||||||
@property
|
|
||||||
def bos_token_id(self) -> int:
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.bos_id
|
|
||||||
|
|
||||||
@property
|
|
||||||
def eos_token_id(self) -> int:
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.eos_id
|
|
||||||
|
|
||||||
@property
|
|
||||||
def pad_token_id(self) -> int:
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.pad_id
|
|
||||||
|
|
||||||
@property
|
|
||||||
def unk_token_id(self) -> int:
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.unk_id
|
|
||||||
|
|
||||||
@property
|
|
||||||
def bos_token(self) -> str:
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.id_to_piece(self.bos_token_id)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def eos_token(self) -> str:
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.id_to_piece(self.eos_token_id)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def pad_token(self) -> str:
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.id_to_piece(self.pad_token_id)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def unk_token(self) -> str:
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.id_to_piece(self.unk_token_id)
|
|
||||||
|
|
||||||
@property
|
|
||||||
def padding_side(self) -> str:
|
|
||||||
return self._padding_side
|
|
||||||
|
|
||||||
@property
|
|
||||||
def name_or_path(self) -> str:
|
|
||||||
return self._name_or_path
|
|
||||||
|
|
||||||
@property
|
|
||||||
def chat_template(self) -> str | None:
|
|
||||||
"""Chat template is not supported. Dummy method to satisfy HuggingFace API."""
|
|
||||||
return None
|
|
||||||
|
|
||||||
def __len__(self) -> int:
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.n_words
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def from_pretrained(
|
|
||||||
cls,
|
|
||||||
name_or_path: str,
|
|
||||||
*,
|
|
||||||
revision: Optional[str] = None,
|
|
||||||
**kwargs, # pylint: disable=unused-argument
|
|
||||||
) -> "HFMistralTokenizer":
|
|
||||||
"""
|
|
||||||
Load a mistral tekken tokenizer from a local file or HF Hub and wrap it.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path_or_repo_id: The path to the tokenizer files or the repo id.
|
|
||||||
revision: The revision of the tokenizer to download.
|
|
||||||
kwargs: Additional keyword arguments.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A HFMistralTokenizer instance.
|
|
||||||
"""
|
|
||||||
if revision:
|
|
||||||
raise NotImplementedError(
|
|
||||||
"Revision not supported yet for mistral-common tokenizer"
|
|
||||||
)
|
|
||||||
|
|
||||||
# only support Tekken tokenizer for now
|
|
||||||
# downloads from HF Hub if not local
|
|
||||||
tokenizer_path = _get_file_path(name_or_path, "tekken.json")
|
|
||||||
|
|
||||||
base = MistralTokenizer.from_file(tokenizer_path)
|
|
||||||
|
|
||||||
return cls(
|
|
||||||
base,
|
|
||||||
name_or_path=name_or_path,
|
|
||||||
tokenizer_path=tokenizer_path,
|
|
||||||
)
|
|
||||||
|
|
||||||
def save_pretrained(self, save_directory: str) -> None:
|
|
||||||
"""
|
|
||||||
Save the Tekken/SentencePiece model file so that from_pretrained can pick it up again.
|
|
||||||
|
|
||||||
Only Tekken models are supported.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
save_directory: The directory to save the tokenizer files.
|
|
||||||
"""
|
|
||||||
inner = self._mistral.instruct_tokenizer.tokenizer
|
|
||||||
if isinstance(inner, Tekkenizer):
|
|
||||||
# Create the directory and save the model
|
|
||||||
try:
|
|
||||||
os.makedirs(save_directory, exist_ok=True)
|
|
||||||
|
|
||||||
# Verify directory was created
|
|
||||||
if not os.path.exists(save_directory):
|
|
||||||
raise RuntimeError(f"Failed to create directory: {save_directory}")
|
|
||||||
|
|
||||||
# Verify source file exists
|
|
||||||
if not os.path.exists(self._tokenizer_path):
|
|
||||||
raise FileNotFoundError(
|
|
||||||
f"Source tokenizer file not found: {self._tokenizer_path}"
|
|
||||||
)
|
|
||||||
|
|
||||||
destination_path = os.path.join(save_directory, "tekken.json")
|
|
||||||
copyfile(self._tokenizer_path, destination_path)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"Failed to save tokenizer to {save_directory}: {e}. "
|
|
||||||
f"Source path: {self._tokenizer_path}, "
|
|
||||||
f"Directory exists: {os.path.exists(save_directory)}"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
else:
|
|
||||||
raise RuntimeError(f"Unknown tokenizer type: {type(inner)}")
|
|
||||||
|
|
||||||
def encode(self, text: str, add_special_tokens: bool = True) -> list[int]:
|
|
||||||
"""
|
|
||||||
Encode a text string into a list of token IDs.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
text: The text string to encode.
|
|
||||||
add_special_tokens: Whether to add special tokens to the encoded tokens.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
A list of token IDs.
|
|
||||||
"""
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.encode(
|
|
||||||
text,
|
|
||||||
bos=add_special_tokens,
|
|
||||||
eos=add_special_tokens,
|
|
||||||
)
|
|
||||||
|
|
||||||
def decode(
|
|
||||||
self, token_ids: int | list[int], skip_special_tokens: bool = False
|
|
||||||
) -> str:
|
|
||||||
"""
|
|
||||||
Decode a list of token IDs into a text string.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
token_ids: The int or list of token IDs to decode.
|
|
||||||
skip_special_tokens: Whether to skip special tokens in the decoded text.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The decoded text string.
|
|
||||||
"""
|
|
||||||
if isinstance(token_ids, int):
|
|
||||||
token_ids = [token_ids]
|
|
||||||
|
|
||||||
if skip_special_tokens:
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.decode(token_ids)
|
|
||||||
|
|
||||||
# to_string returns a string with special tokens
|
|
||||||
return self._mistral.instruct_tokenizer.tokenizer.to_string(token_ids)
|
|
||||||
|
|
||||||
def _create_mistral_chat_completion_request(
|
|
||||||
self, conversation: list[dict], tools: list[dict] | None = None
|
|
||||||
) -> "ChatCompletionRequest":
|
|
||||||
from mistral_common.protocol.instruct.messages import (
|
|
||||||
AssistantMessage,
|
|
||||||
SystemMessage,
|
|
||||||
ToolMessage,
|
|
||||||
UserMessage,
|
|
||||||
)
|
|
||||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
|
||||||
from mistral_common.protocol.instruct.tool_calls import Function, Tool
|
|
||||||
|
|
||||||
messages: list[UserMessage | AssistantMessage | ToolMessage | SystemMessage] = (
|
|
||||||
[]
|
|
||||||
)
|
|
||||||
for turn in conversation:
|
|
||||||
role = turn.get("role")
|
|
||||||
|
|
||||||
if role == "user":
|
|
||||||
messages.append(UserMessage(content=turn["content"]))
|
|
||||||
elif role == "assistant":
|
|
||||||
messages.append(
|
|
||||||
AssistantMessage(
|
|
||||||
content=turn.get("content"),
|
|
||||||
tool_calls=turn.get("tool_calls"),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
elif role == "tool":
|
|
||||||
messages.append(
|
|
||||||
ToolMessage(
|
|
||||||
content=turn.get("content"),
|
|
||||||
tool_call_id=turn.get("tool_call_id"),
|
|
||||||
name=turn.get("name"),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
elif role == "system":
|
|
||||||
messages.append(SystemMessage(content=turn["content"]))
|
|
||||||
else:
|
|
||||||
raise ValueError(
|
|
||||||
f"Unknown role for use with mistral-common tokenizer: {turn['role']}"
|
|
||||||
)
|
|
||||||
|
|
||||||
tool_calls: list[Tool] = []
|
|
||||||
if tools:
|
|
||||||
# convert to Tool
|
|
||||||
for tool in tools:
|
|
||||||
if tool["type"] != "function":
|
|
||||||
continue
|
|
||||||
|
|
||||||
function = tool["function"]
|
|
||||||
|
|
||||||
tool_calls.append(
|
|
||||||
Tool(
|
|
||||||
function=Function(
|
|
||||||
name=function["name"],
|
|
||||||
description=function["description"],
|
|
||||||
# set parameters to empty dict if not provided
|
|
||||||
parameters=function.get("parameters", {}),
|
|
||||||
)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
chat_completion: ChatCompletionRequest = ChatCompletionRequest(
|
|
||||||
messages=messages,
|
|
||||||
tools=tool_calls,
|
|
||||||
)
|
|
||||||
|
|
||||||
return chat_completion
|
|
||||||
|
|
||||||
def apply_chat_template(
|
|
||||||
self,
|
|
||||||
messages: list[dict],
|
|
||||||
tokenize: bool = True,
|
|
||||||
tools: list[dict] | None = None,
|
|
||||||
chat_template: str | None = None, # pylint: disable=unused-argument
|
|
||||||
add_generation_prompt: bool = False, # pylint: disable=unused-argument
|
|
||||||
) -> list[int] | str:
|
|
||||||
if chat_template:
|
|
||||||
raise NotImplementedError("chat_template not supported yet")
|
|
||||||
|
|
||||||
if add_generation_prompt:
|
|
||||||
raise NotImplementedError("add_generation_prompt not supported yet")
|
|
||||||
|
|
||||||
chat_completion: ChatCompletionRequest = (
|
|
||||||
self._create_mistral_chat_completion_request(messages, tools)
|
|
||||||
)
|
|
||||||
|
|
||||||
tokens: list[int] = self._mistral.encode_chat_completion(chat_completion).tokens
|
|
||||||
|
|
||||||
if tokenize:
|
|
||||||
return tokens
|
|
||||||
|
|
||||||
return self.decode(tokens)
|
|
||||||
|
|
||||||
def pad(
|
|
||||||
self,
|
|
||||||
features: list[dict[str, list[int] | np.ndarray]],
|
|
||||||
*,
|
|
||||||
padding: bool | str | PaddingStrategy = True,
|
|
||||||
max_length: int | None = None,
|
|
||||||
pad_to_multiple_of: int | None = None,
|
|
||||||
return_tensors: str | None = None, # "np", "pt", or "tf"
|
|
||||||
) -> dict[str, np.ndarray | Tensor]:
|
|
||||||
"""
|
|
||||||
HF-style pad method that properly handles all sequence-related features:
|
|
||||||
- pad 'input_ids' & 'labels' to the longest (or to max_length)
|
|
||||||
"""
|
|
||||||
import torch
|
|
||||||
from torch.nn import functional as F
|
|
||||||
|
|
||||||
# Check for unsupported fields
|
|
||||||
if any("token_type_ids" in f for f in features):
|
|
||||||
raise ValueError("token_type_ids is not supported by this tokenizer")
|
|
||||||
|
|
||||||
# Determine desired sequence length
|
|
||||||
lengths = [len(f["input_ids"]) for f in features]
|
|
||||||
if padding in (True, "longest", PaddingStrategy.LONGEST):
|
|
||||||
target_length = max(lengths)
|
|
||||||
elif padding in ("max_length", PaddingStrategy.MAX_LENGTH):
|
|
||||||
if max_length is None:
|
|
||||||
raise ValueError("max_length must be set for 'max_length' padding")
|
|
||||||
target_length = max_length
|
|
||||||
elif padding in (False, "do_not_pad", PaddingStrategy.DO_NOT_PAD):
|
|
||||||
target_length = None
|
|
||||||
else:
|
|
||||||
raise ValueError(f"Unknown padding strategy: {padding}")
|
|
||||||
|
|
||||||
# Apply pad_to_multiple_of
|
|
||||||
if target_length is not None and pad_to_multiple_of is not None:
|
|
||||||
target_length = (
|
|
||||||
math.ceil(target_length / pad_to_multiple_of) * pad_to_multiple_of
|
|
||||||
)
|
|
||||||
|
|
||||||
# If no padding requested, just stack tensors
|
|
||||||
do_pad = target_length is not None
|
|
||||||
|
|
||||||
# Pad sequences using torch.nn.utils.rnn.pad_sequence
|
|
||||||
input_ids = torch.nn.utils.rnn.pad_sequence(
|
|
||||||
[torch.tensor(x["input_ids"], dtype=torch.long) for x in features],
|
|
||||||
batch_first=True,
|
|
||||||
padding_value=self.pad_token_id if self.pad_token_id is not None else 0,
|
|
||||||
)
|
|
||||||
|
|
||||||
labels = torch.nn.utils.rnn.pad_sequence(
|
|
||||||
[torch.tensor(x["labels"], dtype=torch.long) for x in features],
|
|
||||||
batch_first=True,
|
|
||||||
padding_value=IGNORE_INDEX,
|
|
||||||
)
|
|
||||||
|
|
||||||
attention_mask = torch.nn.utils.rnn.pad_sequence(
|
|
||||||
[torch.tensor(x["attention_mask"], dtype=torch.long) for x in features],
|
|
||||||
batch_first=True,
|
|
||||||
padding_value=0,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Handle position_ids - pad with sequential values for right padding, 0s for left padding
|
|
||||||
if "position_ids" in features[0]:
|
|
||||||
if self.padding_side == "left":
|
|
||||||
# Likely not needed, but keeping for now
|
|
||||||
# For left padding, we'll pad with 0s using pad_sequence, then handle manually
|
|
||||||
position_ids = torch.nn.utils.rnn.pad_sequence(
|
|
||||||
[
|
|
||||||
torch.tensor(x["position_ids"], dtype=torch.long)
|
|
||||||
for x in features
|
|
||||||
],
|
|
||||||
batch_first=True,
|
|
||||||
padding_value=0,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
# For right padding, continue the sequence
|
|
||||||
max_pos_len = max(len(f["position_ids"]) for f in features)
|
|
||||||
position_ids_list = []
|
|
||||||
for f in features:
|
|
||||||
pos_seq = torch.tensor(f["position_ids"], dtype=torch.long)
|
|
||||||
if len(pos_seq) < max_pos_len:
|
|
||||||
# Continue the sequence
|
|
||||||
last_pos = pos_seq[-1].item() if len(pos_seq) > 0 else -1
|
|
||||||
pad_len = max_pos_len - len(pos_seq)
|
|
||||||
pad_positions = torch.arange(
|
|
||||||
last_pos + 1, last_pos + 1 + pad_len, dtype=torch.long
|
|
||||||
)
|
|
||||||
pos_seq = torch.cat([pos_seq, pad_positions])
|
|
||||||
position_ids_list.append(pos_seq)
|
|
||||||
position_ids = torch.stack(position_ids_list)
|
|
||||||
else:
|
|
||||||
# Create position_ids if not present
|
|
||||||
seq_len = input_ids.size(1)
|
|
||||||
position_ids = (
|
|
||||||
torch.arange(seq_len, dtype=torch.long)
|
|
||||||
.unsqueeze(0)
|
|
||||||
.expand(input_ids.size(0), -1)
|
|
||||||
)
|
|
||||||
|
|
||||||
# Ensure all tensors have the same sequence length
|
|
||||||
max_seq_len = max(
|
|
||||||
input_ids.size(1),
|
|
||||||
labels.size(1),
|
|
||||||
attention_mask.size(1),
|
|
||||||
position_ids.size(1),
|
|
||||||
)
|
|
||||||
|
|
||||||
# TODO: check if trimming is needed? and correct.
|
|
||||||
|
|
||||||
if do_pad and target_length is not None:
|
|
||||||
max_seq_len = target_length
|
|
||||||
|
|
||||||
# Pad all tensors to the same length
|
|
||||||
if input_ids.size(1) < max_seq_len:
|
|
||||||
pad_len = max_seq_len - input_ids.size(1)
|
|
||||||
if self.padding_side == "right":
|
|
||||||
input_ids = F.pad(
|
|
||||||
input_ids,
|
|
||||||
(0, pad_len),
|
|
||||||
value=self.pad_token_id if self.pad_token_id is not None else 0,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
input_ids = F.pad(
|
|
||||||
input_ids,
|
|
||||||
(pad_len, 0),
|
|
||||||
value=self.pad_token_id if self.pad_token_id is not None else 0,
|
|
||||||
)
|
|
||||||
elif input_ids.size(1) > max_seq_len:
|
|
||||||
input_ids = input_ids[:, :max_seq_len]
|
|
||||||
|
|
||||||
if labels.size(1) < max_seq_len:
|
|
||||||
pad_len = max_seq_len - labels.size(1)
|
|
||||||
if self.padding_side == "right":
|
|
||||||
labels = F.pad(labels, (0, pad_len), value=IGNORE_INDEX)
|
|
||||||
else:
|
|
||||||
labels = F.pad(labels, (pad_len, 0), value=IGNORE_INDEX)
|
|
||||||
elif labels.size(1) > max_seq_len:
|
|
||||||
labels = labels[:, :max_seq_len]
|
|
||||||
|
|
||||||
if attention_mask.size(1) < max_seq_len:
|
|
||||||
pad_len = max_seq_len - attention_mask.size(1)
|
|
||||||
if self.padding_side == "right":
|
|
||||||
attention_mask = F.pad(attention_mask, (0, pad_len), value=0)
|
|
||||||
else:
|
|
||||||
attention_mask = F.pad(attention_mask, (pad_len, 0), value=0)
|
|
||||||
elif attention_mask.size(1) > max_seq_len:
|
|
||||||
attention_mask = attention_mask[:, :max_seq_len]
|
|
||||||
|
|
||||||
if position_ids.size(1) < max_seq_len:
|
|
||||||
pad_len = max_seq_len - position_ids.size(1)
|
|
||||||
if self.padding_side == "right":
|
|
||||||
batch_size = position_ids.size(0)
|
|
||||||
new_position_ids = []
|
|
||||||
for i in range(batch_size):
|
|
||||||
seq = position_ids[i]
|
|
||||||
if len(seq) > 0:
|
|
||||||
# get last position and pad with sequential values
|
|
||||||
last_pos = seq[-1].item()
|
|
||||||
pad_positions = torch.arange(
|
|
||||||
last_pos + 1, last_pos + 1 + pad_len, dtype=torch.long
|
|
||||||
)
|
|
||||||
new_seq = torch.cat([seq, pad_positions])
|
|
||||||
else:
|
|
||||||
new_seq = torch.arange(pad_len, dtype=torch.long)
|
|
||||||
new_position_ids.append(new_seq)
|
|
||||||
position_ids = torch.stack(new_position_ids)
|
|
||||||
else:
|
|
||||||
position_ids = F.pad(position_ids, (pad_len, 0), value=0)
|
|
||||||
elif position_ids.size(1) > max_seq_len:
|
|
||||||
position_ids = position_ids[:, :max_seq_len]
|
|
||||||
|
|
||||||
final_batch = {
|
|
||||||
"input_ids": input_ids,
|
|
||||||
"labels": labels,
|
|
||||||
"attention_mask": attention_mask,
|
|
||||||
"position_ids": position_ids,
|
|
||||||
}
|
|
||||||
|
|
||||||
# Handle non-sequence fields (raise error)
|
|
||||||
sequence_fields = {"input_ids", "labels", "attention_mask", "position_ids"}
|
|
||||||
for f in features:
|
|
||||||
for key in f.keys():
|
|
||||||
if key not in sequence_fields:
|
|
||||||
raise NotImplementedError(
|
|
||||||
f"Non-sequence field {key} not handled yet"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Convert to requested tensor type
|
|
||||||
if return_tensors is None or return_tensors == "np":
|
|
||||||
result = {}
|
|
||||||
for k, v in final_batch.items():
|
|
||||||
if isinstance(v, torch.Tensor):
|
|
||||||
result[k] = v.numpy().astype(np.long)
|
|
||||||
else:
|
|
||||||
result[k] = v
|
|
||||||
return result
|
|
||||||
|
|
||||||
if return_tensors == "pt":
|
|
||||||
return final_batch
|
|
||||||
|
|
||||||
raise ValueError(f"Unsupported return_tensors='{return_tensors}'")
|
|
||||||
|
|
||||||
def convert_ids_to_tokens(self, ids: list[int]) -> list[str]:
|
|
||||||
"""
|
|
||||||
Convert a list of token IDs to a list of tokens.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
ids: The list of token IDs to convert.
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
The list of tokens.
|
|
||||||
"""
|
|
||||||
return [
|
|
||||||
self._mistral.instruct_tokenizer.tokenizer.id_to_piece(id) for id in ids
|
|
||||||
]
|
|
||||||
@@ -3,7 +3,6 @@ Multipack Batch Sampler - An efficient batch sampler for packing variable-length
|
|||||||
into fixed-capacity batches to optimize memory usage and training throughput.
|
into fixed-capacity batches to optimize memory usage and training throughput.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import gc
|
|
||||||
import math
|
import math
|
||||||
from concurrent.futures import ProcessPoolExecutor
|
from concurrent.futures import ProcessPoolExecutor
|
||||||
from multiprocessing import cpu_count, get_context
|
from multiprocessing import cpu_count, get_context
|
||||||
@@ -146,7 +145,7 @@ def pack_parallel(
|
|||||||
"""
|
"""
|
||||||
num_items = len(sequence_lengths)
|
num_items = len(sequence_lengths)
|
||||||
if num_processes is None:
|
if num_processes is None:
|
||||||
num_processes = max(1, min(num_items // group_size, cpu_count(), 16))
|
num_processes = max(1, min(num_items // group_size, cpu_count()))
|
||||||
|
|
||||||
# Create tasks for parallel processing
|
# Create tasks for parallel processing
|
||||||
tasks = []
|
tasks = []
|
||||||
@@ -259,8 +258,8 @@ class MultipackBatchSampler(BatchSampler):
|
|||||||
batch_max_len: int, # Maximum sequence length (bin capacity)
|
batch_max_len: int, # Maximum sequence length (bin capacity)
|
||||||
lengths: np.ndarray, # Sequence lengths
|
lengths: np.ndarray, # Sequence lengths
|
||||||
packing_efficiency_estimate: float = 1.0, # Initial efficiency estimate
|
packing_efficiency_estimate: float = 1.0, # Initial efficiency estimate
|
||||||
drop_last: bool = True, # Whether to drop final batches (might be incomplete)
|
drop_last: bool = False, # Whether to drop final batches (might be incomplete)
|
||||||
num_count_samples: int = 8, # Number of times to estimate batch count
|
num_count_samples: int = 16, # Number of times to estimate batch count
|
||||||
sequential: bool = False, # Whether to use sequential packing
|
sequential: bool = False, # Whether to use sequential packing
|
||||||
group_size: int = 100_000, # Size of groups for parallel packing
|
group_size: int = 100_000, # Size of groups for parallel packing
|
||||||
bin_size: int = 200, # The max number of samples that can be packed in a single bin
|
bin_size: int = 200, # The max number of samples that can be packed in a single bin
|
||||||
@@ -350,7 +349,6 @@ class MultipackBatchSampler(BatchSampler):
|
|||||||
# Calculate efficiency statistics
|
# Calculate efficiency statistics
|
||||||
total_used = lengths.sum()
|
total_used = lengths.sum()
|
||||||
total_slots = len(all_bins) * self.batch_max_len
|
total_slots = len(all_bins) * self.batch_max_len
|
||||||
del all_bins
|
|
||||||
|
|
||||||
# Group bins into batches (each batch contains batch_size bins)
|
# Group bins into batches (each batch contains batch_size bins)
|
||||||
batches = [
|
batches = [
|
||||||
@@ -370,7 +368,6 @@ class MultipackBatchSampler(BatchSampler):
|
|||||||
self.total_token_slots += total_slots
|
self.total_token_slots += total_slots
|
||||||
|
|
||||||
self._batches = batches
|
self._batches = batches
|
||||||
gc.collect()
|
|
||||||
return batches
|
return batches
|
||||||
|
|
||||||
def __iter__(self) -> Iterator[list[list[int]]]:
|
def __iter__(self) -> Iterator[list[list[int]]]:
|
||||||
@@ -446,18 +443,10 @@ class MultipackBatchSampler(BatchSampler):
|
|||||||
|
|
||||||
if self._len_across_ranks is None:
|
if self._len_across_ranks is None:
|
||||||
# Sample multiple times to get stable estimate
|
# Sample multiple times to get stable estimate
|
||||||
_sampled_lens = []
|
len_batches = min( # pylint: disable=consider-using-generator
|
||||||
for _ in range(self.num_count_samples):
|
[len(self._batches) for _ in range(self.num_count_samples)]
|
||||||
self._batches = None # Reset cached batches
|
)
|
||||||
_sampled_lens.append(len(self.generate_batches(set_stats=False)))
|
|
||||||
len_batches = min(_sampled_lens)
|
|
||||||
|
|
||||||
# Gather minimum across all ranks
|
# Gather minimum across all ranks
|
||||||
if self._len_across_ranks is None:
|
self._len_across_ranks = self.gather_len_batches(len_batches)
|
||||||
self._len_across_ranks = self.gather_len_batches(len_batches)
|
|
||||||
else:
|
|
||||||
self._len_across_ranks = min(
|
|
||||||
self._len_across_ranks, self.gather_len_batches(len_batches)
|
|
||||||
)
|
|
||||||
|
|
||||||
return self._len_across_ranks
|
return self._len_across_ranks
|
||||||
|
|||||||
@@ -102,8 +102,6 @@ class AxolotlInputConfig(
|
|||||||
dpo_use_weighting: bool | None = None
|
dpo_use_weighting: bool | None = None
|
||||||
dpo_use_logits_to_keep: bool | None = None
|
dpo_use_logits_to_keep: bool | None = None
|
||||||
dpo_label_smoothing: float | None = None
|
dpo_label_smoothing: float | None = None
|
||||||
dpo_norm_loss: bool | None = None
|
|
||||||
dpo_padding_free: bool | None = None
|
|
||||||
|
|
||||||
datasets: (
|
datasets: (
|
||||||
Annotated[
|
Annotated[
|
||||||
@@ -338,14 +336,6 @@ class AxolotlInputConfig(
|
|||||||
|
|
||||||
plugins: list[str] | None = Field(default=None)
|
plugins: list[str] | None = Field(default=None)
|
||||||
|
|
||||||
@field_validator("seed", mode="after")
|
|
||||||
@classmethod
|
|
||||||
def set_default_seed(cls, seed):
|
|
||||||
if seed is None:
|
|
||||||
LOG.info("`seed` not set in config; setting to 42")
|
|
||||||
seed = 42
|
|
||||||
return seed
|
|
||||||
|
|
||||||
@field_validator("datasets", mode="before")
|
@field_validator("datasets", mode="before")
|
||||||
@classmethod
|
@classmethod
|
||||||
def deprecate_sharegpt_datasets(cls, datasets):
|
def deprecate_sharegpt_datasets(cls, datasets):
|
||||||
@@ -1209,7 +1199,7 @@ class AxolotlInputConfig(
|
|||||||
"flash_attention: true must be set with sequence_parallel_degree > 1"
|
"flash_attention: true must be set with sequence_parallel_degree > 1"
|
||||||
)
|
)
|
||||||
|
|
||||||
if self.sample_packing and getattr(self, "micro_batch_size", 1) > 1:
|
if self.sample_packing and self.micro_batch_size > 1:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"micro_batch_size must be set to 1 when sample_packing is enabled "
|
"micro_batch_size must be set to 1 when sample_packing is enabled "
|
||||||
"due to a `ring-flash-attn` requirement"
|
"due to a `ring-flash-attn` requirement"
|
||||||
@@ -1267,71 +1257,9 @@ class AxolotlInputConfig(
|
|||||||
)
|
)
|
||||||
return data
|
return data
|
||||||
|
|
||||||
@model_validator(mode="before")
|
|
||||||
@classmethod
|
|
||||||
def check_tokenizer_use_mistral_common(cls, data):
|
|
||||||
if data.get("tokenizer_use_mistral_common") is None:
|
|
||||||
if any(
|
|
||||||
"magistral" in name.lower()
|
|
||||||
for name in [
|
|
||||||
data.get("base_model", ""),
|
|
||||||
data.get("base_model_config", ""),
|
|
||||||
data.get("tokenizer_config", ""),
|
|
||||||
]
|
|
||||||
):
|
|
||||||
LOG.warning(
|
|
||||||
"tokenizer_use_mistral_common auto inferred to True for Magistral models. Please set it to True explicitly if you want to use mistral-common tokenizer."
|
|
||||||
)
|
|
||||||
data["tokenizer_use_mistral_common"] = True
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
@field_validator("tokenizer_use_mistral_common", mode="after")
|
|
||||||
@classmethod
|
|
||||||
def check_mistral_common_import(cls, tokenizer_use_mistral_common):
|
|
||||||
if tokenizer_use_mistral_common:
|
|
||||||
try:
|
|
||||||
import mistral_common # noqa: F401 # pylint:disable=unused-import
|
|
||||||
except ImportError as exception:
|
|
||||||
raise ImportError(
|
|
||||||
"mistral-common is required for mistral models. Please install it with `pip install axolotl` or `pip install -e .`."
|
|
||||||
) from exception
|
|
||||||
|
|
||||||
return tokenizer_use_mistral_common
|
|
||||||
|
|
||||||
@model_validator(mode="before")
|
|
||||||
@classmethod
|
|
||||||
def check_mistral_common_incompatible_options(cls, data):
|
|
||||||
if not data.get("tokenizer_use_mistral_common"):
|
|
||||||
return data
|
|
||||||
|
|
||||||
# NOTE: mistral-common tokenizer is not compatible with editing tokenizer at the moment
|
|
||||||
|
|
||||||
if data.get("added_tokens_overrides"):
|
|
||||||
raise ValueError(
|
|
||||||
"added_tokens_overrides is not supported with mistral-common tokenizer"
|
|
||||||
)
|
|
||||||
|
|
||||||
if data.get("special_tokens"):
|
|
||||||
raise ValueError(
|
|
||||||
"special_tokens override is not supported with mistral-common tokenizer"
|
|
||||||
)
|
|
||||||
|
|
||||||
if data.get("tokens"):
|
|
||||||
raise ValueError(
|
|
||||||
"tokens override is not supported with mistral-common tokenizer"
|
|
||||||
)
|
|
||||||
|
|
||||||
if data.get("chat_template"):
|
|
||||||
raise ValueError(
|
|
||||||
"Setting chat_template is not supported with mistral-common tokenizer"
|
|
||||||
)
|
|
||||||
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
class AxolotlConfigWCapabilities(AxolotlInputConfig):
|
class AxolotlConfigWCapabilities(AxolotlInputConfig):
|
||||||
"""wrapper to valdiate gpu capabilities with the configured options"""
|
"""Wrapper to validate GPU capabilities with the config options"""
|
||||||
|
|
||||||
capabilities: GPUCapabilities
|
capabilities: GPUCapabilities
|
||||||
env_capabilities: EnvCapabilities
|
env_capabilities: EnvCapabilities
|
||||||
|
|||||||
@@ -43,7 +43,6 @@ class SFTDataset(BaseModel):
|
|||||||
field_human: str | None = None
|
field_human: str | None = None
|
||||||
field_model: str | None = None
|
field_model: str | None = None
|
||||||
field_messages: str | None = None
|
field_messages: str | None = None
|
||||||
field_tools: str | None = None
|
|
||||||
# deprecated, use message_property_mappings
|
# deprecated, use message_property_mappings
|
||||||
message_field_role: str | None = None
|
message_field_role: str | None = None
|
||||||
# deprecated, use message_property_mappings
|
# deprecated, use message_property_mappings
|
||||||
|
|||||||
@@ -18,7 +18,6 @@ class ModelInputConfig(BaseModel):
|
|||||||
tokenizer_config: str | None = None
|
tokenizer_config: str | None = None
|
||||||
tokenizer_use_fast: bool | None = None
|
tokenizer_use_fast: bool | None = None
|
||||||
tokenizer_legacy: bool | None = None
|
tokenizer_legacy: bool | None = None
|
||||||
tokenizer_use_mistral_common: bool | None = None
|
|
||||||
tokenizer_type: str | None = Field(
|
tokenizer_type: str | None = Field(
|
||||||
default=None, json_schema_extra={"description": "transformers tokenizer class"}
|
default=None, json_schema_extra={"description": "transformers tokenizer class"}
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ from datasets import IterableDataset, disable_caching, enable_caching
|
|||||||
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
|
||||||
from transformers.utils import is_torch_bf16_gpu_available
|
from transformers.utils import is_torch_bf16_gpu_available
|
||||||
|
|
||||||
|
from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
|
||||||
from axolotl.monkeypatch.trainer_eval_guard import patch_evaluation_loop_for_fsdp2
|
from axolotl.monkeypatch.trainer_eval_guard import patch_evaluation_loop_for_fsdp2
|
||||||
from axolotl.utils.distributed import reduce_and_broadcast
|
from axolotl.utils.distributed import reduce_and_broadcast
|
||||||
from axolotl.utils.environment import check_cuda_p2p_ib_support
|
from axolotl.utils.environment import check_cuda_p2p_ib_support
|
||||||
@@ -466,7 +467,6 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
|
|||||||
bin_size=cfg.sample_packing_bin_size,
|
bin_size=cfg.sample_packing_bin_size,
|
||||||
sequential=cfg.sample_packing_sequentially,
|
sequential=cfg.sample_packing_sequentially,
|
||||||
drop_last=True,
|
drop_last=True,
|
||||||
num_processes=cfg.dataset_processes,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
data_loader = DataLoader(
|
data_loader = DataLoader(
|
||||||
@@ -482,9 +482,6 @@ def calculate_total_num_steps(cfg, train_dataset, update=True):
|
|||||||
data_loader_len * cfg.num_epochs * cfg.sequence_parallel_degree
|
data_loader_len * cfg.num_epochs * cfg.sequence_parallel_degree
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if cfg.dataloader_drop_last:
|
|
||||||
# drop the last batch for each epoch
|
|
||||||
total_num_steps -= int(math.ceil(cfg.num_epochs))
|
|
||||||
|
|
||||||
def calc_sample_packing_eff_est(estimates: List[float]):
|
def calc_sample_packing_eff_est(estimates: List[float]):
|
||||||
LOG.info(f"sample_packing_eff_est across ranks: {repr(estimates)}")
|
LOG.info(f"sample_packing_eff_est across ranks: {repr(estimates)}")
|
||||||
@@ -632,8 +629,6 @@ def setup_trainer(
|
|||||||
A trainer instance (either `HFRLTrainer` or `HFCausalTrainer`) configured based
|
A trainer instance (either `HFRLTrainer` or `HFCausalTrainer`) configured based
|
||||||
on the provided parameters.
|
on the provided parameters.
|
||||||
"""
|
"""
|
||||||
from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
|
|
||||||
|
|
||||||
if (
|
if (
|
||||||
cfg.torch_compile
|
cfg.torch_compile
|
||||||
and cfg.fsdp_config
|
and cfg.fsdp_config
|
||||||
|
|||||||
@@ -1,6 +1,4 @@
|
|||||||
"""
|
"""Shared pytest fixtures"""
|
||||||
shared pytest fixtures
|
|
||||||
"""
|
|
||||||
|
|
||||||
import functools
|
import functools
|
||||||
import importlib
|
import importlib
|
||||||
@@ -559,3 +557,9 @@ def test_load_fixtures(
|
|||||||
download_llama2_model_fixture,
|
download_llama2_model_fixture,
|
||||||
):
|
):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def disable_telemetry(monkeypatch):
|
||||||
|
monkeypatch.setenv("AXOLOTL_DO_NOT_TRACK", "1")
|
||||||
|
yield
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from axolotl.common.datasets import load_datasets
|
|||||||
from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
|
from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
|
||||||
from axolotl.loaders import ModelLoader, load_tokenizer
|
from axolotl.loaders import ModelLoader, load_tokenizer
|
||||||
from axolotl.utils.config import normalize_config
|
from axolotl.utils.config import normalize_config
|
||||||
from axolotl.utils.data import prepare_preference_datasets
|
from axolotl.utils.data.rl import load_prepare_preference_datasets
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
from axolotl.utils.schemas.enums import RLType
|
from axolotl.utils.schemas.enums import RLType
|
||||||
|
|
||||||
@@ -451,19 +451,15 @@ def rand_reward_func(prompts, completions) -> list[float]:
|
|||||||
# Only use mock for the commented out configs
|
# Only use mock for the commented out configs
|
||||||
if dataset_name is not None:
|
if dataset_name is not None:
|
||||||
with patch(
|
with patch(
|
||||||
"axolotl.utils.data.rl.load_dataset_with_config"
|
"axolotl.utils.data.rl.load_dataset_w_config"
|
||||||
) as mock_load_dataset:
|
) as mock_load_dataset:
|
||||||
mock_load_dataset.return_value = request.getfixturevalue(
|
mock_load_dataset.return_value = request.getfixturevalue(
|
||||||
dataset_name
|
dataset_name
|
||||||
)
|
)
|
||||||
train_dataset, eval_dataset = prepare_preference_datasets(
|
train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
|
||||||
cfg, tokenizer
|
|
||||||
)
|
|
||||||
else:
|
else:
|
||||||
# Load actual datasets for orpo_cfg and kto_cfg
|
# Load actual datasets for orpo_cfg and kto_cfg
|
||||||
train_dataset, eval_dataset = prepare_preference_datasets(
|
train_dataset, eval_dataset = load_prepare_preference_datasets(cfg)
|
||||||
cfg, tokenizer
|
|
||||||
)
|
|
||||||
|
|
||||||
builder.train_dataset = train_dataset
|
builder.train_dataset = train_dataset
|
||||||
builder.eval_dataset = eval_dataset
|
builder.eval_dataset = eval_dataset
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ Simple end-to-end test for Cut Cross Entropy integration
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from axolotl.cli.args import TrainerCliArgs
|
||||||
from axolotl.common.datasets import load_datasets
|
from axolotl.common.datasets import load_datasets
|
||||||
from axolotl.train import train
|
from axolotl.train import train
|
||||||
from axolotl.utils import get_pytorch_version
|
from axolotl.utils import get_pytorch_version
|
||||||
@@ -58,7 +59,8 @@ class TestCutCrossEntropyIntegration:
|
|||||||
cfg = validate_config(cfg)
|
cfg = validate_config(cfg)
|
||||||
prepare_plugins(cfg)
|
prepare_plugins(cfg)
|
||||||
normalize_config(cfg)
|
normalize_config(cfg)
|
||||||
dataset_meta = load_datasets(cfg=cfg)
|
cli_args = TrainerCliArgs()
|
||||||
|
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||||
|
|
||||||
major, minor, _ = get_pytorch_version()
|
major, minor, _ = get_pytorch_version()
|
||||||
if (major, minor) < (2, 4):
|
if (major, minor) < (2, 4):
|
||||||
@@ -103,7 +105,8 @@ class TestCutCrossEntropyIntegration:
|
|||||||
cfg = validate_config(cfg)
|
cfg = validate_config(cfg)
|
||||||
prepare_plugins(cfg)
|
prepare_plugins(cfg)
|
||||||
normalize_config(cfg)
|
normalize_config(cfg)
|
||||||
dataset_meta = load_datasets(cfg=cfg)
|
cli_args = TrainerCliArgs()
|
||||||
|
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||||
|
|
||||||
major, minor, _ = get_pytorch_version()
|
major, minor, _ = get_pytorch_version()
|
||||||
if (major, minor) < (2, 4):
|
if (major, minor) < (2, 4):
|
||||||
@@ -131,7 +134,8 @@ class TestCutCrossEntropyIntegration:
|
|||||||
cfg = validate_config(cfg)
|
cfg = validate_config(cfg)
|
||||||
prepare_plugins(cfg)
|
prepare_plugins(cfg)
|
||||||
normalize_config(cfg)
|
normalize_config(cfg)
|
||||||
dataset_meta = load_datasets(cfg=cfg)
|
cli_args = TrainerCliArgs()
|
||||||
|
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||||
|
|
||||||
major, minor, _ = get_pytorch_version()
|
major, minor, _ = get_pytorch_version()
|
||||||
if (major, minor) < (2, 4):
|
if (major, minor) < (2, 4):
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ e2e tests to make sure all the hooks are fired on the plugin
|
|||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
from axolotl.cli.args import TrainerCliArgs
|
||||||
from axolotl.common.datasets import load_datasets
|
from axolotl.common.datasets import load_datasets
|
||||||
from axolotl.integrations.base import BasePlugin
|
from axolotl.integrations.base import BasePlugin
|
||||||
from axolotl.train import train
|
from axolotl.train import train
|
||||||
@@ -159,7 +160,8 @@ class TestPluginHooks:
|
|||||||
cfg = validate_config(cfg)
|
cfg = validate_config(cfg)
|
||||||
prepare_plugins(cfg)
|
prepare_plugins(cfg)
|
||||||
normalize_config(cfg)
|
normalize_config(cfg)
|
||||||
dataset_meta = load_datasets(cfg=cfg)
|
cli_args = TrainerCliArgs()
|
||||||
|
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||||
|
|
||||||
train(cfg=cfg, dataset_meta=dataset_meta)
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
check_model_output_exists(temp_dir, cfg)
|
check_model_output_exists(temp_dir, cfg)
|
||||||
|
|||||||
@@ -5,9 +5,11 @@ e2e tests for kd trainer support in Axolotl
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import yaml
|
|
||||||
from accelerate.test_utils import execute_subprocess_async, get_torch_dist_unique_port
|
|
||||||
|
|
||||||
|
from axolotl.cli.args import TrainerCliArgs
|
||||||
|
from axolotl.common.datasets import load_datasets
|
||||||
|
from axolotl.train import train
|
||||||
|
from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
|
||||||
from axolotl.utils.dict import DictDefault
|
from axolotl.utils.dict import DictDefault
|
||||||
|
|
||||||
from tests.e2e.utils import check_tensorboard, require_torch_2_5_1
|
from tests.e2e.utils import check_tensorboard, require_torch_2_5_1
|
||||||
@@ -16,8 +18,8 @@ from tests.e2e.utils import check_tensorboard, require_torch_2_5_1
|
|||||||
@pytest.fixture(name="kd_min_cfg")
|
@pytest.fixture(name="kd_min_cfg")
|
||||||
def min_cfg(temp_dir):
|
def min_cfg(temp_dir):
|
||||||
return {
|
return {
|
||||||
"base_model": "Qwen/Qwen3-0.6B",
|
"base_model": "osllmai-community/Llama-3.2-1B",
|
||||||
"tokenizer_config": "winglian/qwen3-14b-math",
|
"tokenizer_config": "axolotl-ai-co/Llama-3.3-70B-Instruct-tokenizer",
|
||||||
"plugins": [
|
"plugins": [
|
||||||
"axolotl.integrations.kd.KDPlugin",
|
"axolotl.integrations.kd.KDPlugin",
|
||||||
"axolotl.integrations.liger.LigerPlugin",
|
"axolotl.integrations.liger.LigerPlugin",
|
||||||
@@ -30,22 +32,20 @@ def min_cfg(temp_dir):
|
|||||||
"kd_ce_alpha": 0.1,
|
"kd_ce_alpha": 0.1,
|
||||||
"kd_alpha": 0.9,
|
"kd_alpha": 0.9,
|
||||||
"kd_temperature": 1.0,
|
"kd_temperature": 1.0,
|
||||||
"kd_beta": 0.0,
|
|
||||||
"kd_normalize_topk": True,
|
|
||||||
"dataloader_prefetch_factor": 8,
|
"dataloader_prefetch_factor": 8,
|
||||||
"dataloader_num_workers": 4,
|
"dataloader_num_workers": 4,
|
||||||
"dataloader_pin_memory": True,
|
"dataloader_pin_memory": True,
|
||||||
"datasets": [
|
"datasets": [
|
||||||
{
|
{
|
||||||
"path": "winglian/OpenThoughts-114k-math-correct-qwen3-14b-math-prepared-topk128-normalized",
|
"path": "axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample",
|
||||||
"type": "chat_template",
|
"type": "axolotl.integrations.kd.chat_template",
|
||||||
|
"field_messages": "messages_combined",
|
||||||
"split": "train",
|
"split": "train",
|
||||||
"split_thinking": True,
|
"logprobs_field": "llm_text_generation_vllm_logprobs",
|
||||||
"eot_tokens": ["<|im_end|>"],
|
"temperature": 1.0,
|
||||||
"data_files": ["train/batch-000000.parquet"],
|
"preprocess_shards": 2,
|
||||||
},
|
},
|
||||||
],
|
],
|
||||||
"skip_prepare_dataset": True,
|
|
||||||
"val_set_size": 0.0,
|
"val_set_size": 0.0,
|
||||||
"sequence_len": 2048,
|
"sequence_len": 2048,
|
||||||
"sample_packing": True,
|
"sample_packing": True,
|
||||||
@@ -81,29 +81,18 @@ class TestKnowledgeDistillation:
|
|||||||
def test_llama_kd(self, temp_dir, kd_min_cfg):
|
def test_llama_kd(self, temp_dir, kd_min_cfg):
|
||||||
cfg = DictDefault(kd_min_cfg)
|
cfg = DictDefault(kd_min_cfg)
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
# write cfg to yaml file
|
cfg = validate_config(cfg)
|
||||||
Path(temp_dir).mkdir(parents=True, exist_ok=True)
|
prepare_plugins(cfg)
|
||||||
with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
|
normalize_config(cfg)
|
||||||
fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
|
cli_args = TrainerCliArgs()
|
||||||
|
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||||
execute_subprocess_async(
|
|
||||||
[
|
|
||||||
"axolotl",
|
|
||||||
"train",
|
|
||||||
str(Path(temp_dir) / "config.yaml"),
|
|
||||||
"--num-processes",
|
|
||||||
"1",
|
|
||||||
"--main-process-port",
|
|
||||||
f"{get_torch_dist_unique_port()}",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
assert (Path(temp_dir) / "model.safetensors").exists()
|
assert (Path(temp_dir) / "model.safetensors").exists()
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/loss", 1.4, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.4, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.skip(reason="Chunked KD loss doesn't support PEFT/LoRA")
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"load_in_8bit",
|
"load_in_8bit",
|
||||||
[True, False],
|
[True, False],
|
||||||
@@ -123,22 +112,13 @@ class TestKnowledgeDistillation:
|
|||||||
| kd_min_cfg
|
| kd_min_cfg
|
||||||
)
|
)
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
# write cfg to yaml file
|
cfg = validate_config(cfg)
|
||||||
Path(temp_dir).mkdir(parents=True, exist_ok=True)
|
prepare_plugins(cfg)
|
||||||
with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
|
normalize_config(cfg)
|
||||||
fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
|
cli_args = TrainerCliArgs()
|
||||||
|
dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
|
||||||
|
|
||||||
execute_subprocess_async(
|
train(cfg=cfg, dataset_meta=dataset_meta)
|
||||||
[
|
|
||||||
"axolotl",
|
|
||||||
"train",
|
|
||||||
str(Path(temp_dir) / "config.yaml"),
|
|
||||||
"--num-processes",
|
|
||||||
"1",
|
|
||||||
"--main-process-port",
|
|
||||||
f"{get_torch_dist_unique_port()}",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
assert (Path(temp_dir) / "adapter_model.safetensors").exists()
|
assert (Path(temp_dir) / "adapter_model.safetensors").exists()
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/loss", 1.2, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/loss", 1.2, "Train Loss (%s) is too high"
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user