From 3c0d96db45133a1f519ba44356a4fb501359c962 Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Thu, 25 Sep 2025 05:12:06 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- FAQS.html | 8 +- docs/amd_hpc.html | 8 +- docs/api/cli.args.html | 8 +- docs/api/cli.art.html | 8 +- docs/api/cli.checks.html | 8 +- docs/api/cli.cloud.base.html | 8 +- docs/api/cli.cloud.modal_.html | 8 +- docs/api/cli.config.html | 8 +- docs/api/cli.delinearize_llama4.html | 8 +- docs/api/cli.evaluate.html | 8 +- docs/api/cli.inference.html | 8 +- docs/api/cli.main.html | 8 +- docs/api/cli.merge_lora.html | 8 +- docs/api/cli.merge_sharded_fsdp_weights.html | 8 +- docs/api/cli.preprocess.html | 8 +- docs/api/cli.quantize.html | 8 +- docs/api/cli.train.html | 8 +- docs/api/cli.utils.args.html | 8 +- docs/api/cli.utils.fetch.html | 8 +- docs/api/cli.utils.html | 8 +- docs/api/cli.utils.load.html | 8 +- docs/api/cli.utils.sweeps.html | 8 +- docs/api/cli.utils.train.html | 8 +- docs/api/cli.vllm_serve.html | 8 +- docs/api/common.architectures.html | 8 +- docs/api/common.const.html | 8 +- docs/api/common.datasets.html | 8 +- docs/api/convert.html | 8 +- docs/api/core.builders.base.html | 8 +- docs/api/core.builders.causal.html | 8 +- docs/api/core.builders.rl.html | 8 +- docs/api/core.chat.format.chatml.html | 8 +- docs/api/core.chat.format.llama3x.html | 8 +- docs/api/core.chat.format.shared.html | 8 +- docs/api/core.chat.messages.html | 8 +- docs/api/core.datasets.chat.html | 8 +- ...core.datasets.transforms.chat_builder.html | 8 +- docs/api/core.trainers.base.html | 8 +- docs/api/core.trainers.dpo.trainer.html | 8 +- docs/api/core.trainers.grpo.sampler.html | 8 +- docs/api/core.trainers.grpo.trainer.html | 8 +- docs/api/core.trainers.mamba.html | 8 +- docs/api/core.trainers.mixins.optimizer.html | 8 +- ...core.trainers.mixins.rng_state_loader.html | 8 +- docs/api/core.trainers.mixins.scheduler.html | 8 +- docs/api/core.trainers.trl.html | 8 +- docs/api/core.trainers.utils.html | 8 +- docs/api/core.training_args.html | 8 +- docs/api/datasets.html | 8 +- docs/api/evaluate.html | 8 +- docs/api/index.html | 8 +- docs/api/integrations.base.html | 8 +- .../integrations.cut_cross_entropy.args.html | 8 +- docs/api/integrations.grokfast.optimizer.html | 8 +- docs/api/integrations.kd.trainer.html | 8 +- docs/api/integrations.liger.args.html | 8 +- docs/api/integrations.lm_eval.args.html | 8 +- docs/api/integrations.spectrum.args.html | 8 +- docs/api/kernels.geglu.html | 8 +- docs/api/kernels.lora.html | 8 +- docs/api/kernels.quantize.html | 8 +- docs/api/kernels.swiglu.html | 8 +- docs/api/kernels.utils.html | 8 +- docs/api/loaders.adapter.html | 8 +- docs/api/loaders.constants.html | 8 +- docs/api/loaders.model.html | 8 +- docs/api/loaders.patch_manager.html | 8 +- docs/api/loaders.processor.html | 8 +- docs/api/loaders.tokenizer.html | 8 +- docs/api/logging_config.html | 8 +- docs/api/models.mamba.modeling_mamba.html | 8 +- .../monkeypatch.btlm_attn_hijack_flash.html | 8 +- ...onkeypatch.data.batch_dataset_fetcher.html | 8 +- ...ch.gradient_checkpointing.offload_cpu.html | 8 +- ...h.gradient_checkpointing.offload_disk.html | 8 +- .../monkeypatch.llama_attn_hijack_flash.html | 8 +- ...onkeypatch.llama_attn_hijack_xformers.html | 8 +- docs/api/monkeypatch.llama_expand_mask.html | 8 +- .../monkeypatch.llama_patch_multipack.html | 8 +- docs/api/monkeypatch.lora_kernels.html | 8 +- ...monkeypatch.mistral_attn_hijack_flash.html | 8 +- docs/api/monkeypatch.mixtral.html | 8 +- docs/api/monkeypatch.multipack.html | 8 +- docs/api/monkeypatch.relora.html | 8 +- ...onkeypatch.stablelm_attn_hijack_flash.html | 8 +- docs/api/monkeypatch.trainer_fsdp_optim.html | 8 +- .../monkeypatch.transformers_fa_utils.html | 8 +- docs/api/monkeypatch.unsloth_.html | 8 +- docs/api/monkeypatch.utils.html | 8 +- docs/api/prompt_strategies.alpaca_chat.html | 8 +- .../prompt_strategies.alpaca_instruct.html | 8 +- .../prompt_strategies.alpaca_w_system.html | 8 +- docs/api/prompt_strategies.base.html | 8 +- ...rompt_strategies.bradley_terry.llama3.html | 8 +- docs/api/prompt_strategies.chat_template.html | 8 +- docs/api/prompt_strategies.completion.html | 8 +- .../prompt_strategies.dpo.chat_template.html | 8 +- docs/api/prompt_strategies.dpo.chatml.html | 8 +- docs/api/prompt_strategies.dpo.llama3.html | 8 +- .../prompt_strategies.dpo.passthrough.html | 8 +- .../prompt_strategies.dpo.user_defined.html | 8 +- docs/api/prompt_strategies.dpo.zephyr.html | 8 +- docs/api/prompt_strategies.input_output.html | 8 +- docs/api/prompt_strategies.kto.chatml.html | 8 +- docs/api/prompt_strategies.kto.llama3.html | 8 +- .../prompt_strategies.kto.user_defined.html | 8 +- docs/api/prompt_strategies.llama2_chat.html | 8 +- docs/api/prompt_strategies.messages.chat.html | 8 +- docs/api/prompt_strategies.metharme.html | 8 +- docs/api/prompt_strategies.orcamini.html | 8 +- .../prompt_strategies.orpo.chat_template.html | 8 +- docs/api/prompt_strategies.pygmalion.html | 8 +- ...prompt_strategies.stepwise_supervised.html | 8 +- docs/api/prompt_strategies.user_defined.html | 8 +- docs/api/prompt_tokenizers.html | 8 +- docs/api/train.html | 8 +- docs/api/utils.bench.html | 8 +- docs/api/utils.callbacks.comet_.html | 8 +- docs/api/utils.callbacks.lisa.html | 8 +- docs/api/utils.callbacks.mlflow_.html | 8 +- docs/api/utils.callbacks.perplexity.html | 8 +- docs/api/utils.callbacks.profiler.html | 8 +- docs/api/utils.callbacks.qat.html | 8 +- docs/api/utils.chat_templates.html | 8 +- docs/api/utils.collators.batching.html | 8 +- docs/api/utils.collators.core.html | 8 +- docs/api/utils.collators.mamba.html | 8 +- docs/api/utils.collators.mm_chat.html | 8 +- .../utils.ctx_managers.sequence_parallel.html | 8 +- docs/api/utils.data.sft.html | 8 +- docs/api/utils.data.streaming.html | 8 +- docs/api/utils.dict.html | 8 +- docs/api/utils.distributed.html | 8 +- docs/api/utils.freeze.html | 8 +- docs/api/utils.lora.html | 8 +- docs/api/utils.model_shard_quant.html | 8 +- docs/api/utils.optimizers.adopt.html | 8 +- docs/api/utils.quantization.html | 8 +- docs/api/utils.samplers.multipack.html | 8 +- docs/api/utils.schedulers.html | 8 +- docs/api/utils.schemas.config.html | 8 +- docs/api/utils.schemas.datasets.html | 8 +- docs/api/utils.schemas.enums.html | 8 +- docs/api/utils.schemas.integrations.html | 8 +- docs/api/utils.schemas.model.html | 8 +- docs/api/utils.schemas.multimodal.html | 8 +- docs/api/utils.schemas.peft.html | 8 +- docs/api/utils.schemas.training.html | 8 +- docs/api/utils.schemas.trl.html | 8 +- docs/api/utils.schemas.utils.html | 8 +- docs/api/utils.tokenization.html | 8 +- docs/api/utils.trainer.html | 8 +- docs/batch_vs_grad.html | 8 +- docs/cli.html | 8 +- docs/config-reference.html | 8 +- docs/custom_integrations.html | 8 +- docs/dataset-formats/conversation.html | 128 +- docs/dataset-formats/index.html | 10 +- docs/dataset-formats/inst_tune.html | 8 +- docs/dataset-formats/pretraining.html | 8 +- docs/dataset-formats/stepwise_supervised.html | 8 +- docs/dataset-formats/template_free.html | 8 +- docs/dataset-formats/tokenized.html | 8 +- docs/dataset_loading.html | 8 +- docs/dataset_preprocessing.html | 8 +- docs/debugging.html | 8 +- docs/docker.html | 8 +- docs/faq.html | 12 +- docs/fsdp_qlora.html | 8 +- docs/getting-started.html | 8 +- docs/gradient_checkpointing.html | 8 +- docs/inference.html | 8 +- docs/input_output.html | 8 +- docs/installation.html | 8 +- docs/lora_optims.html | 8 +- docs/lr_groups.html | 8 +- docs/mac.html | 8 +- docs/mixed_precision.html | 8 +- docs/multi-gpu.html | 8 +- docs/multi-node.html | 8 +- docs/multimodal.html | 8 +- docs/multipack.html | 8 +- docs/nccl.html | 8 +- docs/nd_parallelism.html | 8 +- docs/optimizations.html | 1086 +++ docs/optimizers.html | 8 +- docs/qat.html | 22 +- docs/quantize.html | 8 +- docs/ray-integration.html | 8 +- docs/reward_modelling.html | 8 +- docs/rlhf.html | 8 +- docs/sequence_parallelism.html | 8 +- docs/streaming.html | 8 +- docs/torchao.html | 8 +- docs/unsloth.html | 8 +- .../colab-axolotl-example.html | 8 +- index.html | 8 +- search.json | 7046 +++++++++-------- ...-08d9eb451d58809f35fda8b852d737d8.min.css} | 2 +- sitemap.xml | 1552 ++-- src/axolotl/integrations/LICENSE.html | 8 +- .../cut_cross_entropy/ACKNOWLEDGEMENTS.html | 8 +- 203 files changed, 6888 insertions(+), 4524 deletions(-) create mode 100644 docs/optimizations.html rename site_libs/bootstrap/{bootstrap-f9d679a32da2b248d4ca48a0e58e089e.min.css => bootstrap-08d9eb451d58809f35fda8b852d737d8.min.css} (85%) diff --git a/.nojekyll b/.nojekyll index e80cac93b..2d2e6c912 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -b20662cb \ No newline at end of file +8780fa26 \ No newline at end of file diff --git a/FAQS.html b/FAQS.html index 29023c65a..0bb7d121d 100644 --- a/FAQS.html +++ b/FAQS.html @@ -40,7 +40,7 @@ ul.task-list li input[type="checkbox"] { - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ +
+
+

Optimizations Guide

+
+ +
+
+ A guide to the performance and memory optimizations available in Axolotl. +
+
+ + +
+ + + + +
+ + + +
+ + +

Axolotl includes numerous optimizations to speed up training, reduce memory usage, and handle large models.

+

This guide provides a high-level overview and directs you to the detailed documentation for each feature.

+
+

Speed Optimizations

+

These optimizations focus on increasing training throughput and reducing total training time.

+
+

Sample Packing

+

Improves GPU utilization by combining multiple short sequences into a single packed sequence for training. This requires enabling one of the attention implementations below.

+ +
+
+

Attention Implementations

+

Using an optimized attention implementation is critical for training speed.

+
    +
  • Flash Attention 2: flash_attention: true. (Recommended) The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check AMD Support.
  • +
  • Flex Attention: flex_attention: true.
  • +
  • SDP Attention: sdp_attention: true. PyTorch’s native implementation.
  • +
  • Xformers: xformers_attention: true. Works with FP16.
  • +
+

Note: You should only enable one attention backend.

+
+
+

LoRA Optimizations

+

Leverages optimized kernels to accelerate LoRA training and reduce memory usage.

+ +
+
+
+

Memory Optimizations

+

These techniques help you fit larger models or use bigger batch sizes on your existing hardware.

+
+

Parameter Efficient Finetuning (LoRA & QLoRA)

+

Drastically reduces memory by training a small set of “adapter” parameters instead of the full model. This is the most common and effective memory-saving technique.

+ +
+
+

Gradient Checkpointing & Activation Offloading

+

These techniques save VRAM by changing how activations are handled.

+
    +
  • Gradient Checkpointing: re-computes activations during the backward pass, trading compute time for VRAM.
  • +
  • Activation Offloading: moves activations to CPU RAM or disk, trading I/O overhead for VRAM.
  • +
  • Learn more: Gradient Checkpointing and Offloading Docs
  • +
+
+
+

Cut Cross Entropy (CCE)

+

Reduces VRAM usage by using an optimized cross-entropy loss calculation.

+ +
+
+

Liger Kernels

+

Provides efficient Triton kernels to improve training speed and reduce memory usage.

+ +
+
+
+

Long Context Models

+

Techniques to train models on sequences longer than their original context window.

+
+

RoPE Scaling

+

Extends a model’s context window by interpolating its Rotary Position Embeddings.

+
    +
  • Config: Pass the rope_scaling config under the overrides_of_model_config:. To learn how to set RoPE, check the respective model config.
  • +
+
+
+

Sequence Parallelism

+

Splits long sequences across multiple GPUs, enabling training with sequence lengths that would not fit on a single device.

+ +
+
+

Artic Long Sequence Training (ALST)

+

ALST is a recipe that combines several techniques to train long-context models efficiently. It typically involves:

+
    +
  • TiledMLP to reduce memory usage in MLP layers.

  • +
  • Tiled Loss functions (like CCE.

  • +
  • Activation Offloading to CPU.

  • +
  • Example: ALST Example Configuration

  • +
+
+
+
+

Large Models (Distributed Training)

+

To train models that don’t fit on a single GPU, you’ll need to use a distributed training strategy like FSDP or DeepSpeed. These frameworks shard the model weights, gradients, and optimizer states across multiple GPUs and nodes.

+ +
+

N-D Parallelism (Beta)

+

For advanced scaling, Axolotl allows you to compose different parallelism techniques (e.g., Data, Tensor, Sequence Parallelism). This is a powerful approach to train an extremely large model by overcoming multiple bottlenecks at once.

+ +
+
+
+

Quantization

+

Techniques to reduce the precision of model weights for memory savings.

+
+

4-bit Training (QLoRA)

+

The recommended approach for quantization-based training. It loads the base model in 4-bit using bitsandbytes and then trains QLoRA adapters. See Adapter Finetuning for details.

+
+
+

FP8 Training

+

Enables training with 8-bit floating point precision on supported hardware (e.g., NVIDIA Hopper series GPUs) for significant speed and memory gains.

+ +
+
+

Quantization Aware Training (QAT)

+

Simulates quantization effects during training, helping the model adapt and potentially improving the final accuracy of the quantized model.

+ +
+
+

GPTQ

+

Allows you to finetune LoRA adapters on top of a model that has already been quantized using the GPTQ method.

+ + + +
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/optimizers.html b/docs/optimizers.html index b840b7b2c..6bf12db6f 100644 --- a/docs/optimizers.html +++ b/docs/optimizers.html @@ -76,7 +76,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin - + - + - + - + - + - + - + - + - + - + - + - + - + - +