From 81cb0968fedf0a858aafc13cde935111c54f655d Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Tue, 2 Sep 2025 16:14:17 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- FAQS.html | 6 + docs/amd_hpc.html | 6 + docs/api/cli.args.html | 8 +- docs/api/cli.art.html | 6 + docs/api/cli.checks.html | 6 + docs/api/cli.cloud.base.html | 6 + docs/api/cli.cloud.modal_.html | 6 + docs/api/cli.config.html | 6 + docs/api/cli.delinearize_llama4.html | 6 + docs/api/cli.evaluate.html | 6 + docs/api/cli.inference.html | 6 + docs/api/cli.main.html | 6 + docs/api/cli.merge_lora.html | 6 + docs/api/cli.merge_sharded_fsdp_weights.html | 6 + docs/api/cli.preprocess.html | 6 + docs/api/cli.quantize.html | 6 + docs/api/cli.train.html | 6 + docs/api/cli.utils.args.html | 6 + docs/api/cli.utils.fetch.html | 6 + docs/api/cli.utils.html | 6 + docs/api/cli.utils.load.html | 6 + docs/api/cli.utils.sweeps.html | 6 + docs/api/cli.utils.train.html | 6 + docs/api/cli.vllm_serve.html | 6 + docs/api/common.architectures.html | 6 + docs/api/common.const.html | 6 + docs/api/common.datasets.html | 6 + docs/api/convert.html | 6 + docs/api/core.builders.base.html | 6 + docs/api/core.builders.causal.html | 6 + docs/api/core.builders.rl.html | 6 + docs/api/core.chat.format.chatml.html | 6 + docs/api/core.chat.format.llama3x.html | 6 + docs/api/core.chat.format.shared.html | 6 + docs/api/core.chat.messages.html | 6 + docs/api/core.datasets.chat.html | 6 + ...core.datasets.transforms.chat_builder.html | 6 + docs/api/core.trainers.base.html | 6 + docs/api/core.trainers.dpo.trainer.html | 6 + docs/api/core.trainers.grpo.sampler.html | 6 + docs/api/core.trainers.grpo.trainer.html | 6 + docs/api/core.trainers.mamba.html | 6 + docs/api/core.trainers.mixins.optimizer.html | 6 + ...core.trainers.mixins.rng_state_loader.html | 6 + docs/api/core.trainers.mixins.scheduler.html | 6 + docs/api/core.trainers.trl.html | 6 + docs/api/core.trainers.utils.html | 6 + docs/api/core.training_args.html | 6 + docs/api/datasets.html | 79 +- docs/api/evaluate.html | 6 + docs/api/index.html | 12 +- docs/api/integrations.base.html | 6 + .../integrations.cut_cross_entropy.args.html | 6 + docs/api/integrations.grokfast.optimizer.html | 6 + docs/api/integrations.kd.trainer.html | 6 + docs/api/integrations.liger.args.html | 6 + docs/api/integrations.lm_eval.args.html | 6 + docs/api/integrations.spectrum.args.html | 6 + docs/api/kernels.geglu.html | 6 + docs/api/kernels.lora.html | 6 + docs/api/kernels.quantize.html | 6 + docs/api/kernels.swiglu.html | 6 + docs/api/kernels.utils.html | 6 + docs/api/loaders.adapter.html | 6 + docs/api/loaders.constants.html | 6 + docs/api/loaders.model.html | 6 + docs/api/loaders.patch_manager.html | 6 + docs/api/loaders.processor.html | 6 + docs/api/loaders.tokenizer.html | 6 + docs/api/logging_config.html | 6 + docs/api/models.mamba.modeling_mamba.html | 6 + .../monkeypatch.btlm_attn_hijack_flash.html | 6 + ...onkeypatch.data.batch_dataset_fetcher.html | 6 + ...ch.gradient_checkpointing.offload_cpu.html | 6 + ...h.gradient_checkpointing.offload_disk.html | 6 + .../monkeypatch.llama_attn_hijack_flash.html | 6 + ...onkeypatch.llama_attn_hijack_xformers.html | 6 + docs/api/monkeypatch.llama_expand_mask.html | 6 + .../monkeypatch.llama_patch_multipack.html | 6 + docs/api/monkeypatch.lora_kernels.html | 6 + ...monkeypatch.mistral_attn_hijack_flash.html | 6 + docs/api/monkeypatch.mixtral.html | 6 + docs/api/monkeypatch.multipack.html | 6 + docs/api/monkeypatch.relora.html | 6 + ...onkeypatch.stablelm_attn_hijack_flash.html | 6 + docs/api/monkeypatch.trainer_fsdp_optim.html | 6 + .../monkeypatch.transformers_fa_utils.html | 6 + docs/api/monkeypatch.unsloth_.html | 6 + docs/api/monkeypatch.utils.html | 6 + docs/api/prompt_strategies.alpaca_chat.html | 6 + .../prompt_strategies.alpaca_instruct.html | 6 + .../prompt_strategies.alpaca_w_system.html | 6 + docs/api/prompt_strategies.base.html | 6 + ...rompt_strategies.bradley_terry.llama3.html | 6 + docs/api/prompt_strategies.chat_template.html | 6 + docs/api/prompt_strategies.completion.html | 6 + .../prompt_strategies.dpo.chat_template.html | 6 + docs/api/prompt_strategies.dpo.chatml.html | 6 + docs/api/prompt_strategies.dpo.llama3.html | 6 + .../prompt_strategies.dpo.passthrough.html | 6 + .../prompt_strategies.dpo.user_defined.html | 6 + docs/api/prompt_strategies.dpo.zephyr.html | 6 + docs/api/prompt_strategies.input_output.html | 6 + docs/api/prompt_strategies.kto.chatml.html | 6 + docs/api/prompt_strategies.kto.llama3.html | 6 + .../prompt_strategies.kto.user_defined.html | 6 + docs/api/prompt_strategies.llama2_chat.html | 6 + docs/api/prompt_strategies.messages.chat.html | 6 + docs/api/prompt_strategies.metharme.html | 6 + docs/api/prompt_strategies.orcamini.html | 6 + .../prompt_strategies.orpo.chat_template.html | 6 + docs/api/prompt_strategies.pygmalion.html | 6 + ...prompt_strategies.stepwise_supervised.html | 6 + docs/api/prompt_strategies.user_defined.html | 6 + docs/api/prompt_tokenizers.html | 6 + docs/api/train.html | 6 + docs/api/utils.bench.html | 6 + docs/api/utils.callbacks.comet_.html | 6 + docs/api/utils.callbacks.lisa.html | 6 + docs/api/utils.callbacks.mlflow_.html | 6 + docs/api/utils.callbacks.perplexity.html | 6 + docs/api/utils.callbacks.profiler.html | 6 + docs/api/utils.callbacks.qat.html | 6 + docs/api/utils.chat_templates.html | 6 + docs/api/utils.collators.batching.html | 6 + docs/api/utils.collators.core.html | 6 + docs/api/utils.collators.mamba.html | 6 + docs/api/utils.collators.mm_chat.html | 6 + .../utils.ctx_managers.sequence_parallel.html | 6 + docs/api/utils.data.sft.html | 27 +- ...raining.html => utils.data.streaming.html} | 18 +- docs/api/utils.dict.html | 6 + docs/api/utils.distributed.html | 6 + docs/api/utils.freeze.html | 6 + docs/api/utils.lora.html | 6 + docs/api/utils.model_shard_quant.html | 6 + docs/api/utils.optimizers.adopt.html | 6 + docs/api/utils.quantization.html | 6 + docs/api/utils.samplers.multipack.html | 6 + docs/api/utils.schedulers.html | 6 + docs/api/utils.schemas.config.html | 6 + docs/api/utils.schemas.datasets.html | 6 + docs/api/utils.schemas.enums.html | 6 + docs/api/utils.schemas.integrations.html | 6 + docs/api/utils.schemas.model.html | 6 + docs/api/utils.schemas.multimodal.html | 6 + docs/api/utils.schemas.peft.html | 6 + docs/api/utils.schemas.training.html | 6 + docs/api/utils.schemas.trl.html | 6 + docs/api/utils.schemas.utils.html | 6 + docs/api/utils.tokenization.html | 6 + docs/api/utils.trainer.html | 6 + docs/batch_vs_grad.html | 6 + docs/cli.html | 6 + docs/config-reference.html | 1143 ++-- docs/custom_integrations.html | 6 + docs/dataset-formats/conversation.html | 6 + docs/dataset-formats/index.html | 6 + docs/dataset-formats/inst_tune.html | 6 + docs/dataset-formats/pretraining.html | 6 + docs/dataset-formats/stepwise_supervised.html | 6 + docs/dataset-formats/template_free.html | 6 + docs/dataset-formats/tokenized.html | 6 + docs/dataset_loading.html | 6 + docs/dataset_preprocessing.html | 6 + docs/debugging.html | 6 + docs/docker.html | 6 + docs/faq.html | 6 + docs/fsdp_qlora.html | 6 + docs/getting-started.html | 6 + docs/gradient_checkpointing.html | 6 + docs/inference.html | 6 + docs/input_output.html | 6 + docs/installation.html | 6 + docs/lora_optims.html | 6 + docs/lr_groups.html | 6 + docs/mac.html | 6 + docs/mixed_precision.html | 6 + docs/multi-gpu.html | 6 + docs/multi-node.html | 6 + docs/multimodal.html | 6 + docs/multipack.html | 6 + docs/nccl.html | 6 + docs/nd_parallelism.html | 6 + docs/optimizers.html | 6 + docs/qat.html | 6 + docs/quantize.html | 6 + docs/ray-integration.html | 6 + docs/reward_modelling.html | 6 + docs/rlhf.html | 6 + docs/sequence_parallelism.html | 6 + docs/streaming.html | 1073 ++++ docs/torchao.html | 6 + docs/unsloth.html | 6 + .../colab-axolotl-example.html | 6 + index.html | 6 + search.json | 4746 +++++++++-------- sitemap.xml | 1486 +++--- src/axolotl/integrations/LICENSE.html | 6 + .../cut_cross_entropy/ACKNOWLEDGEMENTS.html | 6 + 201 files changed, 6006 insertions(+), 3734 deletions(-) rename docs/api/{utils.data.pretraining.html => utils.data.streaming.html} (98%) create mode 100644 docs/streaming.html diff --git a/.nojekyll b/.nojekyll index f4b6a59e4..b6837083c 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -f4b50a99 \ No newline at end of file +c8f78714 \ No newline at end of file diff --git a/FAQS.html b/FAQS.html index eadebd0da..3aa04ba1c 100644 --- a/FAQS.html +++ b/FAQS.html @@ -343,6 +343,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); Dataset Preprocessing + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -511,7 +516,10 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});

datasets

datasets

-

Module containing Dataset functionality

+

Module containing dataset functionality.

+

We want this to be a wrapper for an existing dataset that we have loaded. Lets use the +concept of middlewares to wrap each dataset. We’ll use the collators later on to pad the +datasets.

Classes

@@ -523,72 +531,23 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); - - - -
ConstantLengthDatasetIterable dataset that returns constant length chunks of tokens from stream of
TokenizedPromptDataset Dataset that returns tokenized prompts from a stream of text files.
-
-

ConstantLengthDataset

-
datasets.ConstantLengthDataset(tokenizer, datasets, seq_length=2048)
-

Iterable dataset that returns constant length chunks of tokens from stream of -text files.

-
-

Parameters

- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTypeDescriptionDefault
tokenizerThe processor used for processing the data.required
datasetDataset with text files.required
seq_lengthLength of token sequences to return.2048
-
-

TokenizedPromptDataset

-
datasets.TokenizedPromptDataset(
-    prompt_tokenizer,
-    dataset,
-    process_count=None,
-    keep_in_memory=False,
-    **kwargs,
-)
+
datasets.TokenizedPromptDataset(
+    prompt_tokenizer,
+    dataset,
+    process_count=None,
+    keep_in_memory=False,
+    **kwargs,
+)

Dataset that returns tokenized prompts from a stream of text files.

-
-

Parameters

+
+

Parameters

diff --git a/docs/api/evaluate.html b/docs/api/evaluate.html index dd22b9949..cc662120b 100644 --- a/docs/api/evaluate.html +++ b/docs/api/evaluate.html @@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); Dataset Preprocessing + + + - + @@ -1020,8 +1026,8 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); - - + + diff --git a/docs/api/integrations.base.html b/docs/api/integrations.base.html index b040a4d67..c412e1f99 100644 --- a/docs/api/integrations.base.html +++ b/docs/api/integrations.base.html @@ -378,6 +378,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); Dataset Preprocessing + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
datasetsModule containing Dataset functionalityModule containing dataset functionality.
convertCopied from https://github.com/iShohei220/adopt
utils.data.pretrainingdata handling specific to pretrainingutils.data.streamingData handling specific to streaming datasets.
utils.data.sft

prepare_datasets

-
utils.data.sft.prepare_datasets(
-    cfg,
-    tokenizer,
-    processor=None,
-    preprocess_iterable=False,
-)
+
utils.data.sft.prepare_datasets(cfg, tokenizer, processor=None)

Prepare training and evaluation datasets based on configuration.

Parameters

----++++ @@ -572,12 +573,6 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); - - - - - -
Optional processor for multimodal datasets. None
preprocess_iterableboolWhether to use iterable preprocessing.False
diff --git a/docs/api/utils.data.pretraining.html b/docs/api/utils.data.streaming.html similarity index 98% rename from docs/api/utils.data.pretraining.html rename to docs/api/utils.data.streaming.html index 27de29126..159f22b7a 100644 --- a/docs/api/utils.data.pretraining.html +++ b/docs/api/utils.data.streaming.html @@ -7,7 +7,7 @@ -utils.data.pretraining – Axolotl +utils.data.streaming – Axolotl + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ +
+
+

Streaming Datasets

+
+ +
+
+ How to use streaming mode for large-scale datasets and memory-efficient training +
+
+ + +
+ + + + +
+ + + +
+ + +

Streaming enables memory-efficient training with large datasets by loading data +incrementally rather than loading the entire dataset into memory at once.

+

Use streaming when:

+
    +
  • Your dataset is too large to fit in memory (e.g. when you’re doing pretraining with massive text corpora)
  • +
  • You want to start training immediately without preprocessing the entire dataset
  • +
+

Streaming works with both remote and locally stored datasets!

+
+
+
+ +
+
+Note +
+
+
+

Streaming currently only supports a single dataset. Multi-dataset support will be added soon.

+
+
+
+

Configuration

+
+

Basic Streaming

+

Enable streaming mode by setting the streaming flag:

+
streaming: true
+
+
+

Pretraining with Streaming

+

For pretraining tasks, streaming is automatically enabled when using pretraining_dataset:

+
pretraining_dataset:
+  - path: HuggingFaceFW/fineweb-edu
+    type: pretrain
+    text_column: text
+    split: train
+
+# Optionally, enable sample packing
+streaming_multipack_buffer_size: 10000
+sample_packing: true
+
+
+

SFT with Streaming

+

For supervised fine-tuning with streaming:

+
streaming: true
+datasets:
+  - path: tatsu-lab/alpaca
+    type: alpaca
+    split: train
+
+# Optionally, enable sample packing
+streaming_multipack_buffer_size: 10000
+sample_packing: true
+
+
+
+

Configuration Options

+
+

streaming_multipack_buffer_size

+

Controls the buffer size for multipack streaming (default: 10,000). This determines how +many samples are buffered before packing. Larger buffers can improve packing efficiency +but use more memory.

+
+
+

shuffle_merged_datasets

+

When enabled, shuffles the streaming dataset using the buffer. This requires additional +memory for the shuffle buffer.

+
+
+
+

Sample Packing with Streaming

+

Sample packing is supported for streaming datasets. When enabled, multiple samples are +packed into a single sequence to maximize GPU utilization:

+
sample_packing: true
+streaming_multipack_buffer_size: 10000
+
+# For SFT: attention is automatically isolated between packed samples
+# For pretraining: control with pretrain_multipack_attn
+pretrain_multipack_attn: true  # prevent cross-attention between packed samples
+

For more information, see our documentation on multipacking.

+
+
+

Important Considerations

+
+

Memory Usage

+

While streaming reduces memory usage compared to loading entire datasets, you still need +to consider:

+
    +
  • You can control the memory usage by adjusting streaming_multipack_buffer_size
  • +
  • Sample packing requires buffering multiple samples
  • +
  • Shuffling requires additional memory for the shuffle buffer
  • +
+
+
+

Performance

+
    +
  • Streaming may have slightly higher latency compared to preprocessed datasets, as samples are processed on-the-fly
  • +
  • Network speed and disk read speed are important when streaming from remote sources or a local dataset, respectively
  • +
  • Consider using axolotl preprocess for smaller or more frequently used datasets
  • +
+
+
+

Evaluation Datasets

+

Evaluation datasets are not streamed to ensure consistent evaluation metrics. They’re +loaded normally even when training uses streaming.

+
+
+
+

Examples

+

See the examples/streaming/ directory for complete configuration examples:

+
    +
  • pretrain.yaml: Pretraining with streaming dataset
  • +
  • sft.yaml: Supervised fine-tuning with streaming
  • +
+ + +
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/torchao.html b/docs/torchao.html index 673e65d98..c6ab20338 100644 --- a/docs/torchao.html +++ b/docs/torchao.html @@ -379,6 +379,12 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); Dataset Preprocessing + + + + + + +