diff --git a/.nojekyll b/.nojekyll index 215cc2dcb..559d19c5a 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -f6922c8f \ No newline at end of file +d83980d7 \ No newline at end of file diff --git a/FAQS.html b/FAQS.html index c9a9e4e0f..575c4e1f2 100644 --- a/FAQS.html +++ b/FAQS.html @@ -35,10 +35,10 @@ ul.task-list li input[type="checkbox"] { - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + +
+ +
+ + +
+ + + +
+ +
+
+

Custom Integrations

+
+ + + +
+ + + + +
+ + + +
+ + +

Axolotl adds custom features through integrations. They are located within the src/axolotl/integrations directory.

+

To enable them, please check the respective documentations.

+
+

Cut Cross Entropy

+

Cut Cross Entropy reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.

+

See https://github.com/apple/ml-cross-entropy

+
+

Usage

+
plugins:
+  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+cut_cross_entropy: true
+
+
+

Citation

+
@article{wijmans2024cut,
+  author       = {Erik Wijmans and
+                  Brody Huval and
+                  Alexander Hertzberg and
+                  Vladlen Koltun and
+                  Philipp Kr\"ahenb\"uhl},
+  title        = {Cut Your Losses in Large-Vocabulary Language Models},
+  journal      = {arXiv},
+  year         = {2024},
+  url          = {https://arxiv.org/abs/2411.09009},
+}
+

Please see reference here

+
+
+
+

Grokfast

+

See https://github.com/ironjr/grokfast

+
+

Usage

+
plugins:
+  - axolotl.integrations.grokfast.GrokfastPlugin
+
+grokfast_alpha: 2.0
+grokfast_lamb: 0.98
+
+
+

Citation

+
@article{lee2024grokfast,
+    title={{Grokfast}: Accelerated Grokking by Amplifying Slow Gradients},
+    author={Lee, Jaerin and Kang, Bong Gyun and Kim, Kihoon and Lee, Kyoung Mu},
+    journal={arXiv preprint arXiv:2405.20233},
+    year={2024}
+}
+

Please see reference here

+
+
+
+

Knowledge Distillation (KD)

+
+

Usage

+
plugins:
+  - "axolotl.integrations.kd.KDPlugin"
+
+kd_trainer: True
+kd_ce_alpha: 0.1
+kd_alpha: 0.9
+kd_temperature: 1.0
+
+torch_compile: True  # torch>=2.5.1, recommended to reduce vram
+
+datasets:
+  - path: ...
+    type: "axolotl.integrations.kd.chat_template"
+    field_messages: "messages_combined"
+    logprobs_field: "llm_text_generation_vllm_logprobs"  # for kd only, field of logprobs
+

An example dataset can be found at axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample

+

Please see reference here

+
+
+
+

Liger Kernels

+

Liger Kernel provides efficient Triton kernels for LLM training, offering:

+
    +
  • 20% increase in multi-GPU training throughput
  • +
  • 60% reduction in memory usage
  • +
  • Compatibility with both FSDP and DeepSpeed
  • +
+

See https://github.com/linkedin/Liger-Kernel

+
+

Usage

+
plugins:
+  - axolotl.integrations.liger.LigerPlugin
+liger_rope: true
+liger_rms_norm: true
+liger_glu_activation: true
+liger_layer_norm: true
+liger_fused_linear_cross_entropy: true
+
+
+

Citation

+
@article{hsu2024ligerkernelefficienttriton,
+      title={Liger Kernel: Efficient Triton Kernels for LLM Training},
+      author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
+      year={2024},
+      eprint={2410.10989},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2410.10989},
+      journal={arXiv preprint arXiv:2410.10989},
+}
+

Please see reference here

+
+
+
+

Language Model Evaluation Harness (LM Eval)

+

Run evaluation on model using the popular lm-evaluation-harness library.

+

See https://github.com/EleutherAI/lm-evaluation-harness

+
+

Usage

+
plugins:
+  - axolotl.integrations.lm_eval.LMEvalPlugin
+
+lm_eval_tasks:
+  - gsm8k
+  - hellaswag
+  - arc_easy
+
+lm_eval_batch_size: # Batch size for evaluation
+output_dir: # Directory to save evaluation results
+
+
+

Citation

+
@misc{eval-harness,
+  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
+  title        = {A framework for few-shot language model evaluation},
+  month        = 07,
+  year         = 2024,
+  publisher    = {Zenodo},
+  version      = {v0.4.3},
+  doi          = {10.5281/zenodo.12608602},
+  url          = {https://zenodo.org/records/12608602}
+}
+

Please see reference here

+
+
+
+

Spectrum

+

by Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar

+

This plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).

+

See https://github.com/cognitivecomputations/spectrum

+
+

Overview

+

Spectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models. By identifying the top n% of layers with the highest SNR, you can optimize training efficiency.

+
+
+

Usage

+
plugins:
+  - axolotl.integrations.spectrum.SpectrumPlugin
+
+spectrum_top_fraction: 0.5
+spectrum_model_name: meta-llama/Meta-Llama-3.1-8B
+
+
+

Citation

+
@misc{hartford2024spectrumtargetedtrainingsignal,
+      title={Spectrum: Targeted Training on Signal to Noise Ratio},
+      author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},
+      year={2024},
+      eprint={2406.06623},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2406.06623},
+}
+

Please see reference here

+ + +
+
+ +
+ +
+ + + + + \ No newline at end of file diff --git a/docs/dataset-formats/conversation.html b/docs/dataset-formats/conversation.html index b10a6d2b0..134637aa4 100644 --- a/docs/dataset-formats/conversation.html +++ b/docs/dataset-formats/conversation.html @@ -70,10 +70,10 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + diff --git a/docs/input_output.html b/docs/input_output.html index d989ed573..307accd2a 100644 --- a/docs/input_output.html +++ b/docs/input_output.html @@ -21,40 +21,6 @@ ul.task-list li input[type="checkbox"] { margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ vertical-align: middle; } -/* CSS for syntax highlighting */ -pre > code.sourceCode { white-space: pre; position: relative; } -pre > code.sourceCode > span { line-height: 1.25; } -pre > code.sourceCode > span:empty { height: 1.2em; } -.sourceCode { overflow: visible; } -code.sourceCode > span { color: inherit; text-decoration: inherit; } -div.sourceCode { margin: 1em 0; } -pre.sourceCode { margin: 0; } -@media screen { -div.sourceCode { overflow: auto; } -} -@media print { -pre > code.sourceCode { white-space: pre-wrap; } -pre > code.sourceCode > span { display: inline-block; text-indent: -5em; padding-left: 5em; } -} -pre.numberSource code - { counter-reset: source-line 0; } -pre.numberSource code > span - { position: relative; left: -4em; counter-increment: source-line; } -pre.numberSource code > span > a:first-child::before - { content: counter(source-line); - position: relative; left: -1em; text-align: right; vertical-align: baseline; - border: none; display: inline-block; - -webkit-touch-callout: none; -webkit-user-select: none; - -khtml-user-select: none; -moz-user-select: none; - -ms-user-select: none; user-select: none; - padding: 0 4px; width: 4em; - } -pre.numberSource { margin-left: 3em; padding-left: 4px; } -div.sourceCode - { } -@media screen { -pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } -} @@ -70,10 +36,10 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin - + - + - + - + + diff --git a/docs/lora_optims.html b/docs/lora_optims.html index 1de81d066..858c1e694 100644 --- a/docs/lora_optims.html +++ b/docs/lora_optims.html @@ -70,10 +70,10 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin - + - + - + - + - + - + - + - + diff --git a/docs/multi-node.html b/docs/multi-node.html index 8b27412aa..928548c93 100644 --- a/docs/multi-node.html +++ b/docs/multi-node.html @@ -70,10 +70,10 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - +