From 333ca134a290cc8eb8595c9a8eee08631a54b78d Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Fri, 17 Oct 2025 10:06:27 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- .../prompt_strategies.dpo.chat_template.html | 145 +++++- docs/rlhf.html | 442 +++++++++--------- search.json | 11 +- sitemap.xml | 398 ++++++++-------- 5 files changed, 581 insertions(+), 417 deletions(-) diff --git a/.nojekyll b/.nojekyll index 33021b61e..b46f4f982 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -4d136695 \ No newline at end of file +525f6d2a \ No newline at end of file diff --git a/docs/api/prompt_strategies.dpo.chat_template.html b/docs/api/prompt_strategies.dpo.chat_template.html index 895cc84a2..d4b93c442 100644 --- a/docs/api/prompt_strategies.dpo.chat_template.html +++ b/docs/api/prompt_strategies.dpo.chat_template.html @@ -20,6 +20,41 @@ ul.task-list li input[type="checkbox"] { margin: 0 0.8em 0.2em -1em; /* quarto-specific, see https://github.com/quarto-dev/quarto-cli/issues/4556 */ vertical-align: middle; } +/* CSS for syntax highlighting */ +html { -webkit-text-size-adjust: 100%; } +pre > code.sourceCode { white-space: pre; position: relative; } +pre > code.sourceCode > span { display: inline-block; line-height: 1.25; } +pre > code.sourceCode > span:empty { height: 1.2em; } +.sourceCode { overflow: visible; } +code.sourceCode > span { color: inherit; text-decoration: inherit; } +div.sourceCode { margin: 1em 0; } +pre.sourceCode { margin: 0; } +@media screen { +div.sourceCode { overflow: auto; } +} +@media print { +pre > code.sourceCode { white-space: pre-wrap; } +pre > code.sourceCode > span { text-indent: -5em; padding-left: 5em; } +} +pre.numberSource code + { counter-reset: source-line 0; } +pre.numberSource code > span + { position: relative; left: -4em; counter-increment: source-line; } +pre.numberSource code > span > a:first-child::before + { content: counter(source-line); + position: relative; left: -1em; text-align: right; vertical-align: baseline; + border: none; display: inline-block; + -webkit-touch-callout: none; -webkit-user-select: none; + -khtml-user-select: none; -moz-user-select: none; + -ms-user-select: none; user-select: none; + padding: 0 4px; width: 4em; + } +pre.numberSource { margin-left: 3em; padding-left: 4px; } +div.sourceCode + { } +@media screen { +pre > code.sourceCode > span > a:first-child::before { text-decoration: underline; } +} @@ -474,7 +509,13 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});

On this page

@@ -488,8 +529,110 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});

prompt_strategies.dpo.chat_template

prompt_strategies.dpo.chat_template

DPO prompt strategies for using tokenizer chat templates.

+
+

Functions

+ + + + + + + + + + + + + +
NameDescription
argilla_chatDPO chat template strategy for argilla-style datasets.
+
+

argilla_chat

+
prompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)
+

DPO chat template strategy for argilla-style datasets.

+

For argilla-style datasets where chosen/rejected contain full conversations +instead of single response messages. Extracts the conversation history from +the chosen field and formats both chosen/rejected responses using the +configured chat template.

+
+

Parameters

+ ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
NameTypeDescriptionDefault
cfgConfiguration object containing chat_template and dataset settingsrequired
dataset_idxIndex of the dataset in the config (default: 0)0
**kwargsAdditional keyword arguments (unused){}
+
+
+

Returns

+ +++++ + + + + + + + + + + + + + + +
NameTypeDescription
tuple(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop
+
+
+

Dataset format

+

{ +“chosen”: [ +{“role”: “user”, “content”: “…”}, +{“role”: “assistant”, “content”: “…”} +], +“rejected”: [ +{“role”: “user”, “content”: “…”}, +{“role”: “assistant”, “content”: “…”} +] +}

+
+
+
diff --git a/docs/rlhf.html b/docs/rlhf.html index 7a0cfc747..fe92247ef 100644 --- a/docs/rlhf.html +++ b/docs/rlhf.html @@ -528,6 +528,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  • llama3.prompt_pairs
  • llama3.ultra
  • zephyr.nectar
  • +
  • chat_template.argilla_chat
  • chat_template.default
  • user_defined.default
  • @@ -787,228 +788,241 @@ Tip ] } +
    +

    chat_template.argilla_chat

    +
    {
    +    "chosen": [
    +        {"role": "user", "content": "..."},
    +        {"role": "assistant", "content": "..."}
    +    ],
    +    "rejected": [
    +        {"role": "user", "content": "..."},
    +        {"role": "assistant", "content": "..."}
    +    ]
    +}
    +

    chat_template.default

    -
    rl: dpo
    -datasets:
    -  - path: ...
    -    split: train
    -    type: chat_template.default
    -    field_messages: "messages"
    -    field_chosen: "chosen"
    -    field_rejected: "rejected"
    -    message_property_mappings:
    -      role: role
    -      content: content
    -    roles:
    -      user: ["user"]
    -      assistant: ["assistant"]
    -      system: ["system"]
    +
    rl: dpo
    +datasets:
    +  - path: ...
    +    split: train
    +    type: chat_template.default
    +    field_messages: "messages"
    +    field_chosen: "chosen"
    +    field_rejected: "rejected"
    +    message_property_mappings:
    +      role: role
    +      content: content
    +    roles:
    +      user: ["user"]
    +      assistant: ["assistant"]
    +      system: ["system"]

    Sample input format:

    -
    {
    -    "messages": [
    -        {
    -            "role": "system",
    -            "content": "..."
    -        },
    -        {
    -            "role": "user",
    -            "content": "..."
    -        },
    -        // ... more messages
    -    ],
    -    "chosen": {
    -        "role": "assistant",
    -        "content": "..."
    -    },
    -    "rejected": {
    -        "role": "assistant",
    -        "content": "..."
    -    }
    -}
    +
    {
    +    "messages": [
    +        {
    +            "role": "system",
    +            "content": "..."
    +        },
    +        {
    +            "role": "user",
    +            "content": "..."
    +        },
    +        // ... more messages
    +    ],
    +    "chosen": {
    +        "role": "assistant",
    +        "content": "..."
    +    },
    +    "rejected": {
    +        "role": "assistant",
    +        "content": "..."
    +    }
    +}

    user_defined.default

    For custom behaviors,

    -
    rl: dpo
    -datasets:
    -  - path: ...
    -    split: train
    -    type:
    -      field_prompt: "prompt"
    -      field_system: "system"
    -      field_chosen: "chosen"
    -      field_rejected: "rejected"
    -      prompt_format: "{prompt}"
    -      chosen_format: "{chosen}"
    -      rejected_format: "{rejected}"
    +
    rl: dpo
    +datasets:
    +  - path: ...
    +    split: train
    +    type:
    +      field_prompt: "prompt"
    +      field_system: "system"
    +      field_chosen: "chosen"
    +      field_rejected: "rejected"
    +      prompt_format: "{prompt}"
    +      chosen_format: "{chosen}"
    +      rejected_format: "{rejected}"

    The input format is a simple JSON input with customizable fields based on the above config.

    -
    {
    -    "system": "...",  // optional
    -    "prompt": "...",
    -    "chosen": "...",
    -    "rejected": "..."
    -}
    +
    {
    +    "system": "...",  // optional
    +    "prompt": "...",
    +    "chosen": "...",
    +    "rejected": "..."
    +}

    IPO

    As IPO is just DPO with a different loss function, all supported dataset formats for DPO are also supported for IPO.

    -
    rl: ipo
    +
    rl: ipo

    ORPO

    Paper: https://arxiv.org/abs/2403.07691

    -
    rl: orpo
    -orpo_alpha: 0.1
    -remove_unused_columns: false
    -
    -chat_template: chatml
    -datasets:
    -  - path: argilla/ultrafeedback-binarized-preferences-cleaned
    -    type: chat_template.argilla
    +
    rl: orpo
    +orpo_alpha: 0.1
    +remove_unused_columns: false
    +
    +chat_template: chatml
    +datasets:
    +  - path: argilla/ultrafeedback-binarized-preferences-cleaned
    +    type: chat_template.argilla

    ORPO supports the following types with the following dataset format:

    chat_template.argilla

    -
    {
    -    "system": "...",  // optional
    -    "prompt": "...",  // if available, will be taken as user message for single-turn instead of from list below
    -
    -    // chosen/rejected should be same till last content and only even-number of alternating user/assistant turns
    -    "chosen": [
    -        {"role": "user", "content": "..."},
    -        {"role": "assistant", "content": "..."}
    -    ],
    -    "rejected": [
    -        {"role": "user", "content": "..."},
    -        {"role": "assistant", "content": "..."}
    -    ]
    -}
    +
    {
    +    "system": "...",  // optional
    +    "prompt": "...",  // if available, will be taken as user message for single-turn instead of from list below
    +
    +    // chosen/rejected should be same till last content and only even-number of alternating user/assistant turns
    +    "chosen": [
    +        {"role": "user", "content": "..."},
    +        {"role": "assistant", "content": "..."}
    +    ],
    +    "rejected": [
    +        {"role": "user", "content": "..."},
    +        {"role": "assistant", "content": "..."}
    +    ]
    +}

    KTO

    -
    rl: kto
    -rl_beta: 0.1  # default
    -kto_desirable_weight: 1.0  # default
    -kto_undesirable_weight: 1.0  # default
    -
    -remove_unused_columns: false
    -
    -datasets:
    -  - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto
    -    type: llama3.ultra
    -    split: train
    -
    -gradient_checkpointing: true
    -gradient_checkpointing_kwargs:
    -  use_reentrant: true
    +
    rl: kto
    +rl_beta: 0.1  # default
    +kto_desirable_weight: 1.0  # default
    +kto_undesirable_weight: 1.0  # default
    +
    +remove_unused_columns: false
    +
    +datasets:
    +  - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto
    +    type: llama3.ultra
    +    split: train
    +
    +gradient_checkpointing: true
    +gradient_checkpointing_kwargs:
    +  use_reentrant: true

    KTO supports the following types with the following dataset format:

    chatml.argilla

    -
    {
    -    "system": "...", // optional
    -    "instruction": "...",
    -    "completion": "..."
    -}
    +
    {
    +    "system": "...", // optional
    +    "instruction": "...",
    +    "completion": "..."
    +}

    chatml.argilla_chat

    -
    {
    -    "chosen": [
    -        {"role": "user", "content": "..."}
    -    ],
    -    "completion": [
    -        {"role": "assistant", "content": "..."}
    -    ]
    -}
    +
    {
    +    "chosen": [
    +        {"role": "user", "content": "..."}
    +    ],
    +    "completion": [
    +        {"role": "assistant", "content": "..."}
    +    ]
    +}

    chatml.intel

    -
    {
    -    "system": "...", // optional
    -    "question": "...",
    -    "completion": "..."
    -}
    -
    -
    -

    chatml.prompt_pairs

    {
         "system": "...", // optional
    -    "prompt": "...",
    +    "question": "...",
         "completion": "..."
     }
    -
    -

    chatml.ultra

    +
    +

    chatml.prompt_pairs

    {
         "system": "...", // optional
         "prompt": "...",
         "completion": "..."
     }
    -
    -

    llama3.argilla

    +
    +

    chatml.ultra

    {
         "system": "...", // optional
    -    "instruction": "...",
    +    "prompt": "...",
         "completion": "..."
     }
    +
    +

    llama3.argilla

    +
    {
    +    "system": "...", // optional
    +    "instruction": "...",
    +    "completion": "..."
    +}
    +

    llama3.argilla_chat

    -
    {
    -    "completion": [
    -        {"role": "user", "content": "..."},
    -        {"role": "assistant", "content": "..."}
    -    ]
    -}
    +
    {
    +    "completion": [
    +        {"role": "user", "content": "..."},
    +        {"role": "assistant", "content": "..."}
    +    ]
    +}

    llama3.intel

    -
    {
    -    "system": "...", // optional
    -    "question": "...",
    -    "completion": "..."
    -}
    -
    -
    -

    llama3.prompt_pairs

    {
         "system": "...", // optional
    -    "prompt": "...",
    +    "question": "...",
         "completion": "..."
     }
    -
    -

    llama3.ultra

    +
    +

    llama3.prompt_pairs

    {
         "system": "...", // optional
         "prompt": "...",
         "completion": "..."
     }
    +
    +

    llama3.ultra

    +
    {
    +    "system": "...", // optional
    +    "prompt": "...",
    +    "completion": "..."
    +}
    +

    user_defined.default

    For custom behaviors,

    -
    rl: kto
    -datasets:
    -  - path: ...
    -    split: train
    -    type:
    -      field_prompt: "prompt"
    -      field_system: "system"
    -      field_completion: "completion"
    -      field_label: "label"
    -      prompt_format: "{prompt}"
    -      completion_format: "{completion}"
    +
    rl: kto
    +datasets:
    +  - path: ...
    +    split: train
    +    type:
    +      field_prompt: "prompt"
    +      field_system: "system"
    +      field_completion: "completion"
    +      field_label: "label"
    +      prompt_format: "{prompt}"
    +      completion_format: "{completion}"

    The input format is a simple JSON input with customizable fields based on the above config.

    -
    {
    -    "system": "...",  // optional
    -    "prompt": "...",
    -    "completion": "...",
    -    "label": "..."
    -}
    +
    {
    +    "system": "...",  // optional
    +    "prompt": "...",
    +    "completion": "...",
    +    "label": "..."
    +}
    @@ -1040,25 +1054,25 @@ Important

    Make sure you’ve installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. pip install axolotl[vllm].

    -
    base_model: Qwen/Qwen2.5-1.5B-Instruct
    -
    -vllm:
    -    host: 0.0.0.0
    -    port: 8000
    -    tensor_parallel_size: 2
    -    gpu_memory_utilization: 0.85
    -    dtype: auto
    -    # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand
    -
    -rl: grpo
    -trl:
    -    use_vllm: true
    -    vllm_server_host: 0.0.0.0
    -    vllm_server_port: 8000
    -    vllm_server_timeout: 300
    -
    CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml
    +
    base_model: Qwen/Qwen2.5-1.5B-Instruct
    +
    +vllm:
    +    host: 0.0.0.0
    +    port: 8000
    +    tensor_parallel_size: 2
    +    gpu_memory_utilization: 0.85
    +    dtype: auto
    +    # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand
    +
    +rl: grpo
    +trl:
    +    use_vllm: true
    +    vllm_server_host: 0.0.0.0
    +    vllm_server_port: 8000
    +    vllm_server_timeout: 300
    +
    CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml

    Your vLLM instance will now attempt to spin up, and it’s time to kick off training utilizing our remaining two GPUs. In another terminal, execute:

    -
    CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
    +
    CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
    @@ -1076,69 +1090,69 @@ Note

    Reward functions

    GRPO uses custom reward functions and transformations. Please have them ready locally.

    For example, to load OpenAI’s GSM8K and use a random reward for completions:

    -
    # rewards.py
    -import random
    -
    -def rand_reward_func(completions, **kwargs) -> list[float]:
    -    return [random.uniform(0, 1) for _ in completions]
    -
    -def oai_gsm8k_transform(cfg, *args, **kwargs):
    -    def transform_fn(example, tokenizer=None):
    -        label = example["answer"].split("####")[-1].strip().replace(",", "")
    -        return {
    -            "prompt": [{"role": "user", "content": example["question"]},],
    -            "answer": label,
    -        }
    -    return transform_fn, {"remove_columns": ["question"]}
    -
    rl: grpo
    -
    -trl:
    -    beta: 0.001
    -    max_completion_length: 256
    -    use_vllm: True
    -    num_generations: 4
    -    reward_funcs: ["rewards.rand_reward_func"]    # format: '{file_name}.{fn_name}'
    -    reward_weights: [1.0]
    -datasets:
    -  - path: openai/gsm8k
    -    name: main
    -    type: rewards.oai_gsm8k_transform  # format: '{file_name}.{fn_name}'
    +
    # rewards.py
    +import random
    +
    +def rand_reward_func(completions, **kwargs) -> list[float]:
    +    return [random.uniform(0, 1) for _ in completions]
    +
    +def oai_gsm8k_transform(cfg, *args, **kwargs):
    +    def transform_fn(example, tokenizer=None):
    +        label = example["answer"].split("####")[-1].strip().replace(",", "")
    +        return {
    +            "prompt": [{"role": "user", "content": example["question"]},],
    +            "answer": label,
    +        }
    +    return transform_fn, {"remove_columns": ["question"]}
    +
    rl: grpo
    +
    +trl:
    +    beta: 0.001
    +    max_completion_length: 256
    +    use_vllm: True
    +    num_generations: 4
    +    reward_funcs: ["rewards.rand_reward_func"]    # format: '{file_name}.{fn_name}'
    +    reward_weights: [1.0]
    +datasets:
    +  - path: openai/gsm8k
    +    name: main
    +    type: rewards.oai_gsm8k_transform  # format: '{file_name}.{fn_name}'

    To see other examples of custom reward functions, please see TRL GRPO Docs.

    To see all configs, please see TRLConfig.

    GRPO with DAPO/Dr. GRPO loss

    The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.

    -
    trl:
    -  loss_type: dr_grpo
    -  # Normalizes loss based on max completion length (default: 256)
    -  max_completion_length:
    +
    trl:
    +  loss_type: dr_grpo
    +  # Normalizes loss based on max completion length (default: 256)
    +  max_completion_length:

    For more information, see GRPO docs.

    SimPO

    SimPO uses CPOTrainer but with alternative loss function.

    -
    rl: simpo
    -rl_beta: 0.1  # default in CPOTrainer
    -cpo_alpha: 1.0  # default in CPOTrainer
    -simpo_gamma: 0.5  # default in CPOTrainer
    +
    rl: simpo
    +rl_beta: 0.1  # default in CPOTrainer
    +cpo_alpha: 1.0  # default in CPOTrainer
    +simpo_gamma: 0.5  # default in CPOTrainer

    This method uses the same dataset format as DPO.

    Using local dataset files

    -
    datasets:
    -  - ds_type: json
    -    data_files:
    -      - orca_rlhf.jsonl
    -    split: train
    -    type: chatml.intel
    +
    datasets:
    +  - ds_type: json
    +    data_files:
    +      - orca_rlhf.jsonl
    +    split: train
    +    type: chatml.intel

    TRL auto-unwrapping for PEFT

    TRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:

    -
    # load ref model when adapter training.
    -rl_adapter_ref_model: true
    +
    # load ref model when adapter training.
    +rl_adapter_ref_model: true
    diff --git a/search.json b/search.json index f74149433..d5ee176e5 100644 --- a/search.json +++ b/search.json @@ -1553,7 +1553,14 @@ "href": "docs/api/prompt_strategies.dpo.chat_template.html", "title": "prompt_strategies.dpo.chat_template", "section": "", - "text": "prompt_strategies.dpo.chat_template\nprompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates." + "text": "prompt_strategies.dpo.chat_template\nDPO prompt strategies for using tokenizer chat templates.\n\n\n\n\n\nName\nDescription\n\n\n\n\nargilla_chat\nDPO chat template strategy for argilla-style datasets.\n\n\n\n\n\nprompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)\nDPO chat template strategy for argilla-style datasets.\nFor argilla-style datasets where chosen/rejected contain full conversations\ninstead of single response messages. Extracts the conversation history from\nthe chosen field and formats both chosen/rejected responses using the\nconfigured chat template.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nConfiguration object containing chat_template and dataset settings\nrequired\n\n\ndataset_idx\n\nIndex of the dataset in the config (default: 0)\n0\n\n\n**kwargs\n\nAdditional keyword arguments (unused)\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ntuple\n\n(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop\n\n\n\n\n\n\n{\n“chosen”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n],\n“rejected”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n]\n}" + }, + { + "objectID": "docs/api/prompt_strategies.dpo.chat_template.html#functions", + "href": "docs/api/prompt_strategies.dpo.chat_template.html#functions", + "title": "prompt_strategies.dpo.chat_template", + "section": "", + "text": "Name\nDescription\n\n\n\n\nargilla_chat\nDPO chat template strategy for argilla-style datasets.\n\n\n\n\n\nprompt_strategies.dpo.chat_template.argilla_chat(cfg, dataset_idx=0, **kwargs)\nDPO chat template strategy for argilla-style datasets.\nFor argilla-style datasets where chosen/rejected contain full conversations\ninstead of single response messages. Extracts the conversation history from\nthe chosen field and formats both chosen/rejected responses using the\nconfigured chat template.\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\nDefault\n\n\n\n\ncfg\n\nConfiguration object containing chat_template and dataset settings\nrequired\n\n\ndataset_idx\n\nIndex of the dataset in the config (default: 0)\n0\n\n\n**kwargs\n\nAdditional keyword arguments (unused)\n{}\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nName\nType\nDescription\n\n\n\n\ntuple\n\n(transform_fn, dataset_kwargs) where: - transform_fn: Function to transform dataset samples - dataset_kwargs: Dict with ‘remove_columns’ specifying columns to drop\n\n\n\n\n\n\n{\n“chosen”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n],\n“rejected”: [\n{“role”: “user”, “content”: “…”},\n{“role”: “assistant”, “content”: “…”}\n]\n}" }, { "objectID": "docs/api/core.trainers.grpo.trainer.html", @@ -3636,7 +3643,7 @@ "href": "docs/rlhf.html#rlhf-using-axolotl", "title": "RLHF (Beta)", "section": "RLHF using Axolotl", - "text": "RLHF using Axolotl\n\n\n\n\n\n\nImportant\n\n\n\nThis is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.\n\n\nWe rely on the TRL library for implementations of various RL training methods, which we wrap around to expose in axolotl. Each method has their own supported ways of loading datasets and prompt formats.\n\n\n\n\n\n\nTip\n\n\n\nYou can find what each method supports by going into src/axolotl/prompt_strategies/{method} where {method} is one of our supported methods. The type: can be retrieved from {method}.{function_name}.\n\n\n\nDPO\nExample config:\nrl: dpo\ndatasets:\n - path: Intel/orca_dpo_pairs\n split: train\n type: chatml.intel\n - path: argilla/ultrafeedback-binarized-preferences\n split: train\n type: chatml\nDPO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"chosen_response\": \"...\",\n \"rejected_response\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nchatml.icr\n{\n \"system\": \"...\", // optional\n \"input\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"chosen_response\": \"...\",\n \"rejected_response\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.icr\n{\n \"system\": \"...\", // optional\n \"input\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nzephyr.nectar\n{\n \"prompt\": \"...\",\n \"answers\": [\n {\n \"answer\": \"...\",\n \"rank\": 1\n },\n {\n \"answer\": \"...\",\n \"rank\": 2\n }\n // ... more answers with ranks\n ]\n}\n\n\nchat_template.default\nrl: dpo\ndatasets:\n - path: ...\n split: train\n type: chat_template.default\n field_messages: \"messages\"\n field_chosen: \"chosen\"\n field_rejected: \"rejected\"\n message_property_mappings:\n role: role\n content: content\n roles:\n user: [\"user\"]\n assistant: [\"assistant\"]\n system: [\"system\"]\nSample input format:\n{\n \"messages\": [\n {\n \"role\": \"system\",\n \"content\": \"...\"\n },\n {\n \"role\": \"user\",\n \"content\": \"...\"\n },\n // ... more messages\n ],\n \"chosen\": {\n \"role\": \"assistant\",\n \"content\": \"...\"\n },\n \"rejected\": {\n \"role\": \"assistant\",\n \"content\": \"...\"\n }\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: dpo\ndatasets:\n - path: ...\n split: train\n type:\n field_prompt: \"prompt\"\n field_system: \"system\"\n field_chosen: \"chosen\"\n field_rejected: \"rejected\"\n prompt_format: \"{prompt}\"\n chosen_format: \"{chosen}\"\n rejected_format: \"{rejected}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\n\nIPO\nAs IPO is just DPO with a different loss function, all supported dataset formats for DPO are also supported for IPO.\nrl: ipo\n\n\nORPO\nPaper: https://arxiv.org/abs/2403.07691\nrl: orpo\norpo_alpha: 0.1\nremove_unused_columns: false\n\nchat_template: chatml\ndatasets:\n - path: argilla/ultrafeedback-binarized-preferences-cleaned\n type: chat_template.argilla\nORPO supports the following types with the following dataset format:\n\nchat_template.argilla\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\", // if available, will be taken as user message for single-turn instead of from list below\n\n // chosen/rejected should be same till last content and only even-number of alternating user/assistant turns\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\n\nKTO\nrl: kto\nrl_beta: 0.1 # default\nkto_desirable_weight: 1.0 # default\nkto_undesirable_weight: 1.0 # default\n\nremove_unused_columns: false\n\ndatasets:\n - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto\n type: llama3.ultra\n split: train\n\ngradient_checkpointing: true\ngradient_checkpointing_kwargs:\n use_reentrant: true\nKTO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"}\n ],\n \"completion\": [\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nchatml.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n \"completion\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: kto\ndatasets:\n - path: ...\n split: train\n type:\n field_prompt: \"prompt\"\n field_system: \"system\"\n field_completion: \"completion\"\n field_label: \"label\"\n prompt_format: \"{prompt}\"\n completion_format: \"{completion}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\",\n \"label\": \"...\"\n}\n\n\n\nGRPO\n\n\n\n\n\n\nTip\n\n\n\nCheck out our GRPO cookbook.\n\n\nIn the latest GRPO implementation, vLLM is used to significantly speedup trajectory generation during training. In this example, we’re using 4 GPUs - 2 for training, and 2 for vLLM:\n\n\n\n\n\n\nImportant\n\n\n\nMake sure you’ve installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. pip install axolotl[vllm].\n\n\nbase_model: Qwen/Qwen2.5-1.5B-Instruct\n\nvllm:\n host: 0.0.0.0\n port: 8000\n tensor_parallel_size: 2\n gpu_memory_utilization: 0.85\n dtype: auto\n # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand\n\nrl: grpo\ntrl:\n use_vllm: true\n vllm_server_host: 0.0.0.0\n vllm_server_port: 8000\n vllm_server_timeout: 300\nCUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml\nYour vLLM instance will now attempt to spin up, and it’s time to kick off training utilizing our remaining two GPUs. In another terminal, execute:\nCUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2\n\n\n\n\n\n\nNote\n\n\n\nDue to TRL’s implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use CUDA_VISIBLE_DEVICES=2,3 for the vLLM instance.\n\n\n\nReward functions\nGRPO uses custom reward functions and transformations. Please have them ready locally.\nFor example, to load OpenAI’s GSM8K and use a random reward for completions:\n# rewards.py\nimport random\n\ndef rand_reward_func(completions, **kwargs) -> list[float]:\n return [random.uniform(0, 1) for _ in completions]\n\ndef oai_gsm8k_transform(cfg, *args, **kwargs):\n def transform_fn(example, tokenizer=None):\n label = example[\"answer\"].split(\"####\")[-1].strip().replace(\",\", \"\")\n return {\n \"prompt\": [{\"role\": \"user\", \"content\": example[\"question\"]},],\n \"answer\": label,\n }\n return transform_fn, {\"remove_columns\": [\"question\"]}\nrl: grpo\n\ntrl:\n beta: 0.001\n max_completion_length: 256\n use_vllm: True\n num_generations: 4\n reward_funcs: [\"rewards.rand_reward_func\"] # format: '{file_name}.{fn_name}'\n reward_weights: [1.0]\ndatasets:\n - path: openai/gsm8k\n name: main\n type: rewards.oai_gsm8k_transform # format: '{file_name}.{fn_name}'\nTo see other examples of custom reward functions, please see TRL GRPO Docs.\nTo see all configs, please see TRLConfig.\n\n\nGRPO with DAPO/Dr. GRPO loss\nThe DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.\ntrl:\n loss_type: dr_grpo\n # Normalizes loss based on max completion length (default: 256)\n max_completion_length:\nFor more information, see GRPO docs.\n\n\n\nSimPO\nSimPO uses CPOTrainer but with alternative loss function.\nrl: simpo\nrl_beta: 0.1 # default in CPOTrainer\ncpo_alpha: 1.0 # default in CPOTrainer\nsimpo_gamma: 0.5 # default in CPOTrainer\nThis method uses the same dataset format as DPO.\n\n\nUsing local dataset files\ndatasets:\n - ds_type: json\n data_files:\n - orca_rlhf.jsonl\n split: train\n type: chatml.intel\n\n\nTRL auto-unwrapping for PEFT\nTRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:\n# load ref model when adapter training.\nrl_adapter_ref_model: true", + "text": "RLHF using Axolotl\n\n\n\n\n\n\nImportant\n\n\n\nThis is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.\n\n\nWe rely on the TRL library for implementations of various RL training methods, which we wrap around to expose in axolotl. Each method has their own supported ways of loading datasets and prompt formats.\n\n\n\n\n\n\nTip\n\n\n\nYou can find what each method supports by going into src/axolotl/prompt_strategies/{method} where {method} is one of our supported methods. The type: can be retrieved from {method}.{function_name}.\n\n\n\nDPO\nExample config:\nrl: dpo\ndatasets:\n - path: Intel/orca_dpo_pairs\n split: train\n type: chatml.intel\n - path: argilla/ultrafeedback-binarized-preferences\n split: train\n type: chatml\nDPO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"chosen_response\": \"...\",\n \"rejected_response\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nchatml.icr\n{\n \"system\": \"...\", // optional\n \"input\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nchatml.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"chosen_response\": \"...\",\n \"rejected_response\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.icr\n{\n \"system\": \"...\", // optional\n \"input\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\nllama3.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nzephyr.nectar\n{\n \"prompt\": \"...\",\n \"answers\": [\n {\n \"answer\": \"...\",\n \"rank\": 1\n },\n {\n \"answer\": \"...\",\n \"rank\": 2\n }\n // ... more answers with ranks\n ]\n}\n\n\nchat_template.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nchat_template.default\nrl: dpo\ndatasets:\n - path: ...\n split: train\n type: chat_template.default\n field_messages: \"messages\"\n field_chosen: \"chosen\"\n field_rejected: \"rejected\"\n message_property_mappings:\n role: role\n content: content\n roles:\n user: [\"user\"]\n assistant: [\"assistant\"]\n system: [\"system\"]\nSample input format:\n{\n \"messages\": [\n {\n \"role\": \"system\",\n \"content\": \"...\"\n },\n {\n \"role\": \"user\",\n \"content\": \"...\"\n },\n // ... more messages\n ],\n \"chosen\": {\n \"role\": \"assistant\",\n \"content\": \"...\"\n },\n \"rejected\": {\n \"role\": \"assistant\",\n \"content\": \"...\"\n }\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: dpo\ndatasets:\n - path: ...\n split: train\n type:\n field_prompt: \"prompt\"\n field_system: \"system\"\n field_chosen: \"chosen\"\n field_rejected: \"rejected\"\n prompt_format: \"{prompt}\"\n chosen_format: \"{chosen}\"\n rejected_format: \"{rejected}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"chosen\": \"...\",\n \"rejected\": \"...\"\n}\n\n\n\nIPO\nAs IPO is just DPO with a different loss function, all supported dataset formats for DPO are also supported for IPO.\nrl: ipo\n\n\nORPO\nPaper: https://arxiv.org/abs/2403.07691\nrl: orpo\norpo_alpha: 0.1\nremove_unused_columns: false\n\nchat_template: chatml\ndatasets:\n - path: argilla/ultrafeedback-binarized-preferences-cleaned\n type: chat_template.argilla\nORPO supports the following types with the following dataset format:\n\nchat_template.argilla\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\", // if available, will be taken as user message for single-turn instead of from list below\n\n // chosen/rejected should be same till last content and only even-number of alternating user/assistant turns\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ],\n \"rejected\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\n\nKTO\nrl: kto\nrl_beta: 0.1 # default\nkto_desirable_weight: 1.0 # default\nkto_undesirable_weight: 1.0 # default\n\nremove_unused_columns: false\n\ndatasets:\n - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto\n type: llama3.ultra\n split: train\n\ngradient_checkpointing: true\ngradient_checkpointing_kwargs:\n use_reentrant: true\nKTO supports the following types with the following dataset format:\n\nchatml.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.argilla_chat\n{\n \"chosen\": [\n {\"role\": \"user\", \"content\": \"...\"}\n ],\n \"completion\": [\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nchatml.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nchatml.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.argilla\n{\n \"system\": \"...\", // optional\n \"instruction\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.argilla_chat\n{\n \"completion\": [\n {\"role\": \"user\", \"content\": \"...\"},\n {\"role\": \"assistant\", \"content\": \"...\"}\n ]\n}\n\n\nllama3.intel\n{\n \"system\": \"...\", // optional\n \"question\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.prompt_pairs\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nllama3.ultra\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\"\n}\n\n\nuser_defined.default\nFor custom behaviors,\nrl: kto\ndatasets:\n - path: ...\n split: train\n type:\n field_prompt: \"prompt\"\n field_system: \"system\"\n field_completion: \"completion\"\n field_label: \"label\"\n prompt_format: \"{prompt}\"\n completion_format: \"{completion}\"\nThe input format is a simple JSON input with customizable fields based on the above config.\n{\n \"system\": \"...\", // optional\n \"prompt\": \"...\",\n \"completion\": \"...\",\n \"label\": \"...\"\n}\n\n\n\nGRPO\n\n\n\n\n\n\nTip\n\n\n\nCheck out our GRPO cookbook.\n\n\nIn the latest GRPO implementation, vLLM is used to significantly speedup trajectory generation during training. In this example, we’re using 4 GPUs - 2 for training, and 2 for vLLM:\n\n\n\n\n\n\nImportant\n\n\n\nMake sure you’ve installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. pip install axolotl[vllm].\n\n\nbase_model: Qwen/Qwen2.5-1.5B-Instruct\n\nvllm:\n host: 0.0.0.0\n port: 8000\n tensor_parallel_size: 2\n gpu_memory_utilization: 0.85\n dtype: auto\n # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand\n\nrl: grpo\ntrl:\n use_vllm: true\n vllm_server_host: 0.0.0.0\n vllm_server_port: 8000\n vllm_server_timeout: 300\nCUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml\nYour vLLM instance will now attempt to spin up, and it’s time to kick off training utilizing our remaining two GPUs. In another terminal, execute:\nCUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2\n\n\n\n\n\n\nNote\n\n\n\nDue to TRL’s implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use CUDA_VISIBLE_DEVICES=2,3 for the vLLM instance.\n\n\n\nReward functions\nGRPO uses custom reward functions and transformations. Please have them ready locally.\nFor example, to load OpenAI’s GSM8K and use a random reward for completions:\n# rewards.py\nimport random\n\ndef rand_reward_func(completions, **kwargs) -> list[float]:\n return [random.uniform(0, 1) for _ in completions]\n\ndef oai_gsm8k_transform(cfg, *args, **kwargs):\n def transform_fn(example, tokenizer=None):\n label = example[\"answer\"].split(\"####\")[-1].strip().replace(\",\", \"\")\n return {\n \"prompt\": [{\"role\": \"user\", \"content\": example[\"question\"]},],\n \"answer\": label,\n }\n return transform_fn, {\"remove_columns\": [\"question\"]}\nrl: grpo\n\ntrl:\n beta: 0.001\n max_completion_length: 256\n use_vllm: True\n num_generations: 4\n reward_funcs: [\"rewards.rand_reward_func\"] # format: '{file_name}.{fn_name}'\n reward_weights: [1.0]\ndatasets:\n - path: openai/gsm8k\n name: main\n type: rewards.oai_gsm8k_transform # format: '{file_name}.{fn_name}'\nTo see other examples of custom reward functions, please see TRL GRPO Docs.\nTo see all configs, please see TRLConfig.\n\n\nGRPO with DAPO/Dr. GRPO loss\nThe DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.\ntrl:\n loss_type: dr_grpo\n # Normalizes loss based on max completion length (default: 256)\n max_completion_length:\nFor more information, see GRPO docs.\n\n\n\nSimPO\nSimPO uses CPOTrainer but with alternative loss function.\nrl: simpo\nrl_beta: 0.1 # default in CPOTrainer\ncpo_alpha: 1.0 # default in CPOTrainer\nsimpo_gamma: 0.5 # default in CPOTrainer\nThis method uses the same dataset format as DPO.\n\n\nUsing local dataset files\ndatasets:\n - ds_type: json\n data_files:\n - orca_rlhf.jsonl\n split: train\n type: chatml.intel\n\n\nTRL auto-unwrapping for PEFT\nTRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:\n# load ref model when adapter training.\nrl_adapter_ref_model: true", "crumbs": [ "How To Guides", "RLHF (Beta)" diff --git a/sitemap.xml b/sitemap.xml index 623531834..cf95feb06 100644 --- a/sitemap.xml +++ b/sitemap.xml @@ -2,798 +2,798 @@ https://docs.axolotl.ai/src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.html - 2025-10-17T03:35:11.499Z + 2025-10-17T10:00:35.737Z https://docs.axolotl.ai/docs/mac.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.714Z https://docs.axolotl.ai/docs/cli.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/nccl.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/getting-started.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.712Z https://docs.axolotl.ai/docs/lr_groups.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.714Z https://docs.axolotl.ai/docs/qat.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/multipack.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/streaming.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/lora_optims.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.714Z https://docs.axolotl.ai/docs/amd_hpc.html - 2025-10-17T03:35:11.472Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/debugging.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/dataset-formats/conversation.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/dataset-formats/inst_tune.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/dataset-formats/index.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/config-reference.html - 2025-10-17T03:38:56.163Z + 2025-10-17T10:04:28.425Z https://docs.axolotl.ai/docs/multimodal.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/ray-integration.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/faq.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/dataset_preprocessing.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/torchao.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/optimizers.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/api/utils.schedulers.html - 2025-10-17T03:38:40.077Z + 2025-10-17T10:04:12.283Z https://docs.axolotl.ai/docs/api/cli.utils.sweeps.html - 2025-10-17T03:38:39.279Z + 2025-10-17T10:04:11.484Z https://docs.axolotl.ai/docs/api/datasets.html - 2025-10-17T03:38:38.870Z + 2025-10-17T10:04:11.081Z https://docs.axolotl.ai/docs/api/utils.tokenization.html - 2025-10-17T03:38:39.993Z + 2025-10-17T10:04:12.199Z https://docs.axolotl.ai/docs/api/loaders.tokenizer.html - 2025-10-17T03:38:39.397Z + 2025-10-17T10:04:11.601Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_expand_mask.html - 2025-10-17T03:38:39.866Z + 2025-10-17T10:04:12.071Z https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_cpu.html - 2025-10-17T03:38:39.952Z + 2025-10-17T10:04:12.158Z https://docs.axolotl.ai/docs/api/utils.data.sft.html - 2025-10-17T03:38:40.128Z + 2025-10-17T10:04:12.333Z https://docs.axolotl.ai/docs/api/monkeypatch.transformers_fa_utils.html - 2025-10-17T03:38:39.933Z + 2025-10-17T10:04:12.139Z https://docs.axolotl.ai/docs/api/loaders.patch_manager.html - 2025-10-17T03:38:39.418Z + 2025-10-17T10:04:11.622Z https://docs.axolotl.ai/docs/api/integrations.liger.args.html - 2025-10-17T03:38:40.452Z + 2025-10-17T10:04:12.654Z https://docs.axolotl.ai/docs/api/utils.schemas.peft.html - 2025-10-17T03:38:40.218Z + 2025-10-17T10:04:12.423Z https://docs.axolotl.ai/docs/api/prompt_strategies.pygmalion.html - 2025-10-17T03:38:39.612Z + 2025-10-17T10:04:11.813Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_instruct.html - 2025-10-17T03:38:39.530Z + 2025-10-17T10:04:11.732Z https://docs.axolotl.ai/docs/api/cli.cloud.base.html - 2025-10-17T03:38:39.235Z + 2025-10-17T10:04:11.440Z https://docs.axolotl.ai/docs/api/monkeypatch.gradient_checkpointing.offload_disk.html - 2025-10-17T03:38:39.985Z + 2025-10-17T10:04:12.190Z https://docs.axolotl.ai/docs/api/kernels.swiglu.html - 2025-10-17T03:38:39.836Z + 2025-10-17T10:04:12.042Z https://docs.axolotl.ai/docs/api/integrations.cut_cross_entropy.args.html - 2025-10-17T03:38:40.437Z + 2025-10-17T10:04:12.639Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.user_defined.html - 2025-10-17T03:38:39.673Z + 2025-10-17T10:04:11.880Z https://docs.axolotl.ai/docs/api/monkeypatch.utils.html - 2025-10-17T03:38:39.911Z + 2025-10-17T10:04:12.116Z https://docs.axolotl.ai/docs/api/core.builders.rl.html - 2025-10-17T03:38:38.970Z + 2025-10-17T10:04:11.180Z https://docs.axolotl.ai/docs/api/loaders.processor.html - 2025-10-17T03:38:39.399Z + 2025-10-17T10:04:11.603Z https://docs.axolotl.ai/docs/api/utils.callbacks.lisa.html - 2025-10-17T03:38:40.584Z + 2025-10-17T10:04:12.785Z https://docs.axolotl.ai/docs/api/core.training_args.html - 2025-10-17T03:38:38.986Z + 2025-10-17T10:04:11.195Z https://docs.axolotl.ai/docs/api/loaders.adapter.html - 2025-10-17T03:38:39.406Z + 2025-10-17T10:04:11.609Z https://docs.axolotl.ai/docs/api/cli.merge_sharded_fsdp_weights.html - 2025-10-17T03:38:39.206Z + 2025-10-17T10:04:11.411Z https://docs.axolotl.ai/docs/api/cli.train.html - 2025-10-17T03:38:39.089Z + 2025-10-17T10:04:11.296Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.rng_state_loader.html - 2025-10-17T03:38:39.431Z + 2025-10-17T10:04:11.634Z https://docs.axolotl.ai/docs/api/prompt_strategies.completion.html - 2025-10-17T03:38:39.578Z + 2025-10-17T10:04:11.779Z https://docs.axolotl.ai/docs/api/prompt_strategies.stepwise_supervised.html - 2025-10-17T03:38:39.591Z + 2025-10-17T10:04:11.792Z https://docs.axolotl.ai/docs/api/monkeypatch.lora_kernels.html - 2025-10-17T03:38:39.901Z + 2025-10-17T10:04:12.106Z https://docs.axolotl.ai/docs/api/prompt_strategies.messages.chat.html - 2025-10-17T03:38:39.617Z + 2025-10-17T10:04:11.819Z https://docs.axolotl.ai/docs/api/prompt_strategies.user_defined.html - 2025-10-17T03:38:39.555Z + 2025-10-17T10:04:11.757Z https://docs.axolotl.ai/docs/api/core.chat.messages.html - 2025-10-17T03:38:39.015Z + 2025-10-17T10:04:11.224Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.scheduler.html - 2025-10-17T03:38:39.439Z + 2025-10-17T10:04:11.642Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.user_defined.html - 2025-10-17T03:38:39.649Z + 2025-10-17T10:04:11.856Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.llama3.html - 2025-10-17T03:38:39.661Z + 2025-10-17T10:04:11.868Z https://docs.axolotl.ai/docs/api/utils.schemas.integrations.html - 2025-10-17T03:38:40.244Z + 2025-10-17T10:04:12.449Z https://docs.axolotl.ai/docs/api/convert.html - 2025-10-17T03:38:38.887Z + 2025-10-17T10:04:11.098Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.passthrough.html - 2025-10-17T03:38:39.651Z + 2025-10-17T10:04:11.858Z https://docs.axolotl.ai/docs/api/utils.schemas.config.html - 2025-10-17T03:38:40.167Z + 2025-10-17T10:04:12.373Z https://docs.axolotl.ai/docs/api/utils.schemas.enums.html - 2025-10-17T03:38:40.254Z + 2025-10-17T10:04:12.459Z https://docs.axolotl.ai/docs/api/monkeypatch.btlm_attn_hijack_flash.html - 2025-10-17T03:38:39.912Z + 2025-10-17T10:04:12.118Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chat_template.html - 2025-10-17T03:38:39.619Z + 2025-10-17T10:04:11.827Z https://docs.axolotl.ai/docs/api/core.trainers.grpo.trainer.html - 2025-10-17T03:38:39.358Z + 2025-10-17T10:04:11.562Z https://docs.axolotl.ai/docs/api/integrations.lm_eval.args.html - 2025-10-17T03:38:40.456Z + 2025-10-17T10:04:12.658Z https://docs.axolotl.ai/docs/api/utils.collators.core.html - 2025-10-17T03:38:40.486Z + 2025-10-17T10:04:12.687Z https://docs.axolotl.ai/docs/api/core.chat.format.shared.html - 2025-10-17T03:38:39.020Z + 2025-10-17T10:04:11.229Z https://docs.axolotl.ai/docs/api/prompt_strategies.orpo.chat_template.html - 2025-10-17T03:38:39.699Z + 2025-10-17T10:04:11.905Z https://docs.axolotl.ai/docs/api/utils.samplers.multipack.html - 2025-10-17T03:38:40.570Z + 2025-10-17T10:04:12.770Z https://docs.axolotl.ai/docs/api/utils.callbacks.qat.html - 2025-10-17T03:38:40.601Z + 2025-10-17T10:04:12.802Z https://docs.axolotl.ai/docs/api/prompt_strategies.chat_template.html - 2025-10-17T03:38:39.511Z + 2025-10-17T10:04:11.714Z https://docs.axolotl.ai/docs/api/utils.schemas.multimodal.html - 2025-10-17T03:38:40.228Z + 2025-10-17T10:04:12.433Z https://docs.axolotl.ai/docs/api/utils.callbacks.comet_.html - 2025-10-17T03:38:40.593Z + 2025-10-17T10:04:12.794Z https://docs.axolotl.ai/docs/api/prompt_strategies.base.html - 2025-10-17T03:38:39.470Z + 2025-10-17T10:04:11.673Z https://docs.axolotl.ai/docs/api/kernels.utils.html - 2025-10-17T03:38:39.847Z + 2025-10-17T10:04:12.053Z https://docs.axolotl.ai/docs/api/cli.merge_lora.html - 2025-10-17T03:38:39.192Z + 2025-10-17T10:04:11.397Z https://docs.axolotl.ai/docs/api/cli.utils.html - 2025-10-17T03:38:39.245Z + 2025-10-17T10:04:11.449Z https://docs.axolotl.ai/docs/api/utils.ctx_managers.sequence_parallel.html - 2025-10-17T03:38:39.468Z + 2025-10-17T10:04:11.671Z https://docs.axolotl.ai/docs/api/index.html - 2025-10-17T03:38:38.772Z + 2025-10-17T10:04:10.982Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.llama3.html - 2025-10-17T03:38:39.632Z + 2025-10-17T10:04:11.840Z https://docs.axolotl.ai/docs/api/monkeypatch.mixtral.html - 2025-10-17T03:38:39.948Z + 2025-10-17T10:04:12.154Z https://docs.axolotl.ai/docs/api/prompt_strategies.orcamini.html - 2025-10-17T03:38:39.604Z + 2025-10-17T10:04:11.805Z https://docs.axolotl.ai/docs/api/core.trainers.grpo.sampler.html - 2025-10-17T03:38:39.373Z + 2025-10-17T10:04:11.577Z https://docs.axolotl.ai/docs/api/utils.lora.html - 2025-10-17T03:38:40.001Z + 2025-10-17T10:04:12.207Z https://docs.axolotl.ai/docs/api/core.trainers.mixins.optimizer.html - 2025-10-17T03:38:39.427Z + 2025-10-17T10:04:11.630Z https://docs.axolotl.ai/docs/api/cli.config.html - 2025-10-17T03:38:39.158Z + 2025-10-17T10:04:11.364Z https://docs.axolotl.ai/docs/api/monkeypatch.multipack.html - 2025-10-17T03:38:39.860Z + 2025-10-17T10:04:12.065Z https://docs.axolotl.ai/docs/api/utils.collators.batching.html - 2025-10-17T03:38:40.510Z + 2025-10-17T10:04:12.710Z https://docs.axolotl.ai/docs/api/utils.quantization.html - 2025-10-17T03:38:40.152Z + 2025-10-17T10:04:12.357Z https://docs.axolotl.ai/docs/api/utils.dict.html - 2025-10-17T03:38:40.108Z + 2025-10-17T10:04:12.314Z https://docs.axolotl.ai/docs/api/kernels.quantize.html - 2025-10-17T03:38:39.845Z + 2025-10-17T10:04:12.051Z https://docs.axolotl.ai/docs/api/utils.schemas.training.html - 2025-10-17T03:38:40.185Z + 2025-10-17T10:04:12.390Z https://docs.axolotl.ai/docs/api/train.html - 2025-10-17T03:38:38.850Z + 2025-10-17T10:04:11.061Z https://docs.axolotl.ai/docs/api/core.datasets.transforms.chat_builder.html - 2025-10-17T03:38:39.036Z + 2025-10-17T10:04:11.245Z https://docs.axolotl.ai/docs/inference.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.714Z https://docs.axolotl.ai/FAQS.html - 2025-10-17T03:35:11.471Z + 2025-10-17T10:00:35.709Z https://docs.axolotl.ai/examples/colab-notebooks/colab-axolotl-example.html - 2025-10-17T03:35:11.481Z + 2025-10-17T10:00:35.719Z https://docs.axolotl.ai/index.html - 2025-10-17T03:35:11.494Z + 2025-10-17T10:00:35.732Z https://docs.axolotl.ai/docs/custom_integrations.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/api/utils.schemas.utils.html - 2025-10-17T03:38:40.262Z + 2025-10-17T10:04:12.466Z https://docs.axolotl.ai/docs/api/kernels.geglu.html - 2025-10-17T03:38:39.823Z + 2025-10-17T10:04:12.029Z https://docs.axolotl.ai/docs/api/core.builders.causal.html - 2025-10-17T03:38:38.964Z + 2025-10-17T10:04:11.174Z https://docs.axolotl.ai/docs/api/core.trainers.mamba.html - 2025-10-17T03:38:39.337Z + 2025-10-17T10:04:11.541Z https://docs.axolotl.ai/docs/api/prompt_strategies.bradley_terry.llama3.html - 2025-10-17T03:38:39.703Z + 2025-10-17T10:04:11.910Z https://docs.axolotl.ai/docs/api/core.datasets.chat.html - 2025-10-17T03:38:39.027Z + 2025-10-17T10:04:11.236Z https://docs.axolotl.ai/docs/api/utils.collators.mm_chat.html - 2025-10-17T03:38:40.520Z + 2025-10-17T10:04:12.720Z https://docs.axolotl.ai/docs/api/prompt_strategies.llama2_chat.html - 2025-10-17T03:38:39.571Z + 2025-10-17T10:04:11.772Z https://docs.axolotl.ai/docs/api/common.const.html - 2025-10-17T03:38:40.464Z + 2025-10-17T10:04:12.666Z https://docs.axolotl.ai/docs/api/cli.quantize.html - 2025-10-17T03:38:39.223Z + 2025-10-17T10:04:11.427Z https://docs.axolotl.ai/docs/api/utils.trainer.html - 2025-10-17T03:38:40.043Z + 2025-10-17T10:04:12.249Z https://docs.axolotl.ai/docs/api/cli.delinearize_llama4.html - 2025-10-17T03:38:39.164Z + 2025-10-17T10:04:11.369Z https://docs.axolotl.ai/docs/api/evaluate.html - 2025-10-17T03:38:38.863Z + 2025-10-17T10:04:11.074Z https://docs.axolotl.ai/docs/api/monkeypatch.mistral_attn_hijack_flash.html - 2025-10-17T03:38:39.858Z + 2025-10-17T10:04:12.063Z https://docs.axolotl.ai/docs/api/loaders.model.html - 2025-10-17T03:38:39.387Z + 2025-10-17T10:04:11.591Z https://docs.axolotl.ai/docs/api/utils.distributed.html - 2025-10-17T03:38:40.102Z + 2025-10-17T10:04:12.308Z https://docs.axolotl.ai/docs/api/utils.model_shard_quant.html - 2025-10-17T03:38:40.008Z + 2025-10-17T10:04:12.213Z https://docs.axolotl.ai/docs/api/kernels.lora.html - 2025-10-17T03:38:39.810Z + 2025-10-17T10:04:12.017Z https://docs.axolotl.ai/docs/api/cli.main.html - 2025-10-17T03:38:39.078Z + 2025-10-17T10:04:11.286Z https://docs.axolotl.ai/docs/api/integrations.spectrum.args.html - 2025-10-17T03:38:40.461Z + 2025-10-17T10:04:12.662Z https://docs.axolotl.ai/docs/api/utils.optimizers.adopt.html - 2025-10-17T03:38:40.118Z + 2025-10-17T10:04:12.324Z https://docs.axolotl.ai/docs/api/cli.cloud.modal_.html - 2025-10-17T03:38:39.243Z + 2025-10-17T10:04:11.448Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_flash.html - 2025-10-17T03:38:39.854Z + 2025-10-17T10:04:12.060Z https://docs.axolotl.ai/docs/api/core.builders.base.html - 2025-10-17T03:38:38.958Z + 2025-10-17T10:04:11.169Z https://docs.axolotl.ai/docs/api/utils.schemas.trl.html - 2025-10-17T03:38:40.222Z + 2025-10-17T10:04:12.427Z https://docs.axolotl.ai/docs/api/cli.utils.args.html - 2025-10-17T03:38:39.259Z + 2025-10-17T10:04:11.463Z https://docs.axolotl.ai/docs/api/core.trainers.base.html - 2025-10-17T03:38:39.312Z + 2025-10-17T10:04:11.516Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_patch_multipack.html - 2025-10-17T03:38:39.914Z + 2025-10-17T10:04:12.120Z https://docs.axolotl.ai/docs/api/monkeypatch.llama_attn_hijack_xformers.html - 2025-10-17T03:38:39.856Z + 2025-10-17T10:04:12.061Z https://docs.axolotl.ai/docs/api/utils.schemas.model.html - 2025-10-17T03:38:40.176Z + 2025-10-17T10:04:12.381Z https://docs.axolotl.ai/docs/api/prompt_strategies.kto.chatml.html - 2025-10-17T03:38:39.671Z + 2025-10-17T10:04:11.878Z https://docs.axolotl.ai/docs/api/utils.callbacks.mlflow_.html - 2025-10-17T03:38:40.589Z + 2025-10-17T10:04:12.789Z https://docs.axolotl.ai/docs/api/common.datasets.html - 2025-10-17T03:38:40.483Z + 2025-10-17T10:04:12.684Z https://docs.axolotl.ai/docs/api/utils.schemas.datasets.html - 2025-10-17T03:38:40.207Z + 2025-10-17T10:04:12.412Z https://docs.axolotl.ai/docs/api/cli.utils.fetch.html - 2025-10-17T03:38:39.265Z + 2025-10-17T10:04:11.470Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.chatml.html - 2025-10-17T03:38:39.645Z + 2025-10-17T10:04:11.853Z https://docs.axolotl.ai/docs/api/monkeypatch.relora.html - 2025-10-17T03:38:39.864Z + 2025-10-17T10:04:12.070Z https://docs.axolotl.ai/docs/api/cli.evaluate.html - 2025-10-17T03:38:39.099Z + 2025-10-17T10:04:11.306Z https://docs.axolotl.ai/docs/api/prompt_strategies.dpo.zephyr.html - 2025-10-17T03:38:39.647Z + 2025-10-17T10:04:11.855Z https://docs.axolotl.ai/docs/api/core.trainers.utils.html - 2025-10-17T03:38:39.375Z + 2025-10-17T10:04:11.579Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_w_system.html - 2025-10-17T03:38:39.544Z + 2025-10-17T10:04:11.747Z https://docs.axolotl.ai/docs/api/utils.chat_templates.html - 2025-10-17T03:38:39.995Z + 2025-10-17T10:04:12.200Z https://docs.axolotl.ai/docs/api/utils.data.streaming.html - 2025-10-17T03:38:40.120Z + 2025-10-17T10:04:12.326Z https://docs.axolotl.ai/docs/api/utils.bench.html - 2025-10-17T03:38:40.012Z + 2025-10-17T10:04:12.218Z https://docs.axolotl.ai/docs/api/common.architectures.html - 2025-10-17T03:38:40.463Z + 2025-10-17T10:04:12.664Z https://docs.axolotl.ai/docs/api/cli.checks.html - 2025-10-17T03:38:39.137Z + 2025-10-17T10:04:11.342Z https://docs.axolotl.ai/docs/api/core.trainers.dpo.trainer.html - 2025-10-17T03:38:39.345Z + 2025-10-17T10:04:11.549Z https://docs.axolotl.ai/docs/api/integrations.base.html - 2025-10-17T03:38:40.433Z + 2025-10-17T10:04:12.635Z https://docs.axolotl.ai/docs/api/cli.utils.train.html - 2025-10-17T03:38:39.294Z + 2025-10-17T10:04:11.498Z https://docs.axolotl.ai/docs/api/utils.collators.mamba.html - 2025-10-17T03:38:40.514Z + 2025-10-17T10:04:12.715Z https://docs.axolotl.ai/docs/api/cli.art.html - 2025-10-17T03:38:39.129Z + 2025-10-17T10:04:11.334Z https://docs.axolotl.ai/docs/api/monkeypatch.trainer_fsdp_optim.html - 2025-10-17T03:38:39.925Z + 2025-10-17T10:04:12.131Z https://docs.axolotl.ai/docs/api/logging_config.html - 2025-10-17T03:38:38.950Z + 2025-10-17T10:04:11.161Z https://docs.axolotl.ai/docs/api/utils.freeze.html - 2025-10-17T03:38:40.022Z + 2025-10-17T10:04:12.227Z https://docs.axolotl.ai/docs/api/prompt_strategies.metharme.html - 2025-10-17T03:38:39.599Z + 2025-10-17T10:04:11.801Z https://docs.axolotl.ai/docs/api/prompt_strategies.alpaca_chat.html - 2025-10-17T03:38:39.528Z + 2025-10-17T10:04:11.730Z https://docs.axolotl.ai/docs/api/monkeypatch.stablelm_attn_hijack_flash.html - 2025-10-17T03:38:39.921Z + 2025-10-17T10:04:12.127Z https://docs.axolotl.ai/docs/api/models.mamba.modeling_mamba.html - 2025-10-17T03:38:40.484Z + 2025-10-17T10:04:12.685Z https://docs.axolotl.ai/docs/api/core.trainers.trl.html - 2025-10-17T03:38:39.330Z + 2025-10-17T10:04:11.534Z https://docs.axolotl.ai/docs/api/prompt_strategies.input_output.html - 2025-10-17T03:38:39.585Z + 2025-10-17T10:04:11.787Z https://docs.axolotl.ai/docs/api/loaders.constants.html - 2025-10-17T03:38:39.420Z + 2025-10-17T10:04:11.623Z https://docs.axolotl.ai/docs/api/monkeypatch.data.batch_dataset_fetcher.html - 2025-10-17T03:38:39.946Z + 2025-10-17T10:04:12.152Z https://docs.axolotl.ai/docs/api/cli.vllm_serve.html - 2025-10-17T03:38:39.231Z + 2025-10-17T10:04:11.436Z https://docs.axolotl.ai/docs/api/prompt_tokenizers.html - 2025-10-17T03:38:38.938Z + 2025-10-17T10:04:11.149Z https://docs.axolotl.ai/docs/api/cli.args.html - 2025-10-17T03:38:39.124Z + 2025-10-17T10:04:11.330Z https://docs.axolotl.ai/docs/api/cli.inference.html - 2025-10-17T03:38:39.182Z + 2025-10-17T10:04:11.387Z https://docs.axolotl.ai/docs/api/cli.utils.load.html - 2025-10-17T03:38:39.272Z + 2025-10-17T10:04:11.477Z https://docs.axolotl.ai/docs/api/cli.preprocess.html - 2025-10-17T03:38:39.217Z + 2025-10-17T10:04:11.421Z https://docs.axolotl.ai/docs/api/utils.callbacks.profiler.html - 2025-10-17T03:38:40.582Z + 2025-10-17T10:04:12.783Z https://docs.axolotl.ai/docs/api/utils.callbacks.perplexity.html - 2025-10-17T03:38:40.578Z + 2025-10-17T10:04:12.778Z https://docs.axolotl.ai/docs/api/core.chat.format.chatml.html - 2025-10-17T03:38:39.017Z + 2025-10-17T10:04:11.226Z https://docs.axolotl.ai/docs/api/integrations.grokfast.optimizer.html - 2025-10-17T03:38:40.439Z + 2025-10-17T10:04:12.640Z https://docs.axolotl.ai/docs/api/integrations.kd.trainer.html - 2025-10-17T03:38:40.448Z + 2025-10-17T10:04:12.650Z https://docs.axolotl.ai/docs/api/monkeypatch.unsloth_.html - 2025-10-17T03:38:39.935Z + 2025-10-17T10:04:12.141Z https://docs.axolotl.ai/docs/api/core.chat.format.llama3x.html - 2025-10-17T03:38:39.019Z + 2025-10-17T10:04:11.228Z https://docs.axolotl.ai/docs/reward_modelling.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/quantize.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/fsdp_qlora.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/nd_parallelism.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/batch_vs_grad.html - 2025-10-17T03:35:11.472Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/multi-node.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/rlhf.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/dataset-formats/stepwise_supervised.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/dataset-formats/pretraining.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/dataset-formats/tokenized.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/dataset-formats/template_free.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/multi-gpu.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.714Z https://docs.axolotl.ai/docs/input_output.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.714Z https://docs.axolotl.ai/docs/docker.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/gradient_checkpointing.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.712Z https://docs.axolotl.ai/docs/optimizations.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/sequence_parallelism.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/docs/dataset_loading.html - 2025-10-17T03:35:11.473Z + 2025-10-17T10:00:35.711Z https://docs.axolotl.ai/docs/installation.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.714Z https://docs.axolotl.ai/docs/mixed_precision.html - 2025-10-17T03:35:11.476Z + 2025-10-17T10:00:35.714Z https://docs.axolotl.ai/docs/unsloth.html - 2025-10-17T03:35:11.477Z + 2025-10-17T10:00:35.715Z https://docs.axolotl.ai/src/axolotl/integrations/LICENSE.html - 2025-10-17T03:35:11.499Z + 2025-10-17T10:00:35.736Z