From b9fe1393d645c7373f7e92545f7fb995852a1aad Mon Sep 17 00:00:00 2001 From: Quarto GHA Workflow Runner Date: Wed, 25 Mar 2026 12:49:03 +0000 Subject: [PATCH] Built site for gh-pages --- .nojekyll | 2 +- docs/api/cli.merge_lora.html | 5 +- docs/config-reference.html | 413 +++++------ docs/custom_integrations.html | 1281 +++++++++++++++++++++++---------- docs/rlhf.html | 463 +++++++++++- search.json | 23 +- sitemap.xml | 470 ++++++------ 7 files changed, 1836 insertions(+), 821 deletions(-) diff --git a/.nojekyll b/.nojekyll index c2fbc0263..899e73099 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -c438397e \ No newline at end of file +fc32723f \ No newline at end of file diff --git a/docs/api/cli.merge_lora.html b/docs/api/cli.merge_lora.html index db77bd6f5..b34fccc6d 100644 --- a/docs/api/cli.merge_lora.html +++ b/docs/api/cli.merge_lora.html @@ -793,7 +793,7 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); do_merge_lora -Calls transformersmerge_and_unload on the model given in the axolotl config +Merges LoRA adapters with base model using either memory-efficient or legacy approach. @@ -864,8 +864,7 @@ config values will be overwritten to allow the LoRA merge logic to work as expec

do_merge_lora

cli.merge_lora.do_merge_lora(cfg)
-

Calls transformersmerge_and_unload on the model given in the axolotl config -along with the LoRA adapters to combine them into a single base model.

+

Merges LoRA adapters with base model using either memory-efficient or legacy approach.

Parameters

diff --git a/docs/config-reference.html b/docs/config-reference.html index 0a29c0f89..85dc118ab 100644 --- a/docs/config-reference.html +++ b/docs/config-reference.html @@ -2191,211 +2191,214 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true}); loraplus_lr_embedding: float | None = 1e-06merge_lora: bool | None - -# Whether to use ReLoRA. Use with jagged_restart_*steps options. -relora: bool | None -# threshold for optimizer magnitude when pruning -relora_prune_ratio: float | None -# True to perform lora weight merges on cpu during restarts, for modest gpu memory -# savings -relora_cpu_offload: bool | None - -# how often to reset for jagged restarts -jagged_restart_steps: int | None -# how many warmup steps to take after reset for jagged restarts -jagged_restart_warmup_steps: int | None -# how many anneal steps to take before reset for jagged restarts -jagged_restart_anneal_steps: int | None - -# If greater than 1, backpropagation will be skipped and the gradients will be -# accumulated for the given number of steps. -gradient_accumulation_steps: int | None = 1 -# The number of samples to include in each batch. This is the number of samples sent to -# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps -micro_batch_size: int | None = 1 -# Total batch size, we do not recommended setting this manually -batch_size: int | None -# per gpu micro batch size for evals, defaults to value of micro_batch_size -eval_batch_size: int | None - -# whether to find batch size that fits in memory. Passed to underlying transformers -# Trainer -auto_find_batch_size: bool | None - -# Whether to mask out or include the human's prompt from the training labels -train_on_inputs: bool | None = False -# Group similarly sized data to minimize padding. May be slower to start, as it must -# download and sort the entire dataset. Note that training loss may have an oscillating -# pattern with this enabled. -group_by_length: bool | None - -learning_rate: str | float (required) -embedding_lr: float | None -embedding_lr_scale: float | None -# Specify weight decay -weight_decay: float | None = 0.0 -# Specify optimizer -optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED -# Dictionary of arguments to pass to the optimizer -optim_args: str | dict[str, Any] | None -# The target modules to optimize, i.e. the module names that you would like to train, -# right now this is used only for GaLore algorithm -optim_target_modules: list[str] | Literal['all_linear'] | None -# Path to torch distx for optim 'adamw_anyprecision' -torchdistx_path: str | None -lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE -# Specify a scheduler and kwargs to use with the optimizer -lr_scheduler_kwargs: dict[str, Any] | None -lr_quadratic_warmup: bool | None -# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of -# peak lr -cosine_min_lr_ratio: float | None -# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means -# start cosine_min_lr at 80% of training step -cosine_constant_lr_ratio: float | None -# Learning rate div factor -lr_div_factor: float | None - -lr_groups: list[LrGroup] | None - # For LrGroup: - name: str (required) - modules: list[str] (required) - lr: float (required) - -# adamw hyperparams -adam_epsilon: float | None -# only used for CAME Optimizer -adam_epsilon2: float | None -# adamw hyperparams -adam_beta1: float | None -# adamw hyperparams -adam_beta2: float | None -# only used for CAME Optimizer -adam_beta3: float | None - -# Dion Optimizer learning rate -dion_lr: float | None -# Dion Optimizer momentum -dion_momentum: float | None -# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank -# dimension. -dion_rank_fraction: float | None = 1.0 -# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may -# be useful to ensure even sharding. -dion_rank_multiple_of: int | None = 1 - -# Gradient clipping max norm -max_grad_norm: float | None -num_epochs: float = 1.0 - -use_wandb: bool | None -# Set the name of your wandb run -wandb_name: str | None -# Set the ID of your wandb run -wandb_run_id: str | None -# "offline" to save run metadata locally and not sync to the server, "disabled" to turn -# off wandb -wandb_mode: str | None -# Your wandb project name -wandb_project: str | None -# A wandb Team name if using a Team -wandb_entity: str | None -wandb_watch: str | None -# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only -# at the end of training -wandb_log_model: str | None - -use_mlflow: bool | None -# URI to mlflow -mlflow_tracking_uri: str | None -# Your experiment name -mlflow_experiment_name: str | None -# Your run name -mlflow_run_name: str | None -# set to true to copy each saved checkpoint on each save to mlflow artifact registry -hf_mlflow_log_artifacts: bool | None - -# Enable or disable Comet integration. -use_comet: bool | None -# API key for Comet. Recommended to set via `comet login`. -comet_api_key: str | None -# Workspace name in Comet. Defaults to the user's default workspace. -comet_workspace: str | None -# Project name in Comet. Defaults to Uncategorized. -comet_project_name: str | None -# Identifier for the experiment. Used to append data to an existing experiment or -# control the key of new experiments. Default to a random key. -comet_experiment_key: str | None -# Create a new experiment ("create") or log to an existing one ("get"). Default -# ("get_or_create") auto-selects based on configuration. -comet_mode: str | None -# Set to True to log data to Comet server, or False for offline storage. Default is -# True. -comet_online: bool | None -# Dictionary for additional configuration settings, see the doc for more details. -comet_experiment_config: dict[str, Any] | None - -use_trackio: bool | None -# Your trackio project name -trackio_project_name: str | None -# Set the name of your trackio run -trackio_run_name: str | None -# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided) -trackio_space_id: str | None - -# Enable OpenTelemetry metrics collection and Prometheus export -use_otel_metrics: bool | None = False -# Host to bind the OpenTelemetry metrics server to -otel_metrics_host: str | None = localhost -# Port for the Prometheus metrics HTTP server -otel_metrics_port: int | None = 8000 - -# the number of activate layers in LISA -lisa_n_layers: int | None -# how often to switch layers in LISA -lisa_step_interval: int | None -# path under the model to access the layers -lisa_layers_attribute: str | None = model.layers - -gradio_title: str | None -gradio_share: bool | None -gradio_server_name: str | None -gradio_server_port: int | None -gradio_max_new_tokens: int | None -gradio_temperature: float | None - -use_ray: bool = False -ray_run_name: str | None -ray_num_workers: int = 1 -resources_per_worker: dict - -# The size of the image to resize to. It can be an integer (resized into padded-square -# image) or a tuple (width, height).If not provided, we will attempt to load from -# preprocessor.size, otherwise, images won't be resized. -image_size: int | tuple[int, int] | None -# The resampling algorithm to use for image resizing. Default is bilinear. Please refer -# to PIL.Image.Resampling for more details. -image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None - -# optional overrides to the base model configuration -overrides_of_model_config: dict[str, Any] | None -# optional overrides the base model loading from_pretrained -overrides_of_model_kwargs: dict[str, Any] | None -# If you want to specify the type of model to load, AutoModelForCausalLM is a good -# choice too -type_of_model: str | None -# You can specify to choose a specific model revision from huggingface hub -revision_of_model: str | None - -max_packed_sequence_len: int | None -rope_scaling: Any | None -noisy_embedding_alpha: float | None -dpo_beta: float | None -evaluation_strategy: str | None -eval_table_size: int | None -eval_max_new_tokens: int | None -dpo_use_logits_to_keep: bool | None -dpo_generate_during_eval: bool | None +# Method to use for LoRA merging. 'memory_efficient' (default) processes shards +# individually to reduce memory usage, 'legacy' loads the full model into memory. +merge_method: Literal['legacy', 'memory_efficient'] | None = memory_efficient + +# Whether to use ReLoRA. Use with jagged_restart_*steps options. +relora: bool | None +# threshold for optimizer magnitude when pruning +relora_prune_ratio: float | None +# True to perform lora weight merges on cpu during restarts, for modest gpu memory +# savings +relora_cpu_offload: bool | None + +# how often to reset for jagged restarts +jagged_restart_steps: int | None +# how many warmup steps to take after reset for jagged restarts +jagged_restart_warmup_steps: int | None +# how many anneal steps to take before reset for jagged restarts +jagged_restart_anneal_steps: int | None + +# If greater than 1, backpropagation will be skipped and the gradients will be +# accumulated for the given number of steps. +gradient_accumulation_steps: int | None = 1 +# The number of samples to include in each batch. This is the number of samples sent to +# each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps +micro_batch_size: int | None = 1 +# Total batch size, we do not recommended setting this manually +batch_size: int | None +# per gpu micro batch size for evals, defaults to value of micro_batch_size +eval_batch_size: int | None + +# whether to find batch size that fits in memory. Passed to underlying transformers +# Trainer +auto_find_batch_size: bool | None + +# Whether to mask out or include the human's prompt from the training labels +train_on_inputs: bool | None = False +# Group similarly sized data to minimize padding. May be slower to start, as it must +# download and sort the entire dataset. Note that training loss may have an oscillating +# pattern with this enabled. +group_by_length: bool | None + +learning_rate: str | float (required) +embedding_lr: float | None +embedding_lr_scale: float | None +# Specify weight decay +weight_decay: float | None = 0.0 +# Specify optimizer +optimizer: OptimizerNames | CustomSupportedOptimizers | None = OptimizerNames.ADAMW_TORCH_FUSED +# Dictionary of arguments to pass to the optimizer +optim_args: str | dict[str, Any] | None +# The target modules to optimize, i.e. the module names that you would like to train, +# right now this is used only for GaLore algorithm +optim_target_modules: list[str] | Literal['all_linear'] | None +# Path to torch distx for optim 'adamw_anyprecision' +torchdistx_path: str | None +lr_scheduler: SchedulerType | Literal['one_cycle'] | Literal['rex'] | None = SchedulerType.COSINE +# Specify a scheduler and kwargs to use with the optimizer +lr_scheduler_kwargs: dict[str, Any] | None +lr_quadratic_warmup: bool | None +# decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of +# peak lr +cosine_min_lr_ratio: float | None +# freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means +# start cosine_min_lr at 80% of training step +cosine_constant_lr_ratio: float | None +# Learning rate div factor +lr_div_factor: float | None + +lr_groups: list[LrGroup] | None + # For LrGroup: + name: str (required) + modules: list[str] (required) + lr: float (required) + +# adamw hyperparams +adam_epsilon: float | None +# only used for CAME Optimizer +adam_epsilon2: float | None +# adamw hyperparams +adam_beta1: float | None +# adamw hyperparams +adam_beta2: float | None +# only used for CAME Optimizer +adam_beta3: float | None + +# Dion Optimizer learning rate +dion_lr: float | None +# Dion Optimizer momentum +dion_momentum: float | None +# Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank +# dimension. +dion_rank_fraction: float | None = 1.0 +# Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may +# be useful to ensure even sharding. +dion_rank_multiple_of: int | None = 1 + +# Gradient clipping max norm +max_grad_norm: float | None +num_epochs: float = 1.0 + +use_wandb: bool | None +# Set the name of your wandb run +wandb_name: str | None +# Set the ID of your wandb run +wandb_run_id: str | None +# "offline" to save run metadata locally and not sync to the server, "disabled" to turn +# off wandb +wandb_mode: str | None +# Your wandb project name +wandb_project: str | None +# A wandb Team name if using a Team +wandb_entity: str | None +wandb_watch: str | None +# "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only +# at the end of training +wandb_log_model: str | None + +use_mlflow: bool | None +# URI to mlflow +mlflow_tracking_uri: str | None +# Your experiment name +mlflow_experiment_name: str | None +# Your run name +mlflow_run_name: str | None +# set to true to copy each saved checkpoint on each save to mlflow artifact registry +hf_mlflow_log_artifacts: bool | None + +# Enable or disable Comet integration. +use_comet: bool | None +# API key for Comet. Recommended to set via `comet login`. +comet_api_key: str | None +# Workspace name in Comet. Defaults to the user's default workspace. +comet_workspace: str | None +# Project name in Comet. Defaults to Uncategorized. +comet_project_name: str | None +# Identifier for the experiment. Used to append data to an existing experiment or +# control the key of new experiments. Default to a random key. +comet_experiment_key: str | None +# Create a new experiment ("create") or log to an existing one ("get"). Default +# ("get_or_create") auto-selects based on configuration. +comet_mode: str | None +# Set to True to log data to Comet server, or False for offline storage. Default is +# True. +comet_online: bool | None +# Dictionary for additional configuration settings, see the doc for more details. +comet_experiment_config: dict[str, Any] | None + +use_trackio: bool | None +# Your trackio project name +trackio_project_name: str | None +# Set the name of your trackio run +trackio_run_name: str | None +# Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided) +trackio_space_id: str | None + +# Enable OpenTelemetry metrics collection and Prometheus export +use_otel_metrics: bool | None = False +# Host to bind the OpenTelemetry metrics server to +otel_metrics_host: str | None = localhost +# Port for the Prometheus metrics HTTP server +otel_metrics_port: int | None = 8000 + +# the number of activate layers in LISA +lisa_n_layers: int | None +# how often to switch layers in LISA +lisa_step_interval: int | None +# path under the model to access the layers +lisa_layers_attribute: str | None = model.layers + +gradio_title: str | None +gradio_share: bool | None +gradio_server_name: str | None +gradio_server_port: int | None +gradio_max_new_tokens: int | None +gradio_temperature: float | None + +use_ray: bool = False +ray_run_name: str | None +ray_num_workers: int = 1 +resources_per_worker: dict + +# The size of the image to resize to. It can be an integer (resized into padded-square +# image) or a tuple (width, height).If not provided, we will attempt to load from +# preprocessor.size, otherwise, images won't be resized. +image_size: int | tuple[int, int] | None +# The resampling algorithm to use for image resizing. Default is bilinear. Please refer +# to PIL.Image.Resampling for more details. +image_resize_algorithm: Literal['bilinear', 'bicubic', 'lanczos'] | Resampling | None + +# optional overrides to the base model configuration +overrides_of_model_config: dict[str, Any] | None +# optional overrides the base model loading from_pretrained +overrides_of_model_kwargs: dict[str, Any] | None +# If you want to specify the type of model to load, AutoModelForCausalLM is a good +# choice too +type_of_model: str | None +# You can specify to choose a specific model revision from huggingface hub +revision_of_model: str | None + +max_packed_sequence_len: int | None +rope_scaling: Any | None +noisy_embedding_alpha: float | None +dpo_beta: float | None +evaluation_strategy: str | None +eval_table_size: int | None +eval_max_new_tokens: int | None +dpo_use_logits_to_keep: bool | None +dpo_generate_during_eval: bool | None diff --git a/docs/custom_integrations.html b/docs/custom_integrations.html index f5d9c2208..d603aff04 100644 --- a/docs/custom_integrations.html +++ b/docs/custom_integrations.html @@ -823,6 +823,30 @@ gtag('config', 'G-9KYCVJBNMQ', { 'anonymize_ip': true});
  • Supported Models
  • Citation
  • +
  • NeMo Gym Integration for Axolotl +
  • Spectrum
  • ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    PathSpeedMulti-turnArchitecture
    Async GRPO + Data ProducerFastest (3x)YesNemoGymDataProducer replaces vLLM generation
    Standard GRPO + Data ProducerBaselineYesSame producer, no async prefetch
    Standard GRPO + /verifySimplestNoReward function calls /verify directly
    FSDP2 + /verify (2 GPU)DistributedNofsdp_version: 2
    +

    Multi-turn uses nemo_gym_multi_turn: true which auto-enables the async trainer’s +data producer protocol. The plugin’s NemoGymDataProducer calls NeMo Gym agent /run +endpoints and returns RolloutDataset with proper IS correction, env_mask, and rewards.

    +

    All paths tested end-to-end with Qwen3-0.6B + LoRA, logged to wandb project nemo-gym-rl.

    +
    +
    +

    Quick Start

    +
    +
    +

    Prerequisites

    +
      +
    • uv package manager (for NeMo Gym’s venv)
    • +
    • Two GPUs recommended (one for vLLM server, one for training)
    • +
    +
    +
    +

    1. Set Up NeMo Gym

    +
    git clone https://github.com/NVIDIA-NeMo/Gym.git ~/Gym
    +cd ~/Gym
    +uv venv --python 3.12 && source .venv/bin/activate && uv sync
    +
    +CFLAGS="" uv pip install pycosat --python .venv/bin/python --no-build-isolation
    +
    +for dir in resources_servers/reasoning_gym resources_servers/example_single_tool_call responses_api_models/vllm_model responses_api_agents/simple_agent; do
    +    uv venv --seed --allow-existing --python 3.12 $dir/.venv
    +    CFLAGS="" uv pip install --python $dir/.venv/bin/python pycosat --no-build-isolation 2>/dev/null
    +    uv pip install --python $dir/.venv/bin/python -e . "ray[default]==2.52.1"
    +done
    +
    +uv pip install --python resources_servers/reasoning_gym/.venv/bin/python \
    +    reasoning-gym matplotlib pillow cycler contourpy kiwisolver
    +
    + +
    +

    3. Single-Turn Training (Simplest — No Agent Server Needed)

    +

    For environments that only need single-turn verify (math, coding challenges), you don’t need +an agent server. The plugin’s reward function calls /verify directly.

    +
    base_model: Qwen/Qwen2.5-0.5B-Instruct
    +rl: grpo
    +chat_template: tokenizer_default
    +
    +trl:
    +  use_vllm: true
    +  vllm_mode: colocate
    +  vllm_enable_sleep_mode: false
    +  num_generations: 8
    +  max_completion_length: 128
    +  temperature: 0.9
    +  reward_funcs:
    +    - axolotl.integrations.nemo_gym.rewards.reward_nemo_gym_verify
    +
    +plugins:
    +  - axolotl.integrations.nemo_gym.NemoGymPlugin
    +
    +nemo_gym_enabled: true
    +nemo_gym_auto_start: false
    +nemo_gym_head_port: 11000
    +nemo_gym_datasets:
    +  - path: ~/Gym/resources_servers/reasoning_gym/data/train_basic_arithmetic.jsonl
    +    server_name: reasoning_gym
    +
    +datasets:
    +  - path: ~/Gym/resources_servers/reasoning_gym/data/train_basic_arithmetic.jsonl
    +    type: chat_template
    +    field_messages: responses_create_params.input
    +    message_field_content: content
    +    message_field_role: role
    +
    +vllm:
    +  gpu_memory_utilization: 0.3
    +  max_model_len: 512
    +  tensor_parallel_size: 1
    +
    +learning_rate: 1e-5
    +micro_batch_size: 4
    +gradient_accumulation_steps: 2
    +max_steps: 50
    +output_dir: ./outputs/nemo_gym_arithmetic
    +

    Only needs ng_run with resource servers (no agent config):

    +
    cd ~/Gym && ng_run "+config_paths=[resources_servers/reasoning_gym/configs/resources_only.yaml]" "+skip_venv_if_present=true"
    +
    +
    +

    How It Works

    +
    +
    +

    Single-Turn

    +
    axolotl train → GRPO Trainer generates completions
    +  → NeMo Gym plugin reward_fn calls POST /verify on resource server
    +  → reward flows back to GRPO for advantage computation
    +
    +
    +

    Multi-Turn (Agent /run)

    +
    ┌─────────────┐     ┌──────────────┐     ┌──────────────────┐
    +│  axolotl    │     │  NeMo Gym    │────▶│  vLLM OpenAI     │
    +│  train      │────▶│  Agent /run  │◀────│  Server (GPU 0)  │
    +│  (GPU 1)    │     │              │     │  /v1/completions  │
    +└─────────────┘     └──────┬───────┘     └──────────────────┘
    +                           │
    +                           ▼
    +                    ┌──────────────┐
    +                    │  Resource    │
    +                    │  Server     │
    +                    │  (tools +   │
    +                    │   verify)   │
    +                    └─────────────┘
    +

    The agent server orchestrates the entire multi-turn loop: +1. Calls our vLLM server for model generation +2. Parses tool calls from model output +3. Executes tools against resource servers +4. Feeds tool results back to the model +5. Repeats until done, then calls /verify for reward +6. Returns token IDs + logprobs + reward to our rollout_func

    +
    +
    +

    Data Producer Architecture (Multi-Turn)

    +

    When nemo_gym_multi_turn: true, the plugin automatically forces use_data_producer: true +which selects the AxolotlAsyncGRPOTrainer. The plugin then swaps the trainer’s data +producer with NemoGymDataProducer, which:

    +
      +
    1. Gets a prompt batch from the dataset iterator
    2. +
    3. Expands by num_generations (one agent call per rollout)
    4. +
    5. Calls NeMo Gym agents via async HTTP (aiohttp.gather)
    6. +
    7. Parses responses into padded tensors (RolloutDataset)
    8. +
    9. Returns with _pending_policy_logps=True for deferred scoring
    10. +
    +

    The main thread then runs _compute_deferred_scores() which: +- Computes policy logprobs on the training model (GPU forward pass) +- Computes IS correction using agent’s sampling logprobs vs training model logprobs +- Computes advantages with group-level normalization +- All downstream features work: replay buffer, re-roll, streaming, zero-adv skip

    +

    With async_prefetch: true, the data producer runs in a background thread — giving ~3x +speedup as generation and training overlap. With async_prefetch: false, it runs +synchronously on the main thread (still uses the data producer protocol).

    +
    +
    +

    Weight Sync (LoRA Mode)

    +

    With vllm_lora_sync: true, the plugin (or async trainer) replaces NCCL-based weight +sync with filesystem + HTTP:

    +
      +
    1. accelerator.get_state_dict() gathers LoRA weights from all ranks
    2. +
    3. Rank 0 saves adapter to /tmp/lora_sync_*/vN/
    4. +
    5. Rank 0 POSTs to /set_lora_adapter/ on vLLM server
    6. +
    7. vLLM loads adapter natively via Punica kernels
    8. +
    9. Only ~40MB transferred (vs multiple GBs for full model weights)
    10. +
    +
    +
    +

    Multi-Environment Support

    +

    Datasets support per-row environment routing via agent_ref:

    +
    {"agent_ref": {"name": "reasoning_gym"}, "responses_create_params": {...}}
    +{"agent_ref": {"name": "instruction_following"}, "responses_create_params": {...}}
    +

    Or use the simpler per-dataset routing:

    +
    nemo_gym_datasets:
    +  - path: reasoning_data.jsonl
    +    server_name: reasoning_gym
    +  - path: tool_data.jsonl
    +    server_name: example_single_tool_call
    +
    +
    +

    Configuration Reference

    + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    ParameterTypeDefaultDescription
    nemo_gym_enabledboolnullEnable the NeMo Gym integration
    nemo_gym_dirstr~/GymPath to NeMo Gym repo
    nemo_gym_auto_clonebooltrueAuto-clone NeMo Gym repo if missing
    nemo_gym_auto_startbooltrueAuto-start resource servers
    nemo_gym_config_pathslist[str]Server config YAMLs (relative to gym_dir)
    nemo_gym_datasetslist[dict]requiredDataset configs with path and optional server_name
    nemo_gym_head_portint11000Head server port
    nemo_gym_server_timeoutint360Server startup timeout (seconds)
    nemo_gym_verify_timeoutint30Per-request timeout (seconds)
    nemo_gym_multi_turnboolfalseEnable multi-turn via agent /run
    +
    +
    +

    Dataset JSONL Format

    +

    Each line must have responses_create_params with input messages:

    +
    {
    +  "responses_create_params": {
    +    "input": [{"role": "user", "content": "What's the weather in SF?"}],
    +    "tools": [{"name": "get_weather", "type": "function", "strict": true, "parameters": {...}}]
    +  }
    +}
    +

    For multi-turn agent routing, include agent_ref:

    +
    {"agent_ref": {"name": "my_agent"}, "responses_create_params": {...}}
    +

    Note: Tool definitions MUST include "strict": true and "additionalProperties": false for NeMo Gym agent compatibility.

    +
    +
    +

    Reward Functions

    +

    The plugin provides two built-in reward functions — no user code needed:

    +
    trl:
    +  reward_funcs:
    +    # Multi-turn (nemo_gym_multi_turn: true):
    +    # Passthrough — agent /run already computed the reward
    +    - axolotl.integrations.nemo_gym.rewards.reward_env
    +
    +    # Single-turn (nemo_gym_multi_turn: false):
    +    # Calls /verify endpoints on NeMo Gym resource servers
    +    - axolotl.integrations.nemo_gym.rewards.reward_nemo_gym_verify
    +

    Both are also importable from Python:

    +
    from axolotl.integrations.nemo_gym import reward_env, reward_nemo_gym_verify
    +
    +
    +

    Known Issues / Troubleshooting

    +
    +
    +

    NeMo Gym Server Setup

    +
      +
    • pycosat build failure: CFLAGS="" uv pip install pycosat --no-build-isolation
    • +
    • Ray version mismatch: Pin ray[default]==2.52.1 in all server venvs
    • +
    • Pre-build venvs: ng_run creates per-server venvs via Ray. Pre-build them and use +skip_venv_if_present=true
    • +
    • Tool strict field required: Agent server validates tool definitions require strict: true
    • +
    +
    +
    +

    vLLM / Weight Sync

    +
      +
    • Start vLLM with LoRA + tool calling + runtime loading:

      +
      VLLM_ALLOW_RUNTIME_LORA_UPDATING=1 \
      +CUDA_VISIBLE_DEVICES=0 python -m vllm.entrypoints.openai.api_server \
      +  --model Qwen/Qwen3-4B-Instruct-2507 \
      +  --max-model-len 4096 \
      +  --gpu-memory-utilization 0.7 \
      +  --enable-lora --max-lora-rank 64 \
      +  --enable-auto-tool-choice --tool-call-parser hermes
    • +
    • VLLM_ALLOW_RUNTIME_LORA_UPDATING=1: Required for vllm_lora_sync: true. Without it, vLLM won’t expose the /v1/load_lora_adapter endpoint and weight sync will fail silently. The plugin warns if this endpoint is missing.

    • +
    • --enable-lora: Enables LoRA adapter support in vLLM

    • +
    • --enable-auto-tool-choice --tool-call-parser hermes: Required for Qwen3 tool calling

    • +
    • max_model_len must be > max_completion_length: Leave room for prompt tokens (~200). If equal, the NeMo Gym model proxy gets a 400 error and returns empty completions.

    • +
    • CUDA_HOME required: DeepSpeed import needs it for the nvcc shim

    • +
    • NCCL weight sync broken with vLLM 0.17: Use vllm_lora_sync: true (filesystem + HTTP via /v1/load_lora_adapter)

    • +
    +
    +
    +

    Multi-Turn

    +
      +
    • Agent server required: Multi-turn delegates to NeMo Gym’s agent server /run endpoint. Without an agent, the plugin falls back to single-turn /verify
    • +
    • Model server proxy: NeMo Gym needs a responses_api_models server that proxies to your vLLM. See the agent config example above
    • +
    +
    +
    +

    FSDP2

    +
      +
    • Validated on 2 GPUs with single-turn + LoRA
    • +
    • Async field filtering: The builder automatically filters async-only config fields when using the standard GRPO trainer
    • +
    +
    +
    +

    Comparison with Other Integrations

    + ++++++ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
    FeatureAxolotl + NeMo GymUnsloth + NeMo GymNeMo RL (native)
    Server managementAutomaticManual (notebook)Built-in
    Multi-environmentPer-row routingManual codeYAML config
    Multi-turn / tool useAgent /run delegationNoAgent /run (Ray)
    Async GRPO (3x speedup)YesNoYes
    LoRA syncFilesystem + HTTPN/ANCCL
    Multi-GPU (FSDP2)YesNoYes (Ray)
    Config-drivenYesNo (code)Yes
    +

    Please see reference here

    +
    +

    Spectrum

    by Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar

    @@ -1547,23 +2102,23 @@ By identifying the top n% of layers with the highest SNR, you can optimize train

    Usage

    -
    plugins:
    -  - axolotl.integrations.spectrum.SpectrumPlugin
    -
    -spectrum_top_fraction: 0.5
    -spectrum_model_name: meta-llama/Meta-Llama-3.1-8B
    +
    plugins:
    +  - axolotl.integrations.spectrum.SpectrumPlugin
    +
    +spectrum_top_fraction: 0.5
    +spectrum_model_name: meta-llama/Meta-Llama-3.1-8B

    Citation

    -
    @misc{hartford2024spectrumtargetedtrainingsignal,
    -      title={Spectrum: Targeted Training on Signal to Noise Ratio},
    -      author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},
    -      year={2024},
    -      eprint={2406.06623},
    -      archivePrefix={arXiv},
    -      primaryClass={cs.LG},
    -      url={https://arxiv.org/abs/2406.06623},
    -}
    +
    @misc{hartford2024spectrumtargetedtrainingsignal,
    +      title={Spectrum: Targeted Training on Signal to Noise Ratio},
    +      author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},
    +      year={2024},
    +      eprint={2406.06623},
    +      archivePrefix={arXiv},
    +      primaryClass={cs.LG},
    +      url={https://arxiv.org/abs/2406.06623},
    +}

    Please see reference here

    @@ -1587,10 +2142,10 @@ By identifying the top n% of layers with the highest SNR, you can optimize train

    Installation

    -
    pip install swanlab
    +
    pip install swanlab
    -
    -

    Quick Start

    +
    +

    Quick Start

    1. Register for SwanLab (Optional for cloud mode)

    @@ -1599,23 +2154,23 @@ By identifying the top n% of layers with the highest SNR, you can optimize train

    2. Configure Axolotl Config File

    Add SwanLab configuration to your Axolotl YAML config:

    -
    plugins:
    -  - axolotl.integrations.swanlab.SwanLabPlugin
    -
    -use_swanlab: true
    -swanlab_project: my-llm-project
    -swanlab_experiment_name: qwen-finetune-v1
    -swanlab_mode: cloud  # Options: cloud, local, offline, disabled
    -swanlab_workspace: my-team  # Optional: organization name
    -swanlab_api_key: YOUR_API_KEY  # Optional: can also use env var SWANLAB_API_KEY
    +
    plugins:
    +  - axolotl.integrations.swanlab.SwanLabPlugin
    +
    +use_swanlab: true
    +swanlab_project: my-llm-project
    +swanlab_experiment_name: qwen-finetune-v1
    +swanlab_mode: cloud  # Options: cloud, local, offline, disabled
    +swanlab_workspace: my-team  # Optional: organization name
    +swanlab_api_key: YOUR_API_KEY  # Optional: can also use env var SWANLAB_API_KEY

    3. Run Training

    -
    export SWANLAB_API_KEY=your-api-key-here
    -
    -swanlab login
    -
    -accelerate launch -m axolotl.cli.train your-config.yaml
    +
    export SWANLAB_API_KEY=your-api-key-here
    +
    +swanlab login
    +
    +accelerate launch -m axolotl.cli.train your-config.yaml

    Configuration Options

    @@ -1757,46 +2312,46 @@ By identifying the top n% of layers with the highest SNR, you can optimize train

    Example 1: Basic Cloud Sync

    -
    plugins:
    -  - axolotl.integrations.swanlab.SwanLabPlugin
    -
    -use_swanlab: true
    -swanlab_project: llama-finetune
    -swanlab_experiment_name: llama-3-8b-instruct-v1
    -swanlab_mode: cloud
    +
    plugins:
    +  - axolotl.integrations.swanlab.SwanLabPlugin
    +
    +use_swanlab: true
    +swanlab_project: llama-finetune
    +swanlab_experiment_name: llama-3-8b-instruct-v1
    +swanlab_mode: cloud

    Example 2: Offline/Local Mode

    -
    plugins:
    -  - axolotl.integrations.swanlab.SwanLabPlugin
    -
    -use_swanlab: true
    -swanlab_project: local-experiments
    -swanlab_experiment_name: test-run-1
    -swanlab_mode: local  # or 'offline'
    +
    plugins:
    +  - axolotl.integrations.swanlab.SwanLabPlugin
    +
    +use_swanlab: true
    +swanlab_project: local-experiments
    +swanlab_experiment_name: test-run-1
    +swanlab_mode: local  # or 'offline'

    Example 3: Team Workspace

    -
    plugins:
    -  - axolotl.integrations.swanlab.SwanLabPlugin
    -
    -use_swanlab: true
    -swanlab_project: research-project
    -swanlab_experiment_name: experiment-42
    -swanlab_workspace: my-research-team
    -swanlab_mode: cloud
    +
    plugins:
    +  - axolotl.integrations.swanlab.SwanLabPlugin
    +
    +use_swanlab: true
    +swanlab_project: research-project
    +swanlab_experiment_name: experiment-42
    +swanlab_workspace: my-research-team
    +swanlab_mode: cloud

    Example 4: Private Deployment

    -
    plugins:
    -  - axolotl.integrations.swanlab.SwanLabPlugin
    -
    -use_swanlab: true
    -swanlab_project: internal-project
    -swanlab_experiment_name: secure-training
    -swanlab_mode: cloud
    -swanlab_web_host: https://swanlab.yourcompany.com
    -swanlab_api_host: https://api.swanlab.yourcompany.com
    +
    plugins:
    +  - axolotl.integrations.swanlab.SwanLabPlugin
    +
    +use_swanlab: true
    +swanlab_project: internal-project
    +swanlab_experiment_name: secure-training
    +swanlab_mode: cloud
    +swanlab_web_host: https://swanlab.yourcompany.com
    +swanlab_api_host: https://api.swanlab.yourcompany.com

    Team Notifications with Lark (Feishu)

    @@ -1805,8 +2360,8 @@ By identifying the top n% of layers with the highest SNR, you can optimize train - Team collaboration: Keep your ML team informed about long-running experiments - Multi-timezone teams: Team members can check training progress without being online

    -
    -

    Prerequisites

    +
    +

    Prerequisites

    1. Lark Bot Setup: Create a custom bot in your Lark group chat
    2. Webhook URL: Get the webhook URL from your Lark bot settings
    3. @@ -1817,30 +2372,30 @@ By identifying the top n% of layers with the highest SNR, you can optimize train

      Example 5: Basic Lark Notifications

      Send training notifications to a Lark group chat:

      -
      plugins:
      -  - axolotl.integrations.swanlab.SwanLabPlugin
      -
      -use_swanlab: true
      -swanlab_project: production-training
      -swanlab_experiment_name: llama-3-finetune-v2
      -swanlab_mode: cloud
      -
      -swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
      +
      plugins:
      +  - axolotl.integrations.swanlab.SwanLabPlugin
      +
      +use_swanlab: true
      +swanlab_project: production-training
      +swanlab_experiment_name: llama-3-finetune-v2
      +swanlab_mode: cloud
      +
      +swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx

      Note: This configuration will work, but you’ll see a security warning recommending HMAC secret configuration.