chore: update title

restore dockerfile
fix: trim allowed cuda versions
2025-04-26 16:21:31 -04:00 · 2025-04-26 16:21:30 -04:00 · 2025-04-26 16:21:30 -04:00 · 2025-04-26 16:21:30 -04:00 · 2025-04-26 16:21:30 -04:00 · 2025-04-26 16:21:30 -04:00
30 changed files with 1579 additions and 703 deletions
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -24,7 +24,7 @@ jobs:
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
-            axolotl_extras:
+            axolotl_extras: vllm
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
--- a/.github/workflows/multi-gpu-e2e.yml
+++ b/.github/workflows/multi-gpu-e2e.yml
@@ -43,7 +43,7 @@ jobs:
            cuda_version: 12.4.1
            python_version: "3.11"
            pytorch: 2.5.1
-            axolotl_extras:
+            axolotl_extras: vllm
            num_gpus: 2
            nightly_build: "true"
          - cuda: 126
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -258,12 +258,6 @@ jobs:
      fail-fast: false
      matrix:
        include:
-          - cuda: 124
-            cuda_version: 12.4.1
-            python_version: "3.11"
-            pytorch: 2.6.0
-            num_gpus: 1
-            axolotl_extras: llmcompressor
          - cuda: 124
            cuda_version: 12.4.1
            python_version: "3.11"
@@ -275,7 +269,7 @@ jobs:
            python_version: "3.11"
            pytorch: 2.5.1
            num_gpus: 1
-            axolotl_extras:
+            axolotl_extras: vllm
          - cuda: 126
            cuda_version: 12.6.3
            python_version: "3.11"
--- a/.runpod/.gitignore
+++ b/.runpod/.gitignore
@@ -0,0 +1,161 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+pod/scripts/config.yaml
--- a/.runpod/Dockerfile
+++ b/.runpod/Dockerfile
@@ -0,0 +1,18 @@
+FROM runpod/pytorch:3.10-2.0.0-117
+
+COPY .runpod/requirements.txt /requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade pip && \
+    python3 -m pip install --upgrade -r /requirements.txt
+
+
+# Environment settings
+ARG BASE_VOLUME="/runpod-volume"
+ENV BASE_VOLUME=$BASE_VOLUME
+ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
+
+COPY .runpod/src /src
+
+CMD ["python3", "/src/handler.py"]
--- a/.runpod/README.md
+++ b/.runpod/README.md
@@ -0,0 +1,335 @@
+<h1>LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more</h1>
+
+# Configuration Options
+
+This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.
+
+## Usage
+
+You can use these configuration Options:
+
+1. As a JSON request body:
+
+```json
+{
+  "input": {
+    "user_id": "user",
+    "model_id": "model-name",
+    "run_id": "run-id",
+    "credentials": {
+      "wandb_api_key": "", # add your Weights & biases key. TODO:  you will be able to set this in Enviornment variables.
+      "hf_token": "", # add your HF_token. TODO:  you will be able to set this in Enviornment variables.
+    },
+    "args": {
+      "base_model": "NousResearch/Llama-3.2-1B",
+      // ... other options
+    }
+  }
+}
+```
+
+## Configuration Options
+
+### Model Configuration
+
+| Option              | Description                                                                                   | Default              |
+| ------------------- | --------------------------------------------------------------------------------------------- | -------------------- |
+| `base_model`        | Path to the base model (local or HuggingFace)                                                 | Required             |
+| `base_model_config` | Configuration path for the base model                                                         | Same as base_model   |
+| `revision_of_model` | Specific model revision from HuggingFace hub                                                  | Latest               |
+| `tokenizer_config`  | Custom tokenizer configuration path                                                           | Optional             |
+| `model_type`        | Type of model to load                                                                         | AutoModelForCausalLM |
+| `tokenizer_type`    | Type of tokenizer to use                                                                      | AutoTokenizer        |
+| `hub_model_id`      | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional             |
+
+## Model Family Identification
+
+| Option                     | Default | Description                    |
+| -------------------------- | ------- | ------------------------------ |
+| `is_falcon_derived_model`  | `false` | Whether model is Falcon-based  |
+| `is_llama_derived_model`   | `false` | Whether model is LLaMA-based   |
+| `is_qwen_derived_model`    | `false` | Whether model is Qwen-based    |
+| `is_mistral_derived_model` | `false` | Whether model is Mistral-based |
+
+## Model Configuration Overrides
+
+| Option                                          | Default    | Description                        |
+| ----------------------------------------------- | ---------- | ---------------------------------- |
+| `overrides_of_model_config.rope_scaling.type`   | `"linear"` | RoPE scaling type (linear/dynamic) |
+| `overrides_of_model_config.rope_scaling.factor` | `1.0`      | RoPE scaling factor                |
+
+### Model Loading Options
+
+| Option         | Description                   | Default |
+| -------------- | ----------------------------- | ------- |
+| `load_in_8bit` | Load model in 8-bit precision | false   |
+| `load_in_4bit` | Load model in 4-bit precision | false   |
+| `bf16`         | Use bfloat16 precision        | false   |
+| `fp16`         | Use float16 precision         | false   |
+| `tf32`         | Use tensor float 32 precision | false   |
+
+## Memory and Device Settings
+
+| Option             | Default   | Description             |
+| ------------------ | --------- | ----------------------- |
+| `gpu_memory_limit` | `"20GiB"` | GPU memory limit        |
+| `lora_on_cpu`      | `false`   | Load LoRA on CPU        |
+| `device_map`       | `"auto"`  | Device mapping strategy |
+| `max_memory`       | `null`    | Max memory per device   |
+
+## Training Hyperparameters
+
+| Option                        | Default   | Description                 |
+| ----------------------------- | --------- | --------------------------- |
+| `gradient_accumulation_steps` | `1`       | Gradient accumulation steps |
+| `micro_batch_size`            | `2`       | Batch size per GPU          |
+| `eval_batch_size`             | `null`    | Evaluation batch size       |
+| `num_epochs`                  | `4`       | Number of training epochs   |
+| `warmup_steps`                | `100`     | Warmup steps                |
+| `warmup_ratio`                | `0.05`    | Warmup ratio                |
+| `learning_rate`               | `0.00003` | Learning rate               |
+| `lr_quadratic_warmup`         | `false`   | Quadratic warmup            |
+| `logging_steps`               | `null`    | Logging frequency           |
+| `eval_steps`                  | `null`    | Evaluation frequency        |
+| `evals_per_epoch`             | `null`    | Evaluations per epoch       |
+| `save_strategy`               | `"epoch"` | Checkpoint saving strategy  |
+| `save_steps`                  | `null`    | Saving frequency            |
+| `saves_per_epoch`             | `null`    | Saves per epoch             |
+| `save_total_limit`            | `null`    | Maximum checkpoints to keep |
+| `max_steps`                   | `null`    | Maximum training steps      |
+
+### Dataset Configuration
+
+```yaml
+datasets:
+  - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
+    type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
+    ds_type: json # Dataset type
+    data_files: path/to/data # Source data files
+    train_on_split: train # Dataset split to use
+```
+
+## Chat Template Settings
+
+| Option                   | Default                          | Description            |
+| ------------------------ | -------------------------------- | ---------------------- |
+| `chat_template`          | `"tokenizer_default"`            | Chat template type     |
+| `chat_template_jinja`    | `null`                           | Custom Jinja template  |
+| `default_system_message` | `"You are a helpful assistant."` | Default system message |
+
+## Dataset Processing
+
+| Option                        | Default                    | Description                       |
+| ----------------------------- | -------------------------- | --------------------------------- |
+| `dataset_prepared_path`       | `"data/last_run_prepared"` | Path for prepared dataset         |
+| `push_dataset_to_hub`         | `""`                       | Push dataset to HF hub            |
+| `dataset_processes`           | `4`                        | Number of preprocessing processes |
+| `dataset_keep_in_memory`      | `false`                    | Keep dataset in memory            |
+| `shuffle_merged_datasets`     | `true`                     | Shuffle merged datasets           |
+| `dataset_exact_deduplication` | `true`                     | Deduplicate datasets              |
+
+## LoRA Configuration
+
+| Option                     | Default                | Description                    |
+| -------------------------- | ---------------------- | ------------------------------ |
+| `adapter`                  | `"lora"`               | Adapter type (lora/qlora)      |
+| `lora_model_dir`           | `""`                   | Directory with pretrained LoRA |
+| `lora_r`                   | `8`                    | LoRA attention dimension       |
+| `lora_alpha`               | `16`                   | LoRA alpha parameter           |
+| `lora_dropout`             | `0.05`                 | LoRA dropout                   |
+| `lora_target_modules`      | `["q_proj", "v_proj"]` | Modules to apply LoRA          |
+| `lora_target_linear`       | `false`                | Target all linear modules      |
+| `peft_layers_to_transform` | `[]`                   | Layers to transform            |
+| `lora_modules_to_save`     | `[]`                   | Modules to save                |
+| `lora_fan_in_fan_out`      | `false`                | Fan in/out structure           |
+
+## Optimization Settings
+
+| Option                    | Default | Description                |
+| ------------------------- | ------- | -------------------------- |
+| `train_on_inputs`         | `false` | Train on input prompts     |
+| `group_by_length`         | `false` | Group by sequence length   |
+| `gradient_checkpointing`  | `false` | Use gradient checkpointing |
+| `early_stopping_patience` | `3`     | Early stopping patience    |
+
+## Learning Rate Scheduling
+
+| Option                     | Default    | Description          |
+| -------------------------- | ---------- | -------------------- |
+| `lr_scheduler`             | `"cosine"` | Scheduler type       |
+| `lr_scheduler_kwargs`      | `{}`       | Scheduler parameters |
+| `cosine_min_lr_ratio`      | `null`     | Minimum LR ratio     |
+| `cosine_constant_lr_ratio` | `null`     | Constant LR ratio    |
+| `lr_div_factor`            | `null`     | LR division factor   |
+
+## Optimizer Settings
+
+| Option                 | Default      | Description         |
+| ---------------------- | ------------ | ------------------- |
+| `optimizer`            | `"adamw_hf"` | Optimizer choice    |
+| `optim_args`           | `{}`         | Optimizer arguments |
+| `optim_target_modules` | `[]`         | Target modules      |
+| `weight_decay`         | `null`       | Weight decay        |
+| `adam_beta1`           | `null`       | Adam beta1          |
+| `adam_beta2`           | `null`       | Adam beta2          |
+| `adam_epsilon`         | `null`       | Adam epsilon        |
+| `max_grad_norm`        | `null`       | Gradient clipping   |
+
+## Attention Implementations
+
+| Option                     | Default | Description                   |
+| -------------------------- | ------- | ----------------------------- |
+| `flash_optimum`            | `false` | Use better transformers       |
+| `xformers_attention`       | `false` | Use xformers                  |
+| `flash_attention`          | `false` | Use flash attention           |
+| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
+| `flash_attn_rms_norm`      | `false` | Flash attention RMS norm      |
+| `flash_attn_fuse_qkv`      | `false` | Fuse QKV operations           |
+| `flash_attn_fuse_mlp`      | `false` | Fuse MLP operations           |
+| `sdp_attention`            | `false` | Use scaled dot product        |
+| `s2_attention`             | `false` | Use shifted sparse attention  |
+
+## Tokenizer Modifications
+
+| Option           | Default | Description                  |
+| ---------------- | ------- | ---------------------------- |
+| `special_tokens` | -       | Special tokens to add/modify |
+| `tokens`         | `[]`    | Additional tokens            |
+
+## Distributed Training
+
+| Option                  | Default | Description           |
+| ----------------------- | ------- | --------------------- |
+| `fsdp`                  | `null`  | FSDP configuration    |
+| `fsdp_config`           | `null`  | FSDP config options   |
+| `deepspeed`             | `null`  | Deepspeed config path |
+| `ddp_timeout`           | `null`  | DDP timeout           |
+| `ddp_bucket_cap_mb`     | `null`  | DDP bucket capacity   |
+| `ddp_broadcast_buffers` | `null`  | DDP broadcast buffers |
+
+<details>
+<summary><h3>Example Configuration Request:</h3></summary>
+
+Here's a complete example for fine-tuning a LLaMA model using LoRA:
+
+```json
+{
+  "input": {
+    "user_id": "user",
+    "model_id": "llama-test",
+    "run_id": "test-run",
+    "credentials": {
+      "wandb_api_key": "",
+      "hf_token": ""
+    },
+    "args": {
+      "base_model": "NousResearch/Llama-3.2-1B",
+      "load_in_8bit": false,
+      "load_in_4bit": false,
+      "strict": false,
+      "datasets": [
+        {
+          "path": "teknium/GPT4-LLM-Cleaned",
+          "type": "alpaca"
+        }
+      ],
+      "dataset_prepared_path": "last_run_prepared",
+      "val_set_size": 0.1,
+      "output_dir": "./outputs/lora-out",
+      "adapter": "lora",
+      "sequence_len": 2048,
+      "sample_packing": true,
+      "eval_sample_packing": true,
+      "pad_to_sequence_len": true,
+      "lora_r": 16,
+      "lora_alpha": 32,
+      "lora_dropout": 0.05,
+      "lora_target_modules": [
+        "gate_proj",
+        "down_proj",
+        "up_proj",
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj"
+      ],
+      "gradient_accumulation_steps": 2,
+      "micro_batch_size": 2,
+      "num_epochs": 1,
+      "optimizer": "adamw_8bit",
+      "lr_scheduler": "cosine",
+      "learning_rate": 0.0002,
+      "train_on_inputs": false,
+      "group_by_length": false,
+      "bf16": "auto",
+      "tf32": false,
+      "gradient_checkpointing": true,
+      "logging_steps": 1,
+      "flash_attention": true,
+      "loss_watchdog_threshold": 5,
+      "loss_watchdog_patience": 3,
+      "warmup_steps": 10,
+      "evals_per_epoch": 4,
+      "saves_per_epoch": 1,
+      "weight_decay": 0,
+      "hub_model_id": "runpod/llama-fr-lora",
+      "wandb_name": "test-run-1",
+      "wandb_project": "test-run-1",
+      "wandb_entity": "axo-test",
+      "special_tokens": {
+        "pad_token": "<|end_of_text|>"
+      }
+    }
+  }
+}
+```
+
+</details>
+
+### Advanced Features
+
+#### Wandb Integration
+
+- `wandb_project`: Project name for Weights & Biases
+- `wandb_entity`: Team name in W&B
+- `wandb_watch`: Monitor model with W&B
+- `wandb_name`: Name of the W&B run
+- `wandb_run_id`: ID for the W&B run
+
+#### Performance Optimization
+
+- `sample_packing`: Enable efficient sequence packing
+- `eval_sample_packing`: Use sequence packing during evaluation
+- `torch_compile`: Enable PyTorch 2.0 compilation
+- `flash_attention`: Use Flash Attention implementation
+- `xformers_attention`: Use xFormers attention implementation
+
+### Available Optimizers
+
+The following optimizers are supported:
+
+- `adamw_hf`: HuggingFace's AdamW implementation
+- `adamw_torch`: PyTorch's AdamW
+- `adamw_torch_fused`: Fused AdamW implementation
+- `adamw_torch_xla`: XLA-optimized AdamW
+- `adamw_apex_fused`: NVIDIA Apex fused AdamW
+- `adafactor`: Adafactor optimizer
+- `adamw_anyprecision`: Anyprecision AdamW
+- `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
+- `lion_8bit`: 8-bit Lion optimizer
+- `lion_32bit`: 32-bit Lion optimizer
+- `sgd`: Stochastic Gradient Descent
+- `adagrad`: Adagrad optimizer
+
+## Notes
+
+- Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
+- Enable `flash_attention: true` for faster training on modern GPUs
+- Use `gradient_checkpointing: true` to reduce memory usage
+- Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
+
+For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html).
+
+### Errors:
+
+- if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.
--- a/.runpod/hub.json
+++ b/.runpod/hub.json
@@ -0,0 +1,93 @@
+{
+  "title": "Axolotl Fine-Tuning",
+  "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
+  "type": "serverless",
+  "category": "language",
+  "iconUrl": "https://avatars.githubusercontent.com/u/167502477",
+  "config": {
+    "runsOn": "GPU",
+    "containerDiskInGb": 200,
+    "gpuCount": 1,
+    "allowedCudaVersions": [
+      "12.8",
+      "12.7",
+      "12.6",
+      "12.5",
+      "12.4"
+    ],
+    "presets": [],
+    "env": [
+      {
+        "key": "TOKENIZER",
+        "input": {
+          "name": "Tokenizer",
+          "type": "string",
+          "description": "Name or path of the Hugging Face tokenizer to use.",
+          "default": "",
+          "advanced": true
+        }
+      },
+      {
+        "key": "MAX_NUM_SEQS",
+        "input": {
+          "name": "Max Num Seqs",
+          "type": "number",
+          "description": "Maximum number of sequences per iteration.",
+          "default": 256,
+          "advanced": true
+        }
+      },
+      {
+        "key": "DISABLE_LOG_STATS",
+        "input": {
+          "name": "Disable Log Stats",
+          "type": "boolean",
+          "description": "Disable logging statistics.",
+          "default": false,
+          "trueValue": "true",
+          "falseValue": "false"
+        }
+      },
+      {
+        "key": "LOAD_FORMAT",
+        "input": {
+          "name": "Load Format",
+          "type": "string",
+          "description": "The format of the model weights to load.",
+          "default": "auto",
+          "options": [
+            {
+              "label": "auto",
+              "value": "auto"
+            },
+            {
+              "label": "pt",
+              "value": "pt"
+            },
+            {
+              "label": "safetensors",
+              "value": "safetensors"
+            },
+            {
+              "label": "npcache",
+              "value": "npcache"
+            },
+            {
+              "label": "dummy",
+              "value": "dummy"
+            },
+            {
+              "label": "tensorizer",
+              "value": "tensorizer"
+            },
+            {
+              "label": "bitsandbytes",
+              "value": "bitsandbytes"
+            }
+          ],
+          "advanced": true
+        }
+      }
+    ]
+  }
+}
--- a/.runpod/requirements.txt
+++ b/.runpod/requirements.txt
@@ -0,0 +1,15 @@
+# Required Python packages get listed here, one per line.
+# Reccomended to lock the version number to avoid unexpected changes.
+
+# You can also install packages from a git repository, e.g.:
+# git+https://github.com/runpod/runpod-python.git
+# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
+runpod~=1.7.0
+huggingface_hub
+typing-extensions
+pydantic
+pydantic-settings
+hf-transfer
+setuptools
+numpy==2.0.0
+axolotl[flash-attn,deepspeed]
--- a/.runpod/src/config/config.yaml
+++ b/.runpod/src/config/config.yaml
@@ -0,0 +1,577 @@
+# # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
+# # This can also be a relative path to a model on disk
+# base_model: ./llama-7b-hf
+# # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
+# base_model_ignore_patterns:
+# # If the base_model repo on hf hub doesn't include configuration .json files,
+# # You can set that here, or leave this empty to default to base_model
+# base_model_config: ./llama-7b-hf
+# # You can specify to choose a specific model revision from huggingface hub
+# model_revision:
+# # Optional tokenizer configuration override in case you want to use a different tokenizer
+# # than the one defined in the base model
+# tokenizer_config:
+# # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
+# model_type: AutoModelForCausalLM
+# # Corresponding tokenizer for the model AutoTokenizer is a good choice
+# tokenizer_type: AutoTokenizer
+# # Trust remote code for untrusted source
+# trust_remote_code:
+# # use_fast option for tokenizer loading from_pretrained, default to True
+# tokenizer_use_fast:
+# # Whether to use the legacy tokenizer setting, defaults to True
+# tokenizer_legacy:
+# # Resize the model embeddings when new tokens are added to multiples of 32
+# # This is reported to improve training speed on some models
+# resize_token_embeddings_to_32x:
+
+# # Used to identify which the model is based on
+# is_falcon_derived_model:
+# is_llama_derived_model:
+# # Please note that if you set this to true, `padding_side` will be set to "left" by default
+# is_mistral_derived_model:
+# is_qwen_derived_model:
+
+# # optional overrides to the base model configuration
+# model_config:
+#   # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
+#   rope_scaling:
+#     type: # linear | dynamic
+#     factor: # float
+
+
+# # Whether you are training a 4-bit GPTQ quantized model
+# gptq: true
+# gptq_groupsize: 128 # group size
+# gptq_model_v1: false # v1 or v2
+
+# # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+# load_in_8bit: true
+# # Use bitsandbytes 4 bit
+# load_in_4bit:
+
+# # Use CUDA bf16
+# bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
+# # Use CUDA fp16
+# fp16: true
+# # Use CUDA tf32
+# tf32: true # require >=ampere
+
+# # No AMP (automatic mixed precision)
+# bfloat16: true # require >=ampere
+# float16: true
+
+# # A list of one or more datasets to finetune the model with
+# datasets:
+#   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
+#   - path: vicgalle/alpaca-gpt4
+#   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
+#     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
+#     ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
+#     data_files: # Optional[str] path to source data files
+#     shards: # Optional[int] number of shards to split data into
+#     name: # Optional[str] name of dataset configuration to load
+#     train_on_split: train # Optional[str] name of dataset split to load from
+
+#     # Optional[str] fastchat conversation type, only used with type: sharegpt
+#     conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+#     field_human: # Optional[str]. Human key to use for conversation.
+#     field_model: # Optional[str]. Assistant key to use for conversation.
+
+#   # Custom user prompt
+#   - path: repo
+#     type:
+#       # The below are defaults. only set what's needed.
+#       system_prompt: ""
+#       system_format: "{system}"
+#       field_system: system
+#       field_instruction: instruction
+#       field_input: input
+#       field_output: output
+
+#       # Customizable to be single line or multi-line
+#       # 'format' can include {input}
+#       format: |-
+#         User: {instruction} {input}
+#         Assistant:
+#       # 'no_input_format' cannot include {input}
+#       no_input_format: "{instruction} "
+
+#       # For `completion` datsets only, uses the provided field instead of `text` column
+#       field:
+
+# # Axolotl attempts to save the dataset as an arrow after packing the data together so
+# # subsequent training attempts load faster, relative path
+# dataset_prepared_path: data/last_run_prepared
+# # Push prepared dataset to hub
+# push_dataset_to_hub: # repo path
+# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
+# # if not set.
+# dataset_processes: # defaults to os.cpu_count() if not set
+# # push checkpoints to hub
+# hub_model_id: # repo path to push finetuned model
+# # how to push checkpoints to hub
+# # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
+# hub_strategy:
+# # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
+# # Required to be true when used in combination with `push_dataset_to_hub`
+# hf_use_auth_token: # boolean
+# # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
+# val_set_size: 0.04
+# # Num shards for whole dataset
+# dataset_shard_num:
+# # Index of shard to use for whole dataset
+# dataset_shard_idx:
+
+# # The maximum length of an input to train with, this should typically be less than 2048
+# # as most models have a token/context limit of 2048
+# sequence_len: 2048
+# # Pad inputs so each step uses constant sized buffers
+# # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
+# pad_to_sequence_len:
+# # Max sequence length to concatenate training samples together up to
+# # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
+# # FutureWarning: This will soon be DEPRECATED
+# max_packed_sequence_len: 1024
+# # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
+# sample_packing:
+# # Set to 'false' if getting errors during eval with sample_packing on.
+# eval_sample_packing:
+# # You can set these packing optimizations AFTER starting a training at least once.
+# # The trainer will provide recommended values for these values.
+# sample_packing_eff_est:
+# total_num_tokens:
+
+# # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
+# adapter: lora
+# # If you already have a lora model trained that you want to load, put that here.
+# # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
+# lora_model_dir:
+
+# # LoRA hyperparameters
+# # For more details about the following options, see:
+# # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
+# lora_r: 8
+# lora_alpha: 16
+# lora_dropout: 0.05
+# lora_target_modules:
+#   - q_proj
+#   - v_proj
+# #  - k_proj
+# #  - o_proj
+# #  - gate_proj
+# #  - down_proj
+# #  - up_proj
+# lora_target_linear: # If true, will target all linear layers
+
+# # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
+# # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
+# # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
+# # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
+# lora_modules_to_save:
+# #  - embed_tokens
+# #  - lm_head
+
+# # Once you complete training, the model will be saved to the following directory.
+# # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
+# # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
+# lora_out_dir:
+# lora_fan_in_fan_out: false
+
+# # ReLoRA configuration
+# # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
+# relora_steps: # Number of steps per ReLoRA restart
+# relora_warmup_steps: # Number of per-restart warmup steps
+# relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
+
+# # wandb configuration if you're using it
+# wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
+# wandb_project: # Your wandb project name
+# wandb_entity: # A wandb Team name if using a Team
+# wandb_watch:
+# wandb_run_id: # Set the name of your wandb run
+# wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
+
+# # Where to save the full-finetuned model to
+# output_dir: ./completed-model
+
+# # Whether to use torch.compile and which backend to use
+# torch_compile:  # bool
+# torch_compile_backend:  # Optional[str]
+
+# # Training hyperparameters
+
+# # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
+# gradient_accumulation_steps: 1
+# # The number of samples to include in each batch. This is the number of samples sent to each GPU.
+# micro_batch_size: 2
+# eval_batch_size:
+# num_epochs: 4
+# warmup_steps: 100  # cannot use with warmup_ratio
+# warmup_ratio: 0.05  # cannot use with warmup_steps
+# learning_rate: 0.00003
+# lr_quadratic_warmup:
+# logging_steps:
+# save_strategy: # Set to `no` to skip checkpoint saves
+# save_steps: # Leave empty to save at each epoch
+# eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
+# save_total_limit: # Checkpoints saved at a time
+# # Maximum number of iterations to train for. It precedes num_epochs which means that
+# # if both are set, num_epochs will not be guaranteed.
+# # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
+# max_steps:
+
+# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
+# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
+
+# # Save model as safetensors (require safetensors package)
+# save_safetensors:
+
+# # Whether to mask out or include the human's prompt from the training labels
+# train_on_inputs: false
+# # Group similarly sized data to minimize padding.
+# # May be slower to start, as it must download and sort the entire dataset.
+# # Note that training loss may have an oscillating pattern with this enabled.
+# group_by_length: false
+
+# # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+# gradient_checkpointing: false
+
+# # Stop training after this many evaluation losses have increased in a row
+# # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
+# early_stopping_patience: 3
+
+# # Specify a scheduler and kwargs to use with the optimizer
+# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
+# lr_scheduler_kwargs:
+
+# # For one_cycle optim
+# lr_div_factor: # Learning rate div factor
+
+# # For log_sweep optim
+# log_sweep_min_lr:
+# log_sweep_max_lr:
+
+# # Specify optimizer
+# # Valid values are driven by the Transformers OptimizerNames class, see:
+# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
+# #
+# # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
+# # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
+# # in the examples/ for your model and fine-tuning use case.
+# #
+# # Valid values for 'optimizer' include:
+# # - adamw_hf
+# # - adamw_torch
+# # - adamw_torch_fused
+# # - adamw_torch_xla
+# # - adamw_apex_fused
+# # - adafactor
+# # - adamw_anyprecision
+# # - sgd
+# # - adagrad
+# # - adamw_bnb_8bit
+# # - lion_8bit
+# # - lion_32bit
+# # - paged_adamw_32bit
+# # - paged_adamw_8bit
+# # - paged_lion_32bit
+# # - paged_lion_8bit
+# optimizer:
+# # Specify weight decay
+# weight_decay:
+# # adamw hyperparams
+# adam_beta1:
+# adam_beta2:
+# adam_epsilon:
+# # Gradient clipping max norm
+# max_grad_norm:
+
+# # Augmentation techniques
+# # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
+# # currently only supported on Llama and Mistral
+# noisy_embedding_alpha:
+
+# # Whether to bettertransformers
+# flash_optimum:
+# # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
+# xformers_attention:
+# # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+# flash_attention:
+# flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
+# flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
+# flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
+# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
+# # Whether to use scaled-dot-product attention
+# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+# sdp_attention:
+# # Landmark attention (only llama)
+# landmark_attention:
+# # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
+# # LLaMA only
+# xpos_rope:
+
+# # Resume from a specific checkpoint dir
+# resume_from_checkpoint:
+# # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# # Be careful with this being turned on between different models.
+# auto_resume_from_checkpoints: false
+
+# # Don't mess with this, it's here for accelerate and torchrun
+# local_rank:
+
+# # Add or change special tokens.
+# # If you add tokens here, you don't need to add them to the `tokens` list.
+# special_tokens:
+#   # bos_token: "<s>"
+#   # eos_token: "</s>"
+#   # unk_token: "<unk>"
+
+# # Add extra tokens.
+# tokens:
+
+# # FSDP
+# fsdp:
+# fsdp_config:
+
+# # Deepspeed config path. e.g., deepspeed/zero3.json
+# deepspeed:
+
+# # Advanced DDP Arguments
+# ddp_timeout:
+# ddp_bucket_cap_mb:
+# ddp_broadcast_buffers:
+
+# # Path to torch distx for optim 'adamw_anyprecision'
+# torchdistx_path:
+
+# # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
+# pretraining_dataset:
+
+# # Debug mode
+# debug:
+
+# # Seed
+# seed:
+
+# # Allow overwrite yml config using from cli
+# strict:
+
+
+
+base_model: ${BASE_MODEL}
+base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
+base_model_config: ${BASE_MODEL_CONFIG}
+revision_of_model: ${REVISION_OF_MODEL}
+tokenizer_config: ${TOKENIZER_CONFIG}
+model_type: ${MODEL_TYPE}
+tokenizer_type: ${TOKENIZER_TYPE}
+trust_remote_code: ${TRUST_REMOTE_CODE}
+tokenizer_use_fast: ${TOKENIZER_USE_FAST}
+tokenizer_legacy: ${TOKENIZER_LEGACY}
+resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}
+
+is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
+is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
+is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
+is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}
+
+overrides_of_model_config:
+  rope_scaling:
+    type: ${ROPE_SCALING_TYPE}
+    factor: ${ROPE_SCALING_FACTOR}
+
+bnb_config_kwargs:
+  llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
+  bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
+  bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}
+
+gptq: ${GPTQ}
+load_in_8bit: ${LOAD_IN_8BIT}
+load_in_4bit: ${LOAD_IN_4BIT}
+bf16: ${BF16}
+fp16: ${FP16}
+tf32: ${TF32}
+bfloat16: ${BFLOAT16}
+float16: ${FLOAT16}
+
+gpu_memory_limit: ${GPU_MEMORY_LIMIT}
+lora_on_cpu: ${LORA_ON_CPU}
+
+datasets:
+  - path: ${DATASET_PATH}
+    type: ${DATASET_TYPE}
+    ds_type: ${DATASET_DS_TYPE}
+    data_files: ${DATASET_DATA_FILES}
+    shards: ${DATASET_SHARDS}
+    name: ${DATASET_NAME}
+    train_on_split: ${DATASET_TRAIN_ON_SPLIT}
+    revision: ${DATASET_REVISION}
+    trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}
+
+rl: ${RL}
+dpo_use_weighting: ${DPO_USE_WEIGHTING}
+
+chat_template: ${CHAT_TEMPLATE}
+chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
+default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
+dataset_prepared_path: ${DATASET_PREPARED_PATH}
+push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
+dataset_processes: ${DATASET_PROCESSES}
+dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
+hub_model_id: ${HUB_MODEL_ID}
+hub_strategy: ${HUB_STRATEGY}
+hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
+val_set_size: ${VAL_SET_SIZE}
+dataset_shard_num: ${DATASET_SHARD_NUM}
+dataset_shard_idx: ${DATASET_SHARD_IDX}
+
+sequence_len: ${SEQUENCE_LEN}
+pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
+sample_packing: ${SAMPLE_PACKING}
+eval_sample_packing: ${EVAL_SAMPLE_PACKING}
+sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
+total_num_tokens: ${TOTAL_NUM_TOKENS}
+sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
+sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}
+
+batch_flattening: ${BATCH_FLATTENING}
+device_map: ${DEVICE_MAP}
+max_memory: ${MAX_MEMORY}
+
+adapter: ${ADAPTER}
+lora_model_dir: ${LORA_MODEL_DIR}
+
+lora_r: ${LORA_R}
+lora_alpha: ${LORA_ALPHA}
+lora_dropout: ${LORA_DROPOUT}
+lora_target_modules:
+  - ${LORA_TARGET_MODULES}
+lora_target_linear: ${LORA_TARGET_LINEAR}
+peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
+lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
+lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}
+
+loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
+loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}
+
+peft:
+  loftq_config:
+    loftq_bits: ${LOFTQ_BITS}
+
+relora_steps: ${RELORA_STEPS}
+relora_warmup_steps: ${RELORA_WARMUP_STEPS}
+relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
+relora_prune_ratio: ${RELORA_PRUNE_RATIO}
+relora_cpu_offload: ${RELORA_CPU_OFFLOAD}
+
+wandb_mode: ${WANDB_MODE}
+wandb_project: ${WANDB_PROJECT}
+wandb_entity: ${WANDB_ENTITY}
+wandb_watch: ${WANDB_WATCH}
+wandb_name: ${WANDB_NAME}
+wandb_run_id: ${WANDB_RUN_ID}
+wandb_log_model: ${WANDB_LOG_MODEL}
+
+mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
+mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
+mlflow_run_name: ${MLFLOW_RUN_NAME}
+hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}
+
+use_comet: ${USE_COMET}
+comet_api_key: ${COMET_API_KEY}
+comet_workspace: ${COMET_WORKSPACE}
+comet_project_name: ${COMET_PROJECT_NAME}
+comet_experiment_key: ${COMET_EXPERIMENT_KEY}
+comet_mode: ${COMET_MODE}
+comet_online: ${COMET_ONLINE}
+comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}
+
+output_dir: ${OUTPUT_DIR}
+
+torch_compile: ${TORCH_COMPILE}
+torch_compile_backend: ${TORCH_COMPILE_BACKEND}
+
+gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
+micro_batch_size: ${MICRO_BATCH_SIZE}
+eval_batch_size: ${EVAL_BATCH_SIZE}
+num_epochs: ${NUM_EPOCHS}
+warmup_steps: ${WARMUP_STEPS}
+warmup_ratio: ${WARMUP_RATIO}
+learning_rate: ${LEARNING_RATE}
+lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
+logging_steps: ${LOGGING_STEPS}
+eval_steps: ${EVAL_STEPS}
+evals_per_epoch: ${EVALS_PER_EPOCH}
+save_strategy: ${SAVE_STRATEGY}
+save_steps: ${SAVE_STEPS}
+saves_per_epoch: ${SAVES_PER_EPOCH}
+save_total_limit: ${SAVE_TOTAL_LIMIT}
+max_steps: ${MAX_STEPS}
+
+eval_table_size: ${EVAL_TABLE_SIZE}
+eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
+eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}
+
+profiler_steps: ${PROFILER_STEPS}
+loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
+loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
+
+save_safetensors: ${SAVE_SAFETENSORS}
+train_on_inputs: ${TRAIN_ON_INPUTS}
+group_by_length: ${GROUP_BY_LENGTH}
+gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
+early_stopping_patience: ${EARLY_STOPPING_PATIENCE}
+
+lr_scheduler: ${LR_SCHEDULER}
+lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
+cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
+cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
+lr_div_factor: ${LR_DIV_FACTOR}
+
+optimizer: ${OPTIMIZER}
+optim_args: ${OPTIM_ARGS}
+optim_target_modules: ${OPTIM_TARGET_MODULES}
+weight_decay: ${WEIGHT_DECAY}
+adam_beta1: ${ADAM_BETA1}
+adam_beta2: ${ADAM_BETA2}
+adam_epsilon: ${ADAM_EPSILON}
+max_grad_norm: ${MAX_GRAD_NORM}
+
+neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}
+
+flash_optimum: ${FLASH_OPTIMUM}
+xformers_attention: ${XFORMERS_ATTENTION}
+flash_attention: ${FLASH_ATTENTION}
+flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
+flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
+flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
+flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
+sdp_attention: ${SDP_ATTENTION}
+s2_attention: ${S2_ATTENTION}
+resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
+auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}
+
+local_rank: ${LOCAL_RANK}
+
+special_tokens:
+  bos_token: ${SPECIAL_TOKEN_BOS}
+  eos_token: ${SPECIAL_TOKEN_EOS}
+  unk_token: ${SPECIAL_TOKEN_UNK}
+  pad_token: ${SPECIAL_TOKEN_PAD}
+
+tokens: ${TOKENS}
+
+fsdp: ${FSDP}
+fsdp_config: ${FSDP_CONFIG}
+deepspeed: ${DEEPSPEED}
+
+ddp_timeout: ${DDP_TIMEOUT}
+ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
+ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}
+
+torchdistx_path: ${TORCHDISTX_PATH}
+pretraining_dataset: ${PRETRAINING_DATASET}
+debug: ${DEBUG}
+seed: ${SEED}
+strict: ${STRICT}
--- a/.runpod/src/handler.py
+++ b/.runpod/src/handler.py
@@ -0,0 +1,64 @@
+"""
+Runpod serverless entrypoint handler
+"""
+
+import os
+
+import runpod
+import yaml
+from huggingface_hub._login import login
+from train import train
+from utils import get_output_dir
+
+BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
+if not os.path.exists(BASE_VOLUME):
+    os.makedirs(BASE_VOLUME)
+
+logger = runpod.RunPodLogger()
+
+
+async def handler(job):
+    runpod_job_id = job["id"]
+    inputs = job["input"]
+    run_id = inputs.get("run_id", "default_run_id")
+    args = inputs.get("args", {})
+
+    # Set output directory
+    output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
+    args["output_dir"] = output_dir
+
+    # First save args to a temporary config file
+    config_path = "/workspace/test_config.yaml"
+
+    # Add run_name and job_id to args before saving
+    args["run_name"] = run_id
+    args["runpod_job_id"] = runpod_job_id
+
+    yaml_data = yaml.dump(args, default_flow_style=False)
+    with open(config_path, "w", encoding="utf-8") as file:
+        file.write(yaml_data)
+
+    # Handle credentials
+    credentials = inputs.get("credentials", {})
+
+    if "wandb_api_key" in credentials:
+        os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
+    if "hf_token" in credentials:
+        os.environ["HF_TOKEN"] = credentials["hf_token"]
+
+    if os.environ.get("HF_TOKEN"):
+        login(token=os.environ["HF_TOKEN"])
+    else:
+        logger.info("No HF_TOKEN provided. Skipping login.")
+
+    logger.info("Starting Training.")
+    async for result in train(config_path):  # Pass the config path instead of args
+        logger.info(result)
+    logger.info("Training Complete.")
+
+    # Cleanup
+    del os.environ["WANDB_API_KEY"]
+    del os.environ["HF_TOKEN"]
+
+
+runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
--- a/.runpod/src/test_input.json
+++ b/.runpod/src/test_input.json
@@ -0,0 +1,61 @@
+{
+  "input": {
+    "user_id": "user",
+    "model_id": "llama-test",
+    "run_id": "llama-test",
+    "credentials": {
+      "wandb_api_key": "",
+      "hf_token": ""
+    },
+    "args": {
+      "base_model": "NousResearch/Meta-Llama-3-8B",
+      "model_type": "LlamaForCausalLM",
+      "tokenizer_type": "AutoTokenizer",
+      "load_in_8bit": true,
+      "load_in_4bit": false,
+      "strict": false,
+      "datasets": [
+        {
+          "path": "mhenrichsen/alpaca_2k_test",
+          "type": "alpaca"
+        }
+      ],
+      "val_set_size": 0.05,
+      "output_dir": "./outputs/lora-out",
+      "sequence_len": 4096,
+      "sample_packing": true,
+      "eval_sample_packing": false,
+      "pad_to_sequence_len": true,
+      "adapter": "lora",
+      "lora_r": 32,
+      "lora_alpha": 16,
+      "lora_dropout": 0.05,
+      "lora_target_linear": true,
+      "lora_modules_to_save": [
+        "embed_tokens",
+        "lm_head"
+      ],
+      "gradient_accumulation_steps": 4,
+      "micro_batch_size": 2,
+      "num_epochs": 1,
+      "optimizer": "adamw_bnb_8bit",
+      "lr_scheduler": "cosine",
+      "learning_rate": 0.0002,
+      "train_on_inputs": false,
+      "group_by_length": false,
+      "bf16": "auto",
+      "tf32": false,
+      "gradient_checkpointing": true,
+      "logging_steps": 1,
+      "flash_attention": true,
+      "warmup_steps": 1,
+      "evals_per_epoch": 1,
+      "eval_max_new_tokens": 128,
+      "saves_per_epoch": 1,
+      "weight_decay": 0.0,
+      "special_tokens": {
+        "pad_token": "<|end_of_text|>"
+      }
+    }
+  }
+}
--- a/.runpod/src/train.py
+++ b/.runpod/src/train.py
@@ -0,0 +1,45 @@
+"""
+Runpod train entrypoint
+"""
+
+import asyncio
+
+
+async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
+    """
+    Run preprocessing (if enabled) and training with the given config file
+    :param config_path: Path to the YAML config file
+    :param gpu_id: GPU ID to use (default: "0")
+    :param preprocess: Whether to run preprocessing (default: True)
+
+    """
+    # First check if preprocessing is needed
+    if preprocess:
+        # Preprocess command
+        preprocess_cmd = (
+            f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
+        )
+        process = await asyncio.create_subprocess_shell(
+            preprocess_cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.STDOUT,
+        )
+
+        if process.stdout is not None:
+            async for line in process.stdout:
+                yield f"Preprocessing: {line.decode().strip()}"
+        await process.wait()
+        yield "Preprocessing completed."
+    else:
+        yield "Skipping preprocessing step."
+
+    # Training command
+    train_cmd = f"axolotl train {config_path}"
+    process = await asyncio.create_subprocess_shell(
+        train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
+    )
+
+    if process.stdout is not None:
+        async for line in process.stdout:
+            yield f"Training: {line.decode().strip()}"
+    await process.wait()
--- a/.runpod/src/utils.py
+++ b/.runpod/src/utils.py
@@ -0,0 +1,89 @@
+"""
+Runpod launcher utils
+"""
+
+import os
+
+import yaml
+
+
+def get_output_dir(run_id):
+    path = f"fine-tuning/{run_id}"
+    return path
+
+
+def make_valid_config(input_args):
+    """
+    Creates and saves updated config file, returns the path to the new config
+    :param input_args: dict of input args
+    :return: str, path to the updated config file
+    """
+    # Load default config
+    with open("config/config.yaml", "r", encoding="utf-8") as fin:
+        all_args = yaml.safe_load(fin)
+
+    if not input_args:
+        print("No args provided, using defaults")
+    else:
+        all_args.update(input_args)
+
+    # Create updated config path
+    updated_config_path = "config/updated_config.yaml"
+
+    # Save updated config to new file
+    with open(updated_config_path, "w", encoding="utf-8") as f:
+        yaml.dump(all_args, f)
+
+    return updated_config_path
+
+
+def set_config_env_vars(args: dict):
+    """
+    Convert API arguments into environment variables.
+    Handles nested dictionaries, lists, and special values.
+
+    Args:
+        args (dict): The arguments dictionary from the API request
+    """
+
+    def process_value(value):
+        """Convert Python values to string format for environment variables"""
+        if value is None:
+            return ""
+        if isinstance(value, bool):
+            return str(value).lower()
+        if isinstance(value, (list, dict)):
+            return str(value)
+        return str(value)
+
+    def set_env_vars(data, prefix=""):
+        """Recursively set environment variables from nested dictionary"""
+        for key, value in data.items():
+            env_key = prefix + key.upper()
+
+            # Handle special cases
+            if isinstance(value, dict):
+                # For nested dictionaries (like special_tokens)
+                set_env_vars(value, f"{env_key}_")
+            elif isinstance(value, list):
+                # Handle list of dictionaries (like datasets)
+                if value and isinstance(value[0], dict):
+                    for i, item in enumerate(value):
+                        set_env_vars(item, f"{env_key}_{i}_")
+                else:
+                    # For simple lists (like lora_target_modules)
+                    os.environ[env_key] = process_value(value)
+            else:
+                # Handle all other cases
+                os.environ[env_key] = process_value(value)
+
+    # Clear any existing related environment variables
+    # This prevents old values from persisting
+    for key in list(os.environ.keys()):
+        if key.startswith(
+            ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
+        ):
+            del os.environ[key]
+
+    # Set new environment variables
+    set_env_vars(args)
--- a/.runpod/tests.json
+++ b/.runpod/tests.json
@@ -0,0 +1,89 @@
+{
+  "tests": [
+    {
+      "name": "quick_smoke_test_sft",
+      "input": {
+        "user_id": "user",
+        "model_id": "llama-test",
+        "run_id": "llama-test",
+        "credentials": {
+          "wandb_api_key": "",
+          "hf_token": ""
+        },
+        "args": {
+          "base_model": "NousResearch/Meta-Llama-3-8B",
+          "model_type": "LlamaForCausalLM",
+          "tokenizer_type": "AutoTokenizer",
+          "load_in_8bit": true,
+          "load_in_4bit": false,
+          "strict": false,
+          "datasets": [
+            {
+              "path": "mhenrichsen/alpaca_2k_test",
+              "type": "alpaca"
+            }
+          ],
+          "val_set_size": 0.05,
+          "output_dir": "./outputs/lora-out",
+          "sequence_len": 4096,
+          "sample_packing": true,
+          "eval_sample_packing": false,
+          "pad_to_sequence_len": true,
+          "adapter": "lora",
+          "lora_r": 32,
+          "lora_alpha": 16,
+          "lora_dropout": 0.05,
+          "lora_target_linear": true,
+          "lora_modules_to_save": [
+            "embed_tokens",
+            "lm_head"
+          ],
+          "gradient_accumulation_steps": 4,
+          "micro_batch_size": 2,
+          "num_epochs": 1,
+          "optimizer": "adamw_bnb_8bit",
+          "lr_scheduler": "cosine",
+          "learning_rate": 0.0002,
+          "train_on_inputs": false,
+          "group_by_length": false,
+          "bf16": "auto",
+          "tf32": false,
+          "gradient_checkpointing": true,
+          "logging_steps": 1,
+          "flash_attention": true,
+          "warmup_steps": 1,
+          "evals_per_epoch": 1,
+          "eval_max_new_tokens": 128,
+          "saves_per_epoch": 1,
+          "weight_decay": 0.0,
+          "special_tokens": {
+            "pad_token": "<|end_of_text|>"
+          }
+        }
+      },
+      "timeout": 100000
+    }
+  ],
+  "config": {
+    "gpuTypeId": "NVIDIA GeForce RTX 4090",
+    "gpuCount": 1,
+    "containerDiskInGb": 200,
+    "env": [
+      {
+        "key": "TOKENIZER",
+        "value": ""
+      },
+      {
+        "key": "DISABLE_LOG_STATS",
+        "value": "true"
+      }
+    ],
+    "allowedCudaVersions": [
+      "12.8",
+      "12.7",
+      "12.6",
+      "12.5",
+      "12.4"
+    ]
+  }
+}
--- a/cicd/multigpu.sh
+++ b/cicd/multigpu.sh
@@ -20,4 +20,4 @@ pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
  --cov-report=xml:multigpu-coverage.xml

 # Upload coverage to Codecov
-codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
+codecov upload-process -t $CODECOV_TOKEN -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION}
--- a/docs/custom_integrations.qmd
+++ b/docs/custom_integrations.qmd
@@ -49,8 +49,7 @@ sections = [
    ("Knowledge Distillation (KD)", "kd"),
    ("Liger Kernels", "liger"),
    ("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
-    ("Spectrum", "spectrum"),
-    ("LLMCompressor", "llm_compressor")
+    ("Spectrum", "spectrum")
 ]

 for section_name, folder_name in sections:
--- a/examples/llama-3/sparse-finetuning.yaml
+++ b/examples/llama-3/sparse-finetuning.yaml
@@ -1,77 +0,0 @@
-base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4
-
-plugins:
-  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
-
-load_in_8bit: false
-load_in_4bit: false
-strict: false
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-output_dir: ./outputs/out
-
-sequence_len: 4096
-sample_packing: true
-pad_to_sequence_len: true
-eval_sample_packing: false
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 8
-micro_batch_size: 1
-num_epochs: 1
-optimizer: paged_adamw_8bit
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-gradient_checkpointing_kwargs:
-  use_reentrant: false
-early_stopping_patience:
-resume_from_checkpoint:
-logging_steps: 1
-xformers_attention:
-flash_attention: true
-
-warmup_steps: 100
-evals_per_epoch: 2
-eval_table_size:
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  pad_token: <|end_of_text|>
-
-llmcompressor:
-  recipe:
-    finetuning_stage:
-      finetuning_modifiers:
-        ConstantPruningModifier:
-          targets: [
-            're:.*q_proj.weight',
-            're:.*k_proj.weight',
-            're:.*v_proj.weight',
-            're:.*o_proj.weight',
-            're:.*gate_proj.weight',
-            're:.*up_proj.weight',
-            're:.*down_proj.weight',
-          ]
-          start: 0
-  save_compressed: true
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,13 +11,13 @@ liger-kernel==0.5.8

 packaging==23.2

-peft==0.15.2
+peft==0.15.1
 transformers==4.51.3
 tokenizers>=0.21.1
 accelerate==1.6.0
 datasets==3.5.0
 deepspeed>=0.15.4
-trl==0.17.0
+trl==0.16.1
 hf_xet==1.0.0
 hqq==0.2.5

--- a/setup.py
+++ b/setup.py
@@ -67,13 +67,13 @@ def parse_requirements(extras_require_map):
            if (major, minor) >= (2, 7):
                _install_requires.pop(_install_requires.index(xformers_version))
                # _install_requires.append("xformers==0.0.29.post3")  # xformers seems to be hard pinned to 2.6.0
-                extras_require_map["vllm"] = ["vllm==0.8.4"]
+                extras_require_map["vllm"] = ["vllm==0.8.3"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append(
                    "xformers==0.0.29.post2"
                )  # vllm needs post2 w torch 2.6
-                extras_require_map["vllm"] = ["vllm==0.8.4"]
+                extras_require_map["vllm"] = ["vllm==0.8.3"]
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
@@ -149,9 +149,6 @@ extras_require = {
    "vllm": [
        "vllm==0.7.2",
    ],
-    "llmcompressor": [
-        "llmcompressor==0.5.1",
-    ],
 }

 install_requires, dependency_links, extras_require_build = parse_requirements(
--- a/src/axolotl/core/trainers/grpo/init.py
+++ b/src/axolotl/core/trainers/grpo/init.py
@@ -135,9 +135,7 @@ class GRPOStrategy:
        try:
            # use importlib to dynamically load the reward function from the module
            reward_func_module_name = reward_func_fqn.split(".")[-1]
-            reward_func_module = importlib.import_module(
-                ".".join(reward_func_fqn.split(".")[:-1])
-            )
+            reward_func_module = importlib.import_module(reward_func_fqn.split(".")[-2])
            reward_func = getattr(reward_func_module, reward_func_module_name)
            if not len(inspect.signature(reward_func).parameters) >= 2:
                raise ValueError(
--- a/src/axolotl/integrations/llm_compressor/README.md
+++ b/src/axolotl/integrations/llm_compressor/README.md
@@ -1,108 +0,0 @@
-# LLMCompressor Integration
-
-Fine-tune sparsified models in Axolotl using Neural Magic's [LLMCompressor](https://github.com/vllm-project/llm-compressor).
-
-This integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor's model compression capabilities with Axolotl's distributed training pipelines, users can efficiently fine-tune sparse models at scale.
-
-It uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.
-
---
-
-## Requirements
-
- Axolotl with `llmcompressor` extras:
-
-  ```bash
-  pip install "axolotl[llmcompressor]"
-  ```
-
- Requires `llmcompressor >= 0.5.1`
-
-This will install all necessary dependencies to fine-tune sparsified models using the integration.
-
---
-
-## Usage
-
-To enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:
-
-```yaml
-plugins:
-  - axolotl.integrations.llm_compressor.LLMCompressorPlugin
-
-llmcompressor:
-  recipe:
-    finetuning_stage:
-      finetuning_modifiers:
-        ConstantPruningModifier:
-          targets: [
-            're:.*q_proj.weight',
-            're:.*k_proj.weight',
-            're:.*v_proj.weight',
-            're:.*o_proj.weight',
-            're:.*gate_proj.weight',
-            're:.*up_proj.weight',
-            're:.*down_proj.weight',
-          ]
-          start: 0
-  save_compressed: true
-# ... (other training arguments)
-```
-
-This plugin **does not apply pruning or sparsification itself** — it is intended for **fine-tuning models that have already been sparsified**.
-
-Pre-sparsified checkpoints can be:
- Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor)
- Downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic)
- Any custom LLM with compatible sparsity patterns that you've created yourself
-
-To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:
-[https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md)
-
-### Storage Optimization with save_compressed
-
-Setting `save_compressed: true` in your configuration enables saving models in a compressed format, which:
- Reduces disk space usage by approximately 40%
- Maintains compatibility with vLLM for accelerated inference
- Maintains compatibility with llmcompressor for further optimization (example: quantization)
-
-This option is highly recommended when working with sparse models to maximize the benefits of model compression.
-
-### Example Config
-
-See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example.
-
---
-
-## Inference with vLLM
-
-After fine-tuning your sparse model, you can leverage vLLM for efficient inference.
-You can also use LLMCompressor to apply additional quantization to your fine-tuned
-sparse model before inference for even greater performance benefits.:
-
-```python
-from vllm import LLM, SamplingParams
-
-prompts = [
-    "Hello, my name is",
-    "The president of the United States is",
-    "The capital of France is",
-    "The future of AI is",
-]
-sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
-llm = LLM("path/to/your/sparse/model")
-outputs = llm.generate(prompts, sampling_params)
-
-for output in outputs:
-    prompt = output.prompt
-    generated_text = output.outputs[0].text
-    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
-```
-
-For more details on vLLM's capabilities and advanced configuration options, see the [official vLLM documentation](https://docs.vllm.ai/).
-
-## Learn More
-
-For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:
-
-[https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)
--- a/src/axolotl/integrations/llm_compressor/init.py
+++ b/src/axolotl/integrations/llm_compressor/init.py
@@ -1,5 +0,0 @@
-"""Integration entry point for the LLMCompressor plugin."""
-
-from .plugin import LLMCompressorPlugin
-
-__all__ = ["LLMCompressorPlugin"]
--- a/src/axolotl/integrations/llm_compressor/args.py
+++ b/src/axolotl/integrations/llm_compressor/args.py
@@ -1,40 +0,0 @@
-"""
-LLMCompressor and Sparse Finetuning config models.
-"""
-
-from typing import Any
-
-from pydantic import BaseModel, Field
-from typing_extensions import Annotated
-
-
-class CompressionArgs(BaseModel):
-    """Sparse Finetuning config for LLMCompressor."""
-
-    # Typing for recipe is set to Any due to:
-    # https://github.com/vllm-project/llm-compressor/issues/1319
-    recipe: Annotated[
-        Any,
-        Field(
-            description="The recipe containing the compression algorithms and hyperparameters to apply."
-        ),
-    ]
-
-    save_compressed: Annotated[
-        bool,
-        Field(
-            default=False,
-            description="Whether to save the compressed model after training.",
-        ),
-    ]
-
-
-class LLMCompressorArgs(BaseModel):
-    """LLMCompressor configuration BaseModel."""
-
-    llmcompressor: Annotated[
-        CompressionArgs,
-        Field(
-            description="Arguments enabling compression pathways through the LLM Compressor plugins"
-        ),
-    ]
--- a/src/axolotl/integrations/llm_compressor/plugin.py
+++ b/src/axolotl/integrations/llm_compressor/plugin.py
@@ -1,171 +0,0 @@
-"""
-Sparse Finetuning plugin for Axolotl — enables handling of sparse neural networks
-by maintaining masks for zero weights during training.
-"""
-
-import logging
-from functools import wraps
-from typing import Any, Callable, Concatenate, ParamSpec, TypeVar
-
-from llmcompressor import active_session, create_session
-from llmcompressor.core import callbacks as session_callbacks
-from llmcompressor.recipe import Recipe
-from torch.nn import Module
-from transformers.trainer import Trainer
-from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
-from transformers.training_args import TrainingArguments
-
-from axolotl.integrations.base import BasePlugin
-
-P = ParamSpec("P")  # Params for generic function signatures
-R = TypeVar("R")  # Return type for generic function signatures
-
-LOG = logging.getLogger("axolotl.integrations.llm_compressor")
-
-
-class LLMCompressorCallbackHandler(TrainerCallback):
-    """
-    Trainer callback for Sparse Finetuning.
-    Maintains sparsity patterns during training by applying masks after optimization steps,
-    ensuring zero-weight updates are canceled out.
-    """
-
-    def __init__(self, trainer: Trainer, recipe: Any):
-        """
-        Initialize the Sparse Finetuning callback handler.
-
-        Args:
-            trainer (Trainer): Huggingface Trainer instance.
-            recipe (Recipe | dict): Sparse finetuning recipe to apply.
-        """
-        super().__init__()
-        self.trainer = trainer
-        self.recipe = (
-            Recipe.model_validate(recipe) if not isinstance(recipe, Recipe) else recipe
-        )
-        self.original_compute_loss = trainer.compute_loss
-        self.trainer.compute_loss = compute_loss_wrapper(self.trainer.compute_loss)
-        create_session()
-
-    def on_train_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ) -> None:
-        """
-        Called at the beginning of training. Initializes the compression session.
-
-        Args:
-            args (TrainingArguments): Training arguments.
-            state (TrainerState): Trainer state.
-            control (TrainerControl): Trainer control.
-        """
-        super().on_train_begin(args, state, control, **kwargs)
-        self.trainer.accelerator.wait_for_everyone()
-        active_session().initialize(
-            model=self.trainer.model,
-            optimizer=self.trainer.optimizer,
-            start=state.epoch,
-            recipe=self.recipe,
-        )
-        self.trainer.accelerator.wait_for_everyone()
-
-    def on_step_begin(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ) -> None:
-        """
-        Called at the beginning of a training step. Triggers batch_start callback.
-        """
-        super().on_step_begin(args, state, control, **kwargs)
-        session_callbacks.batch_start()
-
-    def on_step_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ) -> None:
-        """
-        Called at the end of a training step. Triggers optimizer and batch_end callbacks.
-        """
-        super().on_step_end(args, state, control, **kwargs)
-        session_callbacks.optim_pre_step()
-        session_callbacks.optim_post_step()
-        session_callbacks.batch_end()
-
-    def on_train_end(
-        self,
-        args: TrainingArguments,
-        state: TrainerState,
-        control: TrainerControl,
-        **kwargs,
-    ) -> None:
-        """
-        Called at the end of training. Finalizes the compression session.
-        """
-        super().on_train_end(args, state, control, **kwargs)
-        active_session().finalize()
-        self.trainer.compute_loss_func = self.original_compute_loss
-
-
-class LLMCompressorPlugin(BasePlugin):
-    """
-    Sparse Finetuning plugin for Axolotl integration.
-    """
-
-    def get_input_args(self) -> str:
-        """
-        Returns the path to the plugin's argument definition.
-
-        Returns:
-            str: Dotted path to the LLMCompressorArgs class.
-        """
-        return "axolotl.integrations.llm_compressor.args.LLMCompressorArgs"
-
-    def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
-        """
-        Adds Sparse Finetuning callback to the Trainer instance.
-
-        Args:
-            cfg (Any): Configuration object containing the sparse recipe.
-            trainer (Trainer): Huggingface Trainer instance.
-
-        Returns:
-            list: List containing the configured callback instances.
-        """
-        LOG.info("Adding Sparse Finetuning callback to the trainer")
-        callback = LLMCompressorCallbackHandler(
-            trainer=trainer,
-            recipe=cfg.llmcompressor.recipe,
-        )
-        return [callback]
-
-
-def compute_loss_wrapper(
-    compute_loss_func: Callable[Concatenate[Module, P], R],
-) -> Callable[Concatenate[Module, P], R]:
-    """
-    Wraps the loss computation function to trigger the loss_calculated callback.
-
-    Args:
-        compute_loss_func (Callable): Original loss computation function.
-
-    Returns:
-        Callable: Wrapped function that also invokes the loss_calculated callback.
-    """
-
-    @wraps(compute_loss_func)
-    def compute_and_notify(model: Module, *args: P.args, **kwargs: P.kwargs) -> R:
-        loss = compute_loss_func(model, *args, **kwargs)
-        if active_session().lifecycle.initialized_ and model.training:
-            session_callbacks.loss_calculated(loss=loss)
-        return loss
-
-    return compute_and_notify
--- a/src/axolotl/integrations/llm_compressor/utils.py
+++ b/src/axolotl/integrations/llm_compressor/utils.py
@@ -1,40 +0,0 @@
-"""Utilities for llmcompressor integration with axolotl."""
-
-from typing import Union
-
-from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
-    modify_save_pretrained,
-)
-from transformers import PreTrainedModel, Trainer
-
-
-def save_compressed_model(
-    model: PreTrainedModel,
-    output_dir: Union[str, bytes],
-    trainer: Trainer,
-    safe_serialization: bool = False,
-    save_compressed: bool = False,
-) -> None:
-    """
-    Synchronize processes, apply compression hooks, and save the model.
-
-    Args:
-        model (PreTrainedModel): The model to be saved.
-        output_dir (str or bytes): Path where the model files will be written.
-        trainer (Trainer): Hugging Face Trainer for process synchronization.
-        safe_serialization (bool): Use safe serialization if True.
-        save_compressed (bool): Write compressed tensors if True.
-    """
-    trainer.accelerator.wait_for_everyone()
-
-    # Only the main process writes the files
-    if not trainer.accelerator.is_main_process:
-        return
-
-    modify_save_pretrained(model)
-    model.save_pretrained(
-        output_dir,
-        safe_serialization=safe_serialization,
-        save_compressed=save_compressed,
-        skip_sparsity_compression_stats=not save_compressed,
-    )
--- a/src/axolotl/train.py
+++ b/src/axolotl/train.py
@@ -295,23 +295,8 @@ def save_trained_model(
            trainer.model.save_pretrained(
                cfg.output_dir, safe_serialization=safe_serialization
            )
-
        model.save_pretrained(cfg.output_dir, safe_serialization=safe_serialization)

-    if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
-        # TODO: add integration support so this can be implemented completely within the plugin
-        from axolotl.integrations.llm_compressor.utils import (
-            save_compressed_model,
-        )
-
-        save_compressed_model(
-            model=model,
-            output_dir=cfg.output_dir,
-            trainer=trainer,
-            safe_serialization=safe_serialization,
-            save_compressed=cfg.llmcompressor.save_compressed,
-        )
-

 def create_model_card(cfg: DictDefault, trainer: Trainer):
    """
--- a/src/axolotl/utils/models.py
+++ b/src/axolotl/utils/models.py
@@ -139,22 +139,6 @@ def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
        hasattr(model_config, "quantization_config")
        and model_config.quantization_config
    )
-
-    # Detect compressed-tensors config
-    is_compressed_tensors_config = (
-        quant_config_exists
-        and model_config.quantization_config.get("quant_method") == "compressed-tensors"
-    )
-
-    if is_compressed_tensors_config:
-        if model_config.quantization_config.get("config_groups"):
-            LOG.warning(
-                "Found `config_groups` in a compressed-tensors config. "
-                "QAT integration with llmcompressor is not tested."
-            )
-        # Skip further quant checks for compressed-tensors
-        return
-
    quant_config_method_is_gptq = (
        quant_config_exists
        and "quant_method" in model_config.quantization_config
--- a/tests/e2e/integrations/test_llm_compressor.py
+++ b/tests/e2e/integrations/test_llm_compressor.py
@@ -1,106 +0,0 @@
-"""
-E2E smoke tests for LLMCompressorPlugin integration
-"""
-
-from pathlib import Path
-
-import pytest
-
-from axolotl.cli.args import TrainerCliArgs
-from axolotl.common.datasets import load_datasets
-from axolotl.train import train
-from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
-from axolotl.utils.dict import DictDefault
-
-from tests.e2e.utils import (
-    check_model_output_exists,
-    require_llmcompressor,
-    require_torch_2_4_1,
-)
-
-MODELS = [
-    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
-    "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
-]
-
-
-@pytest.mark.parametrize(
-    "base_model", MODELS, ids=["no-checkpoint-recipe", "with-checkpoint-recipe"]
-)
-@pytest.mark.parametrize(
-    "save_compressed", [True, False], ids=["save_compressed", "save_uncompressed"]
-)
-@require_llmcompressor
-class TestLLMCompressorIntegration:
-    """
-    e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin
-    """
-
-    @require_torch_2_4_1
-    def test_llmcompressor_plugin(
-        self, temp_dir, base_model: str, save_compressed: bool
-    ):
-        # core cfg
-        cfg = DictDefault(
-            {
-                "base_model": base_model,
-                "plugins": ["axolotl.integrations.llm_compressor.LLMCompressorPlugin"],
-                "sequence_len": 1024,
-                "val_set_size": 0.05,
-                "special_tokens": {"pad_token": "<|endoftext|>"},
-                "datasets": [{"path": "mhenrichsen/alpaca_2k_test", "type": "alpaca"}],
-                "num_epochs": 1,
-                "micro_batch_size": 2,
-                "gradient_accumulation_steps": 2,
-                "output_dir": temp_dir,
-                "learning_rate": 1e-5,
-                "optimizer": "adamw_torch_fused",
-                "lr_scheduler": "cosine",
-                "save_safetensors": True,
-                "bf16": "auto",
-                "max_steps": 5,
-                "llmcompressor": {
-                    "recipe": {
-                        "finetuning_stage": {
-                            "finetuning_modifiers": {
-                                "ConstantPruningModifier": {
-                                    "targets": [
-                                        "re:.*q_proj.weight",
-                                        "re:.*k_proj.weight",
-                                        "re:.*v_proj.weight",
-                                        "re:.*o_proj.weight",
-                                        "re:.*gate_proj.weight",
-                                        "re:.*up_proj.weight",
-                                        "re:.*down_proj.weight",
-                                    ],
-                                    "start": 0,
-                                },
-                            },
-                        },
-                    },
-                    "save_compressed": save_compressed,
-                },
-            }
-        )
-
-        prepare_plugins(cfg)
-        cfg = validate_config(cfg)
-        normalize_config(cfg)
-        cli_args = TrainerCliArgs()
-        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)
-
-        train(cfg=cfg, dataset_meta=dataset_meta)
-        check_model_output_exists(temp_dir, cfg)
-        _check_llmcompressor_model_outputs(temp_dir, save_compressed)
-
-
-def _check_llmcompressor_model_outputs(temp_dir, save_compressed):
-    if save_compressed:
-        assert (Path(temp_dir) / "recipe.yaml").exists()
-
-        from compressed_tensors import ModelCompressor
-        from compressed_tensors.config import Sparse24BitMaskConfig
-
-        compressor = ModelCompressor.from_pretrained(temp_dir)
-        assert compressor is not None
-        assert isinstance(compressor.sparsity_config, Sparse24BitMaskConfig)
--- a/tests/e2e/multigpu/solo/test_grpo.py
+++ b/tests/e2e/multigpu/solo/test_grpo.py
@@ -4,14 +4,11 @@ GRPO test suite

 import os
 import random
-import shutil
 import subprocess  # nosec B404
 import sys
-import tempfile
 import time
 from pathlib import Path

-import psutil
 import pytest
 import requests
 import yaml
@@ -24,8 +21,8 @@ from tests.e2e.utils import require_vllm


 def start_vllm(
-    model: str, env: dict, wait: int | None = None, quiet=False, **kwargs
-) -> subprocess.Popen:
+    model: str, env: dict | None = None, wait: int | None = None, quiet=False, **kwargs
+) -> int:
    """
    helper function to start the VLLM server in the background, mostly for testing purposes
    """
@@ -49,41 +46,10 @@ def start_vllm(
    # print out the command to be executed
    print(" ".join(cmd))

-    vllm_logging_json = Path(tempfile.mkdtemp()) / "vllm_logging.json"
-    with open(vllm_logging_json, "w", encoding="utf-8") as temp_file:
-        temp_file.write(
-            """{
-  "formatters": {
-    "json": {
-      "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
-    }
-  },
-  "handlers": {
-    "file": {
-      "class": "logging.FileHandler",
-      "formatter": "json",
-      "level": "DEBUG",
-      "filename": "/tmp/vllm.log",
-      "mode": "a"
-    }
-  },
-  "loggers": {
-    "vllm": {
-      "handlers": ["file"],
-      "level": "DEBUG",
-      "propagate": false
-    }
-  },
-  "version": 1
-}"""
-        )
-
-    cmd_env = env.copy()
-    cmd_env.update({"VLLM_LOGGING_CONFIG_PATH": vllm_logging_json})
    # start `trl vllm-serve` command in the background and capture the process id
    process = subprocess.Popen(  # pylint: disable=consider-using-with
        cmd,
-        env=cmd_env,
+        env=env,
        stdout=subprocess.DEVNULL if quiet else subprocess.PIPE,
        stderr=subprocess.DEVNULL if quiet else subprocess.PIPE,
    )  # nosec B603
@@ -92,51 +58,32 @@ def start_vllm(
    print(f"VLLM server process started (PID: {process.pid})")

    # wait until the http server is ready, even if it 404s, but timeout after 60 seconds
-    period_seconds = 5
    started = False
    if wait and host and port:
-        for i in range(0, int(wait), period_seconds):
+        for _ in range(int(wait)):
            try:
                response = requests.get(f"http://{host}:{port}", timeout=1)
-                print(f"{i}: VLLM server (status: {response.status_code})")
                if int(response.status_code) in [200, 404]:
                    started = True
                    break
-            except requests.exceptions.RequestException as exc:
-                print(f"{i}: VLLM server failed to start: {str(exc)}")
+            except requests.exceptions.RequestException:
+                pass

            # also check if the process.pid is still running
            if not process.poll() is None:
                break

-            time.sleep(period_seconds)
+            time.sleep(1)

    if wait and not started:
        print(
            f"VLLM server process did not start within {wait} seconds. Please check your server logs."
        )
-        recursive_kill(process)
-        with open("/tmp/vllm.log", "r", encoding="utf-8") as log_file:
-            print(log_file.read())
-        shutil.rmtree("/tmp/vllm.log")
+        process.kill()
        raise RuntimeError(f"VLLM server process did not start within {wait} seconds.")

-    # return the process
-    return process
-
-
-def recursive_kill(process: subprocess.Popen):
-    """
-    Recursively kill a process and its children
-    """
-    process = psutil.Process(process.pid)
-    for child in psutil.Process(process.pid).children(recursive=True):
-        child.terminate()
-        child.kill()
-        os.kill(child.pid, 9)
-    process.terminate()
-    process.kill()
-    os.kill(process.pid, 9)
+    # return the process id
+    return process.pid


 class TestGRPO:
@@ -227,17 +174,16 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):

        current_env = os.environ.copy()
        env = {
-            "NCCL_P2P_LEVEL": "NVL",
+            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
-            "VLLM_DISABLE_COMPILE_CACHE": "1",
-            # "VLLM_USE_V1": "0",
+            "VLLM_USE_V1": "0",
        }
-        vllm_process = start_vllm(
+        vllm_process_id = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
-            wait=300,
+            wait=120,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
@@ -256,14 +202,10 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
-                env={
-                    "NCCL_P2P_LEVEL": "NVL",
-                    "NCCL_DEBUG": "INFO",
-                    **current_env,
-                },
+                env={"NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env},
            )
        finally:
-            recursive_kill(vllm_process)
+            os.kill(vllm_process_id, 9)

    @pytest.mark.parametrize(
        "num_gpus",
@@ -320,17 +262,16 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):

        current_env = os.environ.copy()
        env = {
-            "NCCL_P2P_LEVEL": "NVL",  # nccl can be brittle, assume P2P isn't reliable
+            "NCCL_P2P_LEVEL": "LOC",  # nccl can be brittle, assume P2P isn't reliable
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
-            "VLLM_DISABLE_COMPILE_CACHE": "1",
-            # "VLLM_USE_V1": "0",
+            "VLLM_USE_V1": "0",
        }
-        vllm_process = start_vllm(
+        vllm_process_id = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
-            wait=300,
+            wait=120,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
@@ -349,11 +290,7 @@ def oai_gsm8k_transform(cfg, *args, **kwargs):
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
-                env={
-                    "NCCL_P2P_LEVEL": "NVL",
-                    "NCCL_DEBUG": "INFO",
-                    **current_env,
-                },
+                env={"NCCL_P2P_LEVEL": "LOC", "NCCL_DEBUG": "INFO", **current_env},
            )
        finally:
-            recursive_kill(vllm_process)
+            os.kill(vllm_process_id, 9)
--- a/tests/e2e/utils.py
+++ b/tests/e2e/utils.py
@@ -109,24 +109,6 @@ def require_vllm(test_case):
    )(test_case)


-def require_llmcompressor(test_case):
-    """
-    Decorator marking a test that requires a llmcompressor to be installed
-    """
-
-    def is_llmcompressor_installed():
-        try:
-            import llmcompressor  # pylint: disable=unused-import  # noqa: F401
-
-            return True
-        except ImportError:
-            return False
-
-    return unittest.skipUnless(
-        is_llmcompressor_installed(), "test requires a llmcompressor to be installed"
-    )(test_case)
-
-
 def is_hopper():
    compute_capability = torch.cuda.get_device_capability()
    return compute_capability == (9, 0)
Author	SHA1	Message	Date
NanoCode012	2b9a2dde4b	chore: update title	2025-04-26 16:21:31 -04:00
Wing Lian	388e950016	restore dockerfile	2025-04-26 16:21:30 -04:00
NanoCode012	fb4adbb311	fix: trim allowed cuda versions	2025-04-26 16:21:30 -04:00
Wing Lian	5e8abca54f	use axolotl cloud image as base and various fixes	2025-04-26 16:21:30 -04:00
Wing Lian	168ec339e5	chore: lint	2025-04-26 16:21:30 -04:00
zeke	cb7185998b	remove LICENSE and fix README	2025-04-26 16:21:30 -04:00
zeke	c2fc35f520	Add runpod sls handler	2025-04-26 16:21:30 -04:00