From 5d182a105660a48b5e2cca679a9d82f3c23c87cc Mon Sep 17 00:00:00 2001
From: Ezekiel Wotring <40004347+KAJdev@users.noreply.github.com>
Date: Mon, 28 Apr 2025 06:08:32 -0800
Subject: [PATCH] Add runpod sls handler (#2530) [skip ci]

* Add runpod sls handler

* remove LICENSE and fix README

* chore: lint

* use axolotl cloud image as base and various fixes

* fix: trim allowed cuda versions

* restore dockerfile

* chore: update title

* use axolotl cloud image

---------

Co-authored-by: Wing Lian <wing@axolotl.ai>
Co-authored-by: NanoCode012 <nano@axolotl.ai>
---
 .runpod/.gitignore             | 161 +++++++++
 .runpod/Dockerfile             |  18 +
 .runpod/README.md              | 335 +++++++++++++++++++
 .runpod/hub.json               |  93 ++++++
 .runpod/requirements.txt       |   7 +
 .runpod/src/config/config.yaml | 577 +++++++++++++++++++++++++++++++++
 .runpod/src/handler.py         |  64 ++++
 .runpod/src/test_input.json    |  61 ++++
 .runpod/src/train.py           |  45 +++
 .runpod/src/utils.py           |  89 +++++
 .runpod/tests.json             |  85 +++++
 11 files changed, 1535 insertions(+)
 create mode 100644 .runpod/.gitignore
 create mode 100644 .runpod/Dockerfile
 create mode 100644 .runpod/README.md
 create mode 100644 .runpod/hub.json
 create mode 100644 .runpod/requirements.txt
 create mode 100644 .runpod/src/config/config.yaml
 create mode 100644 .runpod/src/handler.py
 create mode 100644 .runpod/src/test_input.json
 create mode 100644 .runpod/src/train.py
 create mode 100644 .runpod/src/utils.py
 create mode 100644 .runpod/tests.json

diff --git a/.runpod/.gitignore b/.runpod/.gitignore
new file mode 100644
index 000000000..383570cfc
--- /dev/null
+++ b/.runpod/.gitignore
@@ -0,0 +1,161 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+pod/scripts/config.yaml
diff --git a/.runpod/Dockerfile b/.runpod/Dockerfile
new file mode 100644
index 000000000..107caf5f3
--- /dev/null
+++ b/.runpod/Dockerfile
@@ -0,0 +1,18 @@
+FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0
+
+COPY .runpod/requirements.txt /requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade pip && \
+    python3 -m pip install --upgrade -r /requirements.txt
+
+# Environment settings
+ARG BASE_VOLUME="/runpod-volume"
+ENV BASE_VOLUME=$BASE_VOLUME
+ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
+
+COPY .runpod/src /src
+
+WORKDIR /src
+CMD ["python3", "/src/handler.py"]
diff --git a/.runpod/README.md b/.runpod/README.md
new file mode 100644
index 000000000..a631c3937
--- /dev/null
+++ b/.runpod/README.md
@@ -0,0 +1,335 @@
+<h1>LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more</h1>
+
+# Configuration Options
+
+This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.
+
+## Usage
+
+You can use these configuration Options:
+
+1. As a JSON request body:
+
+```json
+{
+  "input": {
+    "user_id": "user",
+    "model_id": "model-name",
+    "run_id": "run-id",
+    "credentials": {
+      "wandb_api_key": "", # add your Weights & biases key. TODO:  you will be able to set this in Enviornment variables.
+      "hf_token": "", # add your HF_token. TODO:  you will be able to set this in Enviornment variables.
+    },
+    "args": {
+      "base_model": "NousResearch/Llama-3.2-1B",
+      // ... other options
+    }
+  }
+}
+```
+
+## Configuration Options
+
+### Model Configuration
+
+| Option              | Description                                                                                   | Default              |
+| ------------------- | --------------------------------------------------------------------------------------------- | -------------------- |
+| `base_model`        | Path to the base model (local or HuggingFace)                                                 | Required             |
+| `base_model_config` | Configuration path for the base model                                                         | Same as base_model   |
+| `revision_of_model` | Specific model revision from HuggingFace hub                                                  | Latest               |
+| `tokenizer_config`  | Custom tokenizer configuration path                                                           | Optional             |
+| `model_type`        | Type of model to load                                                                         | AutoModelForCausalLM |
+| `tokenizer_type`    | Type of tokenizer to use                                                                      | AutoTokenizer        |
+| `hub_model_id`      | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional             |
+
+## Model Family Identification
+
+| Option                     | Default | Description                    |
+| -------------------------- | ------- | ------------------------------ |
+| `is_falcon_derived_model`  | `false` | Whether model is Falcon-based  |
+| `is_llama_derived_model`   | `false` | Whether model is LLaMA-based   |
+| `is_qwen_derived_model`    | `false` | Whether model is Qwen-based    |
+| `is_mistral_derived_model` | `false` | Whether model is Mistral-based |
+
+## Model Configuration Overrides
+
+| Option                                          | Default    | Description                        |
+| ----------------------------------------------- | ---------- | ---------------------------------- |
+| `overrides_of_model_config.rope_scaling.type`   | `"linear"` | RoPE scaling type (linear/dynamic) |
+| `overrides_of_model_config.rope_scaling.factor` | `1.0`      | RoPE scaling factor                |
+
+### Model Loading Options
+
+| Option         | Description                   | Default |
+| -------------- | ----------------------------- | ------- |
+| `load_in_8bit` | Load model in 8-bit precision | false   |
+| `load_in_4bit` | Load model in 4-bit precision | false   |
+| `bf16`         | Use bfloat16 precision        | false   |
+| `fp16`         | Use float16 precision         | false   |
+| `tf32`         | Use tensor float 32 precision | false   |
+
+## Memory and Device Settings
+
+| Option             | Default   | Description             |
+| ------------------ | --------- | ----------------------- |
+| `gpu_memory_limit` | `"20GiB"` | GPU memory limit        |
+| `lora_on_cpu`      | `false`   | Load LoRA on CPU        |
+| `device_map`       | `"auto"`  | Device mapping strategy |
+| `max_memory`       | `null`    | Max memory per device   |
+
+## Training Hyperparameters
+
+| Option                        | Default   | Description                 |
+| ----------------------------- | --------- | --------------------------- |
+| `gradient_accumulation_steps` | `1`       | Gradient accumulation steps |
+| `micro_batch_size`            | `2`       | Batch size per GPU          |
+| `eval_batch_size`             | `null`    | Evaluation batch size       |
+| `num_epochs`                  | `4`       | Number of training epochs   |
+| `warmup_steps`                | `100`     | Warmup steps                |
+| `warmup_ratio`                | `0.05`    | Warmup ratio                |
+| `learning_rate`               | `0.00003` | Learning rate               |
+| `lr_quadratic_warmup`         | `false`   | Quadratic warmup            |
+| `logging_steps`               | `null`    | Logging frequency           |
+| `eval_steps`                  | `null`    | Evaluation frequency        |
+| `evals_per_epoch`             | `null`    | Evaluations per epoch       |
+| `save_strategy`               | `"epoch"` | Checkpoint saving strategy  |
+| `save_steps`                  | `null`    | Saving frequency            |
+| `saves_per_epoch`             | `null`    | Saves per epoch             |
+| `save_total_limit`            | `null`    | Maximum checkpoints to keep |
+| `max_steps`                   | `null`    | Maximum training steps      |
+
+### Dataset Configuration
+
+```yaml
+datasets:
+  - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
+    type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
+    ds_type: json # Dataset type
+    data_files: path/to/data # Source data files
+    train_on_split: train # Dataset split to use
+```
+
+## Chat Template Settings
+
+| Option                   | Default                          | Description            |
+| ------------------------ | -------------------------------- | ---------------------- |
+| `chat_template`          | `"tokenizer_default"`            | Chat template type     |
+| `chat_template_jinja`    | `null`                           | Custom Jinja template  |
+| `default_system_message` | `"You are a helpful assistant."` | Default system message |
+
+## Dataset Processing
+
+| Option                        | Default                    | Description                       |
+| ----------------------------- | -------------------------- | --------------------------------- |
+| `dataset_prepared_path`       | `"data/last_run_prepared"` | Path for prepared dataset         |
+| `push_dataset_to_hub`         | `""`                       | Push dataset to HF hub            |
+| `dataset_processes`           | `4`                        | Number of preprocessing processes |
+| `dataset_keep_in_memory`      | `false`                    | Keep dataset in memory            |
+| `shuffle_merged_datasets`     | `true`                     | Shuffle merged datasets           |
+| `dataset_exact_deduplication` | `true`                     | Deduplicate datasets              |
+
+## LoRA Configuration
+
+| Option                     | Default                | Description                    |
+| -------------------------- | ---------------------- | ------------------------------ |
+| `adapter`                  | `"lora"`               | Adapter type (lora/qlora)      |
+| `lora_model_dir`           | `""`                   | Directory with pretrained LoRA |
+| `lora_r`                   | `8`                    | LoRA attention dimension       |
+| `lora_alpha`               | `16`                   | LoRA alpha parameter           |
+| `lora_dropout`             | `0.05`                 | LoRA dropout                   |
+| `lora_target_modules`      | `["q_proj", "v_proj"]` | Modules to apply LoRA          |
+| `lora_target_linear`       | `false`                | Target all linear modules      |
+| `peft_layers_to_transform` | `[]`                   | Layers to transform            |
+| `lora_modules_to_save`     | `[]`                   | Modules to save                |
+| `lora_fan_in_fan_out`      | `false`                | Fan in/out structure           |
+
+## Optimization Settings
+
+| Option                    | Default | Description                |
+| ------------------------- | ------- | -------------------------- |
+| `train_on_inputs`         | `false` | Train on input prompts     |
+| `group_by_length`         | `false` | Group by sequence length   |
+| `gradient_checkpointing`  | `false` | Use gradient checkpointing |
+| `early_stopping_patience` | `3`     | Early stopping patience    |
+
+## Learning Rate Scheduling
+
+| Option                     | Default    | Description          |
+| -------------------------- | ---------- | -------------------- |
+| `lr_scheduler`             | `"cosine"` | Scheduler type       |
+| `lr_scheduler_kwargs`      | `{}`       | Scheduler parameters |
+| `cosine_min_lr_ratio`      | `null`     | Minimum LR ratio     |
+| `cosine_constant_lr_ratio` | `null`     | Constant LR ratio    |
+| `lr_div_factor`            | `null`     | LR division factor   |
+
+## Optimizer Settings
+
+| Option                 | Default      | Description         |
+| ---------------------- | ------------ | ------------------- |
+| `optimizer`            | `"adamw_hf"` | Optimizer choice    |
+| `optim_args`           | `{}`         | Optimizer arguments |
+| `optim_target_modules` | `[]`         | Target modules      |
+| `weight_decay`         | `null`       | Weight decay        |
+| `adam_beta1`           | `null`       | Adam beta1          |
+| `adam_beta2`           | `null`       | Adam beta2          |
+| `adam_epsilon`         | `null`       | Adam epsilon        |
+| `max_grad_norm`        | `null`       | Gradient clipping   |
+
+## Attention Implementations
+
+| Option                     | Default | Description                   |
+| -------------------------- | ------- | ----------------------------- |
+| `flash_optimum`            | `false` | Use better transformers       |
+| `xformers_attention`       | `false` | Use xformers                  |
+| `flash_attention`          | `false` | Use flash attention           |
+| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
+| `flash_attn_rms_norm`      | `false` | Flash attention RMS norm      |
+| `flash_attn_fuse_qkv`      | `false` | Fuse QKV operations           |
+| `flash_attn_fuse_mlp`      | `false` | Fuse MLP operations           |
+| `sdp_attention`            | `false` | Use scaled dot product        |
+| `s2_attention`             | `false` | Use shifted sparse attention  |
+
+## Tokenizer Modifications
+
+| Option           | Default | Description                  |
+| ---------------- | ------- | ---------------------------- |
+| `special_tokens` | -       | Special tokens to add/modify |
+| `tokens`         | `[]`    | Additional tokens            |
+
+## Distributed Training
+
+| Option                  | Default | Description           |
+| ----------------------- | ------- | --------------------- |
+| `fsdp`                  | `null`  | FSDP configuration    |
+| `fsdp_config`           | `null`  | FSDP config options   |
+| `deepspeed`             | `null`  | Deepspeed config path |
+| `ddp_timeout`           | `null`  | DDP timeout           |
+| `ddp_bucket_cap_mb`     | `null`  | DDP bucket capacity   |
+| `ddp_broadcast_buffers` | `null`  | DDP broadcast buffers |
+
+<details>
+<summary><h3>Example Configuration Request:</h3></summary>
+
+Here's a complete example for fine-tuning a LLaMA model using LoRA:
+
+```json
+{
+  "input": {
+    "user_id": "user",
+    "model_id": "llama-test",
+    "run_id": "test-run",
+    "credentials": {
+      "wandb_api_key": "",
+      "hf_token": ""
+    },
+    "args": {
+      "base_model": "NousResearch/Llama-3.2-1B",
+      "load_in_8bit": false,
+      "load_in_4bit": false,
+      "strict": false,
+      "datasets": [
+        {
+          "path": "teknium/GPT4-LLM-Cleaned",
+          "type": "alpaca"
+        }
+      ],
+      "dataset_prepared_path": "last_run_prepared",
+      "val_set_size": 0.1,
+      "output_dir": "./outputs/lora-out",
+      "adapter": "lora",
+      "sequence_len": 2048,
+      "sample_packing": true,
+      "eval_sample_packing": true,
+      "pad_to_sequence_len": true,
+      "lora_r": 16,
+      "lora_alpha": 32,
+      "lora_dropout": 0.05,
+      "lora_target_modules": [
+        "gate_proj",
+        "down_proj",
+        "up_proj",
+        "q_proj",
+        "v_proj",
+        "k_proj",
+        "o_proj"
+      ],
+      "gradient_accumulation_steps": 2,
+      "micro_batch_size": 2,
+      "num_epochs": 1,
+      "optimizer": "adamw_8bit",
+      "lr_scheduler": "cosine",
+      "learning_rate": 0.0002,
+      "train_on_inputs": false,
+      "group_by_length": false,
+      "bf16": "auto",
+      "tf32": false,
+      "gradient_checkpointing": true,
+      "logging_steps": 1,
+      "flash_attention": true,
+      "loss_watchdog_threshold": 5,
+      "loss_watchdog_patience": 3,
+      "warmup_steps": 10,
+      "evals_per_epoch": 4,
+      "saves_per_epoch": 1,
+      "weight_decay": 0,
+      "hub_model_id": "runpod/llama-fr-lora",
+      "wandb_name": "test-run-1",
+      "wandb_project": "test-run-1",
+      "wandb_entity": "axo-test",
+      "special_tokens": {
+        "pad_token": "<|end_of_text|>"
+      }
+    }
+  }
+}
+```
+
+</details>
+
+### Advanced Features
+
+#### Wandb Integration
+
+- `wandb_project`: Project name for Weights & Biases
+- `wandb_entity`: Team name in W&B
+- `wandb_watch`: Monitor model with W&B
+- `wandb_name`: Name of the W&B run
+- `wandb_run_id`: ID for the W&B run
+
+#### Performance Optimization
+
+- `sample_packing`: Enable efficient sequence packing
+- `eval_sample_packing`: Use sequence packing during evaluation
+- `torch_compile`: Enable PyTorch 2.0 compilation
+- `flash_attention`: Use Flash Attention implementation
+- `xformers_attention`: Use xFormers attention implementation
+
+### Available Optimizers
+
+The following optimizers are supported:
+
+- `adamw_hf`: HuggingFace's AdamW implementation
+- `adamw_torch`: PyTorch's AdamW
+- `adamw_torch_fused`: Fused AdamW implementation
+- `adamw_torch_xla`: XLA-optimized AdamW
+- `adamw_apex_fused`: NVIDIA Apex fused AdamW
+- `adafactor`: Adafactor optimizer
+- `adamw_anyprecision`: Anyprecision AdamW
+- `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
+- `lion_8bit`: 8-bit Lion optimizer
+- `lion_32bit`: 32-bit Lion optimizer
+- `sgd`: Stochastic Gradient Descent
+- `adagrad`: Adagrad optimizer
+
+## Notes
+
+- Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
+- Enable `flash_attention: true` for faster training on modern GPUs
+- Use `gradient_checkpointing: true` to reduce memory usage
+- Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
+
+For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html).
+
+### Errors:
+
+- if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.
diff --git a/.runpod/hub.json b/.runpod/hub.json
new file mode 100644
index 000000000..a243a27d8
--- /dev/null
+++ b/.runpod/hub.json
@@ -0,0 +1,93 @@
+{
+  "title": "Axolotl Fine-Tuning",
+  "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
+  "type": "serverless",
+  "category": "language",
+  "iconUrl": "https://avatars.githubusercontent.com/u/167502477",
+  "config": {
+    "runsOn": "GPU",
+    "containerDiskInGb": 200,
+    "gpuCount": 1,
+    "allowedCudaVersions": [
+      "12.8",
+      "12.7",
+      "12.6",
+      "12.5",
+      "12.4"
+    ],
+    "presets": [],
+    "env": [
+      {
+        "key": "TOKENIZER",
+        "input": {
+          "name": "Tokenizer",
+          "type": "string",
+          "description": "Name or path of the Hugging Face tokenizer to use.",
+          "default": "",
+          "advanced": true
+        }
+      },
+      {
+        "key": "MAX_NUM_SEQS",
+        "input": {
+          "name": "Max Num Seqs",
+          "type": "number",
+          "description": "Maximum number of sequences per iteration.",
+          "default": 256,
+          "advanced": true
+        }
+      },
+      {
+        "key": "DISABLE_LOG_STATS",
+        "input": {
+          "name": "Disable Log Stats",
+          "type": "boolean",
+          "description": "Disable logging statistics.",
+          "default": false,
+          "trueValue": "true",
+          "falseValue": "false"
+        }
+      },
+      {
+        "key": "LOAD_FORMAT",
+        "input": {
+          "name": "Load Format",
+          "type": "string",
+          "description": "The format of the model weights to load.",
+          "default": "auto",
+          "options": [
+            {
+              "label": "auto",
+              "value": "auto"
+            },
+            {
+              "label": "pt",
+              "value": "pt"
+            },
+            {
+              "label": "safetensors",
+              "value": "safetensors"
+            },
+            {
+              "label": "npcache",
+              "value": "npcache"
+            },
+            {
+              "label": "dummy",
+              "value": "dummy"
+            },
+            {
+              "label": "tensorizer",
+              "value": "tensorizer"
+            },
+            {
+              "label": "bitsandbytes",
+              "value": "bitsandbytes"
+            }
+          ],
+          "advanced": true
+        }
+      }
+    ]
+  }
+}
diff --git a/.runpod/requirements.txt b/.runpod/requirements.txt
new file mode 100644
index 000000000..345bdda35
--- /dev/null
+++ b/.runpod/requirements.txt
@@ -0,0 +1,7 @@
+# Required Python packages get listed here, one per line.
+# Reccomended to lock the version number to avoid unexpected changes.
+
+# You can also install packages from a git repository, e.g.:
+# git+https://github.com/runpod/runpod-python.git
+# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
+runpod~=1.7.0
diff --git a/.runpod/src/config/config.yaml b/.runpod/src/config/config.yaml
new file mode 100644
index 000000000..4dff37cae
--- /dev/null
+++ b/.runpod/src/config/config.yaml
@@ -0,0 +1,577 @@
+# # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
+# # This can also be a relative path to a model on disk
+# base_model: ./llama-7b-hf
+# # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
+# base_model_ignore_patterns:
+# # If the base_model repo on hf hub doesn't include configuration .json files,
+# # You can set that here, or leave this empty to default to base_model
+# base_model_config: ./llama-7b-hf
+# # You can specify to choose a specific model revision from huggingface hub
+# model_revision:
+# # Optional tokenizer configuration override in case you want to use a different tokenizer
+# # than the one defined in the base model
+# tokenizer_config:
+# # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
+# model_type: AutoModelForCausalLM
+# # Corresponding tokenizer for the model AutoTokenizer is a good choice
+# tokenizer_type: AutoTokenizer
+# # Trust remote code for untrusted source
+# trust_remote_code:
+# # use_fast option for tokenizer loading from_pretrained, default to True
+# tokenizer_use_fast:
+# # Whether to use the legacy tokenizer setting, defaults to True
+# tokenizer_legacy:
+# # Resize the model embeddings when new tokens are added to multiples of 32
+# # This is reported to improve training speed on some models
+# resize_token_embeddings_to_32x:
+
+# # Used to identify which the model is based on
+# is_falcon_derived_model:
+# is_llama_derived_model:
+# # Please note that if you set this to true, `padding_side` will be set to "left" by default
+# is_mistral_derived_model:
+# is_qwen_derived_model:
+
+# # optional overrides to the base model configuration
+# model_config:
+#   # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
+#   rope_scaling:
+#     type: # linear | dynamic
+#     factor: # float
+
+
+# # Whether you are training a 4-bit GPTQ quantized model
+# gptq: true
+# gptq_groupsize: 128 # group size
+# gptq_model_v1: false # v1 or v2
+
+# # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+# load_in_8bit: true
+# # Use bitsandbytes 4 bit
+# load_in_4bit:
+
+# # Use CUDA bf16
+# bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
+# # Use CUDA fp16
+# fp16: true
+# # Use CUDA tf32
+# tf32: true # require >=ampere
+
+# # No AMP (automatic mixed precision)
+# bfloat16: true # require >=ampere
+# float16: true
+
+# # A list of one or more datasets to finetune the model with
+# datasets:
+#   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
+#   - path: vicgalle/alpaca-gpt4
+#   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
+#     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
+#     ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
+#     data_files: # Optional[str] path to source data files
+#     shards: # Optional[int] number of shards to split data into
+#     name: # Optional[str] name of dataset configuration to load
+#     train_on_split: train # Optional[str] name of dataset split to load from
+
+#     # Optional[str] fastchat conversation type, only used with type: sharegpt
+#     conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+#     field_human: # Optional[str]. Human key to use for conversation.
+#     field_model: # Optional[str]. Assistant key to use for conversation.
+
+#   # Custom user prompt
+#   - path: repo
+#     type:
+#       # The below are defaults. only set what's needed.
+#       system_prompt: ""
+#       system_format: "{system}"
+#       field_system: system
+#       field_instruction: instruction
+#       field_input: input
+#       field_output: output
+
+#       # Customizable to be single line or multi-line
+#       # 'format' can include {input}
+#       format: |-
+#         User: {instruction} {input}
+#         Assistant:
+#       # 'no_input_format' cannot include {input}
+#       no_input_format: "{instruction} "
+
+#       # For `completion` datsets only, uses the provided field instead of `text` column
+#       field:
+
+# # Axolotl attempts to save the dataset as an arrow after packing the data together so
+# # subsequent training attempts load faster, relative path
+# dataset_prepared_path: data/last_run_prepared
+# # Push prepared dataset to hub
+# push_dataset_to_hub: # repo path
+# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
+# # if not set.
+# dataset_processes: # defaults to os.cpu_count() if not set
+# # push checkpoints to hub
+# hub_model_id: # repo path to push finetuned model
+# # how to push checkpoints to hub
+# # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
+# hub_strategy:
+# # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
+# # Required to be true when used in combination with `push_dataset_to_hub`
+# hf_use_auth_token: # boolean
+# # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
+# val_set_size: 0.04
+# # Num shards for whole dataset
+# dataset_shard_num:
+# # Index of shard to use for whole dataset
+# dataset_shard_idx:
+
+# # The maximum length of an input to train with, this should typically be less than 2048
+# # as most models have a token/context limit of 2048
+# sequence_len: 2048
+# # Pad inputs so each step uses constant sized buffers
+# # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
+# pad_to_sequence_len:
+# # Max sequence length to concatenate training samples together up to
+# # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
+# # FutureWarning: This will soon be DEPRECATED
+# max_packed_sequence_len: 1024
+# # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
+# sample_packing:
+# # Set to 'false' if getting errors during eval with sample_packing on.
+# eval_sample_packing:
+# # You can set these packing optimizations AFTER starting a training at least once.
+# # The trainer will provide recommended values for these values.
+# sample_packing_eff_est:
+# total_num_tokens:
+
+# # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
+# adapter: lora
+# # If you already have a lora model trained that you want to load, put that here.
+# # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
+# lora_model_dir:
+
+# # LoRA hyperparameters
+# # For more details about the following options, see:
+# # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
+# lora_r: 8
+# lora_alpha: 16
+# lora_dropout: 0.05
+# lora_target_modules:
+#   - q_proj
+#   - v_proj
+# #  - k_proj
+# #  - o_proj
+# #  - gate_proj
+# #  - down_proj
+# #  - up_proj
+# lora_target_linear: # If true, will target all linear layers
+
+# # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
+# # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
+# # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
+# # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
+# lora_modules_to_save:
+# #  - embed_tokens
+# #  - lm_head
+
+# # Once you complete training, the model will be saved to the following directory.
+# # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
+# # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
+# lora_out_dir:
+# lora_fan_in_fan_out: false
+
+# # ReLoRA configuration
+# # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
+# relora_steps: # Number of steps per ReLoRA restart
+# relora_warmup_steps: # Number of per-restart warmup steps
+# relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
+
+# # wandb configuration if you're using it
+# wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
+# wandb_project: # Your wandb project name
+# wandb_entity: # A wandb Team name if using a Team
+# wandb_watch:
+# wandb_run_id: # Set the name of your wandb run
+# wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
+
+# # Where to save the full-finetuned model to
+# output_dir: ./completed-model
+
+# # Whether to use torch.compile and which backend to use
+# torch_compile:  # bool
+# torch_compile_backend:  # Optional[str]
+
+# # Training hyperparameters
+
+# # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
+# gradient_accumulation_steps: 1
+# # The number of samples to include in each batch. This is the number of samples sent to each GPU.
+# micro_batch_size: 2
+# eval_batch_size:
+# num_epochs: 4
+# warmup_steps: 100  # cannot use with warmup_ratio
+# warmup_ratio: 0.05  # cannot use with warmup_steps
+# learning_rate: 0.00003
+# lr_quadratic_warmup:
+# logging_steps:
+# save_strategy: # Set to `no` to skip checkpoint saves
+# save_steps: # Leave empty to save at each epoch
+# eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
+# save_total_limit: # Checkpoints saved at a time
+# # Maximum number of iterations to train for. It precedes num_epochs which means that
+# # if both are set, num_epochs will not be guaranteed.
+# # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
+# max_steps:
+
+# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
+# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
+
+# # Save model as safetensors (require safetensors package)
+# save_safetensors:
+
+# # Whether to mask out or include the human's prompt from the training labels
+# train_on_inputs: false
+# # Group similarly sized data to minimize padding.
+# # May be slower to start, as it must download and sort the entire dataset.
+# # Note that training loss may have an oscillating pattern with this enabled.
+# group_by_length: false
+
+# # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+# gradient_checkpointing: false
+
+# # Stop training after this many evaluation losses have increased in a row
+# # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
+# early_stopping_patience: 3
+
+# # Specify a scheduler and kwargs to use with the optimizer
+# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
+# lr_scheduler_kwargs:
+
+# # For one_cycle optim
+# lr_div_factor: # Learning rate div factor
+
+# # For log_sweep optim
+# log_sweep_min_lr:
+# log_sweep_max_lr:
+
+# # Specify optimizer
+# # Valid values are driven by the Transformers OptimizerNames class, see:
+# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
+# #
+# # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
+# # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
+# # in the examples/ for your model and fine-tuning use case.
+# #
+# # Valid values for 'optimizer' include:
+# # - adamw_hf
+# # - adamw_torch
+# # - adamw_torch_fused
+# # - adamw_torch_xla
+# # - adamw_apex_fused
+# # - adafactor
+# # - adamw_anyprecision
+# # - sgd
+# # - adagrad
+# # - adamw_bnb_8bit
+# # - lion_8bit
+# # - lion_32bit
+# # - paged_adamw_32bit
+# # - paged_adamw_8bit
+# # - paged_lion_32bit
+# # - paged_lion_8bit
+# optimizer:
+# # Specify weight decay
+# weight_decay:
+# # adamw hyperparams
+# adam_beta1:
+# adam_beta2:
+# adam_epsilon:
+# # Gradient clipping max norm
+# max_grad_norm:
+
+# # Augmentation techniques
+# # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
+# # currently only supported on Llama and Mistral
+# noisy_embedding_alpha:
+
+# # Whether to bettertransformers
+# flash_optimum:
+# # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
+# xformers_attention:
+# # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+# flash_attention:
+# flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
+# flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
+# flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
+# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
+# # Whether to use scaled-dot-product attention
+# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+# sdp_attention:
+# # Landmark attention (only llama)
+# landmark_attention:
+# # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
+# # LLaMA only
+# xpos_rope:
+
+# # Resume from a specific checkpoint dir
+# resume_from_checkpoint:
+# # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# # Be careful with this being turned on between different models.
+# auto_resume_from_checkpoints: false
+
+# # Don't mess with this, it's here for accelerate and torchrun
+# local_rank:
+
+# # Add or change special tokens.
+# # If you add tokens here, you don't need to add them to the `tokens` list.
+# special_tokens:
+#   # bos_token: "<s>"
+#   # eos_token: "</s>"
+#   # unk_token: "<unk>"
+
+# # Add extra tokens.
+# tokens:
+
+# # FSDP
+# fsdp:
+# fsdp_config:
+
+# # Deepspeed config path. e.g., deepspeed/zero3.json
+# deepspeed:
+
+# # Advanced DDP Arguments
+# ddp_timeout:
+# ddp_bucket_cap_mb:
+# ddp_broadcast_buffers:
+
+# # Path to torch distx for optim 'adamw_anyprecision'
+# torchdistx_path:
+
+# # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
+# pretraining_dataset:
+
+# # Debug mode
+# debug:
+
+# # Seed
+# seed:
+
+# # Allow overwrite yml config using from cli
+# strict:
+
+
+
+base_model: ${BASE_MODEL}
+base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
+base_model_config: ${BASE_MODEL_CONFIG}
+revision_of_model: ${REVISION_OF_MODEL}
+tokenizer_config: ${TOKENIZER_CONFIG}
+model_type: ${MODEL_TYPE}
+tokenizer_type: ${TOKENIZER_TYPE}
+trust_remote_code: ${TRUST_REMOTE_CODE}
+tokenizer_use_fast: ${TOKENIZER_USE_FAST}
+tokenizer_legacy: ${TOKENIZER_LEGACY}
+resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}
+
+is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
+is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
+is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
+is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}
+
+overrides_of_model_config:
+  rope_scaling:
+    type: ${ROPE_SCALING_TYPE}
+    factor: ${ROPE_SCALING_FACTOR}
+
+bnb_config_kwargs:
+  llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
+  bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
+  bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}
+
+gptq: ${GPTQ}
+load_in_8bit: ${LOAD_IN_8BIT}
+load_in_4bit: ${LOAD_IN_4BIT}
+bf16: ${BF16}
+fp16: ${FP16}
+tf32: ${TF32}
+bfloat16: ${BFLOAT16}
+float16: ${FLOAT16}
+
+gpu_memory_limit: ${GPU_MEMORY_LIMIT}
+lora_on_cpu: ${LORA_ON_CPU}
+
+datasets:
+  - path: ${DATASET_PATH}
+    type: ${DATASET_TYPE}
+    ds_type: ${DATASET_DS_TYPE}
+    data_files: ${DATASET_DATA_FILES}
+    shards: ${DATASET_SHARDS}
+    name: ${DATASET_NAME}
+    train_on_split: ${DATASET_TRAIN_ON_SPLIT}
+    revision: ${DATASET_REVISION}
+    trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}
+
+rl: ${RL}
+dpo_use_weighting: ${DPO_USE_WEIGHTING}
+
+chat_template: ${CHAT_TEMPLATE}
+chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
+default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
+dataset_prepared_path: ${DATASET_PREPARED_PATH}
+push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
+dataset_processes: ${DATASET_PROCESSES}
+dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
+hub_model_id: ${HUB_MODEL_ID}
+hub_strategy: ${HUB_STRATEGY}
+hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
+val_set_size: ${VAL_SET_SIZE}
+dataset_shard_num: ${DATASET_SHARD_NUM}
+dataset_shard_idx: ${DATASET_SHARD_IDX}
+
+sequence_len: ${SEQUENCE_LEN}
+pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
+sample_packing: ${SAMPLE_PACKING}
+eval_sample_packing: ${EVAL_SAMPLE_PACKING}
+sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
+total_num_tokens: ${TOTAL_NUM_TOKENS}
+sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
+sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}
+
+batch_flattening: ${BATCH_FLATTENING}
+device_map: ${DEVICE_MAP}
+max_memory: ${MAX_MEMORY}
+
+adapter: ${ADAPTER}
+lora_model_dir: ${LORA_MODEL_DIR}
+
+lora_r: ${LORA_R}
+lora_alpha: ${LORA_ALPHA}
+lora_dropout: ${LORA_DROPOUT}
+lora_target_modules:
+  - ${LORA_TARGET_MODULES}
+lora_target_linear: ${LORA_TARGET_LINEAR}
+peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
+lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
+lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}
+
+loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
+loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}
+
+peft:
+  loftq_config:
+    loftq_bits: ${LOFTQ_BITS}
+
+relora_steps: ${RELORA_STEPS}
+relora_warmup_steps: ${RELORA_WARMUP_STEPS}
+relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
+relora_prune_ratio: ${RELORA_PRUNE_RATIO}
+relora_cpu_offload: ${RELORA_CPU_OFFLOAD}
+
+wandb_mode: ${WANDB_MODE}
+wandb_project: ${WANDB_PROJECT}
+wandb_entity: ${WANDB_ENTITY}
+wandb_watch: ${WANDB_WATCH}
+wandb_name: ${WANDB_NAME}
+wandb_run_id: ${WANDB_RUN_ID}
+wandb_log_model: ${WANDB_LOG_MODEL}
+
+mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
+mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
+mlflow_run_name: ${MLFLOW_RUN_NAME}
+hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}
+
+use_comet: ${USE_COMET}
+comet_api_key: ${COMET_API_KEY}
+comet_workspace: ${COMET_WORKSPACE}
+comet_project_name: ${COMET_PROJECT_NAME}
+comet_experiment_key: ${COMET_EXPERIMENT_KEY}
+comet_mode: ${COMET_MODE}
+comet_online: ${COMET_ONLINE}
+comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}
+
+output_dir: ${OUTPUT_DIR}
+
+torch_compile: ${TORCH_COMPILE}
+torch_compile_backend: ${TORCH_COMPILE_BACKEND}
+
+gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
+micro_batch_size: ${MICRO_BATCH_SIZE}
+eval_batch_size: ${EVAL_BATCH_SIZE}
+num_epochs: ${NUM_EPOCHS}
+warmup_steps: ${WARMUP_STEPS}
+warmup_ratio: ${WARMUP_RATIO}
+learning_rate: ${LEARNING_RATE}
+lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
+logging_steps: ${LOGGING_STEPS}
+eval_steps: ${EVAL_STEPS}
+evals_per_epoch: ${EVALS_PER_EPOCH}
+save_strategy: ${SAVE_STRATEGY}
+save_steps: ${SAVE_STEPS}
+saves_per_epoch: ${SAVES_PER_EPOCH}
+save_total_limit: ${SAVE_TOTAL_LIMIT}
+max_steps: ${MAX_STEPS}
+
+eval_table_size: ${EVAL_TABLE_SIZE}
+eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
+eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}
+
+profiler_steps: ${PROFILER_STEPS}
+loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
+loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
+
+save_safetensors: ${SAVE_SAFETENSORS}
+train_on_inputs: ${TRAIN_ON_INPUTS}
+group_by_length: ${GROUP_BY_LENGTH}
+gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
+early_stopping_patience: ${EARLY_STOPPING_PATIENCE}
+
+lr_scheduler: ${LR_SCHEDULER}
+lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
+cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
+cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
+lr_div_factor: ${LR_DIV_FACTOR}
+
+optimizer: ${OPTIMIZER}
+optim_args: ${OPTIM_ARGS}
+optim_target_modules: ${OPTIM_TARGET_MODULES}
+weight_decay: ${WEIGHT_DECAY}
+adam_beta1: ${ADAM_BETA1}
+adam_beta2: ${ADAM_BETA2}
+adam_epsilon: ${ADAM_EPSILON}
+max_grad_norm: ${MAX_GRAD_NORM}
+
+neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}
+
+flash_optimum: ${FLASH_OPTIMUM}
+xformers_attention: ${XFORMERS_ATTENTION}
+flash_attention: ${FLASH_ATTENTION}
+flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
+flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
+flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
+flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
+sdp_attention: ${SDP_ATTENTION}
+s2_attention: ${S2_ATTENTION}
+resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
+auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}
+
+local_rank: ${LOCAL_RANK}
+
+special_tokens:
+  bos_token: ${SPECIAL_TOKEN_BOS}
+  eos_token: ${SPECIAL_TOKEN_EOS}
+  unk_token: ${SPECIAL_TOKEN_UNK}
+  pad_token: ${SPECIAL_TOKEN_PAD}
+
+tokens: ${TOKENS}
+
+fsdp: ${FSDP}
+fsdp_config: ${FSDP_CONFIG}
+deepspeed: ${DEEPSPEED}
+
+ddp_timeout: ${DDP_TIMEOUT}
+ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
+ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}
+
+torchdistx_path: ${TORCHDISTX_PATH}
+pretraining_dataset: ${PRETRAINING_DATASET}
+debug: ${DEBUG}
+seed: ${SEED}
+strict: ${STRICT}
diff --git a/.runpod/src/handler.py b/.runpod/src/handler.py
new file mode 100644
index 000000000..21073dff4
--- /dev/null
+++ b/.runpod/src/handler.py
@@ -0,0 +1,64 @@
+"""
+Runpod serverless entrypoint handler
+"""
+
+import os
+
+import runpod
+import yaml
+from huggingface_hub._login import login
+from train import train
+from utils import get_output_dir
+
+BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
+if not os.path.exists(BASE_VOLUME):
+    os.makedirs(BASE_VOLUME)
+
+logger = runpod.RunPodLogger()
+
+
+async def handler(job):
+    runpod_job_id = job["id"]
+    inputs = job["input"]
+    run_id = inputs.get("run_id", "default_run_id")
+    args = inputs.get("args", {})
+
+    # Set output directory
+    output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
+    args["output_dir"] = output_dir
+
+    # First save args to a temporary config file
+    config_path = "/workspace/test_config.yaml"
+
+    # Add run_name and job_id to args before saving
+    args["run_name"] = run_id
+    args["runpod_job_id"] = runpod_job_id
+
+    yaml_data = yaml.dump(args, default_flow_style=False)
+    with open(config_path, "w", encoding="utf-8") as file:
+        file.write(yaml_data)
+
+    # Handle credentials
+    credentials = inputs.get("credentials", {})
+
+    if "wandb_api_key" in credentials:
+        os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
+    if "hf_token" in credentials:
+        os.environ["HF_TOKEN"] = credentials["hf_token"]
+
+    if os.environ.get("HF_TOKEN"):
+        login(token=os.environ["HF_TOKEN"])
+    else:
+        logger.info("No HF_TOKEN provided. Skipping login.")
+
+    logger.info("Starting Training.")
+    async for result in train(config_path):  # Pass the config path instead of args
+        logger.info(result)
+    logger.info("Training Complete.")
+
+    # Cleanup
+    del os.environ["WANDB_API_KEY"]
+    del os.environ["HF_TOKEN"]
+
+
+runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
diff --git a/.runpod/src/test_input.json b/.runpod/src/test_input.json
new file mode 100644
index 000000000..889e8ee27
--- /dev/null
+++ b/.runpod/src/test_input.json
@@ -0,0 +1,61 @@
+{
+  "input": {
+    "user_id": "user",
+    "model_id": "llama-test",
+    "run_id": "llama-test",
+    "credentials": {
+      "wandb_api_key": "",
+      "hf_token": ""
+    },
+    "args": {
+      "base_model": "NousResearch/Meta-Llama-3-8B",
+      "model_type": "LlamaForCausalLM",
+      "tokenizer_type": "AutoTokenizer",
+      "load_in_8bit": true,
+      "load_in_4bit": false,
+      "strict": false,
+      "datasets": [
+        {
+          "path": "mhenrichsen/alpaca_2k_test",
+          "type": "alpaca"
+        }
+      ],
+      "val_set_size": 0.05,
+      "output_dir": "./outputs/lora-out",
+      "sequence_len": 4096,
+      "sample_packing": true,
+      "eval_sample_packing": false,
+      "pad_to_sequence_len": true,
+      "adapter": "lora",
+      "lora_r": 32,
+      "lora_alpha": 16,
+      "lora_dropout": 0.05,
+      "lora_target_linear": true,
+      "lora_modules_to_save": [
+        "embed_tokens",
+        "lm_head"
+      ],
+      "gradient_accumulation_steps": 4,
+      "micro_batch_size": 2,
+      "num_epochs": 1,
+      "optimizer": "adamw_bnb_8bit",
+      "lr_scheduler": "cosine",
+      "learning_rate": 0.0002,
+      "train_on_inputs": false,
+      "group_by_length": false,
+      "bf16": "auto",
+      "tf32": false,
+      "gradient_checkpointing": true,
+      "logging_steps": 1,
+      "flash_attention": true,
+      "warmup_steps": 1,
+      "evals_per_epoch": 1,
+      "eval_max_new_tokens": 128,
+      "saves_per_epoch": 1,
+      "weight_decay": 0.0,
+      "special_tokens": {
+        "pad_token": "<|end_of_text|>"
+      }
+    }
+  }
+}
diff --git a/.runpod/src/train.py b/.runpod/src/train.py
new file mode 100644
index 000000000..72edda940
--- /dev/null
+++ b/.runpod/src/train.py
@@ -0,0 +1,45 @@
+"""
+Runpod train entrypoint
+"""
+
+import asyncio
+
+
+async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
+    """
+    Run preprocessing (if enabled) and training with the given config file
+    :param config_path: Path to the YAML config file
+    :param gpu_id: GPU ID to use (default: "0")
+    :param preprocess: Whether to run preprocessing (default: True)
+
+    """
+    # First check if preprocessing is needed
+    if preprocess:
+        # Preprocess command
+        preprocess_cmd = (
+            f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
+        )
+        process = await asyncio.create_subprocess_shell(
+            preprocess_cmd,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.STDOUT,
+        )
+
+        if process.stdout is not None:
+            async for line in process.stdout:
+                yield f"Preprocessing: {line.decode().strip()}"
+        await process.wait()
+        yield "Preprocessing completed."
+    else:
+        yield "Skipping preprocessing step."
+
+    # Training command
+    train_cmd = f"axolotl train {config_path}"
+    process = await asyncio.create_subprocess_shell(
+        train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
+    )
+
+    if process.stdout is not None:
+        async for line in process.stdout:
+            yield f"Training: {line.decode().strip()}"
+    await process.wait()
diff --git a/.runpod/src/utils.py b/.runpod/src/utils.py
new file mode 100644
index 000000000..8245aecf4
--- /dev/null
+++ b/.runpod/src/utils.py
@@ -0,0 +1,89 @@
+"""
+Runpod launcher utils
+"""
+
+import os
+
+import yaml
+
+
+def get_output_dir(run_id):
+    path = f"fine-tuning/{run_id}"
+    return path
+
+
+def make_valid_config(input_args):
+    """
+    Creates and saves updated config file, returns the path to the new config
+    :param input_args: dict of input args
+    :return: str, path to the updated config file
+    """
+    # Load default config
+    with open("config/config.yaml", "r", encoding="utf-8") as fin:
+        all_args = yaml.safe_load(fin)
+
+    if not input_args:
+        print("No args provided, using defaults")
+    else:
+        all_args.update(input_args)
+
+    # Create updated config path
+    updated_config_path = "config/updated_config.yaml"
+
+    # Save updated config to new file
+    with open(updated_config_path, "w", encoding="utf-8") as f:
+        yaml.dump(all_args, f)
+
+    return updated_config_path
+
+
+def set_config_env_vars(args: dict):
+    """
+    Convert API arguments into environment variables.
+    Handles nested dictionaries, lists, and special values.
+
+    Args:
+        args (dict): The arguments dictionary from the API request
+    """
+
+    def process_value(value):
+        """Convert Python values to string format for environment variables"""
+        if value is None:
+            return ""
+        if isinstance(value, bool):
+            return str(value).lower()
+        if isinstance(value, (list, dict)):
+            return str(value)
+        return str(value)
+
+    def set_env_vars(data, prefix=""):
+        """Recursively set environment variables from nested dictionary"""
+        for key, value in data.items():
+            env_key = prefix + key.upper()
+
+            # Handle special cases
+            if isinstance(value, dict):
+                # For nested dictionaries (like special_tokens)
+                set_env_vars(value, f"{env_key}_")
+            elif isinstance(value, list):
+                # Handle list of dictionaries (like datasets)
+                if value and isinstance(value[0], dict):
+                    for i, item in enumerate(value):
+                        set_env_vars(item, f"{env_key}_{i}_")
+                else:
+                    # For simple lists (like lora_target_modules)
+                    os.environ[env_key] = process_value(value)
+            else:
+                # Handle all other cases
+                os.environ[env_key] = process_value(value)
+
+    # Clear any existing related environment variables
+    # This prevents old values from persisting
+    for key in list(os.environ.keys()):
+        if key.startswith(
+            ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
+        ):
+            del os.environ[key]
+
+    # Set new environment variables
+    set_env_vars(args)
diff --git a/.runpod/tests.json b/.runpod/tests.json
new file mode 100644
index 000000000..1d1e0287b
--- /dev/null
+++ b/.runpod/tests.json
@@ -0,0 +1,85 @@
+{
+  "input": {
+    "name": "quick_smoke_test_sft",
+    "user_id": "user",
+    "model_id": "llama-test",
+    "run_id": "llama-test",
+    "credentials": {
+      "wandb_api_key": "",
+      "hf_token": ""
+    },
+    "args": {
+      "base_model": "HuggingFaceTB/SmolLM2-135M",
+      "model_type": "AutoModelForCausalLM",
+      "tokenizer_type": "AutoTokenizer",
+      "load_in_8bit": true,
+      "load_in_4bit": false,
+      "strict": false,
+      "datasets": [
+        {
+          "path": "mhenrichsen/alpaca_2k_test",
+          "type": "alpaca"
+        }
+      ],
+      "val_set_size": 0.05,
+      "output_dir": "./outputs/lora-out",
+      "sequence_len": 4096,
+      "sample_packing": true,
+      "eval_sample_packing": false,
+      "pad_to_sequence_len": true,
+      "adapter": "lora",
+      "lora_r": 32,
+      "lora_alpha": 64,
+      "lora_dropout": 0.05,
+      "lora_target_linear": true,
+      "lora_modules_to_save": [
+        "embed_tokens",
+        "lm_head"
+      ],
+      "gradient_accumulation_steps": 4,
+      "micro_batch_size": 2,
+      "num_epochs": 1,
+      "optimizer": "adamw_torch_fused",
+      "lr_scheduler": "cosine",
+      "learning_rate": 0.0002,
+      "train_on_inputs": false,
+      "group_by_length": false,
+      "bf16": "auto",
+      "tf32": true,
+      "gradient_checkpointing": true,
+      "logging_steps": 1,
+      "flash_attention": true,
+      "warmup_steps": 1,
+      "evals_per_epoch": 1,
+      "eval_max_new_tokens": 128,
+      "saves_per_epoch": 1,
+      "weight_decay": 0.0,
+      "special_tokens": {
+        "pad_token": "<|endoftext|>"
+      }
+    },
+    "timeout": 100000
+  },
+  "config": {
+    "gpuTypeId": "NVIDIA GeForce RTX 4090",
+    "gpuCount": 1,
+    "containerDiskInGb": 200,
+    "env": [
+      {
+        "key": "TOKENIZER",
+        "value": ""
+      },
+      {
+        "key": "DISABLE_LOG_STATS",
+        "value": "true"
+      }
+    ],
+    "allowedCudaVersions": [
+      "12.8",
+      "12.7",
+      "12.6",
+      "12.5",
+      "12.4"
+    ]
+  }
+}