diff --git a/.runpod/.gitignore b/.runpod/.gitignore
new file mode 100644
index 000000000..383570cfc
--- /dev/null
+++ b/.runpod/.gitignore
@@ -0,0 +1,161 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+# Usually these files are written by a python script from a template
+# before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+# For a library or package, you might want to ignore these files since the code is
+# intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+# However, in case of collaboration, if having platform-specific dependencies or dependencies
+# having no cross-platform support, pipenv may install dependencies that don't work, or not
+# install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+# This is especially recommended for binary packages to ensure reproducibility, and is more
+# commonly ignored for libraries.
+# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+# in version control.
+# https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+# and can be added to the global gitignore or merged into this file. For a more nuclear
+# option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+pod/scripts/config.yaml
diff --git a/.runpod/Dockerfile b/.runpod/Dockerfile
new file mode 100644
index 000000000..dfd8e1c16
--- /dev/null
+++ b/.runpod/Dockerfile
@@ -0,0 +1,18 @@
+FROM runpod/pytorch:3.10-2.0.0-117
+
+COPY .runpod/requirements.txt /requirements.txt
+RUN --mount=type=cache,target=/root/.cache/pip \
+ python3 -m pip install --upgrade pip && \
+ python3 -m pip install --upgrade -r /requirements.txt
+
+
+# Environment settings
+ARG BASE_VOLUME="/runpod-volume"
+ENV BASE_VOLUME=$BASE_VOLUME
+ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
+ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
+ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
+
+COPY .runpod/src /src
+
+CMD ["python3", "/src/handler.py"]
diff --git a/.runpod/LICENSE b/.runpod/LICENSE
new file mode 100644
index 000000000..a80f426da
--- /dev/null
+++ b/.runpod/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2023 runpod-workers
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/.runpod/README.md b/.runpod/README.md
new file mode 100644
index 000000000..1e2227030
--- /dev/null
+++ b/.runpod/README.md
@@ -0,0 +1,356 @@
+
+
+
LLM Training- Full finetune, LoRA, QLoRa etc. Llama/Mistral/Gemma
+
+## RunPod Worker Images
+
+Below is a summary of the available RunPod Worker images, categorized by image stability and CUDA version compatibility.
+
+| Preview Image Tag | Development Image Tag |
+-----------------------------------|-----------------------------------|
+| `runpod/llm-finetuning:preview` | `runpod/llm-finetuning:dev`
+
+# Configuration Options
+
+This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.
+
+## Usage
+
+You can use these configuration Options:
+
+1. As a JSON request body:
+```json
+{
+ "input": {
+ "user_id": "user",
+ "model_id": "model-name",
+ "run_id": "run-id",
+ "credentials": {
+ "wandb_api_key": "", # add your Weights & biases key. TODO: you will be able to set this in Enviornment variables.
+ "hf_token": "", # add your HF_token. TODO: you will be able to set this in Enviornment variables.
+ },
+ "args": {
+ "base_model": "NousResearch/Llama-3.2-1B",
+ // ... other options
+ }
+ }
+}
+```
+
+## Configuration Options
+
+### Model Configuration
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `base_model` | Path to the base model (local or HuggingFace) | Required |
+| `base_model_config` | Configuration path for the base model | Same as base_model |
+| `revision_of_model` | Specific model revision from HuggingFace hub | Latest |
+| `tokenizer_config` | Custom tokenizer configuration path | Optional |
+| `model_type` | Type of model to load | AutoModelForCausalLM |
+| `tokenizer_type` | Type of tokenizer to use | AutoTokenizer |
+| `hub_model_id` | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional |
+
+
+
+## Model Family Identification
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `is_falcon_derived_model` | `false` | Whether model is Falcon-based |
+| `is_llama_derived_model` | `false` | Whether model is LLaMA-based |
+| `is_qwen_derived_model` | `false` | Whether model is Qwen-based |
+| `is_mistral_derived_model` | `false` | Whether model is Mistral-based |
+
+## Model Configuration Overrides
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `overrides_of_model_config.rope_scaling.type` | `"linear"` | RoPE scaling type (linear/dynamic) |
+| `overrides_of_model_config.rope_scaling.factor` | `1.0` | RoPE scaling factor |
+
+### Model Loading Options
+
+| Option | Description | Default |
+|--------|-------------|---------|
+| `load_in_8bit` | Load model in 8-bit precision | false |
+| `load_in_4bit` | Load model in 4-bit precision | false |
+| `bf16` | Use bfloat16 precision | false |
+| `fp16` | Use float16 precision | false |
+| `tf32` | Use tensor float 32 precision | false |
+
+
+## Memory and Device Settings
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `gpu_memory_limit` | `"20GiB"` | GPU memory limit |
+| `lora_on_cpu` | `false` | Load LoRA on CPU |
+| `device_map` | `"auto"` | Device mapping strategy |
+| `max_memory` | `null` | Max memory per device |
+
+## Training Hyperparameters
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `gradient_accumulation_steps` | `1` | Gradient accumulation steps |
+| `micro_batch_size` | `2` | Batch size per GPU |
+| `eval_batch_size` | `null` | Evaluation batch size |
+| `num_epochs` | `4` | Number of training epochs |
+| `warmup_steps` | `100` | Warmup steps |
+| `warmup_ratio` | `0.05` | Warmup ratio |
+| `learning_rate` | `0.00003` | Learning rate |
+| `lr_quadratic_warmup` | `false` | Quadratic warmup |
+| `logging_steps` | `null` | Logging frequency |
+| `eval_steps` | `null` | Evaluation frequency |
+| `evals_per_epoch` | `null` | Evaluations per epoch |
+| `save_strategy` | `"epoch"` | Checkpoint saving strategy |
+| `save_steps` | `null` | Saving frequency |
+| `saves_per_epoch` | `null` | Saves per epoch |
+| `save_total_limit` | `null` | Maximum checkpoints to keep |
+| `max_steps` | `null` | Maximum training steps |
+
+### Dataset Configuration
+
+```yaml
+datasets:
+ - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
+ type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
+ ds_type: json # Dataset type
+ data_files: path/to/data # Source data files
+ train_on_split: train # Dataset split to use
+```
+
+
+## Chat Template Settings
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `chat_template` | `"tokenizer_default"` | Chat template type |
+| `chat_template_jinja` | `null` | Custom Jinja template |
+| `default_system_message` | `"You are a helpful assistant."` | Default system message |
+
+## Dataset Processing
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset |
+| `push_dataset_to_hub` | `""` | Push dataset to HF hub |
+| `dataset_processes` | `4` | Number of preprocessing processes |
+| `dataset_keep_in_memory` | `false` | Keep dataset in memory |
+| `shuffle_merged_datasets` | `true` | Shuffle merged datasets |
+| `dataset_exact_deduplication` | `true` | Deduplicate datasets |
+
+## LoRA Configuration
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `adapter` | `"lora"` | Adapter type (lora/qlora) |
+| `lora_model_dir` | `""` | Directory with pretrained LoRA |
+| `lora_r` | `8` | LoRA attention dimension |
+| `lora_alpha` | `16` | LoRA alpha parameter |
+| `lora_dropout` | `0.05` | LoRA dropout |
+| `lora_target_modules` | `["q_proj", "v_proj"]` | Modules to apply LoRA |
+| `lora_target_linear` | `false` | Target all linear modules |
+| `peft_layers_to_transform` | `[]` | Layers to transform |
+| `lora_modules_to_save` | `[]` | Modules to save |
+| `lora_fan_in_fan_out` | `false` | Fan in/out structure |
+
+
+## Optimization Settings
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `train_on_inputs` | `false` | Train on input prompts |
+| `group_by_length` | `false` | Group by sequence length |
+| `gradient_checkpointing` | `false` | Use gradient checkpointing |
+| `early_stopping_patience` | `3` | Early stopping patience |
+
+## Learning Rate Scheduling
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `lr_scheduler` | `"cosine"` | Scheduler type |
+| `lr_scheduler_kwargs` | `{}` | Scheduler parameters |
+| `cosine_min_lr_ratio` | `null` | Minimum LR ratio |
+| `cosine_constant_lr_ratio` | `null` | Constant LR ratio |
+| `lr_div_factor` | `null` | LR division factor |
+
+## Optimizer Settings
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `optimizer` | `"adamw_hf"` | Optimizer choice |
+| `optim_args` | `{}` | Optimizer arguments |
+| `optim_target_modules` | `[]` | Target modules |
+| `weight_decay` | `null` | Weight decay |
+| `adam_beta1` | `null` | Adam beta1 |
+| `adam_beta2` | `null` | Adam beta2 |
+| `adam_epsilon` | `null` | Adam epsilon |
+| `max_grad_norm` | `null` | Gradient clipping |
+
+## Attention Implementations
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `flash_optimum` | `false` | Use better transformers |
+| `xformers_attention` | `false` | Use xformers |
+| `flash_attention` | `false` | Use flash attention |
+| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
+| `flash_attn_rms_norm` | `false` | Flash attention RMS norm |
+| `flash_attn_fuse_qkv` | `false` | Fuse QKV operations |
+| `flash_attn_fuse_mlp` | `false` | Fuse MLP operations |
+| `sdp_attention` | `false` | Use scaled dot product |
+| `s2_attention` | `false` | Use shifted sparse attention |
+
+
+## Tokenizer Modifications
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `special_tokens` | - | Special tokens to add/modify |
+| `tokens` | `[]` | Additional tokens |
+
+## Distributed Training
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `fsdp` | `null` | FSDP configuration |
+| `fsdp_config` | `null` | FSDP config options |
+| `deepspeed` | `null` | Deepspeed config path |
+| `ddp_timeout` | `null` | DDP timeout |
+| `ddp_bucket_cap_mb` | `null` | DDP bucket capacity |
+| `ddp_broadcast_buffers` | `null` | DDP broadcast buffers |
+
+
+
+Example Configuration Request:
+
+Here's a complete example for fine-tuning a LLaMA model using LoRA:
+
+```json
+{
+ "input": {
+ "user_id": "user",
+ "model_id": "llama-test",
+ "run_id": "test-run",
+ "credentials": {
+ "wandb_api_key": "",
+ "hf_token": ""
+ },
+ "args": {
+ "base_model": "NousResearch/Llama-3.2-1B",
+ "load_in_8bit": false,
+ "load_in_4bit": false,
+ "strict": false,
+ "datasets": [
+ {
+ "path": "teknium/GPT4-LLM-Cleaned",
+ "type": "alpaca"
+ }
+ ],
+ "dataset_prepared_path": "last_run_prepared",
+ "val_set_size": 0.1,
+ "output_dir": "./outputs/lora-out",
+ "adapter": "lora",
+ "sequence_len": 2048,
+ "sample_packing": true,
+ "eval_sample_packing": true,
+ "pad_to_sequence_len": true,
+ "lora_r": 16,
+ "lora_alpha": 32,
+ "lora_dropout": 0.05,
+ "lora_target_modules": [
+ "gate_proj",
+ "down_proj",
+ "up_proj",
+ "q_proj",
+ "v_proj",
+ "k_proj",
+ "o_proj"
+ ],
+ "gradient_accumulation_steps": 2,
+ "micro_batch_size": 2,
+ "num_epochs": 1,
+ "optimizer": "adamw_8bit",
+ "lr_scheduler": "cosine",
+ "learning_rate": 0.0002,
+ "train_on_inputs": false,
+ "group_by_length": false,
+ "bf16": "auto",
+ "tf32": false,
+ "gradient_checkpointing": true,
+ "logging_steps": 1,
+ "flash_attention": true,
+ "loss_watchdog_threshold": 5,
+ "loss_watchdog_patience": 3,
+ "warmup_steps": 10,
+ "evals_per_epoch": 4,
+ "saves_per_epoch": 1,
+ "weight_decay": 0,
+ "hub_model_id": "runpod/llama-fr-lora",
+ "wandb_name": "test-run-1",
+ "wandb_project": "test-run-1",
+ "wandb_entity": "axo-test",
+ "special_tokens": {
+ "pad_token": "<|end_of_text|>"
+ }
+ }
+ }
+}
+```
+
+
+### Advanced Features
+
+#### Wandb Integration
+- `wandb_project`: Project name for Weights & Biases
+- `wandb_entity`: Team name in W&B
+- `wandb_watch`: Monitor model with W&B
+- `wandb_name`: Name of the W&B run
+- `wandb_run_id`: ID for the W&B run
+
+
+
+#### Performance Optimization
+- `sample_packing`: Enable efficient sequence packing
+- `eval_sample_packing`: Use sequence packing during evaluation
+- `torch_compile`: Enable PyTorch 2.0 compilation
+- `flash_attention`: Use Flash Attention implementation
+- `xformers_attention`: Use xFormers attention implementation
+
+### Available Optimizers
+
+The following optimizers are supported:
+
+- `adamw_hf`: HuggingFace's AdamW implementation
+- `adamw_torch`: PyTorch's AdamW
+- `adamw_torch_fused`: Fused AdamW implementation
+- `adamw_torch_xla`: XLA-optimized AdamW
+- `adamw_apex_fused`: NVIDIA Apex fused AdamW
+- `adafactor`: Adafactor optimizer
+- `adamw_anyprecision`: Anyprecision AdamW
+- `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
+- `lion_8bit`: 8-bit Lion optimizer
+- `lion_32bit`: 32-bit Lion optimizer
+- `sgd`: Stochastic Gradient Descent
+- `adagrad`: Adagrad optimizer
+
+
+
+## Notes
+
+- Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
+- Enable `flash_attention: true` for faster training on modern GPUs
+- Use `gradient_checkpointing: true` to reduce memory usage
+- Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory
+
+For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html).
+
+
+### Errors:
+- if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.
+
+
+
+
diff --git a/.runpod/hub.json b/.runpod/hub.json
new file mode 100644
index 000000000..77770c223
--- /dev/null
+++ b/.runpod/hub.json
@@ -0,0 +1,98 @@
+{
+ "title": "Axolotl Fine-Tuning",
+ "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
+ "type": "serverless",
+ "category": "language",
+ "iconUrl": "https://avatars.githubusercontent.com/u/167502477",
+ "config": {
+ "runsOn": "GPU",
+ "containerDiskInGb": 200,
+ "gpuCount": 1,
+ "allowedCudaVersions": [
+ "12.8",
+ "12.7",
+ "12.6",
+ "12.5",
+ "12.4",
+ "12.3",
+ "12.2",
+ "12.1",
+ "12.0",
+ "11.8"
+ ],
+ "presets": [],
+ "env": [
+ {
+ "key": "TOKENIZER",
+ "input": {
+ "name": "Tokenizer",
+ "type": "string",
+ "description": "Name or path of the Hugging Face tokenizer to use.",
+ "default": "",
+ "advanced": true
+ }
+ },
+ {
+ "key": "MAX_NUM_SEQS",
+ "input": {
+ "name": "Max Num Seqs",
+ "type": "number",
+ "description": "Maximum number of sequences per iteration.",
+ "default": 256,
+ "advanced": true
+ }
+ },
+ {
+ "key": "DISABLE_LOG_STATS",
+ "input": {
+ "name": "Disable Log Stats",
+ "type": "boolean",
+ "description": "Disable logging statistics.",
+ "default": false,
+ "trueValue": "true",
+ "falseValue": "false"
+ }
+ },
+ {
+ "key": "LOAD_FORMAT",
+ "input": {
+ "name": "Load Format",
+ "type": "string",
+ "description": "The format of the model weights to load.",
+ "default": "auto",
+ "options": [
+ {
+ "label": "auto",
+ "value": "auto"
+ },
+ {
+ "label": "pt",
+ "value": "pt"
+ },
+ {
+ "label": "safetensors",
+ "value": "safetensors"
+ },
+ {
+ "label": "npcache",
+ "value": "npcache"
+ },
+ {
+ "label": "dummy",
+ "value": "dummy"
+ },
+ {
+ "label": "tensorizer",
+ "value": "tensorizer"
+ },
+ {
+ "label": "bitsandbytes",
+ "value": "bitsandbytes"
+ }
+ ],
+ "advanced": true
+ }
+ }
+ ]
+ }
+}
\ No newline at end of file
diff --git a/.runpod/requirements.txt b/.runpod/requirements.txt
new file mode 100644
index 000000000..af9a6fdcf
--- /dev/null
+++ b/.runpod/requirements.txt
@@ -0,0 +1,15 @@
+# Required Python packages get listed here, one per line.
+# Reccomended to lock the version number to avoid unexpected changes.
+
+# You can also install packages from a git repository, e.g.:
+# git+https://github.com/runpod/runpod-python.git
+# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
+runpod~=1.7.0
+huggingface_hub
+typing-extensions
+pydantic
+pydantic-settings
+hf-transfer
+setuptools
+numpy==2.0.0
+axolotl
diff --git a/.runpod/src/config/config.yaml b/.runpod/src/config/config.yaml
new file mode 100644
index 000000000..60cd7a2de
--- /dev/null
+++ b/.runpod/src/config/config.yaml
@@ -0,0 +1,577 @@
+# # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
+# # This can also be a relative path to a model on disk
+# base_model: ./llama-7b-hf
+# # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
+# base_model_ignore_patterns:
+# # If the base_model repo on hf hub doesn't include configuration .json files,
+# # You can set that here, or leave this empty to default to base_model
+# base_model_config: ./llama-7b-hf
+# # You can specify to choose a specific model revision from huggingface hub
+# model_revision:
+# # Optional tokenizer configuration override in case you want to use a different tokenizer
+# # than the one defined in the base model
+# tokenizer_config:
+# # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
+# model_type: AutoModelForCausalLM
+# # Corresponding tokenizer for the model AutoTokenizer is a good choice
+# tokenizer_type: AutoTokenizer
+# # Trust remote code for untrusted source
+# trust_remote_code:
+# # use_fast option for tokenizer loading from_pretrained, default to True
+# tokenizer_use_fast:
+# # Whether to use the legacy tokenizer setting, defaults to True
+# tokenizer_legacy:
+# # Resize the model embeddings when new tokens are added to multiples of 32
+# # This is reported to improve training speed on some models
+# resize_token_embeddings_to_32x:
+
+# # Used to identify which the model is based on
+# is_falcon_derived_model:
+# is_llama_derived_model:
+# # Please note that if you set this to true, `padding_side` will be set to "left" by default
+# is_mistral_derived_model:
+# is_qwen_derived_model:
+
+# # optional overrides to the base model configuration
+# model_config:
+# # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
+# rope_scaling:
+# type: # linear | dynamic
+# factor: # float
+
+
+# # Whether you are training a 4-bit GPTQ quantized model
+# gptq: true
+# gptq_groupsize: 128 # group size
+# gptq_model_v1: false # v1 or v2
+
+# # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
+# load_in_8bit: true
+# # Use bitsandbytes 4 bit
+# load_in_4bit:
+
+# # Use CUDA bf16
+# bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
+# # Use CUDA fp16
+# fp16: true
+# # Use CUDA tf32
+# tf32: true # require >=ampere
+
+# # No AMP (automatic mixed precision)
+# bfloat16: true # require >=ampere
+# float16: true
+
+# # A list of one or more datasets to finetune the model with
+# datasets:
+# # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
+# - path: vicgalle/alpaca-gpt4
+# # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
+# type: alpaca # format | format: (chat/instruct) | .load_
+# ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
+# data_files: # Optional[str] path to source data files
+# shards: # Optional[int] number of shards to split data into
+# name: # Optional[str] name of dataset configuration to load
+# train_on_split: train # Optional[str] name of dataset split to load from
+
+# # Optional[str] fastchat conversation type, only used with type: sharegpt
+# conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+# field_human: # Optional[str]. Human key to use for conversation.
+# field_model: # Optional[str]. Assistant key to use for conversation.
+
+# # Custom user prompt
+# - path: repo
+# type:
+# # The below are defaults. only set what's needed.
+# system_prompt: ""
+# system_format: "{system}"
+# field_system: system
+# field_instruction: instruction
+# field_input: input
+# field_output: output
+
+# # Customizable to be single line or multi-line
+# # 'format' can include {input}
+# format: |-
+# User: {instruction} {input}
+# Assistant:
+# # 'no_input_format' cannot include {input}
+# no_input_format: "{instruction} "
+
+# # For `completion` datsets only, uses the provided field instead of `text` column
+# field:
+
+# # Axolotl attempts to save the dataset as an arrow after packing the data together so
+# # subsequent training attempts load faster, relative path
+# dataset_prepared_path: data/last_run_prepared
+# # Push prepared dataset to hub
+# push_dataset_to_hub: # repo path
+# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
+# # if not set.
+# dataset_processes: # defaults to os.cpu_count() if not set
+# # push checkpoints to hub
+# hub_model_id: # repo path to push finetuned model
+# # how to push checkpoints to hub
+# # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
+# hub_strategy:
+# # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
+# # Required to be true when used in combination with `push_dataset_to_hub`
+# hf_use_auth_token: # boolean
+# # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
+# val_set_size: 0.04
+# # Num shards for whole dataset
+# dataset_shard_num:
+# # Index of shard to use for whole dataset
+# dataset_shard_idx:
+
+# # The maximum length of an input to train with, this should typically be less than 2048
+# # as most models have a token/context limit of 2048
+# sequence_len: 2048
+# # Pad inputs so each step uses constant sized buffers
+# # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
+# pad_to_sequence_len:
+# # Max sequence length to concatenate training samples together up to
+# # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
+# # FutureWarning: This will soon be DEPRECATED
+# max_packed_sequence_len: 1024
+# # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
+# sample_packing:
+# # Set to 'false' if getting errors during eval with sample_packing on.
+# eval_sample_packing:
+# # You can set these packing optimizations AFTER starting a training at least once.
+# # The trainer will provide recommended values for these values.
+# sample_packing_eff_est:
+# total_num_tokens:
+
+# # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
+# adapter: lora
+# # If you already have a lora model trained that you want to load, put that here.
+# # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
+# lora_model_dir:
+
+# # LoRA hyperparameters
+# # For more details about the following options, see:
+# # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
+# lora_r: 8
+# lora_alpha: 16
+# lora_dropout: 0.05
+# lora_target_modules:
+# - q_proj
+# - v_proj
+# # - k_proj
+# # - o_proj
+# # - gate_proj
+# # - down_proj
+# # - up_proj
+# lora_target_linear: # If true, will target all linear layers
+
+# # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
+# # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
+# # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
+# # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
+# lora_modules_to_save:
+# # - embed_tokens
+# # - lm_head
+
+# # Once you complete training, the model will be saved to the following directory.
+# # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
+# # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
+# lora_out_dir:
+# lora_fan_in_fan_out: false
+
+# # ReLoRA configuration
+# # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
+# relora_steps: # Number of steps per ReLoRA restart
+# relora_warmup_steps: # Number of per-restart warmup steps
+# relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings
+
+# # wandb configuration if you're using it
+# wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
+# wandb_project: # Your wandb project name
+# wandb_entity: # A wandb Team name if using a Team
+# wandb_watch:
+# wandb_run_id: # Set the name of your wandb run
+# wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training
+
+# # Where to save the full-finetuned model to
+# output_dir: ./completed-model
+
+# # Whether to use torch.compile and which backend to use
+# torch_compile: # bool
+# torch_compile_backend: # Optional[str]
+
+# # Training hyperparameters
+
+# # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
+# gradient_accumulation_steps: 1
+# # The number of samples to include in each batch. This is the number of samples sent to each GPU.
+# micro_batch_size: 2
+# eval_batch_size:
+# num_epochs: 4
+# warmup_steps: 100 # cannot use with warmup_ratio
+# warmup_ratio: 0.05 # cannot use with warmup_steps
+# learning_rate: 0.00003
+# lr_quadratic_warmup:
+# logging_steps:
+# save_strategy: # Set to `no` to skip checkpoint saves
+# save_steps: # Leave empty to save at each epoch
+# eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
+# save_total_limit: # Checkpoints saved at a time
+# # Maximum number of iterations to train for. It precedes num_epochs which means that
+# # if both are set, num_epochs will not be guaranteed.
+# # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
+# max_steps:
+
+# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
+# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128
+
+# # Save model as safetensors (require safetensors package)
+# save_safetensors:
+
+# # Whether to mask out or include the human's prompt from the training labels
+# train_on_inputs: false
+# # Group similarly sized data to minimize padding.
+# # May be slower to start, as it must download and sort the entire dataset.
+# # Note that training loss may have an oscillating pattern with this enabled.
+# group_by_length: false
+
+# # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
+# gradient_checkpointing: false
+
+# # Stop training after this many evaluation losses have increased in a row
+# # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
+# early_stopping_patience: 3
+
+# # Specify a scheduler and kwargs to use with the optimizer
+# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine
+# lr_scheduler_kwargs:
+
+# # For one_cycle optim
+# lr_div_factor: # Learning rate div factor
+
+# # For log_sweep optim
+# log_sweep_min_lr:
+# log_sweep_max_lr:
+
+# # Specify optimizer
+# # Valid values are driven by the Transformers OptimizerNames class, see:
+# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
+# #
+# # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
+# # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
+# # in the examples/ for your model and fine-tuning use case.
+# #
+# # Valid values for 'optimizer' include:
+# # - adamw_hf
+# # - adamw_torch
+# # - adamw_torch_fused
+# # - adamw_torch_xla
+# # - adamw_apex_fused
+# # - adafactor
+# # - adamw_anyprecision
+# # - sgd
+# # - adagrad
+# # - adamw_bnb_8bit
+# # - lion_8bit
+# # - lion_32bit
+# # - paged_adamw_32bit
+# # - paged_adamw_8bit
+# # - paged_lion_32bit
+# # - paged_lion_8bit
+# optimizer:
+# # Specify weight decay
+# weight_decay:
+# # adamw hyperparams
+# adam_beta1:
+# adam_beta2:
+# adam_epsilon:
+# # Gradient clipping max norm
+# max_grad_norm:
+
+# # Augmentation techniques
+# # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
+# # currently only supported on Llama and Mistral
+# noisy_embedding_alpha:
+
+# # Whether to bettertransformers
+# flash_optimum:
+# # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
+# xformers_attention:
+# # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
+# flash_attention:
+# flash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only
+# flash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only
+# flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation
+# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
+# # Whether to use scaled-dot-product attention
+# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
+# sdp_attention:
+# # Landmark attention (only llama)
+# landmark_attention:
+# # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
+# # LLaMA only
+# xpos_rope:
+
+# # Resume from a specific checkpoint dir
+# resume_from_checkpoint:
+# # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
+# # Be careful with this being turned on between different models.
+# auto_resume_from_checkpoints: false
+
+# # Don't mess with this, it's here for accelerate and torchrun
+# local_rank:
+
+# # Add or change special tokens.
+# # If you add tokens here, you don't need to add them to the `tokens` list.
+# special_tokens:
+# # bos_token: ""
+# # eos_token: ""
+# # unk_token: ""
+
+# # Add extra tokens.
+# tokens:
+
+# # FSDP
+# fsdp:
+# fsdp_config:
+
+# # Deepspeed config path. e.g., deepspeed/zero3.json
+# deepspeed:
+
+# # Advanced DDP Arguments
+# ddp_timeout:
+# ddp_bucket_cap_mb:
+# ddp_broadcast_buffers:
+
+# # Path to torch distx for optim 'adamw_anyprecision'
+# torchdistx_path:
+
+# # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
+# pretraining_dataset:
+
+# # Debug mode
+# debug:
+
+# # Seed
+# seed:
+
+# # Allow overwrite yml config using from cli
+# strict:
+
+
+
+base_model: ${BASE_MODEL}
+base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
+base_model_config: ${BASE_MODEL_CONFIG}
+revision_of_model: ${REVISION_OF_MODEL}
+tokenizer_config: ${TOKENIZER_CONFIG}
+model_type: ${MODEL_TYPE}
+tokenizer_type: ${TOKENIZER_TYPE}
+trust_remote_code: ${TRUST_REMOTE_CODE}
+tokenizer_use_fast: ${TOKENIZER_USE_FAST}
+tokenizer_legacy: ${TOKENIZER_LEGACY}
+resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}
+
+is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
+is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
+is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
+is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}
+
+overrides_of_model_config:
+ rope_scaling:
+ type: ${ROPE_SCALING_TYPE}
+ factor: ${ROPE_SCALING_FACTOR}
+
+bnb_config_kwargs:
+ llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
+ bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
+ bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}
+
+gptq: ${GPTQ}
+load_in_8bit: ${LOAD_IN_8BIT}
+load_in_4bit: ${LOAD_IN_4BIT}
+bf16: ${BF16}
+fp16: ${FP16}
+tf32: ${TF32}
+bfloat16: ${BFLOAT16}
+float16: ${FLOAT16}
+
+gpu_memory_limit: ${GPU_MEMORY_LIMIT}
+lora_on_cpu: ${LORA_ON_CPU}
+
+datasets:
+ - path: ${DATASET_PATH}
+ type: ${DATASET_TYPE}
+ ds_type: ${DATASET_DS_TYPE}
+ data_files: ${DATASET_DATA_FILES}
+ shards: ${DATASET_SHARDS}
+ name: ${DATASET_NAME}
+ train_on_split: ${DATASET_TRAIN_ON_SPLIT}
+ revision: ${DATASET_REVISION}
+ trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}
+
+rl: ${RL}
+dpo_use_weighting: ${DPO_USE_WEIGHTING}
+
+chat_template: ${CHAT_TEMPLATE}
+chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
+default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
+dataset_prepared_path: ${DATASET_PREPARED_PATH}
+push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
+dataset_processes: ${DATASET_PROCESSES}
+dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
+hub_model_id: ${HUB_MODEL_ID}
+hub_strategy: ${HUB_STRATEGY}
+hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
+val_set_size: ${VAL_SET_SIZE}
+dataset_shard_num: ${DATASET_SHARD_NUM}
+dataset_shard_idx: ${DATASET_SHARD_IDX}
+
+sequence_len: ${SEQUENCE_LEN}
+pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
+sample_packing: ${SAMPLE_PACKING}
+eval_sample_packing: ${EVAL_SAMPLE_PACKING}
+sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
+total_num_tokens: ${TOTAL_NUM_TOKENS}
+sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
+sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}
+
+batch_flattening: ${BATCH_FLATTENING}
+device_map: ${DEVICE_MAP}
+max_memory: ${MAX_MEMORY}
+
+adapter: ${ADAPTER}
+lora_model_dir: ${LORA_MODEL_DIR}
+
+lora_r: ${LORA_R}
+lora_alpha: ${LORA_ALPHA}
+lora_dropout: ${LORA_DROPOUT}
+lora_target_modules:
+ - ${LORA_TARGET_MODULES}
+lora_target_linear: ${LORA_TARGET_LINEAR}
+peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
+lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
+lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}
+
+loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
+loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}
+
+peft:
+ loftq_config:
+ loftq_bits: ${LOFTQ_BITS}
+
+relora_steps: ${RELORA_STEPS}
+relora_warmup_steps: ${RELORA_WARMUP_STEPS}
+relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
+relora_prune_ratio: ${RELORA_PRUNE_RATIO}
+relora_cpu_offload: ${RELORA_CPU_OFFLOAD}
+
+wandb_mode: ${WANDB_MODE}
+wandb_project: ${WANDB_PROJECT}
+wandb_entity: ${WANDB_ENTITY}
+wandb_watch: ${WANDB_WATCH}
+wandb_name: ${WANDB_NAME}
+wandb_run_id: ${WANDB_RUN_ID}
+wandb_log_model: ${WANDB_LOG_MODEL}
+
+mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
+mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
+mlflow_run_name: ${MLFLOW_RUN_NAME}
+hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}
+
+use_comet: ${USE_COMET}
+comet_api_key: ${COMET_API_KEY}
+comet_workspace: ${COMET_WORKSPACE}
+comet_project_name: ${COMET_PROJECT_NAME}
+comet_experiment_key: ${COMET_EXPERIMENT_KEY}
+comet_mode: ${COMET_MODE}
+comet_online: ${COMET_ONLINE}
+comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}
+
+output_dir: ${OUTPUT_DIR}
+
+torch_compile: ${TORCH_COMPILE}
+torch_compile_backend: ${TORCH_COMPILE_BACKEND}
+
+gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
+micro_batch_size: ${MICRO_BATCH_SIZE}
+eval_batch_size: ${EVAL_BATCH_SIZE}
+num_epochs: ${NUM_EPOCHS}
+warmup_steps: ${WARMUP_STEPS}
+warmup_ratio: ${WARMUP_RATIO}
+learning_rate: ${LEARNING_RATE}
+lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
+logging_steps: ${LOGGING_STEPS}
+eval_steps: ${EVAL_STEPS}
+evals_per_epoch: ${EVALS_PER_EPOCH}
+save_strategy: ${SAVE_STRATEGY}
+save_steps: ${SAVE_STEPS}
+saves_per_epoch: ${SAVES_PER_EPOCH}
+save_total_limit: ${SAVE_TOTAL_LIMIT}
+max_steps: ${MAX_STEPS}
+
+eval_table_size: ${EVAL_TABLE_SIZE}
+eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
+eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}
+
+profiler_steps: ${PROFILER_STEPS}
+loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
+loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}
+
+save_safetensors: ${SAVE_SAFETENSORS}
+train_on_inputs: ${TRAIN_ON_INPUTS}
+group_by_length: ${GROUP_BY_LENGTH}
+gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
+early_stopping_patience: ${EARLY_STOPPING_PATIENCE}
+
+lr_scheduler: ${LR_SCHEDULER}
+lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
+cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
+cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
+lr_div_factor: ${LR_DIV_FACTOR}
+
+optimizer: ${OPTIMIZER}
+optim_args: ${OPTIM_ARGS}
+optim_target_modules: ${OPTIM_TARGET_MODULES}
+weight_decay: ${WEIGHT_DECAY}
+adam_beta1: ${ADAM_BETA1}
+adam_beta2: ${ADAM_BETA2}
+adam_epsilon: ${ADAM_EPSILON}
+max_grad_norm: ${MAX_GRAD_NORM}
+
+neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}
+
+flash_optimum: ${FLASH_OPTIMUM}
+xformers_attention: ${XFORMERS_ATTENTION}
+flash_attention: ${FLASH_ATTENTION}
+flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
+flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
+flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV}
+flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
+sdp_attention: ${SDP_ATTENTION}
+s2_attention: ${S2_ATTENTION}
+resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
+auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}
+
+local_rank: ${LOCAL_RANK}
+
+special_tokens:
+ bos_token: ${SPECIAL_TOKEN_BOS}
+ eos_token: ${SPECIAL_TOKEN_EOS}
+ unk_token: ${SPECIAL_TOKEN_UNK}
+ pad_token: ${SPECIAL_TOKEN_PAD}
+
+tokens: ${TOKENS}
+
+fsdp: ${FSDP}
+fsdp_config: ${FSDP_CONFIG}
+deepspeed: ${DEEPSPEED}
+
+ddp_timeout: ${DDP_TIMEOUT}
+ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
+ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}
+
+torchdistx_path: ${TORCHDISTX_PATH}
+pretraining_dataset: ${PRETRAINING_DATASET}
+debug: ${DEBUG}
+seed: ${SEED}
+strict: ${STRICT}
\ No newline at end of file
diff --git a/.runpod/src/handler.py b/.runpod/src/handler.py
new file mode 100644
index 000000000..181575322
--- /dev/null
+++ b/.runpod/src/handler.py
@@ -0,0 +1,59 @@
+import runpod
+import os
+from train import train
+from utils import get_output_dir
+from huggingface_hub._login import login
+import yaml
+
+BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
+if not os.path.exists(BASE_VOLUME):
+ os.makedirs(BASE_VOLUME)
+
+logger = runpod.RunPodLogger()
+
+
+async def handler(job):
+ runpod_job_id = job["id"]
+ inputs = job["input"]
+ run_id = inputs.get("run_id", "default_run_id")
+ args = inputs.get("args", {})
+
+ # Set output directory
+ output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
+ args["output_dir"] = output_dir
+
+ # First save args to a temporary config file
+ config_path = "/workspace/test_config.yaml"
+
+ # Add run_name and job_id to args before saving
+ args["run_name"] = run_id
+ args["runpod_job_id"] = runpod_job_id
+
+ yaml_data = yaml.dump(args, default_flow_style=False)
+ with open(config_path, "w") as file:
+ file.write(yaml_data)
+
+ # Handle credentials
+ credentials = inputs.get("credentials", {})
+
+ if "wandb_api_key" in credentials:
+ os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
+ if "hf_token" in credentials:
+ os.environ["HF_TOKEN"] = credentials["hf_token"]
+
+ if os.environ.get("HF_TOKEN"):
+ login(token=os.environ["HF_TOKEN"])
+ else:
+ logger.info("No HF_TOKEN provided. Skipping login.")
+
+ logger.info("Starting Training.")
+ async for result in train(config_path): # Pass the config path instead of args
+ logger.info(result)
+ logger.info("Training Complete.")
+
+ # Cleanup
+ del os.environ["WANDB_API_KEY"]
+ del os.environ["HF_TOKEN"]
+
+
+runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})
diff --git a/.runpod/src/test_input.json b/.runpod/src/test_input.json
new file mode 100644
index 000000000..88edb3123
--- /dev/null
+++ b/.runpod/src/test_input.json
@@ -0,0 +1,61 @@
+{
+ "input": {
+ "user_id": "user",
+ "model_id": "llama-test",
+ "run_id": "llama-test",
+ "credentials": {
+ "wandb_api_key": "",
+ "hf_token": ""
+ },
+ "args": {
+ "base_model": "NousResearch/Meta-Llama-3-8B",
+ "model_type": "LlamaForCausalLM",
+ "tokenizer_type": "AutoTokenizer",
+ "load_in_8bit": true,
+ "load_in_4bit": false,
+ "strict": false,
+ "datasets": [
+ {
+ "path": "mhenrichsen/alpaca_2k_test",
+ "type": "alpaca"
+ }
+ ],
+ "val_set_size": 0.05,
+ "output_dir": "./outputs/lora-out",
+ "sequence_len": 4096,
+ "sample_packing": true,
+ "eval_sample_packing": false,
+ "pad_to_sequence_len": true,
+ "adapter": "lora",
+ "lora_r": 32,
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "lora_target_linear": true,
+ "lora_modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "gradient_accumulation_steps": 4,
+ "micro_batch_size": 2,
+ "num_epochs": 1,
+ "optimizer": "adamw_bnb_8bit",
+ "lr_scheduler": "cosine",
+ "learning_rate": 0.0002,
+ "train_on_inputs": false,
+ "group_by_length": false,
+ "bf16": "auto",
+ "tf32": false,
+ "gradient_checkpointing": true,
+ "logging_steps": 1,
+ "flash_attention": true,
+ "warmup_steps": 1,
+ "evals_per_epoch": 1,
+ "eval_max_new_tokens": 128,
+ "saves_per_epoch": 1,
+ "weight_decay": 0.0,
+ "special_tokens": {
+ "pad_token": "<|end_of_text|>"
+ }
+ }
+ }
+}
\ No newline at end of file
diff --git a/.runpod/src/train.py b/.runpod/src/train.py
new file mode 100644
index 000000000..dff606162
--- /dev/null
+++ b/.runpod/src/train.py
@@ -0,0 +1,43 @@
+import yaml
+from torch.cuda import device_count
+import asyncio
+import os
+from typing import Optional, Dict, Any, AsyncGenerator
+
+
+async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
+ """
+ Run preprocessing (if enabled) and training with the given config file
+ :param config_path: Path to the YAML config file
+ :param gpu_id: GPU ID to use (default: "0")
+ :param preprocess: Whether to run preprocessing (default: True)
+
+ """
+ # First check if preprocessing is needed
+ if preprocess:
+ # Preprocess command
+ preprocess_cmd = f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
+ process = await asyncio.create_subprocess_shell(
+ preprocess_cmd,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.STDOUT
+ )
+
+ async for line in process.stdout:
+ yield f"Preprocessing: {line.decode().strip()}"
+ await process.wait()
+ yield "Preprocessing completed."
+ else:
+ yield "Skipping preprocessing step."
+
+ # Training command
+ train_cmd = f"axolotl train {config_path}"
+ process = await asyncio.create_subprocess_shell(
+ train_cmd,
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.STDOUT
+ )
+
+ async for line in process.stdout:
+ yield f"Training: {line.decode().strip()}"
+ await process.wait()
diff --git a/.runpod/src/utils.py b/.runpod/src/utils.py
new file mode 100644
index 000000000..ed60c7183
--- /dev/null
+++ b/.runpod/src/utils.py
@@ -0,0 +1,96 @@
+import os
+import yaml
+
+
+def get_output_dir(run_id):
+ path = f"fine-tuning/{run_id}"
+ return path
+
+
+# def make_valid_config(input_args):
+# """
+# Currently limited by all possible axolotl args, no defaults
+# :param input_args: dict of input args
+# """
+# all_args = yaml.safe_load(open("config/config.yaml", "r"))
+# if not input_args:
+# print("No args provided, using defaults")
+# else:
+# all_args.update(input_args)
+# return all_args
+
+
+def make_valid_config(input_args):
+ """
+ Creates and saves updated config file, returns the path to the new config
+ :param input_args: dict of input args
+ :return: str, path to the updated config file
+ """
+ # Load default config
+ all_args = yaml.safe_load(open("config/config.yaml", "r"))
+
+ if not input_args:
+ print("No args provided, using defaults")
+ else:
+ all_args.update(input_args)
+
+ # Create updated config path
+ updated_config_path = "config/updated_config.yaml"
+
+ # Save updated config to new file
+ with open(updated_config_path, "w") as f:
+ yaml.dump(all_args, f)
+
+ return updated_config_path
+
+
+def set_config_env_vars(args: dict):
+ """
+ Convert API arguments into environment variables.
+ Handles nested dictionaries, lists, and special values.
+
+ Args:
+ args (dict): The arguments dictionary from the API request
+ """
+
+ def process_value(value):
+ """Convert Python values to string format for environment variables"""
+ if value is None:
+ return ""
+ elif isinstance(value, bool):
+ return str(value).lower()
+ elif isinstance(value, (list, dict)):
+ return str(value)
+ return str(value)
+
+ def set_env_vars(data, prefix=""):
+ """Recursively set environment variables from nested dictionary"""
+ for key, value in data.items():
+ env_key = prefix + key.upper()
+
+ # Handle special cases
+ if isinstance(value, dict):
+ # For nested dictionaries (like special_tokens)
+ set_env_vars(value, f"{env_key}_")
+ elif isinstance(value, list):
+ # Handle list of dictionaries (like datasets)
+ if value and isinstance(value[0], dict):
+ for i, item in enumerate(value):
+ set_env_vars(item, f"{env_key}_{i}_")
+ else:
+ # For simple lists (like lora_target_modules)
+ os.environ[env_key] = process_value(value)
+ else:
+ # Handle all other cases
+ os.environ[env_key] = process_value(value)
+
+ # Clear any existing related environment variables
+ # This prevents old values from persisting
+ for key in list(os.environ.keys()):
+ if key.startswith(
+ ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
+ ):
+ del os.environ[key]
+
+ # Set new environment variables
+ set_env_vars(args)
diff --git a/.runpod/tests.json b/.runpod/tests.json
new file mode 100644
index 000000000..633bac460
--- /dev/null
+++ b/.runpod/tests.json
@@ -0,0 +1,93 @@
+{
+ "tests": [
+ {
+ "name": "quick_smoke_test_sft",
+ "input": {
+ "user_id": "user",
+ "model_id": "llama-test",
+ "run_id": "llama-test",
+ "credentials": {
+ "wandb_api_key": "",
+ "hf_token": ""
+ },
+ "args": {
+ "base_model": "NousResearch/Meta-Llama-3-8B",
+ "model_type": "LlamaForCausalLM",
+ "tokenizer_type": "AutoTokenizer",
+ "load_in_8bit": true,
+ "load_in_4bit": false,
+ "strict": false,
+ "datasets": [
+ {
+ "path": "mhenrichsen/alpaca_2k_test",
+ "type": "alpaca"
+ }
+ ],
+ "val_set_size": 0.05,
+ "output_dir": "./outputs/lora-out",
+ "sequence_len": 4096,
+ "sample_packing": true,
+ "eval_sample_packing": false,
+ "pad_to_sequence_len": true,
+ "adapter": "lora",
+ "lora_r": 32,
+ "lora_alpha": 16,
+ "lora_dropout": 0.05,
+ "lora_target_linear": true,
+ "lora_modules_to_save": [
+ "embed_tokens",
+ "lm_head"
+ ],
+ "gradient_accumulation_steps": 4,
+ "micro_batch_size": 2,
+ "num_epochs": 1,
+ "optimizer": "adamw_bnb_8bit",
+ "lr_scheduler": "cosine",
+ "learning_rate": 0.0002,
+ "train_on_inputs": false,
+ "group_by_length": false,
+ "bf16": "auto",
+ "tf32": false,
+ "gradient_checkpointing": true,
+ "logging_steps": 1,
+ "flash_attention": true,
+ "warmup_steps": 1,
+ "evals_per_epoch": 1,
+ "eval_max_new_tokens": 128,
+ "saves_per_epoch": 1,
+ "weight_decay": 0.0,
+ "special_tokens": {
+ "pad_token": "<|end_of_text|>"
+ }
+ }
+ },
+ "timeout": 100000
+ }
+ ],
+ "config": {
+ "gpuTypeId": "NVIDIA GeForce RTX 4090",
+ "gpuCount": 1,
+ "containerDiskInGb": 200,
+ "env": [
+ {
+ "key": "TOKENIZER",
+ "value": ""
+ },
+ {
+ "key": "DISABLE_LOG_STATS",
+ "value": "true"
+ }
+ ],
+ "allowedCudaVersions": [
+ "12.8",
+ "12.7",
+ "12.6",
+ "12.5",
+ "12.4",
+ "12.3",
+ "12.2",
+ "12.1",
+ "12.0"
+ ]
+ }
+}
\ No newline at end of file