From 5d182a105660a48b5e2cca679a9d82f3c23c87cc Mon Sep 17 00:00:00 2001 From: Ezekiel Wotring <40004347+KAJdev@users.noreply.github.com> Date: Mon, 28 Apr 2025 06:08:32 -0800 Subject: [PATCH] Add runpod sls handler (#2530) [skip ci] * Add runpod sls handler * remove LICENSE and fix README * chore: lint * use axolotl cloud image as base and various fixes * fix: trim allowed cuda versions * restore dockerfile * chore: update title * use axolotl cloud image --------- Co-authored-by: Wing Lian Co-authored-by: NanoCode012 --- .runpod/.gitignore | 161 +++++++++ .runpod/Dockerfile | 18 + .runpod/README.md | 335 +++++++++++++++++++ .runpod/hub.json | 93 ++++++ .runpod/requirements.txt | 7 + .runpod/src/config/config.yaml | 577 +++++++++++++++++++++++++++++++++ .runpod/src/handler.py | 64 ++++ .runpod/src/test_input.json | 61 ++++ .runpod/src/train.py | 45 +++ .runpod/src/utils.py | 89 +++++ .runpod/tests.json | 85 +++++ 11 files changed, 1535 insertions(+) create mode 100644 .runpod/.gitignore create mode 100644 .runpod/Dockerfile create mode 100644 .runpod/README.md create mode 100644 .runpod/hub.json create mode 100644 .runpod/requirements.txt create mode 100644 .runpod/src/config/config.yaml create mode 100644 .runpod/src/handler.py create mode 100644 .runpod/src/test_input.json create mode 100644 .runpod/src/train.py create mode 100644 .runpod/src/utils.py create mode 100644 .runpod/tests.json diff --git a/.runpod/.gitignore b/.runpod/.gitignore new file mode 100644 index 000000000..383570cfc --- /dev/null +++ b/.runpod/.gitignore @@ -0,0 +1,161 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ +pod/scripts/config.yaml diff --git a/.runpod/Dockerfile b/.runpod/Dockerfile new file mode 100644 index 000000000..107caf5f3 --- /dev/null +++ b/.runpod/Dockerfile @@ -0,0 +1,18 @@ +FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0 + +COPY .runpod/requirements.txt /requirements.txt +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install --upgrade pip && \ + python3 -m pip install --upgrade -r /requirements.txt + +# Environment settings +ARG BASE_VOLUME="/runpod-volume" +ENV BASE_VOLUME=$BASE_VOLUME +ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets" +ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub" +ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub" + +COPY .runpod/src /src + +WORKDIR /src +CMD ["python3", "/src/handler.py"] diff --git a/.runpod/README.md b/.runpod/README.md new file mode 100644 index 000000000..a631c3937 --- /dev/null +++ b/.runpod/README.md @@ -0,0 +1,335 @@ +

LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more

+ +# Configuration Options + +This document outlines all available configuration options for training models. The configuration can be provided as a JSON request. + +## Usage + +You can use these configuration Options: + +1. As a JSON request body: + +```json +{ + "input": { + "user_id": "user", + "model_id": "model-name", + "run_id": "run-id", + "credentials": { + "wandb_api_key": "", # add your Weights & biases key. TODO: you will be able to set this in Enviornment variables. + "hf_token": "", # add your HF_token. TODO: you will be able to set this in Enviornment variables. + }, + "args": { + "base_model": "NousResearch/Llama-3.2-1B", + // ... other options + } + } +} +``` + +## Configuration Options + +### Model Configuration + +| Option | Description | Default | +| ------------------- | --------------------------------------------------------------------------------------------- | -------------------- | +| `base_model` | Path to the base model (local or HuggingFace) | Required | +| `base_model_config` | Configuration path for the base model | Same as base_model | +| `revision_of_model` | Specific model revision from HuggingFace hub | Latest | +| `tokenizer_config` | Custom tokenizer configuration path | Optional | +| `model_type` | Type of model to load | AutoModelForCausalLM | +| `tokenizer_type` | Type of tokenizer to use | AutoTokenizer | +| `hub_model_id` | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional | + +## Model Family Identification + +| Option | Default | Description | +| -------------------------- | ------- | ------------------------------ | +| `is_falcon_derived_model` | `false` | Whether model is Falcon-based | +| `is_llama_derived_model` | `false` | Whether model is LLaMA-based | +| `is_qwen_derived_model` | `false` | Whether model is Qwen-based | +| `is_mistral_derived_model` | `false` | Whether model is Mistral-based | + +## Model Configuration Overrides + +| Option | Default | Description | +| ----------------------------------------------- | ---------- | ---------------------------------- | +| `overrides_of_model_config.rope_scaling.type` | `"linear"` | RoPE scaling type (linear/dynamic) | +| `overrides_of_model_config.rope_scaling.factor` | `1.0` | RoPE scaling factor | + +### Model Loading Options + +| Option | Description | Default | +| -------------- | ----------------------------- | ------- | +| `load_in_8bit` | Load model in 8-bit precision | false | +| `load_in_4bit` | Load model in 4-bit precision | false | +| `bf16` | Use bfloat16 precision | false | +| `fp16` | Use float16 precision | false | +| `tf32` | Use tensor float 32 precision | false | + +## Memory and Device Settings + +| Option | Default | Description | +| ------------------ | --------- | ----------------------- | +| `gpu_memory_limit` | `"20GiB"` | GPU memory limit | +| `lora_on_cpu` | `false` | Load LoRA on CPU | +| `device_map` | `"auto"` | Device mapping strategy | +| `max_memory` | `null` | Max memory per device | + +## Training Hyperparameters + +| Option | Default | Description | +| ----------------------------- | --------- | --------------------------- | +| `gradient_accumulation_steps` | `1` | Gradient accumulation steps | +| `micro_batch_size` | `2` | Batch size per GPU | +| `eval_batch_size` | `null` | Evaluation batch size | +| `num_epochs` | `4` | Number of training epochs | +| `warmup_steps` | `100` | Warmup steps | +| `warmup_ratio` | `0.05` | Warmup ratio | +| `learning_rate` | `0.00003` | Learning rate | +| `lr_quadratic_warmup` | `false` | Quadratic warmup | +| `logging_steps` | `null` | Logging frequency | +| `eval_steps` | `null` | Evaluation frequency | +| `evals_per_epoch` | `null` | Evaluations per epoch | +| `save_strategy` | `"epoch"` | Checkpoint saving strategy | +| `save_steps` | `null` | Saving frequency | +| `saves_per_epoch` | `null` | Saves per epoch | +| `save_total_limit` | `null` | Maximum checkpoints to keep | +| `max_steps` | `null` | Maximum training steps | + +### Dataset Configuration + +```yaml +datasets: + - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path. + type: alpaca # Format type (alpaca, gpteacher, oasst, etc.) + ds_type: json # Dataset type + data_files: path/to/data # Source data files + train_on_split: train # Dataset split to use +``` + +## Chat Template Settings + +| Option | Default | Description | +| ------------------------ | -------------------------------- | ---------------------- | +| `chat_template` | `"tokenizer_default"` | Chat template type | +| `chat_template_jinja` | `null` | Custom Jinja template | +| `default_system_message` | `"You are a helpful assistant."` | Default system message | + +## Dataset Processing + +| Option | Default | Description | +| ----------------------------- | -------------------------- | --------------------------------- | +| `dataset_prepared_path` | `"data/last_run_prepared"` | Path for prepared dataset | +| `push_dataset_to_hub` | `""` | Push dataset to HF hub | +| `dataset_processes` | `4` | Number of preprocessing processes | +| `dataset_keep_in_memory` | `false` | Keep dataset in memory | +| `shuffle_merged_datasets` | `true` | Shuffle merged datasets | +| `dataset_exact_deduplication` | `true` | Deduplicate datasets | + +## LoRA Configuration + +| Option | Default | Description | +| -------------------------- | ---------------------- | ------------------------------ | +| `adapter` | `"lora"` | Adapter type (lora/qlora) | +| `lora_model_dir` | `""` | Directory with pretrained LoRA | +| `lora_r` | `8` | LoRA attention dimension | +| `lora_alpha` | `16` | LoRA alpha parameter | +| `lora_dropout` | `0.05` | LoRA dropout | +| `lora_target_modules` | `["q_proj", "v_proj"]` | Modules to apply LoRA | +| `lora_target_linear` | `false` | Target all linear modules | +| `peft_layers_to_transform` | `[]` | Layers to transform | +| `lora_modules_to_save` | `[]` | Modules to save | +| `lora_fan_in_fan_out` | `false` | Fan in/out structure | + +## Optimization Settings + +| Option | Default | Description | +| ------------------------- | ------- | -------------------------- | +| `train_on_inputs` | `false` | Train on input prompts | +| `group_by_length` | `false` | Group by sequence length | +| `gradient_checkpointing` | `false` | Use gradient checkpointing | +| `early_stopping_patience` | `3` | Early stopping patience | + +## Learning Rate Scheduling + +| Option | Default | Description | +| -------------------------- | ---------- | -------------------- | +| `lr_scheduler` | `"cosine"` | Scheduler type | +| `lr_scheduler_kwargs` | `{}` | Scheduler parameters | +| `cosine_min_lr_ratio` | `null` | Minimum LR ratio | +| `cosine_constant_lr_ratio` | `null` | Constant LR ratio | +| `lr_div_factor` | `null` | LR division factor | + +## Optimizer Settings + +| Option | Default | Description | +| ---------------------- | ------------ | ------------------- | +| `optimizer` | `"adamw_hf"` | Optimizer choice | +| `optim_args` | `{}` | Optimizer arguments | +| `optim_target_modules` | `[]` | Target modules | +| `weight_decay` | `null` | Weight decay | +| `adam_beta1` | `null` | Adam beta1 | +| `adam_beta2` | `null` | Adam beta2 | +| `adam_epsilon` | `null` | Adam epsilon | +| `max_grad_norm` | `null` | Gradient clipping | + +## Attention Implementations + +| Option | Default | Description | +| -------------------------- | ------- | ----------------------------- | +| `flash_optimum` | `false` | Use better transformers | +| `xformers_attention` | `false` | Use xformers | +| `flash_attention` | `false` | Use flash attention | +| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy | +| `flash_attn_rms_norm` | `false` | Flash attention RMS norm | +| `flash_attn_fuse_qkv` | `false` | Fuse QKV operations | +| `flash_attn_fuse_mlp` | `false` | Fuse MLP operations | +| `sdp_attention` | `false` | Use scaled dot product | +| `s2_attention` | `false` | Use shifted sparse attention | + +## Tokenizer Modifications + +| Option | Default | Description | +| ---------------- | ------- | ---------------------------- | +| `special_tokens` | - | Special tokens to add/modify | +| `tokens` | `[]` | Additional tokens | + +## Distributed Training + +| Option | Default | Description | +| ----------------------- | ------- | --------------------- | +| `fsdp` | `null` | FSDP configuration | +| `fsdp_config` | `null` | FSDP config options | +| `deepspeed` | `null` | Deepspeed config path | +| `ddp_timeout` | `null` | DDP timeout | +| `ddp_bucket_cap_mb` | `null` | DDP bucket capacity | +| `ddp_broadcast_buffers` | `null` | DDP broadcast buffers | + +
+

Example Configuration Request:

+ +Here's a complete example for fine-tuning a LLaMA model using LoRA: + +```json +{ + "input": { + "user_id": "user", + "model_id": "llama-test", + "run_id": "test-run", + "credentials": { + "wandb_api_key": "", + "hf_token": "" + }, + "args": { + "base_model": "NousResearch/Llama-3.2-1B", + "load_in_8bit": false, + "load_in_4bit": false, + "strict": false, + "datasets": [ + { + "path": "teknium/GPT4-LLM-Cleaned", + "type": "alpaca" + } + ], + "dataset_prepared_path": "last_run_prepared", + "val_set_size": 0.1, + "output_dir": "./outputs/lora-out", + "adapter": "lora", + "sequence_len": 2048, + "sample_packing": true, + "eval_sample_packing": true, + "pad_to_sequence_len": true, + "lora_r": 16, + "lora_alpha": 32, + "lora_dropout": 0.05, + "lora_target_modules": [ + "gate_proj", + "down_proj", + "up_proj", + "q_proj", + "v_proj", + "k_proj", + "o_proj" + ], + "gradient_accumulation_steps": 2, + "micro_batch_size": 2, + "num_epochs": 1, + "optimizer": "adamw_8bit", + "lr_scheduler": "cosine", + "learning_rate": 0.0002, + "train_on_inputs": false, + "group_by_length": false, + "bf16": "auto", + "tf32": false, + "gradient_checkpointing": true, + "logging_steps": 1, + "flash_attention": true, + "loss_watchdog_threshold": 5, + "loss_watchdog_patience": 3, + "warmup_steps": 10, + "evals_per_epoch": 4, + "saves_per_epoch": 1, + "weight_decay": 0, + "hub_model_id": "runpod/llama-fr-lora", + "wandb_name": "test-run-1", + "wandb_project": "test-run-1", + "wandb_entity": "axo-test", + "special_tokens": { + "pad_token": "<|end_of_text|>" + } + } + } +} +``` + +
+ +### Advanced Features + +#### Wandb Integration + +- `wandb_project`: Project name for Weights & Biases +- `wandb_entity`: Team name in W&B +- `wandb_watch`: Monitor model with W&B +- `wandb_name`: Name of the W&B run +- `wandb_run_id`: ID for the W&B run + +#### Performance Optimization + +- `sample_packing`: Enable efficient sequence packing +- `eval_sample_packing`: Use sequence packing during evaluation +- `torch_compile`: Enable PyTorch 2.0 compilation +- `flash_attention`: Use Flash Attention implementation +- `xformers_attention`: Use xFormers attention implementation + +### Available Optimizers + +The following optimizers are supported: + +- `adamw_hf`: HuggingFace's AdamW implementation +- `adamw_torch`: PyTorch's AdamW +- `adamw_torch_fused`: Fused AdamW implementation +- `adamw_torch_xla`: XLA-optimized AdamW +- `adamw_apex_fused`: NVIDIA Apex fused AdamW +- `adafactor`: Adafactor optimizer +- `adamw_anyprecision`: Anyprecision AdamW +- `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes +- `lion_8bit`: 8-bit Lion optimizer +- `lion_32bit`: 32-bit Lion optimizer +- `sgd`: Stochastic Gradient Descent +- `adagrad`: Adagrad optimizer + +## Notes + +- Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training +- Enable `flash_attention: true` for faster training on modern GPUs +- Use `gradient_checkpointing: true` to reduce memory usage +- Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory + +For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config.html). + +### Errors: + +- if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start. diff --git a/.runpod/hub.json b/.runpod/hub.json new file mode 100644 index 000000000..a243a27d8 --- /dev/null +++ b/.runpod/hub.json @@ -0,0 +1,93 @@ +{ + "title": "Axolotl Fine-Tuning", + "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.", + "type": "serverless", + "category": "language", + "iconUrl": "https://avatars.githubusercontent.com/u/167502477", + "config": { + "runsOn": "GPU", + "containerDiskInGb": 200, + "gpuCount": 1, + "allowedCudaVersions": [ + "12.8", + "12.7", + "12.6", + "12.5", + "12.4" + ], + "presets": [], + "env": [ + { + "key": "TOKENIZER", + "input": { + "name": "Tokenizer", + "type": "string", + "description": "Name or path of the Hugging Face tokenizer to use.", + "default": "", + "advanced": true + } + }, + { + "key": "MAX_NUM_SEQS", + "input": { + "name": "Max Num Seqs", + "type": "number", + "description": "Maximum number of sequences per iteration.", + "default": 256, + "advanced": true + } + }, + { + "key": "DISABLE_LOG_STATS", + "input": { + "name": "Disable Log Stats", + "type": "boolean", + "description": "Disable logging statistics.", + "default": false, + "trueValue": "true", + "falseValue": "false" + } + }, + { + "key": "LOAD_FORMAT", + "input": { + "name": "Load Format", + "type": "string", + "description": "The format of the model weights to load.", + "default": "auto", + "options": [ + { + "label": "auto", + "value": "auto" + }, + { + "label": "pt", + "value": "pt" + }, + { + "label": "safetensors", + "value": "safetensors" + }, + { + "label": "npcache", + "value": "npcache" + }, + { + "label": "dummy", + "value": "dummy" + }, + { + "label": "tensorizer", + "value": "tensorizer" + }, + { + "label": "bitsandbytes", + "value": "bitsandbytes" + } + ], + "advanced": true + } + } + ] + } +} diff --git a/.runpod/requirements.txt b/.runpod/requirements.txt new file mode 100644 index 000000000..345bdda35 --- /dev/null +++ b/.runpod/requirements.txt @@ -0,0 +1,7 @@ +# Required Python packages get listed here, one per line. +# Reccomended to lock the version number to avoid unexpected changes. + +# You can also install packages from a git repository, e.g.: +# git+https://github.com/runpod/runpod-python.git +# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/ +runpod~=1.7.0 diff --git a/.runpod/src/config/config.yaml b/.runpod/src/config/config.yaml new file mode 100644 index 000000000..4dff37cae --- /dev/null +++ b/.runpod/src/config/config.yaml @@ -0,0 +1,577 @@ +# # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files +# # This can also be a relative path to a model on disk +# base_model: ./llama-7b-hf +# # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc) +# base_model_ignore_patterns: +# # If the base_model repo on hf hub doesn't include configuration .json files, +# # You can set that here, or leave this empty to default to base_model +# base_model_config: ./llama-7b-hf +# # You can specify to choose a specific model revision from huggingface hub +# model_revision: +# # Optional tokenizer configuration override in case you want to use a different tokenizer +# # than the one defined in the base model +# tokenizer_config: +# # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too +# model_type: AutoModelForCausalLM +# # Corresponding tokenizer for the model AutoTokenizer is a good choice +# tokenizer_type: AutoTokenizer +# # Trust remote code for untrusted source +# trust_remote_code: +# # use_fast option for tokenizer loading from_pretrained, default to True +# tokenizer_use_fast: +# # Whether to use the legacy tokenizer setting, defaults to True +# tokenizer_legacy: +# # Resize the model embeddings when new tokens are added to multiples of 32 +# # This is reported to improve training speed on some models +# resize_token_embeddings_to_32x: + +# # Used to identify which the model is based on +# is_falcon_derived_model: +# is_llama_derived_model: +# # Please note that if you set this to true, `padding_side` will be set to "left" by default +# is_mistral_derived_model: +# is_qwen_derived_model: + +# # optional overrides to the base model configuration +# model_config: +# # RoPE Scaling https://github.com/huggingface/transformers/pull/24653 +# rope_scaling: +# type: # linear | dynamic +# factor: # float + + +# # Whether you are training a 4-bit GPTQ quantized model +# gptq: true +# gptq_groupsize: 128 # group size +# gptq_model_v1: false # v1 or v2 + +# # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer +# load_in_8bit: true +# # Use bitsandbytes 4 bit +# load_in_4bit: + +# # Use CUDA bf16 +# bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere +# # Use CUDA fp16 +# fp16: true +# # Use CUDA tf32 +# tf32: true # require >=ampere + +# # No AMP (automatic mixed precision) +# bfloat16: true # require >=ampere +# float16: true + +# # A list of one or more datasets to finetune the model with +# datasets: +# # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files +# - path: vicgalle/alpaca-gpt4 +# # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection] +# type: alpaca # format | format: (chat/instruct) | .load_ +# ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file +# data_files: # Optional[str] path to source data files +# shards: # Optional[int] number of shards to split data into +# name: # Optional[str] name of dataset configuration to load +# train_on_split: train # Optional[str] name of dataset split to load from + +# # Optional[str] fastchat conversation type, only used with type: sharegpt +# conversation: # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py +# field_human: # Optional[str]. Human key to use for conversation. +# field_model: # Optional[str]. Assistant key to use for conversation. + +# # Custom user prompt +# - path: repo +# type: +# # The below are defaults. only set what's needed. +# system_prompt: "" +# system_format: "{system}" +# field_system: system +# field_instruction: instruction +# field_input: input +# field_output: output + +# # Customizable to be single line or multi-line +# # 'format' can include {input} +# format: |- +# User: {instruction} {input} +# Assistant: +# # 'no_input_format' cannot include {input} +# no_input_format: "{instruction} " + +# # For `completion` datsets only, uses the provided field instead of `text` column +# field: + +# # Axolotl attempts to save the dataset as an arrow after packing the data together so +# # subsequent training attempts load faster, relative path +# dataset_prepared_path: data/last_run_prepared +# # Push prepared dataset to hub +# push_dataset_to_hub: # repo path +# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` +# # if not set. +# dataset_processes: # defaults to os.cpu_count() if not set +# # push checkpoints to hub +# hub_model_id: # repo path to push finetuned model +# # how to push checkpoints to hub +# # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy +# hub_strategy: +# # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets +# # Required to be true when used in combination with `push_dataset_to_hub` +# hf_use_auth_token: # boolean +# # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval. +# val_set_size: 0.04 +# # Num shards for whole dataset +# dataset_shard_num: +# # Index of shard to use for whole dataset +# dataset_shard_idx: + +# # The maximum length of an input to train with, this should typically be less than 2048 +# # as most models have a token/context limit of 2048 +# sequence_len: 2048 +# # Pad inputs so each step uses constant sized buffers +# # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently +# pad_to_sequence_len: +# # Max sequence length to concatenate training samples together up to +# # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning +# # FutureWarning: This will soon be DEPRECATED +# max_packed_sequence_len: 1024 +# # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true' +# sample_packing: +# # Set to 'false' if getting errors during eval with sample_packing on. +# eval_sample_packing: +# # You can set these packing optimizations AFTER starting a training at least once. +# # The trainer will provide recommended values for these values. +# sample_packing_eff_est: +# total_num_tokens: + +# # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model +# adapter: lora +# # If you already have a lora model trained that you want to load, put that here. +# # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`. +# lora_model_dir: + +# # LoRA hyperparameters +# # For more details about the following options, see: +# # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2 +# lora_r: 8 +# lora_alpha: 16 +# lora_dropout: 0.05 +# lora_target_modules: +# - q_proj +# - v_proj +# # - k_proj +# # - o_proj +# # - gate_proj +# # - down_proj +# # - up_proj +# lora_target_linear: # If true, will target all linear layers + +# # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens. +# # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models. +# # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities. +# # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994 +# lora_modules_to_save: +# # - embed_tokens +# # - lm_head + +# # Once you complete training, the model will be saved to the following directory. +# # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory. +# # Make sure `lora_model_dir` points to this directory if you want to use the trained model. +# lora_out_dir: +# lora_fan_in_fan_out: false + +# # ReLoRA configuration +# # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed +# relora_steps: # Number of steps per ReLoRA restart +# relora_warmup_steps: # Number of per-restart warmup steps +# relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings + +# # wandb configuration if you're using it +# wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb +# wandb_project: # Your wandb project name +# wandb_entity: # A wandb Team name if using a Team +# wandb_watch: +# wandb_run_id: # Set the name of your wandb run +# wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training + +# # Where to save the full-finetuned model to +# output_dir: ./completed-model + +# # Whether to use torch.compile and which backend to use +# torch_compile: # bool +# torch_compile_backend: # Optional[str] + +# # Training hyperparameters + +# # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps. +# gradient_accumulation_steps: 1 +# # The number of samples to include in each batch. This is the number of samples sent to each GPU. +# micro_batch_size: 2 +# eval_batch_size: +# num_epochs: 4 +# warmup_steps: 100 # cannot use with warmup_ratio +# warmup_ratio: 0.05 # cannot use with warmup_steps +# learning_rate: 0.00003 +# lr_quadratic_warmup: +# logging_steps: +# save_strategy: # Set to `no` to skip checkpoint saves +# save_steps: # Leave empty to save at each epoch +# eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps +# save_total_limit: # Checkpoints saved at a time +# # Maximum number of iterations to train for. It precedes num_epochs which means that +# # if both are set, num_epochs will not be guaranteed. +# # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps +# max_steps: + +# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0 +# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128 + +# # Save model as safetensors (require safetensors package) +# save_safetensors: + +# # Whether to mask out or include the human's prompt from the training labels +# train_on_inputs: false +# # Group similarly sized data to minimize padding. +# # May be slower to start, as it must download and sort the entire dataset. +# # Note that training loss may have an oscillating pattern with this enabled. +# group_by_length: false + +# # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing +# gradient_checkpointing: false + +# # Stop training after this many evaluation losses have increased in a row +# # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback +# early_stopping_patience: 3 + +# # Specify a scheduler and kwargs to use with the optimizer +# lr_scheduler: # 'one_cycle' | 'log_sweep' | empty for cosine +# lr_scheduler_kwargs: + +# # For one_cycle optim +# lr_div_factor: # Learning rate div factor + +# # For log_sweep optim +# log_sweep_min_lr: +# log_sweep_max_lr: + +# # Specify optimizer +# # Valid values are driven by the Transformers OptimizerNames class, see: +# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134 +# # +# # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of +# # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used +# # in the examples/ for your model and fine-tuning use case. +# # +# # Valid values for 'optimizer' include: +# # - adamw_hf +# # - adamw_torch +# # - adamw_torch_fused +# # - adamw_torch_xla +# # - adamw_apex_fused +# # - adafactor +# # - adamw_anyprecision +# # - sgd +# # - adagrad +# # - adamw_bnb_8bit +# # - lion_8bit +# # - lion_32bit +# # - paged_adamw_32bit +# # - paged_adamw_8bit +# # - paged_lion_32bit +# # - paged_lion_8bit +# optimizer: +# # Specify weight decay +# weight_decay: +# # adamw hyperparams +# adam_beta1: +# adam_beta2: +# adam_epsilon: +# # Gradient clipping max norm +# max_grad_norm: + +# # Augmentation techniques +# # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings +# # currently only supported on Llama and Mistral +# noisy_embedding_alpha: + +# # Whether to bettertransformers +# flash_optimum: +# # Whether to use xformers attention patch https://github.com/facebookresearch/xformers: +# xformers_attention: +# # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention: +# flash_attention: +# flash_attn_cross_entropy: # Whether to use flash-attention cross entropy implementation - advanced use only +# flash_attn_rms_norm: # Whether to use flash-attention rms norm implementation - advanced use only +# flash_attn_fuse_qkv: # Whether to fuse QKV into a single operation +# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation +# # Whether to use scaled-dot-product attention +# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html +# sdp_attention: +# # Landmark attention (only llama) +# landmark_attention: +# # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py +# # LLaMA only +# xpos_rope: + +# # Resume from a specific checkpoint dir +# resume_from_checkpoint: +# # If resume_from_checkpoint isn't set and you simply want it to start where it left off. +# # Be careful with this being turned on between different models. +# auto_resume_from_checkpoints: false + +# # Don't mess with this, it's here for accelerate and torchrun +# local_rank: + +# # Add or change special tokens. +# # If you add tokens here, you don't need to add them to the `tokens` list. +# special_tokens: +# # bos_token: "" +# # eos_token: "" +# # unk_token: "" + +# # Add extra tokens. +# tokens: + +# # FSDP +# fsdp: +# fsdp_config: + +# # Deepspeed config path. e.g., deepspeed/zero3.json +# deepspeed: + +# # Advanced DDP Arguments +# ddp_timeout: +# ddp_bucket_cap_mb: +# ddp_broadcast_buffers: + +# # Path to torch distx for optim 'adamw_anyprecision' +# torchdistx_path: + +# # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize +# pretraining_dataset: + +# # Debug mode +# debug: + +# # Seed +# seed: + +# # Allow overwrite yml config using from cli +# strict: + + + +base_model: ${BASE_MODEL} +base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS} +base_model_config: ${BASE_MODEL_CONFIG} +revision_of_model: ${REVISION_OF_MODEL} +tokenizer_config: ${TOKENIZER_CONFIG} +model_type: ${MODEL_TYPE} +tokenizer_type: ${TOKENIZER_TYPE} +trust_remote_code: ${TRUST_REMOTE_CODE} +tokenizer_use_fast: ${TOKENIZER_USE_FAST} +tokenizer_legacy: ${TOKENIZER_LEGACY} +resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X} + +is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL} +is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL} +is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL} +is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL} + +overrides_of_model_config: + rope_scaling: + type: ${ROPE_SCALING_TYPE} + factor: ${ROPE_SCALING_FACTOR} + +bnb_config_kwargs: + llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT} + bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE} + bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT} + +gptq: ${GPTQ} +load_in_8bit: ${LOAD_IN_8BIT} +load_in_4bit: ${LOAD_IN_4BIT} +bf16: ${BF16} +fp16: ${FP16} +tf32: ${TF32} +bfloat16: ${BFLOAT16} +float16: ${FLOAT16} + +gpu_memory_limit: ${GPU_MEMORY_LIMIT} +lora_on_cpu: ${LORA_ON_CPU} + +datasets: + - path: ${DATASET_PATH} + type: ${DATASET_TYPE} + ds_type: ${DATASET_DS_TYPE} + data_files: ${DATASET_DATA_FILES} + shards: ${DATASET_SHARDS} + name: ${DATASET_NAME} + train_on_split: ${DATASET_TRAIN_ON_SPLIT} + revision: ${DATASET_REVISION} + trust_remote_code: ${DATASET_TRUST_REMOTE_CODE} + +rl: ${RL} +dpo_use_weighting: ${DPO_USE_WEIGHTING} + +chat_template: ${CHAT_TEMPLATE} +chat_template_jinja: ${CHAT_TEMPLATE_JINJA} +default_system_message: ${DEFAULT_SYSTEM_MESSAGE} +dataset_prepared_path: ${DATASET_PREPARED_PATH} +push_dataset_to_hub: ${PUSH_DATASET_TO_HUB} +dataset_processes: ${DATASET_PROCESSES} +dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY} +hub_model_id: ${HUB_MODEL_ID} +hub_strategy: ${HUB_STRATEGY} +hf_use_auth_token: ${HF_USE_AUTH_TOKEN} +val_set_size: ${VAL_SET_SIZE} +dataset_shard_num: ${DATASET_SHARD_NUM} +dataset_shard_idx: ${DATASET_SHARD_IDX} + +sequence_len: ${SEQUENCE_LEN} +pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN} +sample_packing: ${SAMPLE_PACKING} +eval_sample_packing: ${EVAL_SAMPLE_PACKING} +sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST} +total_num_tokens: ${TOTAL_NUM_TOKENS} +sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE} +sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE} + +batch_flattening: ${BATCH_FLATTENING} +device_map: ${DEVICE_MAP} +max_memory: ${MAX_MEMORY} + +adapter: ${ADAPTER} +lora_model_dir: ${LORA_MODEL_DIR} + +lora_r: ${LORA_R} +lora_alpha: ${LORA_ALPHA} +lora_dropout: ${LORA_DROPOUT} +lora_target_modules: + - ${LORA_TARGET_MODULES} +lora_target_linear: ${LORA_TARGET_LINEAR} +peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM} +lora_modules_to_save: ${LORA_MODULES_TO_SAVE} +lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT} + +loraplus_lr_ratio: ${LORAPLUS_LR_RATIO} +loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING} + +peft: + loftq_config: + loftq_bits: ${LOFTQ_BITS} + +relora_steps: ${RELORA_STEPS} +relora_warmup_steps: ${RELORA_WARMUP_STEPS} +relora_anneal_steps: ${RELORA_ANNEAL_STEPS} +relora_prune_ratio: ${RELORA_PRUNE_RATIO} +relora_cpu_offload: ${RELORA_CPU_OFFLOAD} + +wandb_mode: ${WANDB_MODE} +wandb_project: ${WANDB_PROJECT} +wandb_entity: ${WANDB_ENTITY} +wandb_watch: ${WANDB_WATCH} +wandb_name: ${WANDB_NAME} +wandb_run_id: ${WANDB_RUN_ID} +wandb_log_model: ${WANDB_LOG_MODEL} + +mlflow_tracking_uri: ${MLFLOW_TRACKING_URI} +mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME} +mlflow_run_name: ${MLFLOW_RUN_NAME} +hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS} + +use_comet: ${USE_COMET} +comet_api_key: ${COMET_API_KEY} +comet_workspace: ${COMET_WORKSPACE} +comet_project_name: ${COMET_PROJECT_NAME} +comet_experiment_key: ${COMET_EXPERIMENT_KEY} +comet_mode: ${COMET_MODE} +comet_online: ${COMET_ONLINE} +comet_experiment_config: ${COMET_EXPERIMENT_CONFIG} + +output_dir: ${OUTPUT_DIR} + +torch_compile: ${TORCH_COMPILE} +torch_compile_backend: ${TORCH_COMPILE_BACKEND} + +gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS} +micro_batch_size: ${MICRO_BATCH_SIZE} +eval_batch_size: ${EVAL_BATCH_SIZE} +num_epochs: ${NUM_EPOCHS} +warmup_steps: ${WARMUP_STEPS} +warmup_ratio: ${WARMUP_RATIO} +learning_rate: ${LEARNING_RATE} +lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP} +logging_steps: ${LOGGING_STEPS} +eval_steps: ${EVAL_STEPS} +evals_per_epoch: ${EVALS_PER_EPOCH} +save_strategy: ${SAVE_STRATEGY} +save_steps: ${SAVE_STEPS} +saves_per_epoch: ${SAVES_PER_EPOCH} +save_total_limit: ${SAVE_TOTAL_LIMIT} +max_steps: ${MAX_STEPS} + +eval_table_size: ${EVAL_TABLE_SIZE} +eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS} +eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS} + +profiler_steps: ${PROFILER_STEPS} +loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD} +loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE} + +save_safetensors: ${SAVE_SAFETENSORS} +train_on_inputs: ${TRAIN_ON_INPUTS} +group_by_length: ${GROUP_BY_LENGTH} +gradient_checkpointing: ${GRADIENT_CHECKPOINTING} +early_stopping_patience: ${EARLY_STOPPING_PATIENCE} + +lr_scheduler: ${LR_SCHEDULER} +lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS} +cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO} +cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO} +lr_div_factor: ${LR_DIV_FACTOR} + +optimizer: ${OPTIMIZER} +optim_args: ${OPTIM_ARGS} +optim_target_modules: ${OPTIM_TARGET_MODULES} +weight_decay: ${WEIGHT_DECAY} +adam_beta1: ${ADAM_BETA1} +adam_beta2: ${ADAM_BETA2} +adam_epsilon: ${ADAM_EPSILON} +max_grad_norm: ${MAX_GRAD_NORM} + +neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA} + +flash_optimum: ${FLASH_OPTIMUM} +xformers_attention: ${XFORMERS_ATTENTION} +flash_attention: ${FLASH_ATTENTION} +flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY} +flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM} +flash_attn_fuse_qkv: ${FLASH_ATTN_FUSE_QKV} +flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP} +sdp_attention: ${SDP_ATTENTION} +s2_attention: ${S2_ATTENTION} +resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT} +auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS} + +local_rank: ${LOCAL_RANK} + +special_tokens: + bos_token: ${SPECIAL_TOKEN_BOS} + eos_token: ${SPECIAL_TOKEN_EOS} + unk_token: ${SPECIAL_TOKEN_UNK} + pad_token: ${SPECIAL_TOKEN_PAD} + +tokens: ${TOKENS} + +fsdp: ${FSDP} +fsdp_config: ${FSDP_CONFIG} +deepspeed: ${DEEPSPEED} + +ddp_timeout: ${DDP_TIMEOUT} +ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB} +ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS} + +torchdistx_path: ${TORCHDISTX_PATH} +pretraining_dataset: ${PRETRAINING_DATASET} +debug: ${DEBUG} +seed: ${SEED} +strict: ${STRICT} diff --git a/.runpod/src/handler.py b/.runpod/src/handler.py new file mode 100644 index 000000000..21073dff4 --- /dev/null +++ b/.runpod/src/handler.py @@ -0,0 +1,64 @@ +""" +Runpod serverless entrypoint handler +""" + +import os + +import runpod +import yaml +from huggingface_hub._login import login +from train import train +from utils import get_output_dir + +BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume") +if not os.path.exists(BASE_VOLUME): + os.makedirs(BASE_VOLUME) + +logger = runpod.RunPodLogger() + + +async def handler(job): + runpod_job_id = job["id"] + inputs = job["input"] + run_id = inputs.get("run_id", "default_run_id") + args = inputs.get("args", {}) + + # Set output directory + output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id)) + args["output_dir"] = output_dir + + # First save args to a temporary config file + config_path = "/workspace/test_config.yaml" + + # Add run_name and job_id to args before saving + args["run_name"] = run_id + args["runpod_job_id"] = runpod_job_id + + yaml_data = yaml.dump(args, default_flow_style=False) + with open(config_path, "w", encoding="utf-8") as file: + file.write(yaml_data) + + # Handle credentials + credentials = inputs.get("credentials", {}) + + if "wandb_api_key" in credentials: + os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"] + if "hf_token" in credentials: + os.environ["HF_TOKEN"] = credentials["hf_token"] + + if os.environ.get("HF_TOKEN"): + login(token=os.environ["HF_TOKEN"]) + else: + logger.info("No HF_TOKEN provided. Skipping login.") + + logger.info("Starting Training.") + async for result in train(config_path): # Pass the config path instead of args + logger.info(result) + logger.info("Training Complete.") + + # Cleanup + del os.environ["WANDB_API_KEY"] + del os.environ["HF_TOKEN"] + + +runpod.serverless.start({"handler": handler, "return_aggregate_stream": True}) diff --git a/.runpod/src/test_input.json b/.runpod/src/test_input.json new file mode 100644 index 000000000..889e8ee27 --- /dev/null +++ b/.runpod/src/test_input.json @@ -0,0 +1,61 @@ +{ + "input": { + "user_id": "user", + "model_id": "llama-test", + "run_id": "llama-test", + "credentials": { + "wandb_api_key": "", + "hf_token": "" + }, + "args": { + "base_model": "NousResearch/Meta-Llama-3-8B", + "model_type": "LlamaForCausalLM", + "tokenizer_type": "AutoTokenizer", + "load_in_8bit": true, + "load_in_4bit": false, + "strict": false, + "datasets": [ + { + "path": "mhenrichsen/alpaca_2k_test", + "type": "alpaca" + } + ], + "val_set_size": 0.05, + "output_dir": "./outputs/lora-out", + "sequence_len": 4096, + "sample_packing": true, + "eval_sample_packing": false, + "pad_to_sequence_len": true, + "adapter": "lora", + "lora_r": 32, + "lora_alpha": 16, + "lora_dropout": 0.05, + "lora_target_linear": true, + "lora_modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "gradient_accumulation_steps": 4, + "micro_batch_size": 2, + "num_epochs": 1, + "optimizer": "adamw_bnb_8bit", + "lr_scheduler": "cosine", + "learning_rate": 0.0002, + "train_on_inputs": false, + "group_by_length": false, + "bf16": "auto", + "tf32": false, + "gradient_checkpointing": true, + "logging_steps": 1, + "flash_attention": true, + "warmup_steps": 1, + "evals_per_epoch": 1, + "eval_max_new_tokens": 128, + "saves_per_epoch": 1, + "weight_decay": 0.0, + "special_tokens": { + "pad_token": "<|end_of_text|>" + } + } + } +} diff --git a/.runpod/src/train.py b/.runpod/src/train.py new file mode 100644 index 000000000..72edda940 --- /dev/null +++ b/.runpod/src/train.py @@ -0,0 +1,45 @@ +""" +Runpod train entrypoint +""" + +import asyncio + + +async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True): + """ + Run preprocessing (if enabled) and training with the given config file + :param config_path: Path to the YAML config file + :param gpu_id: GPU ID to use (default: "0") + :param preprocess: Whether to run preprocessing (default: True) + + """ + # First check if preprocessing is needed + if preprocess: + # Preprocess command + preprocess_cmd = ( + f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}" + ) + process = await asyncio.create_subprocess_shell( + preprocess_cmd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.STDOUT, + ) + + if process.stdout is not None: + async for line in process.stdout: + yield f"Preprocessing: {line.decode().strip()}" + await process.wait() + yield "Preprocessing completed." + else: + yield "Skipping preprocessing step." + + # Training command + train_cmd = f"axolotl train {config_path}" + process = await asyncio.create_subprocess_shell( + train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT + ) + + if process.stdout is not None: + async for line in process.stdout: + yield f"Training: {line.decode().strip()}" + await process.wait() diff --git a/.runpod/src/utils.py b/.runpod/src/utils.py new file mode 100644 index 000000000..8245aecf4 --- /dev/null +++ b/.runpod/src/utils.py @@ -0,0 +1,89 @@ +""" +Runpod launcher utils +""" + +import os + +import yaml + + +def get_output_dir(run_id): + path = f"fine-tuning/{run_id}" + return path + + +def make_valid_config(input_args): + """ + Creates and saves updated config file, returns the path to the new config + :param input_args: dict of input args + :return: str, path to the updated config file + """ + # Load default config + with open("config/config.yaml", "r", encoding="utf-8") as fin: + all_args = yaml.safe_load(fin) + + if not input_args: + print("No args provided, using defaults") + else: + all_args.update(input_args) + + # Create updated config path + updated_config_path = "config/updated_config.yaml" + + # Save updated config to new file + with open(updated_config_path, "w", encoding="utf-8") as f: + yaml.dump(all_args, f) + + return updated_config_path + + +def set_config_env_vars(args: dict): + """ + Convert API arguments into environment variables. + Handles nested dictionaries, lists, and special values. + + Args: + args (dict): The arguments dictionary from the API request + """ + + def process_value(value): + """Convert Python values to string format for environment variables""" + if value is None: + return "" + if isinstance(value, bool): + return str(value).lower() + if isinstance(value, (list, dict)): + return str(value) + return str(value) + + def set_env_vars(data, prefix=""): + """Recursively set environment variables from nested dictionary""" + for key, value in data.items(): + env_key = prefix + key.upper() + + # Handle special cases + if isinstance(value, dict): + # For nested dictionaries (like special_tokens) + set_env_vars(value, f"{env_key}_") + elif isinstance(value, list): + # Handle list of dictionaries (like datasets) + if value and isinstance(value[0], dict): + for i, item in enumerate(value): + set_env_vars(item, f"{env_key}_{i}_") + else: + # For simple lists (like lora_target_modules) + os.environ[env_key] = process_value(value) + else: + # Handle all other cases + os.environ[env_key] = process_value(value) + + # Clear any existing related environment variables + # This prevents old values from persisting + for key in list(os.environ.keys()): + if key.startswith( + ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_") + ): + del os.environ[key] + + # Set new environment variables + set_env_vars(args) diff --git a/.runpod/tests.json b/.runpod/tests.json new file mode 100644 index 000000000..1d1e0287b --- /dev/null +++ b/.runpod/tests.json @@ -0,0 +1,85 @@ +{ + "input": { + "name": "quick_smoke_test_sft", + "user_id": "user", + "model_id": "llama-test", + "run_id": "llama-test", + "credentials": { + "wandb_api_key": "", + "hf_token": "" + }, + "args": { + "base_model": "HuggingFaceTB/SmolLM2-135M", + "model_type": "AutoModelForCausalLM", + "tokenizer_type": "AutoTokenizer", + "load_in_8bit": true, + "load_in_4bit": false, + "strict": false, + "datasets": [ + { + "path": "mhenrichsen/alpaca_2k_test", + "type": "alpaca" + } + ], + "val_set_size": 0.05, + "output_dir": "./outputs/lora-out", + "sequence_len": 4096, + "sample_packing": true, + "eval_sample_packing": false, + "pad_to_sequence_len": true, + "adapter": "lora", + "lora_r": 32, + "lora_alpha": 64, + "lora_dropout": 0.05, + "lora_target_linear": true, + "lora_modules_to_save": [ + "embed_tokens", + "lm_head" + ], + "gradient_accumulation_steps": 4, + "micro_batch_size": 2, + "num_epochs": 1, + "optimizer": "adamw_torch_fused", + "lr_scheduler": "cosine", + "learning_rate": 0.0002, + "train_on_inputs": false, + "group_by_length": false, + "bf16": "auto", + "tf32": true, + "gradient_checkpointing": true, + "logging_steps": 1, + "flash_attention": true, + "warmup_steps": 1, + "evals_per_epoch": 1, + "eval_max_new_tokens": 128, + "saves_per_epoch": 1, + "weight_decay": 0.0, + "special_tokens": { + "pad_token": "<|endoftext|>" + } + }, + "timeout": 100000 + }, + "config": { + "gpuTypeId": "NVIDIA GeForce RTX 4090", + "gpuCount": 1, + "containerDiskInGb": 200, + "env": [ + { + "key": "TOKENIZER", + "value": "" + }, + { + "key": "DISABLE_LOG_STATS", + "value": "true" + } + ], + "allowedCudaVersions": [ + "12.8", + "12.7", + "12.6", + "12.5", + "12.4" + ] + } +}