Compare commits
2 Commits
fix/hpc-ro
...
liger-063
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9ee7ce5c85 | ||
|
|
a41ca4d06f |
14
.github/workflows/base.yml
vendored
14
.github/workflows/base.yml
vendored
@@ -60,13 +60,6 @@ jobs:
|
|||||||
pytorch: 2.9.0
|
pytorch: 2.9.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-base"
|
dockerfile: "Dockerfile-base"
|
||||||
- cuda: "130"
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.0
|
|
||||||
torch_cuda_arch_list: "9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-base"
|
|
||||||
# - cuda: "128"
|
# - cuda: "128"
|
||||||
# cuda_version: 12.8.1
|
# cuda_version: 12.8.1
|
||||||
# cudnn_version: ""
|
# cudnn_version: ""
|
||||||
@@ -150,13 +143,6 @@ jobs:
|
|||||||
pytorch: 2.9.0
|
pytorch: 2.9.0
|
||||||
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
|
||||||
dockerfile: "Dockerfile-uv-base"
|
dockerfile: "Dockerfile-uv-base"
|
||||||
- cuda: "130"
|
|
||||||
cuda_version: 13.0.0
|
|
||||||
cudnn_version: ""
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.0
|
|
||||||
torch_cuda_arch_list: "9.0+PTX"
|
|
||||||
dockerfile: "Dockerfile-uv-base"
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|||||||
7
.github/workflows/multi-gpu-e2e.yml
vendored
7
.github/workflows/multi-gpu-e2e.yml
vendored
@@ -40,13 +40,6 @@ jobs:
|
|||||||
axolotl_extras: fbgemm-gpu
|
axolotl_extras: fbgemm-gpu
|
||||||
num_gpus: 2
|
num_gpus: 2
|
||||||
nightly_build: "true"
|
nightly_build: "true"
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.0
|
|
||||||
axolotl_extras: fbgemm-gpu
|
|
||||||
num_gpus: 2
|
|
||||||
nightly_build: "true"
|
|
||||||
runs-on: [self-hosted, modal]
|
runs-on: [self-hosted, modal]
|
||||||
timeout-minutes: 120
|
timeout-minutes: 120
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
2
.github/workflows/precommit-autoupdate.yml
vendored
2
.github/workflows/precommit-autoupdate.yml
vendored
@@ -2,7 +2,7 @@ name: Pre-commit auto-update
|
|||||||
|
|
||||||
on:
|
on:
|
||||||
schedule:
|
schedule:
|
||||||
- cron: '0 0 1 * *' # Run monthly
|
- cron: '0 0 * * 0' # Run weekly
|
||||||
workflow_dispatch: # Manual kickoff
|
workflow_dispatch: # Manual kickoff
|
||||||
|
|
||||||
jobs:
|
jobs:
|
||||||
|
|||||||
34
.github/workflows/tests.yml
vendored
34
.github/workflows/tests.yml
vendored
@@ -55,7 +55,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python_version: ["3.11"]
|
python_version: ["3.11"]
|
||||||
pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
|
pytorch_version: ["2.7.1", "2.8.0"]
|
||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -130,7 +130,7 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
python_version: ["3.11"]
|
python_version: ["3.11"]
|
||||||
pytorch_version: ["2.7.1", "2.8.0", "2.9.0"]
|
pytorch_version: ["2.7.1", "2.8.0"]
|
||||||
timeout-minutes: 20
|
timeout-minutes: 20
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
@@ -152,7 +152,7 @@ jobs:
|
|||||||
- name: upgrade pip
|
- name: upgrade pip
|
||||||
run: |
|
run: |
|
||||||
pip3 install --upgrade pip
|
pip3 install --upgrade pip
|
||||||
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel psutil
|
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 setuptools_scm build wheel
|
||||||
|
|
||||||
- name: Install PyTorch
|
- name: Install PyTorch
|
||||||
run: |
|
run: |
|
||||||
@@ -231,10 +231,16 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 128
|
- cuda: 126
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.8.0
|
pytorch: 2.7.1
|
||||||
|
num_gpus: 1
|
||||||
|
axolotl_extras:
|
||||||
|
- cuda: 126
|
||||||
|
cuda_version: 12.6.3
|
||||||
|
python_version: "3.11"
|
||||||
|
pytorch: 2.7.1
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
dockerfile: "Dockerfile-uv.jinja"
|
dockerfile: "Dockerfile-uv.jinja"
|
||||||
@@ -280,18 +286,12 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 126
|
- cuda: 128
|
||||||
cuda_version: 12.6.3
|
cuda_version: 12.8.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.7.1
|
pytorch: 2.7.1
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
axolotl_extras:
|
axolotl_extras:
|
||||||
# - cuda: 128
|
|
||||||
# cuda_version: 12.8.1
|
|
||||||
# python_version: "3.11"
|
|
||||||
# pytorch: 2.7.1
|
|
||||||
# num_gpus: 1
|
|
||||||
# axolotl_extras:
|
|
||||||
- cuda: 128
|
- cuda: 128
|
||||||
cuda_version: 12.8.1
|
cuda_version: 12.8.1
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
@@ -299,12 +299,6 @@ jobs:
|
|||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
gpu_type: "B200"
|
gpu_type: "B200"
|
||||||
axolotl_extras: fbgemm-gpu
|
axolotl_extras: fbgemm-gpu
|
||||||
- cuda: 128
|
|
||||||
cuda_version: 12.8.1
|
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.9.0
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout
|
- name: Checkout
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ repos:
|
|||||||
- id: no-commit-to-branch
|
- id: no-commit-to-branch
|
||||||
args: ['--branch', 'main']
|
args: ['--branch', 'main']
|
||||||
- repo: https://github.com/astral-sh/ruff-pre-commit
|
- repo: https://github.com/astral-sh/ruff-pre-commit
|
||||||
rev: v0.14.3
|
rev: v0.14.0
|
||||||
hooks:
|
hooks:
|
||||||
- id: ruff
|
- id: ruff
|
||||||
args: [--fix]
|
args: [--fix]
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
|
|||||||
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
RUN pip install packaging==23.2 setuptools==75.8.0 psutil
|
RUN pip install packaging==23.2 setuptools==75.8.0
|
||||||
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
|
||||||
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
|
||||||
else \
|
else \
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ ARG MAX_JOBS=4
|
|||||||
|
|
||||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||||
|
|
||||||
ENV PATH="/workspace/miniconda3/bin:${PATH}"
|
ENV PATH="/root/miniconda3/bin:${PATH}"
|
||||||
|
|
||||||
ARG PYTHON_VERSION="3.10"
|
ARG PYTHON_VERSION="3.10"
|
||||||
ARG PYTORCH_VERSION="2.1.2"
|
ARG PYTORCH_VERSION="2.1.2"
|
||||||
@@ -24,27 +24,23 @@ RUN apt-get update \
|
|||||||
&& rm -rf /var/lib/apt/lists/* \
|
&& rm -rf /var/lib/apt/lists/* \
|
||||||
&& wget \
|
&& wget \
|
||||||
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
||||||
&& mkdir -p /workspace/.conda \
|
&& mkdir /root/.conda \
|
||||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b -p /workspace/miniconda3 \
|
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
||||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
||||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||||
|
|
||||||
ENV PATH="/workspace/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel psutil && \
|
RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==23.2 setuptools==75.8.0 wheel && \
|
||||||
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
|
python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
|
||||||
|
CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir causal_conv1d==1.5.2 && \
|
||||||
|
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
|
||||||
python3 -m pip cache purge
|
python3 -m pip cache purge
|
||||||
|
|
||||||
RUN if [ "$CUDA" != "130" ] ; then \
|
|
||||||
CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.4"; \
|
|
||||||
python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
|
|
||||||
python3 -m pip cache purge; \
|
|
||||||
fi
|
|
||||||
|
|
||||||
RUN git lfs install --skip-repo && \
|
RUN git lfs install --skip-repo && \
|
||||||
pip3 install awscli && \
|
pip3 install awscli && \
|
||||||
# The base image ships with `pydantic==1.8.2` which is not working
|
# The base image ships with `pydantic==1.8.2` which is not working
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ ARG MAX_JOBS=4
|
|||||||
|
|
||||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||||
|
|
||||||
ENV PATH="/workspace/miniconda3/bin:${PATH}"
|
ENV PATH="/root/miniconda3/bin:${PATH}"
|
||||||
|
|
||||||
ARG PYTHON_VERSION="3.11"
|
ARG PYTHON_VERSION="3.11"
|
||||||
ARG PYTORCH_VERSION="next"
|
ARG PYTORCH_VERSION="next"
|
||||||
@@ -19,12 +19,12 @@ RUN apt-get update \
|
|||||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
|
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
|
||||||
&& wget \
|
&& wget \
|
||||||
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
||||||
&& mkdir -p /workspace/.conda \
|
&& mkdir /root/.conda \
|
||||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b -p /workspace/miniconda3 \
|
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||||
|
|
||||||
ENV PATH="/workspace/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ ARG MAX_JOBS=4
|
|||||||
|
|
||||||
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder
|
||||||
|
|
||||||
ENV PATH="/workspace/miniconda3/bin:${PATH}"
|
ENV PATH="/root/miniconda3/bin:${PATH}"
|
||||||
|
|
||||||
ARG PYTHON_VERSION="3.11"
|
ARG PYTHON_VERSION="3.11"
|
||||||
ARG PYTORCH_VERSION="nightly"
|
ARG PYTORCH_VERSION="nightly"
|
||||||
@@ -19,14 +19,14 @@ RUN apt-get update \
|
|||||||
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
|
&& apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
|
||||||
&& wget \
|
&& wget \
|
||||||
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
|
||||||
&& mkdir -p /workspace/.conda \
|
&& mkdir /root/.conda \
|
||||||
&& bash Miniconda3-latest-Linux-x86_64.sh -b -p /workspace/miniconda3 \
|
&& bash Miniconda3-latest-Linux-x86_64.sh -b \
|
||||||
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
&& rm -f Miniconda3-latest-Linux-x86_64.sh \
|
||||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
|
||||||
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
&& conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
|
||||||
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
&& conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"
|
||||||
|
|
||||||
ENV PATH="/workspace/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"
|
||||||
|
|
||||||
WORKDIR /workspace
|
WORKDIR /workspace
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
base_model: google/gemma-3-1b-it
|
base_model: google/gemma-3-1b-it
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: Gemma3ForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
base_model: google/gemma-3-270m-it
|
base_model: google/gemma-3-270m-it
|
||||||
|
# optionally might have model_type or tokenizer_type
|
||||||
model_type: Gemma3ForCausalLM
|
model_type: AutoModelForCausalLM
|
||||||
|
tokenizer_type: AutoTokenizer
|
||||||
# Automatically upload checkpoint and final model to HF
|
# Automatically upload checkpoint and final model to HF
|
||||||
# hub_model_id: username/custom_model_name
|
# hub_model_id: username/custom_model_name
|
||||||
|
|
||||||
|
|||||||
@@ -1,8 +1,5 @@
|
|||||||
base_model: google/gemma-3-4b-it
|
base_model: google/gemma-3-4b-it
|
||||||
|
|
||||||
# Need to set else transformers tries to load vision too
|
|
||||||
model_type: Gemma3ForCausalLM
|
|
||||||
|
|
||||||
load_in_4bit: true
|
load_in_4bit: true
|
||||||
|
|
||||||
# gemma3 doesn't seem to play nice with ddp
|
# gemma3 doesn't seem to play nice with ddp
|
||||||
|
|||||||
@@ -2,8 +2,6 @@
|
|||||||
|
|
||||||
[GPT-OSS](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B.
|
[GPT-OSS](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B.
|
||||||
|
|
||||||
In October 2025, OpenAI released safeguard models built upon GPT-OSS called [GPT-OSS-Safeguard](https://huggingface.co/collections/openai/gpt-oss-safeguard). They use the same architecture, so the same examples below can be re-used.
|
|
||||||
|
|
||||||
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.
|
||||||
|
|
||||||
## Getting started
|
## Getting started
|
||||||
@@ -66,16 +64,6 @@ axolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offlo
|
|||||||
mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/
|
mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/
|
||||||
```
|
```
|
||||||
|
|
||||||
### How to set reasoning_effort in template?
|
|
||||||
|
|
||||||
The harmony template has a feature to set the `reasoning_effort` during prompt building. The default is `medium`. If you would like to adjust this, you can add the following to your config:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
chat_template_kwargs:
|
|
||||||
reasoning_effort: "high" # low | medium | high
|
|
||||||
```
|
|
||||||
|
|
||||||
Currently, this applies globally. There is no method to apply per sample yet. If you are interested in adding this, please feel free to create an Issue to discuss.
|
|
||||||
|
|
||||||
### Inferencing your fine-tuned model
|
### Inferencing your fine-tuned model
|
||||||
|
|
||||||
|
|||||||
@@ -1,67 +0,0 @@
|
|||||||
base_model: openai/gpt-oss-safeguard-20b
|
|
||||||
use_kernels: true
|
|
||||||
model_quantization_config: Mxfp4Config
|
|
||||||
model_quantization_config_kwargs:
|
|
||||||
dequantize: true
|
|
||||||
|
|
||||||
plugins:
|
|
||||||
- axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
|
|
||||||
|
|
||||||
experimental_skip_move_to_device: true # prevent OOM by not putting model to GPU before sharding
|
|
||||||
|
|
||||||
datasets:
|
|
||||||
- path: HuggingFaceH4/Multilingual-Thinking
|
|
||||||
type: chat_template
|
|
||||||
field_thinking: thinking
|
|
||||||
template_thinking_key: thinking
|
|
||||||
|
|
||||||
dataset_prepared_path: last_run_prepared
|
|
||||||
val_set_size: 0
|
|
||||||
output_dir: ./outputs/gpt-oss-safeguard-out/
|
|
||||||
|
|
||||||
sequence_len: 4096
|
|
||||||
sample_packing: true
|
|
||||||
|
|
||||||
adapter: lora
|
|
||||||
lora_r: 8
|
|
||||||
lora_alpha: 16
|
|
||||||
lora_dropout: 0.0 # dropout not supported when using LoRA over expert parameters
|
|
||||||
lora_target_linear: true
|
|
||||||
|
|
||||||
# TODO: not supported for now, see peft#2710
|
|
||||||
#lora_target_parameters: # target the experts in the last two layers
|
|
||||||
# - "22._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
|
|
||||||
# - "22._checkpoint_wrapped_module.mlp.experts.down_proj"
|
|
||||||
# - "23._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
|
|
||||||
# - "23._checkpoint_wrapped_module.mlp.experts.down_proj"
|
|
||||||
|
|
||||||
wandb_project:
|
|
||||||
wandb_entity:
|
|
||||||
wandb_watch:
|
|
||||||
wandb_name:
|
|
||||||
wandb_log_model:
|
|
||||||
|
|
||||||
gradient_accumulation_steps: 8
|
|
||||||
micro_batch_size: 1
|
|
||||||
num_epochs: 1
|
|
||||||
|
|
||||||
optimizer: adamw_torch_8bit
|
|
||||||
lr_scheduler: constant_with_warmup
|
|
||||||
learning_rate: 2e-4
|
|
||||||
|
|
||||||
bf16: true
|
|
||||||
tf32: true
|
|
||||||
|
|
||||||
flash_attention: true
|
|
||||||
attn_implementation: kernels-community/vllm-flash-attn3 # this is not needed if using flash_attn >= 2.8.3
|
|
||||||
|
|
||||||
gradient_checkpointing: true
|
|
||||||
activation_offloading: true
|
|
||||||
|
|
||||||
logging_steps: 1
|
|
||||||
saves_per_epoch: 1
|
|
||||||
warmup_ratio: 0.1
|
|
||||||
|
|
||||||
special_tokens:
|
|
||||||
eot_tokens:
|
|
||||||
- "<|end|>"
|
|
||||||
@@ -10,25 +10,25 @@ liger-kernel==0.6.3
|
|||||||
|
|
||||||
packaging==23.2
|
packaging==23.2
|
||||||
|
|
||||||
huggingface_hub>=0.36.0
|
huggingface_hub>=0.33.0
|
||||||
peft>=0.17.1
|
peft>=0.17.1
|
||||||
tokenizers>=0.21.1
|
tokenizers>=0.21.1
|
||||||
transformers==4.57.1
|
transformers==4.57.1
|
||||||
accelerate==1.10.1
|
accelerate==1.10.1
|
||||||
datasets==4.3.0
|
datasets==4.0.0
|
||||||
deepspeed>=0.17.0
|
deepspeed>=0.17.0
|
||||||
trl==0.24.0
|
trl==0.23.1
|
||||||
hf_xet==1.2.0
|
hf_xet==1.1.5
|
||||||
kernels>=0.9.0
|
kernels==0.9.0
|
||||||
trackio
|
trackio
|
||||||
|
|
||||||
optimum==1.16.2
|
optimum==1.16.2
|
||||||
hf_transfer
|
hf_transfer
|
||||||
sentencepiece
|
sentencepiece
|
||||||
gradio==5.49.1
|
gradio==5.41.1
|
||||||
|
|
||||||
modal==1.0.2
|
modal==1.0.2
|
||||||
pydantic>=2.10.6
|
pydantic==2.10.6
|
||||||
addict
|
addict
|
||||||
fire
|
fire
|
||||||
PyYAML>=6.0
|
PyYAML>=6.0
|
||||||
@@ -36,8 +36,8 @@ requests
|
|||||||
wandb
|
wandb
|
||||||
einops
|
einops
|
||||||
colorama
|
colorama
|
||||||
numba>=0.61.2
|
numba
|
||||||
numpy>=2.2.6
|
numpy>=1.24.4,<=2.0.1
|
||||||
|
|
||||||
# qlora things
|
# qlora things
|
||||||
evaluate==0.4.1
|
evaluate==0.4.1
|
||||||
@@ -50,7 +50,7 @@ python-dotenv==1.0.1
|
|||||||
|
|
||||||
# remote filesystems
|
# remote filesystems
|
||||||
s3fs>=2024.5.0
|
s3fs>=2024.5.0
|
||||||
gcsfs>=2025.3.0
|
gcsfs>=2024.5.0
|
||||||
adlfs>=2024.5.0
|
adlfs>=2024.5.0
|
||||||
ocifs==1.3.2
|
ocifs==1.3.2
|
||||||
|
|
||||||
@@ -66,7 +66,7 @@ antlr4-python3-runtime==4.13.2
|
|||||||
torchao==0.13.0
|
torchao==0.13.0
|
||||||
schedulefree==1.4.1
|
schedulefree==1.4.1
|
||||||
|
|
||||||
axolotl-contribs-lgpl==0.0.7
|
axolotl-contribs-lgpl==0.0.6
|
||||||
axolotl-contribs-mit==0.0.5
|
axolotl-contribs-mit==0.0.5
|
||||||
|
|
||||||
mistral-common==1.8.5
|
mistral-common==1.8.5
|
||||||
|
|||||||
15
setup.py
15
setup.py
@@ -62,15 +62,8 @@ def parse_requirements(extras_require_map):
|
|||||||
else:
|
else:
|
||||||
raise ValueError("Invalid version format")
|
raise ValueError("Invalid version format")
|
||||||
|
|
||||||
if (major, minor) >= (2, 9):
|
if (major, minor) >= (2, 8):
|
||||||
extras_require_map.pop("fbgemm-gpu")
|
pass
|
||||||
extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.4.1"]
|
|
||||||
extras_require_map["vllm"] = ["vllm==0.11.1"]
|
|
||||||
_install_requires.pop(_install_requires.index(xformers_version))
|
|
||||||
elif (major, minor) >= (2, 8):
|
|
||||||
extras_require_map.pop("fbgemm-gpu")
|
|
||||||
extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
|
|
||||||
extras_require_map["vllm"] = ["vllm==0.11.0"]
|
|
||||||
elif (major, minor) >= (2, 7):
|
elif (major, minor) >= (2, 7):
|
||||||
_install_requires.pop(_install_requires.index(xformers_version))
|
_install_requires.pop(_install_requires.index(xformers_version))
|
||||||
if patch == 0:
|
if patch == 0:
|
||||||
@@ -79,7 +72,7 @@ def parse_requirements(extras_require_map):
|
|||||||
extras_require_map.pop("vllm")
|
extras_require_map.pop("vllm")
|
||||||
else:
|
else:
|
||||||
_install_requires.append("xformers==0.0.31")
|
_install_requires.append("xformers==0.0.31")
|
||||||
extras_require_map["vllm"] = ["vllm==0.10.1"]
|
extras_require_map["vllm"] = ["vllm>=0.10.0"]
|
||||||
elif (major, minor) >= (2, 6):
|
elif (major, minor) >= (2, 6):
|
||||||
_install_requires.pop(_install_requires.index(xformers_version))
|
_install_requires.pop(_install_requires.index(xformers_version))
|
||||||
_install_requires.append("xformers==0.0.29.post3")
|
_install_requires.append("xformers==0.0.29.post3")
|
||||||
@@ -165,7 +158,7 @@ extras_require = {
|
|||||||
"llmcompressor": [
|
"llmcompressor": [
|
||||||
"llmcompressor==0.5.1",
|
"llmcompressor==0.5.1",
|
||||||
],
|
],
|
||||||
"fbgemm-gpu": ["fbgemm-gpu-genai==1.3.0"],
|
"fbgemm-gpu": ["fbgemm-gpu-genai>=1.2.0"],
|
||||||
"opentelemetry": [
|
"opentelemetry": [
|
||||||
"opentelemetry-api",
|
"opentelemetry-api",
|
||||||
"opentelemetry-sdk",
|
"opentelemetry-sdk",
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ from transformers import (
|
|||||||
EarlyStoppingCallback,
|
EarlyStoppingCallback,
|
||||||
Trainer,
|
Trainer,
|
||||||
)
|
)
|
||||||
from trl.trainer.reward_trainer import DataCollatorForPreference
|
from trl.trainer.utils import RewardDataCollatorWithPadding
|
||||||
|
|
||||||
from axolotl.core.builders.base import TrainerBuilderBase
|
from axolotl.core.builders.base import TrainerBuilderBase
|
||||||
from axolotl.core.trainers import (
|
from axolotl.core.trainers import (
|
||||||
@@ -453,7 +453,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
BatchSamplerDataCollatorForSeq2Seq,
|
BatchSamplerDataCollatorForSeq2Seq,
|
||||||
DataCollatorForSeq2Seq,
|
DataCollatorForSeq2Seq,
|
||||||
DataCollatorWithFlattening,
|
DataCollatorWithFlattening,
|
||||||
DataCollatorForPreference,
|
RewardDataCollatorWithPadding,
|
||||||
]
|
]
|
||||||
]
|
]
|
||||||
collator_args = [self.tokenizer]
|
collator_args = [self.tokenizer]
|
||||||
@@ -470,10 +470,7 @@ class HFCausalTrainerBuilder(TrainerBuilderBase):
|
|||||||
if kwargs and isinstance(kwargs, dict):
|
if kwargs and isinstance(kwargs, dict):
|
||||||
kwargs.update(collator_cls_and_kwargs[1])
|
kwargs.update(collator_cls_and_kwargs[1])
|
||||||
elif self.cfg.reward_model:
|
elif self.cfg.reward_model:
|
||||||
collator = DataCollatorForPreference
|
collator = RewardDataCollatorWithPadding
|
||||||
tokenizer = collator_args.pop(0)
|
|
||||||
kwargs["pad_token_id"] = tokenizer.pad_token_id
|
|
||||||
kwargs.pop("padding")
|
|
||||||
elif use_batch_sampler_collator:
|
elif use_batch_sampler_collator:
|
||||||
# Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
|
# Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
|
||||||
# supported multipack models, or non-flash-attention llama
|
# supported multipack models, or non-flash-attention llama
|
||||||
|
|||||||
@@ -71,10 +71,10 @@ class BTChatTemplateStrategy(ChatTemplateStrategy):
|
|||||||
]
|
]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"chosen_input_ids": chosen_tokenized["input_ids"],
|
"input_ids_chosen": chosen_tokenized["input_ids"],
|
||||||
"attention_mask_chosen": chosen_tokenized["attention_mask"],
|
"attention_mask_chosen": chosen_tokenized["attention_mask"],
|
||||||
"labels_chosen": 1.0,
|
"labels_chosen": 1.0,
|
||||||
"rejected_input_ids": rejected_tokenized["input_ids"],
|
"input_ids_rejected": rejected_tokenized["input_ids"],
|
||||||
"attention_mask_rejected": rejected_tokenized["attention_mask"],
|
"attention_mask_rejected": rejected_tokenized["attention_mask"],
|
||||||
"labels_rejected": 0.0,
|
"labels_rejected": 0.0,
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user