Compare commits
11 Commits
feat/phi_3
...
shared-pre
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b79996bdc4 | ||
|
|
68368de7ed | ||
|
|
a94c4a014b | ||
|
|
0102ca5943 | ||
|
|
97e8c01a70 | ||
|
|
5c4705b185 | ||
|
|
47a88da330 | ||
|
|
07ab737a55 | ||
|
|
c40da3b5eb | ||
|
|
a5946ff1f0 | ||
|
|
70ca1b2291 |
6
.github/workflows/base.yml
vendored
6
.github/workflows/base.yml
vendored
@@ -5,11 +5,13 @@ on:
|
|||||||
branches:
|
branches:
|
||||||
- "main"
|
- "main"
|
||||||
paths:
|
paths:
|
||||||
- 'Dockerfile-base'
|
- 'docker/Dockerfile-base'
|
||||||
|
- 'docker/Dockerfile-uv-base'
|
||||||
- '.github/workflows/base.yml'
|
- '.github/workflows/base.yml'
|
||||||
pull_request:
|
pull_request:
|
||||||
paths:
|
paths:
|
||||||
- 'Dockerfile-base'
|
- 'docker/Dockerfile-base'
|
||||||
|
- 'docker/Dockerfile-uv-base'
|
||||||
- '.github/workflows/base.yml'
|
- '.github/workflows/base.yml'
|
||||||
workflow_dispatch:
|
workflow_dispatch:
|
||||||
|
|
||||||
|
|||||||
115
.github/workflows/tests-nightly.yml
vendored
115
.github/workflows/tests-nightly.yml
vendored
@@ -18,96 +18,9 @@ jobs:
|
|||||||
env:
|
env:
|
||||||
SKIP: no-commit-to-branch
|
SKIP: no-commit-to-branch
|
||||||
|
|
||||||
preload-cache:
|
|
||||||
name: Preload HF cache
|
|
||||||
runs-on: ubuntu-latest
|
|
||||||
strategy:
|
|
||||||
fail-fast: false
|
|
||||||
matrix:
|
|
||||||
python_version: ["3.11"]
|
|
||||||
pytorch_version: ["2.6.0"]
|
|
||||||
timeout-minutes: 20
|
|
||||||
|
|
||||||
env:
|
|
||||||
AXOLOTL_IS_CI_CACHE_PRELOAD: "1"
|
|
||||||
|
|
||||||
steps:
|
|
||||||
- name: Check out repository code
|
|
||||||
uses: actions/checkout@v4
|
|
||||||
|
|
||||||
- name: Restore HF cache
|
|
||||||
id: hf-cache-restore
|
|
||||||
uses: actions/cache/restore@v4
|
|
||||||
with:
|
|
||||||
path: |
|
|
||||||
/home/runner/.cache/huggingface/hub/datasets--*
|
|
||||||
/home/runner/.cache/huggingface/hub/models--*
|
|
||||||
key: ${{ runner.os }}-hf-hub-cache-v2
|
|
||||||
|
|
||||||
- name: Setup Python
|
|
||||||
uses: actions/setup-python@v5
|
|
||||||
with:
|
|
||||||
python-version: ${{ matrix.python_version }}
|
|
||||||
cache: 'pip' # caching pip dependencies
|
|
||||||
|
|
||||||
- name: upgrade pip
|
|
||||||
run: |
|
|
||||||
pip3 install --upgrade pip
|
|
||||||
pip3 install --upgrade packaging==23.2 setuptools==75.8.0 wheel
|
|
||||||
|
|
||||||
- name: Install PyTorch
|
|
||||||
run: |
|
|
||||||
pip3 install torch==${{ matrix.pytorch_version }}
|
|
||||||
|
|
||||||
- name: Install dependencies
|
|
||||||
run: |
|
|
||||||
pip3 show torch
|
|
||||||
pip3 install --no-build-isolation -U -e .
|
|
||||||
python scripts/unsloth_install.py | sh
|
|
||||||
python scripts/cutcrossentropy_install.py | sh
|
|
||||||
pip3 install -r requirements-dev.txt -r requirements-tests.txt
|
|
||||||
|
|
||||||
- name: Make sure PyTorch version wasn't clobbered
|
|
||||||
run: |
|
|
||||||
python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"
|
|
||||||
|
|
||||||
- name: Ensure axolotl CLI was installed
|
|
||||||
run: |
|
|
||||||
axolotl --help
|
|
||||||
|
|
||||||
- name: Pre-Download dataset fixture
|
|
||||||
run: |
|
|
||||||
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
|
||||||
|
|
||||||
- name: Run tests
|
|
||||||
run: |
|
|
||||||
pytest -v tests/conftest.py
|
|
||||||
|
|
||||||
- name: Upload coverage to Codecov
|
|
||||||
uses: codecov/codecov-action@v5
|
|
||||||
with:
|
|
||||||
token: ${{ secrets.CODECOV_TOKEN }}
|
|
||||||
files: ./coverage.xml
|
|
||||||
flags: unittests,pytorch-${{ matrix.pytorch_version }}
|
|
||||||
fail_ci_if_error: false
|
|
||||||
|
|
||||||
- name: cleanup pip cache
|
|
||||||
run: |
|
|
||||||
find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;
|
|
||||||
|
|
||||||
- name: Save HF cache
|
|
||||||
id: hf-cache
|
|
||||||
uses: actions/cache/save@v4
|
|
||||||
with:
|
|
||||||
path: |
|
|
||||||
/home/runner/.cache/huggingface/hub/datasets--*
|
|
||||||
/home/runner/.cache/huggingface/hub/models--*
|
|
||||||
key: ${{ steps.hf-cache-restore.outputs.cache-primary-key }}
|
|
||||||
|
|
||||||
pytest:
|
pytest:
|
||||||
name: PyTest
|
name: PyTest
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
needs: [preload-cache]
|
|
||||||
strategy:
|
strategy:
|
||||||
fail-fast: false
|
fail-fast: false
|
||||||
max-parallel: 2
|
max-parallel: 2
|
||||||
@@ -120,14 +33,11 @@ jobs:
|
|||||||
- name: Check out repository code
|
- name: Check out repository code
|
||||||
uses: actions/checkout@v4
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
- name: Restore HF cache
|
- name: Restore Cache from S3
|
||||||
id: hf-cache-restore
|
id: hf-cache-restore-s3
|
||||||
uses: actions/cache/restore@v4
|
run: |
|
||||||
with:
|
mkdir -p /home/runner/.cache/huggingface/hub
|
||||||
path: |
|
curl -L https://d1dttdx32dkk5p.cloudfront.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/ --use-compress-program unzstd
|
||||||
/home/runner/.cache/huggingface/hub/datasets--*
|
|
||||||
/home/runner/.cache/huggingface/hub/models--*
|
|
||||||
key: ${{ runner.os }}-hf-hub-cache-v2
|
|
||||||
|
|
||||||
- name: Setup Python
|
- name: Setup Python
|
||||||
uses: actions/setup-python@v5
|
uses: actions/setup-python@v5
|
||||||
@@ -168,10 +78,6 @@ jobs:
|
|||||||
run: |
|
run: |
|
||||||
axolotl --help
|
axolotl --help
|
||||||
|
|
||||||
- name: Pre-Download dataset fixture
|
|
||||||
run: |
|
|
||||||
huggingface-cli download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures
|
|
||||||
|
|
||||||
- name: Run tests
|
- name: Run tests
|
||||||
run: |
|
run: |
|
||||||
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
pytest -v -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
|
||||||
@@ -193,15 +99,8 @@ jobs:
|
|||||||
fail-fast: false
|
fail-fast: false
|
||||||
matrix:
|
matrix:
|
||||||
include:
|
include:
|
||||||
- cuda: 124
|
- cuda: 126
|
||||||
cuda_version: 12.4.1
|
cuda_version: 12.6.3
|
||||||
python_version: "3.11"
|
|
||||||
pytorch: 2.5.1
|
|
||||||
num_gpus: 1
|
|
||||||
axolotl_extras:
|
|
||||||
nightly_build: "true"
|
|
||||||
- cuda: 124
|
|
||||||
cuda_version: 12.4.1
|
|
||||||
python_version: "3.11"
|
python_version: "3.11"
|
||||||
pytorch: 2.6.0
|
pytorch: 2.6.0
|
||||||
num_gpus: 1
|
num_gpus: 1
|
||||||
|
|||||||
@@ -37,3 +37,7 @@ RUN git lfs install --skip-repo && \
|
|||||||
pip3 install awscli && \
|
pip3 install awscli && \
|
||||||
# The base image ships with `pydantic==1.8.2` which is not working
|
# The base image ships with `pydantic==1.8.2` which is not working
|
||||||
pip3 install -U --no-cache-dir pydantic==1.10.10
|
pip3 install -U --no-cache-dir pydantic==1.10.10
|
||||||
|
|
||||||
|
RUN if [ "$PYTORCH_VERSION" = "2.6.0" ] && [ "$CUDA" = "124" ] ; then \
|
||||||
|
FLASH_ATTENTION_FORCE_BUILD="TRUE" pip3 install --no-build-isolation flash-attn==2.8.0.post2; \
|
||||||
|
fi
|
||||||
|
|||||||
@@ -16,7 +16,6 @@ format:
|
|||||||
- [Gemma-3](#sec-gemma-3)
|
- [Gemma-3](#sec-gemma-3)
|
||||||
- [Qwen2-VL](#sec-qwen2-vl)
|
- [Qwen2-VL](#sec-qwen2-vl)
|
||||||
- [Qwen2.5-VL](#sec-qwen25-vl)
|
- [Qwen2.5-VL](#sec-qwen25-vl)
|
||||||
- [Phi3-V](#sec-phi3-v)
|
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
@@ -127,15 +126,6 @@ base_model: Qwen/Qwen2.5-VL-7B-Instruct
|
|||||||
chat_template: qwen2_vl # same as qwen2-vl
|
chat_template: qwen2_vl # same as qwen2-vl
|
||||||
```
|
```
|
||||||
|
|
||||||
### Phi3-V {#sec-phi3-v}
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
base_model: microsoft/Phi-3.5-vision-instruct
|
|
||||||
|
|
||||||
trust_remote_code: true
|
|
||||||
chat_template: phi_35_vl
|
|
||||||
```
|
|
||||||
|
|
||||||
## Dataset Format
|
## Dataset Format
|
||||||
|
|
||||||
For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
|
For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.
|
||||||
|
|||||||
@@ -219,7 +219,9 @@ class TrainerBuilderBase(abc.ABC):
|
|||||||
if self.cfg.bf16 == "full":
|
if self.cfg.bf16 == "full":
|
||||||
training_args_kwargs["bf16_full_eval"] = True
|
training_args_kwargs["bf16_full_eval"] = True
|
||||||
else:
|
else:
|
||||||
training_args_kwargs["bf16"] = self.cfg.bf16 or self.cfg.bfloat16
|
bf16 = self.cfg.bf16 or self.cfg.bfloat16
|
||||||
|
bf16 = bf16 if bf16 is not None else False
|
||||||
|
training_args_kwargs["bf16"] = bf16
|
||||||
|
|
||||||
def _configure_scheduler(self, training_args_kwargs: dict):
|
def _configure_scheduler(self, training_args_kwargs: dict):
|
||||||
if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
|
if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
|
||||||
|
|||||||
@@ -1,7 +1,6 @@
|
|||||||
"""Shared constants for axolotl.loaders module"""
|
"""Shared constants for axolotl.loaders module"""
|
||||||
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoModelForCausalLM,
|
|
||||||
Gemma3ForConditionalGeneration,
|
Gemma3ForConditionalGeneration,
|
||||||
Llama4ForConditionalGeneration,
|
Llama4ForConditionalGeneration,
|
||||||
LlavaForConditionalGeneration,
|
LlavaForConditionalGeneration,
|
||||||
@@ -19,6 +18,4 @@ MULTIMODAL_AUTO_MODEL_MAPPING = {
|
|||||||
"qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
|
"qwen2_5_vl": Qwen2_5_VLForConditionalGeneration,
|
||||||
"mistral3": Mistral3ForConditionalGeneration,
|
"mistral3": Mistral3ForConditionalGeneration,
|
||||||
"gemma3": Gemma3ForConditionalGeneration,
|
"gemma3": Gemma3ForConditionalGeneration,
|
||||||
# phi3_v modeling code is not available in transformers yet
|
|
||||||
"phi3_v": AutoModelForCausalLM,
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -264,23 +264,6 @@ class Gemma3ProcessingStrategy(ProcessingStrategy):
|
|||||||
return labels
|
return labels
|
||||||
|
|
||||||
|
|
||||||
class Phi35VLProcessingStrategy(ProcessingStrategy):
|
|
||||||
"""Processing Strategy class for Phi-3.5-vision-instruct"""
|
|
||||||
|
|
||||||
def __init__(
|
|
||||||
self,
|
|
||||||
processor: ProcessorMixin,
|
|
||||||
chat_template: Optional[str] = None,
|
|
||||||
image_size: int | tuple[int, int] | None = None,
|
|
||||||
image_resize_algorithm: Resampling | None = None,
|
|
||||||
):
|
|
||||||
super().__init__(processor, chat_template, image_size, image_resize_algorithm)
|
|
||||||
self.image_token = "<|image|>" # nosec
|
|
||||||
self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
|
|
||||||
self.image_token
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_processing_strategy(
|
def get_processing_strategy(
|
||||||
processor: ProcessorMixin,
|
processor: ProcessorMixin,
|
||||||
chat_template,
|
chat_template,
|
||||||
@@ -296,10 +279,6 @@ def get_processing_strategy(
|
|||||||
return Gemma3ProcessingStrategy(
|
return Gemma3ProcessingStrategy(
|
||||||
processor, chat_template, image_size, image_resize_algorithm
|
processor, chat_template, image_size, image_resize_algorithm
|
||||||
)
|
)
|
||||||
if chat_template_type == "phi_35_vl":
|
|
||||||
return Phi35VLProcessingStrategy(
|
|
||||||
processor, chat_template, image_size, image_resize_algorithm
|
|
||||||
)
|
|
||||||
if chat_template_type in [
|
if chat_template_type in [
|
||||||
"llama3_2_vision",
|
"llama3_2_vision",
|
||||||
"llama4",
|
"llama4",
|
||||||
|
|||||||
@@ -32,7 +32,6 @@ _CHAT_TEMPLATES = {
|
|||||||
"llava": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
|
"llava": "{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>\n' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}",
|
||||||
"phi_3": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
|
"phi_3": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
|
||||||
"phi_35": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}",
|
"phi_35": "{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'user' %}{{'<|user|>\n' + message['content'] + '<|end|>\n'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>\n' + message['content'] + '<|end|>\n'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% endif %}",
|
||||||
"phi_35_vl": "{% set image_count = namespace(value=0) %}{% for message in messages %}{{'<|' + message['role'] + '|>\n' }}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% set message_images = [] %}{% set message_text = [] %}{% for chunk in message['content'] %}{% if chunk['type'] == 'image' or 'image' in chunk or 'image_url' in chunk %}{% set image_count.value = image_count.value + 1 %}{% set _ = message_images.append('<|image_' + image_count.value|string + '|>\n') %}{% elif chunk['type'] == 'text' %}{% set _ = message_text.append(chunk['text']) %}{% endif %}{% endfor %}{{ message_images | join('') }}{{ message_text | join('') }}{% endif %}{{ '<|end|>\n' }}{% endfor %}{% if add_generation_prompt and messages[-1]['role'] != 'assistant' %}{{- '<|assistant|>\n' -}}{% endif %}",
|
|
||||||
"phi_4": "{% set system_message = 'You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:' -%}{%- if messages and messages[0]['role'] == 'system' -%}{%- set system_message = messages[0]['content'] -%}{%- set messages = messages[1:] -%}{%- endif -%}<|im_start|>system<|im_sep|>{{ system_message }}<|im_end|>{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>'}}{% generation %}{{message['content'] + '<|im_end|>'}}{% endgeneration %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}",
|
"phi_4": "{% set system_message = 'You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:' -%}{%- if messages and messages[0]['role'] == 'system' -%}{%- set system_message = messages[0]['content'] -%}{%- set messages = messages[1:] -%}{%- endif -%}<|im_start|>system<|im_sep|>{{ system_message }}<|im_end|>{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>'}}{% generation %}{{message['content'] + '<|im_end|>'}}{% endgeneration %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}",
|
||||||
"deepseek_v2": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|User|>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<|Assistant|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|Assistant|>' }}{% endif %}",
|
"deepseek_v2": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<|User|>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<|Assistant|>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '\n\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|Assistant|>' }}{% endif %}",
|
||||||
"deepseek_v3": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
|
"deepseek_v3": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\\n\\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<|User|>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<|Assistant|><|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- else %}{{'<|Assistant|>' + message['content'] + '<|tool▁calls▁begin|><|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\\n' + '<|tool▁call▁begin|>' + tool['type'] + '<|tool▁sep|>' + tool['function']['name'] + '\\n' + '```json' + '\\n' + tool['function']['arguments'] + '\\n' + '```' + '<|tool▁call▁end|>'}}{%- endif %}{%- endfor %}{{'<|tool▁calls▁end|><|end▁of▁sentence|>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<|tool▁outputs▁end|>' + message['content'] + '<|end▁of▁sentence|>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<|Assistant|>' + content + '<|end▁of▁sentence|>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<|tool▁outputs▁begin|><|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<|tool▁output▁begin|>' + message['content'] + '<|tool▁output▁end|>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<|tool▁outputs▁end|>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<|Assistant|>'}}{% endif %}",
|
||||||
|
|||||||
@@ -48,8 +48,6 @@ class ChatTemplate(str, Enum):
|
|||||||
llama4 = "llama4"
|
llama4 = "llama4"
|
||||||
phi_3 = "phi_3"
|
phi_3 = "phi_3"
|
||||||
phi_35 = "phi_35"
|
phi_35 = "phi_35"
|
||||||
phi_35_vl = "phi_35_vl"
|
|
||||||
phi_4 = "phi_4"
|
|
||||||
deepseek_v2 = "deepseek_v2"
|
deepseek_v2 = "deepseek_v2"
|
||||||
deepseek_v3 = "deepseek_v3"
|
deepseek_v3 = "deepseek_v3"
|
||||||
jamba = "jamba"
|
jamba = "jamba"
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ import shutil
|
|||||||
import sys
|
import sys
|
||||||
import tempfile
|
import tempfile
|
||||||
import time
|
import time
|
||||||
from pathlib import Path, PosixPath
|
from pathlib import Path
|
||||||
from typing import Generator
|
from typing import Generator
|
||||||
|
|
||||||
import datasets
|
import datasets
|
||||||
@@ -423,9 +423,13 @@ def temp_dir() -> Generator[str, None, None]:
|
|||||||
shutil.rmtree(_temp_dir)
|
shutil.rmtree(_temp_dir)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="module")
|
||||||
def unique_triton_cache_dir(temp_dir: str | PosixPath) -> None:
|
def module_temp_dir() -> Generator[str, None, None]:
|
||||||
os.environ["TRITON_CACHE_DIR"] = str(temp_dir) + "/.triton/cache"
|
# Create a temporary directory
|
||||||
|
_temp_dir = tempfile.mkdtemp()
|
||||||
|
yield _temp_dir
|
||||||
|
# Clean up the directory after the test
|
||||||
|
shutil.rmtree(_temp_dir)
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture(scope="function", autouse=True)
|
@pytest.fixture(scope="function", autouse=True)
|
||||||
|
|||||||
@@ -2,6 +2,8 @@
|
|||||||
E2E tests for multigpu lora tinyllama
|
E2E tests for multigpu lora tinyllama
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
# pylint: disable=redefined-outer-name
|
||||||
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
@@ -25,6 +27,60 @@ def download_model():
|
|||||||
snapshot_download("HuggingFaceTB/SmolLM2-135M")
|
snapshot_download("HuggingFaceTB/SmolLM2-135M")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module")
|
||||||
|
def sft_base_cfg():
|
||||||
|
cfg = DictDefault(
|
||||||
|
base_model="HuggingFaceTB/SmolLM2-135M",
|
||||||
|
tokenizer_config="HuggingFaceTB/SmolLM2-135M", # this has to be manually set since we haven't done validation
|
||||||
|
sequence_len=1024,
|
||||||
|
special_tokens={
|
||||||
|
"pad_token": "<|endoftext|>",
|
||||||
|
},
|
||||||
|
datasets=[
|
||||||
|
{
|
||||||
|
"path": "tatsu-lab/alpaca",
|
||||||
|
"type": "alpaca",
|
||||||
|
"split": "train[:10%]",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
val_set_size=0.1,
|
||||||
|
sample_packing=True,
|
||||||
|
flash_attention=True,
|
||||||
|
learning_rate=0.00001,
|
||||||
|
optimizer="adamw_8bit",
|
||||||
|
seed=42,
|
||||||
|
# these need to be set since we aren't running schema validation
|
||||||
|
micro_batch_size=2,
|
||||||
|
gradient_accumulation_steps=1,
|
||||||
|
)
|
||||||
|
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(scope="module", name="sft_prepared_dataset_alpaca_cfg")
|
||||||
|
def sft_prepared_dataset_alpaca_cfg(module_temp_dir, sft_base_cfg):
|
||||||
|
dataset_prepared_path = module_temp_dir + "/last_run_prepared"
|
||||||
|
cfg = sft_base_cfg | DictDefault(
|
||||||
|
dataset_prepared_path=dataset_prepared_path,
|
||||||
|
)
|
||||||
|
|
||||||
|
Path(module_temp_dir).mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(Path(module_temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
|
||||||
|
fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
|
||||||
|
|
||||||
|
execute_subprocess_async(
|
||||||
|
[
|
||||||
|
"axolotl",
|
||||||
|
"preprocess",
|
||||||
|
str(Path(module_temp_dir) / "config.yaml"),
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# unset flash attention since we have some flex attention tests too
|
||||||
|
cfg.flash_attention = None
|
||||||
|
return cfg
|
||||||
|
|
||||||
|
|
||||||
def transformers_version_eq(required_version):
|
def transformers_version_eq(required_version):
|
||||||
return version.parse(transformers.__version__) == version.parse(required_version)
|
return version.parse(transformers.__version__) == version.parse(required_version)
|
||||||
|
|
||||||
@@ -97,45 +153,36 @@ class TestMultiGPULlama:
|
|||||||
"gradient_accumulation_steps",
|
"gradient_accumulation_steps",
|
||||||
[1, 2],
|
[1, 2],
|
||||||
)
|
)
|
||||||
def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
|
def test_lora_ddp_packed(
|
||||||
|
self, temp_dir, sft_prepared_dataset_alpaca_cfg, gradient_accumulation_steps
|
||||||
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = (
|
||||||
{
|
DictDefault(
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
{
|
||||||
"sequence_len": 2048,
|
"eval_sample_packing": False,
|
||||||
"sample_packing": True,
|
"pad_to_sequence_len": True,
|
||||||
"eval_sample_packing": False,
|
"adapter": "lora",
|
||||||
"pad_to_sequence_len": True,
|
"lora_r": 8,
|
||||||
"adapter": "lora",
|
"lora_alpha": 16,
|
||||||
"lora_r": 8,
|
"lora_dropout": 0.05,
|
||||||
"lora_alpha": 16,
|
"lora_target_linear": True,
|
||||||
"lora_dropout": 0.05,
|
"val_set_size": 0.05,
|
||||||
"lora_target_linear": True,
|
"num_epochs": 1,
|
||||||
"val_set_size": 0.05,
|
"max_steps": 2,
|
||||||
"special_tokens": {
|
"micro_batch_size": 1,
|
||||||
"pad_token": "<|endoftext|>",
|
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||||
},
|
# "gradient_checkpointing": True,
|
||||||
"datasets": [
|
"output_dir": temp_dir,
|
||||||
{
|
"learning_rate": 0.00001,
|
||||||
"path": "tatsu-lab/alpaca",
|
"optimizer": "adamw_8bit",
|
||||||
"type": "alpaca",
|
"lr_scheduler": "cosine",
|
||||||
"split": "train[:20%]",
|
"flash_attention": True,
|
||||||
},
|
"use_tensorboard": True,
|
||||||
],
|
"bf16": True,
|
||||||
"num_epochs": 1,
|
}
|
||||||
"max_steps": 2,
|
)
|
||||||
"micro_batch_size": 1,
|
| sft_prepared_dataset_alpaca_cfg
|
||||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
|
||||||
# "gradient_checkpointing": True,
|
|
||||||
"output_dir": temp_dir,
|
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
|
||||||
"learning_rate": 0.00001,
|
|
||||||
"optimizer": "adamw_8bit",
|
|
||||||
"lr_scheduler": "cosine",
|
|
||||||
"flash_attention": True,
|
|
||||||
"use_tensorboard": True,
|
|
||||||
"bf16": True,
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -385,59 +432,50 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"fsdp_state_dict_type",
|
"fsdp_state_dict_type",
|
||||||
["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
|
["FULL_STATE_DICT", "SHARDED_STATE_DICT"],
|
||||||
)
|
)
|
||||||
def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
|
def test_fsdp_packed(
|
||||||
|
self, temp_dir, sft_prepared_dataset_alpaca_cfg, fsdp_state_dict_type
|
||||||
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = (
|
||||||
{
|
DictDefault(
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
{
|
||||||
"sample_packing": True,
|
"pad_to_sequence_len": True,
|
||||||
"pad_to_sequence_len": True,
|
"num_epochs": 1,
|
||||||
"sequence_len": 1024,
|
"max_steps": 2,
|
||||||
"val_set_size": 0.05,
|
"micro_batch_size": 2,
|
||||||
"special_tokens": {
|
"gradient_accumulation_steps": 2,
|
||||||
"pad_token": "<|endoftext|>",
|
# "gradient_checkpointing": True,
|
||||||
},
|
"output_dir": temp_dir,
|
||||||
"datasets": [
|
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
||||||
{
|
"learning_rate": 0.00001,
|
||||||
"path": "tatsu-lab/alpaca",
|
"optimizer": "adamw_torch_fused",
|
||||||
"type": "alpaca",
|
"lr_scheduler": "cosine",
|
||||||
"split": "train[:10%]",
|
"flash_attention": True,
|
||||||
|
"fsdp": [
|
||||||
|
"full_shard",
|
||||||
|
"auto_wrap",
|
||||||
|
],
|
||||||
|
"fsdp_config": {
|
||||||
|
"fsdp_limit_all_gathers": True,
|
||||||
|
"fsdp_offload_params": False,
|
||||||
|
"fsdp_sync_module_states": True,
|
||||||
|
"fsdp_use_orig_params": False,
|
||||||
|
"fsdp_cpu_ram_efficient_loading": False,
|
||||||
|
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||||
|
"fsdp_state_dict_type": fsdp_state_dict_type,
|
||||||
|
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||||
},
|
},
|
||||||
],
|
"use_tensorboard": True,
|
||||||
"num_epochs": 1,
|
}
|
||||||
"max_steps": 2,
|
)
|
||||||
"micro_batch_size": 2,
|
| sft_prepared_dataset_alpaca_cfg
|
||||||
"gradient_accumulation_steps": 2,
|
|
||||||
# "gradient_checkpointing": True,
|
|
||||||
"output_dir": temp_dir,
|
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
|
||||||
"learning_rate": 0.00001,
|
|
||||||
"optimizer": "adamw_torch_fused",
|
|
||||||
"lr_scheduler": "cosine",
|
|
||||||
"flash_attention": True,
|
|
||||||
"fsdp": [
|
|
||||||
"full_shard",
|
|
||||||
"auto_wrap",
|
|
||||||
],
|
|
||||||
"fsdp_config": {
|
|
||||||
"fsdp_limit_all_gathers": True,
|
|
||||||
"fsdp_offload_params": False,
|
|
||||||
"fsdp_sync_module_states": True,
|
|
||||||
"fsdp_use_orig_params": False,
|
|
||||||
"fsdp_cpu_ram_efficient_loading": False,
|
|
||||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
|
||||||
"fsdp_state_dict_type": fsdp_state_dict_type,
|
|
||||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
|
||||||
},
|
|
||||||
"use_tensorboard": True,
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -458,7 +496,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@require_torch_2_6_0
|
@require_torch_2_6_0
|
||||||
@@ -471,51 +509,43 @@ class TestMultiGPULlama:
|
|||||||
[True, False],
|
[True, False],
|
||||||
)
|
)
|
||||||
def test_fsdp2_packed(
|
def test_fsdp2_packed(
|
||||||
self, temp_dir, attention_backend, fsdp_reshard_after_forward
|
self,
|
||||||
|
temp_dir,
|
||||||
|
sft_prepared_dataset_alpaca_cfg,
|
||||||
|
attention_backend,
|
||||||
|
fsdp_reshard_after_forward,
|
||||||
):
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = (
|
||||||
{
|
DictDefault(
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
{
|
||||||
"sample_packing": True,
|
"pad_to_sequence_len": True,
|
||||||
"pad_to_sequence_len": True,
|
"num_epochs": 1,
|
||||||
"sequence_len": 2048,
|
"max_steps": 2,
|
||||||
"val_set_size": 0.1,
|
"micro_batch_size": 4,
|
||||||
"special_tokens": {
|
"gradient_accumulation_steps": 2,
|
||||||
"pad_token": "<|endoftext|>",
|
"gradient_checkpointing": True,
|
||||||
},
|
"output_dir": temp_dir,
|
||||||
"datasets": [
|
"learning_rate": 0.00001,
|
||||||
{
|
"optimizer": "adamw_torch_8bit",
|
||||||
"path": "tatsu-lab/alpaca",
|
"lr_scheduler": "cosine",
|
||||||
"type": "alpaca",
|
"fsdp": [
|
||||||
"split": "train[:10%]",
|
"auto_wrap",
|
||||||
|
],
|
||||||
|
"fsdp_config": {
|
||||||
|
"fsdp_version": 2,
|
||||||
|
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
|
||||||
|
"fsdp_offload_params": False,
|
||||||
|
"fsdp_cpu_ram_efficient_loading": False,
|
||||||
|
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||||
|
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
||||||
|
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||||
|
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
|
||||||
},
|
},
|
||||||
],
|
"use_tensorboard": True,
|
||||||
"num_epochs": 1,
|
}
|
||||||
"max_steps": 2,
|
)
|
||||||
"micro_batch_size": 4,
|
| sft_prepared_dataset_alpaca_cfg
|
||||||
"gradient_accumulation_steps": 2,
|
|
||||||
"gradient_checkpointing": True,
|
|
||||||
"output_dir": temp_dir,
|
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
|
||||||
"learning_rate": 0.00001,
|
|
||||||
"optimizer": "adamw_torch_8bit",
|
|
||||||
"lr_scheduler": "cosine",
|
|
||||||
"fsdp": [
|
|
||||||
"auto_wrap",
|
|
||||||
],
|
|
||||||
"fsdp_config": {
|
|
||||||
"fsdp_version": 2,
|
|
||||||
# "fsdp_forward_prefetch": True, # not yet implemented in accelerate
|
|
||||||
"fsdp_offload_params": False,
|
|
||||||
"fsdp_cpu_ram_efficient_loading": False,
|
|
||||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
|
||||||
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
|
||||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
|
||||||
"fsdp_reshard_after_forward": fsdp_reshard_after_forward,
|
|
||||||
},
|
|
||||||
"use_tensorboard": True,
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
if attention_backend == "flash":
|
if attention_backend == "flash":
|
||||||
cfg.flash_attention = True
|
cfg.flash_attention = True
|
||||||
@@ -543,64 +573,55 @@ class TestMultiGPULlama:
|
|||||||
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_fsdp_qlora_prequant_packed(self, temp_dir):
|
def test_fsdp_qlora_prequant_packed(
|
||||||
|
self, temp_dir, sft_prepared_dataset_alpaca_cfg
|
||||||
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
cfg = DictDefault(
|
cfg = (
|
||||||
{
|
DictDefault(
|
||||||
"base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
|
{
|
||||||
"adapter": "qlora",
|
"base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
|
||||||
"mean_resizing_embeddings": True,
|
"adapter": "qlora",
|
||||||
"load_in_4bit": True,
|
"mean_resizing_embeddings": True,
|
||||||
"lora_r": 8,
|
"load_in_4bit": True,
|
||||||
"lora_alpha": 16,
|
"lora_r": 8,
|
||||||
"lora_dropout": 0.05,
|
"lora_alpha": 16,
|
||||||
"lora_target_linear": True,
|
"lora_dropout": 0.05,
|
||||||
# "lora_modules_to_save": [
|
"lora_target_linear": True,
|
||||||
# "embed_tokens",
|
# "lora_modules_to_save": [
|
||||||
# "lm_head",
|
# "embed_tokens",
|
||||||
# ],
|
# "lm_head",
|
||||||
"sample_packing": True,
|
# ],
|
||||||
"eval_sample_packing": False,
|
"eval_sample_packing": False,
|
||||||
"pad_to_sequence_len": True,
|
"pad_to_sequence_len": True,
|
||||||
"sequence_len": 1024,
|
"num_epochs": 1,
|
||||||
"val_set_size": 0.01,
|
"max_steps": 2,
|
||||||
"special_tokens": {
|
"micro_batch_size": 2,
|
||||||
"pad_token": "<|endoftext|>",
|
"gradient_accumulation_steps": 2,
|
||||||
},
|
# "gradient_checkpointing": True,
|
||||||
"datasets": [
|
"output_dir": temp_dir,
|
||||||
{
|
"learning_rate": 0.00001,
|
||||||
"path": "tatsu-lab/alpaca",
|
"optimizer": "adamw_torch_fused",
|
||||||
"type": "alpaca",
|
"lr_scheduler": "cosine",
|
||||||
"split": "train[:10%]",
|
"flash_attention": True,
|
||||||
|
"fsdp": [
|
||||||
|
"full_shard",
|
||||||
|
"auto_wrap",
|
||||||
|
],
|
||||||
|
"fsdp_config": {
|
||||||
|
"fsdp_limit_all_gathers": True,
|
||||||
|
"fsdp_offload_params": False,
|
||||||
|
"fsdp_sync_module_states": True,
|
||||||
|
"fsdp_use_orig_params": False,
|
||||||
|
"fsdp_cpu_ram_efficient_loading": True,
|
||||||
|
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
||||||
|
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
||||||
|
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
||||||
},
|
},
|
||||||
],
|
"use_tensorboard": True,
|
||||||
"num_epochs": 1,
|
}
|
||||||
"max_steps": 2,
|
)
|
||||||
"micro_batch_size": 2,
|
| sft_prepared_dataset_alpaca_cfg
|
||||||
"gradient_accumulation_steps": 2,
|
|
||||||
# "gradient_checkpointing": True,
|
|
||||||
"output_dir": temp_dir,
|
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
|
||||||
"learning_rate": 0.00001,
|
|
||||||
"optimizer": "adamw_torch_fused",
|
|
||||||
"lr_scheduler": "cosine",
|
|
||||||
"flash_attention": True,
|
|
||||||
"fsdp": [
|
|
||||||
"full_shard",
|
|
||||||
"auto_wrap",
|
|
||||||
],
|
|
||||||
"fsdp_config": {
|
|
||||||
"fsdp_limit_all_gathers": True,
|
|
||||||
"fsdp_offload_params": False,
|
|
||||||
"fsdp_sync_module_states": True,
|
|
||||||
"fsdp_use_orig_params": False,
|
|
||||||
"fsdp_cpu_ram_efficient_loading": True,
|
|
||||||
"fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
|
|
||||||
"fsdp_state_dict_type": "SHARDED_STATE_DICT",
|
|
||||||
"fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
|
|
||||||
},
|
|
||||||
"use_tensorboard": True,
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -641,7 +662,12 @@ class TestMultiGPULlama:
|
|||||||
[True, False],
|
[True, False],
|
||||||
)
|
)
|
||||||
def test_ds_zero3_packed(
|
def test_ds_zero3_packed(
|
||||||
self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
|
self,
|
||||||
|
temp_dir,
|
||||||
|
sft_prepared_dataset_alpaca_cfg,
|
||||||
|
gradient_accumulation_steps,
|
||||||
|
deepspeed,
|
||||||
|
qlora,
|
||||||
):
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
if qlora:
|
if qlora:
|
||||||
@@ -655,37 +681,25 @@ class TestMultiGPULlama:
|
|||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
adapter = {}
|
adapter = {}
|
||||||
cfg = DictDefault(
|
cfg = (
|
||||||
{
|
DictDefault(
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
{
|
||||||
"sample_packing": True,
|
"pad_to_sequence_len": True,
|
||||||
"pad_to_sequence_len": True,
|
"num_epochs": 1,
|
||||||
"sequence_len": 1024,
|
"max_steps": 2,
|
||||||
"val_set_size": 0.05,
|
"micro_batch_size": 1,
|
||||||
"special_tokens": {
|
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||||
"pad_token": "<|endoftext|>",
|
"output_dir": temp_dir,
|
||||||
},
|
"learning_rate": 0.00001,
|
||||||
"datasets": [
|
"optimizer": "adamw_torch_fused",
|
||||||
{
|
"lr_scheduler": "cosine",
|
||||||
"path": "tatsu-lab/alpaca",
|
"flash_attention": True,
|
||||||
"type": "alpaca",
|
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
|
||||||
"split": "train[:10%]",
|
"use_tensorboard": True,
|
||||||
},
|
**adapter,
|
||||||
],
|
}
|
||||||
"num_epochs": 1,
|
)
|
||||||
"max_steps": 2,
|
| sft_prepared_dataset_alpaca_cfg
|
||||||
"micro_batch_size": 1,
|
|
||||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
|
||||||
"output_dir": temp_dir,
|
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
|
||||||
"learning_rate": 0.00001,
|
|
||||||
"optimizer": "adamw_torch_fused",
|
|
||||||
"lr_scheduler": "cosine",
|
|
||||||
"flash_attention": True,
|
|
||||||
"deepspeed": str(AXOLOTL_ROOT / deepspeed),
|
|
||||||
"use_tensorboard": True,
|
|
||||||
**adapter,
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -706,7 +720,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.4, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -717,7 +731,13 @@ class TestMultiGPULlama:
|
|||||||
"qlora",
|
"qlora",
|
||||||
[True, False],
|
[True, False],
|
||||||
)
|
)
|
||||||
def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
|
def test_ds_zero2_packed(
|
||||||
|
self,
|
||||||
|
temp_dir,
|
||||||
|
sft_prepared_dataset_alpaca_cfg,
|
||||||
|
gradient_accumulation_steps,
|
||||||
|
qlora,
|
||||||
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
if qlora:
|
if qlora:
|
||||||
adapter = {
|
adapter = {
|
||||||
@@ -730,37 +750,25 @@ class TestMultiGPULlama:
|
|||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
adapter = {}
|
adapter = {}
|
||||||
cfg = DictDefault(
|
cfg = (
|
||||||
{
|
DictDefault(
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
{
|
||||||
"sample_packing": True,
|
"pad_to_sequence_len": True,
|
||||||
"pad_to_sequence_len": True,
|
"num_epochs": 1,
|
||||||
"sequence_len": 1024,
|
"max_steps": 2,
|
||||||
"val_set_size": 0.01,
|
"micro_batch_size": 1,
|
||||||
"special_tokens": {
|
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||||
"pad_token": "<|endoftext|>",
|
"output_dir": temp_dir,
|
||||||
},
|
"learning_rate": 0.00001,
|
||||||
"datasets": [
|
"optimizer": "adamw_torch_fused",
|
||||||
{
|
"lr_scheduler": "cosine",
|
||||||
"path": "tatsu-lab/alpaca",
|
"flash_attention": True,
|
||||||
"type": "alpaca",
|
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
|
||||||
"split": "train[:10%]",
|
"use_tensorboard": True,
|
||||||
},
|
**adapter,
|
||||||
],
|
}
|
||||||
"num_epochs": 1,
|
)
|
||||||
"max_steps": 2,
|
| sft_prepared_dataset_alpaca_cfg
|
||||||
"micro_batch_size": 1,
|
|
||||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
|
||||||
"output_dir": temp_dir,
|
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
|
||||||
"learning_rate": 0.00001,
|
|
||||||
"optimizer": "adamw_torch_fused",
|
|
||||||
"lr_scheduler": "cosine",
|
|
||||||
"flash_attention": True,
|
|
||||||
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
|
|
||||||
"use_tensorboard": True,
|
|
||||||
**adapter,
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -781,7 +789,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
@@ -792,7 +800,13 @@ class TestMultiGPULlama:
|
|||||||
"qlora",
|
"qlora",
|
||||||
[True, False],
|
[True, False],
|
||||||
)
|
)
|
||||||
def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
|
def test_ds_zero1_packed(
|
||||||
|
self,
|
||||||
|
temp_dir,
|
||||||
|
sft_prepared_dataset_alpaca_cfg,
|
||||||
|
gradient_accumulation_steps,
|
||||||
|
qlora,
|
||||||
|
):
|
||||||
# pylint: disable=duplicate-code
|
# pylint: disable=duplicate-code
|
||||||
if qlora:
|
if qlora:
|
||||||
adapter = {
|
adapter = {
|
||||||
@@ -805,37 +819,25 @@ class TestMultiGPULlama:
|
|||||||
}
|
}
|
||||||
else:
|
else:
|
||||||
adapter = {}
|
adapter = {}
|
||||||
cfg = DictDefault(
|
cfg = (
|
||||||
{
|
DictDefault(
|
||||||
"base_model": "HuggingFaceTB/SmolLM2-135M",
|
{
|
||||||
"sample_packing": True,
|
"pad_to_sequence_len": True,
|
||||||
"pad_to_sequence_len": True,
|
"num_epochs": 1,
|
||||||
"sequence_len": 1024,
|
"max_steps": 2,
|
||||||
"val_set_size": 0.01,
|
"micro_batch_size": 1,
|
||||||
"special_tokens": {
|
"gradient_accumulation_steps": gradient_accumulation_steps,
|
||||||
"pad_token": "<|endoftext|>",
|
"output_dir": temp_dir,
|
||||||
},
|
"learning_rate": 0.00001,
|
||||||
"datasets": [
|
"optimizer": "adamw_torch_fused",
|
||||||
{
|
"lr_scheduler": "cosine",
|
||||||
"path": "tatsu-lab/alpaca",
|
"flash_attention": True,
|
||||||
"type": "alpaca",
|
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
|
||||||
"split": "train[:10%]",
|
"use_tensorboard": True,
|
||||||
},
|
**adapter,
|
||||||
],
|
}
|
||||||
"num_epochs": 1,
|
)
|
||||||
"max_steps": 2,
|
| sft_prepared_dataset_alpaca_cfg
|
||||||
"micro_batch_size": 1,
|
|
||||||
"gradient_accumulation_steps": gradient_accumulation_steps,
|
|
||||||
"output_dir": temp_dir,
|
|
||||||
"dataset_prepared_path": temp_dir + "/last_run_prepared",
|
|
||||||
"learning_rate": 0.00001,
|
|
||||||
"optimizer": "adamw_torch_fused",
|
|
||||||
"lr_scheduler": "cosine",
|
|
||||||
"flash_attention": True,
|
|
||||||
"deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
|
|
||||||
"use_tensorboard": True,
|
|
||||||
**adapter,
|
|
||||||
}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# write cfg to yaml file
|
# write cfg to yaml file
|
||||||
@@ -856,7 +858,7 @@ class TestMultiGPULlama:
|
|||||||
)
|
)
|
||||||
|
|
||||||
check_tensorboard(
|
check_tensorboard(
|
||||||
temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
|
temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
|
||||||
)
|
)
|
||||||
|
|
||||||
@pytest.mark.skip(
|
@pytest.mark.skip(
|
||||||
|
|||||||
Reference in New Issue
Block a user