diff --git a/.nojekyll b/.nojekyll index 8bd07d1c3..9e9dcc241 100644 --- a/.nojekyll +++ b/.nojekyll @@ -1 +1 @@ -b7664344 \ No newline at end of file +deaaba0c \ No newline at end of file diff --git a/FAQS.html b/FAQS.html index 839bb0114..979a2bca4 100644 --- a/FAQS.html +++ b/FAQS.html @@ -39,7 +39,7 @@ ul.task-list li input[type="checkbox"] { - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + @@ -74,7 +74,7 @@ pre > code.sourceCode > span > a:first-child::before { text-decoration: underlin - + + - - + + - @@ -169,7 +144,7 @@ window.Quarto = { - + @@ -496,12 +471,18 @@ window.Quarto = {

On this page

@@ -510,7 +491,7 @@ window.Quarto = {
-

Setting up

+

Fine-Tune Qwen3 14B with Axolotl

@@ -527,201 +508,847 @@ window.Quarto = {
-
-
import torch
-# Check so there is a gpu available, a T4(free tier) is enough to run this notebook
-assert (torch.cuda.is_available()==True)
-
+

Built with Axolotl

+

Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.

+ +
+

Installation

+

Axolotl is easy to install from pip, or use our pre-built Docker images for a hassle free dependency experience. See our docs for more information.

-
!pip install --no-build-isolation axolotl[deepspeed]
+
%%capture
+# This step can take ~5-10 minutes to install dependencies
+!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1
+!pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@78b2a45713a54c9bedf8b33f5e31cf07a1a57154"
-
-

Hugging Face login (optional)

-
-
from huggingface_hub import notebook_login
-notebook_login()
+
+

Demo: Talk Like a Pirate

+

In this demo, we are training the model to respond like a pirate. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab.

+
+

Upload your own dataset or use a Huggingface dataset

+

You can choose to use your own JSONL file from your own Google Drive; for example downloading the Pirate-Ultrachat JSONL to your Google Drive. JSONL datasets should be formatted similar to the OpenAI dataset format.

+

You can also simply use the winglian/pirate-ultrachat-10k dataset directly.

+
+
# Default to HF dataset location
+dataset_id = "winglian/pirate-ultrachat-10k"
+uploaded = {}
-
-
-

Example configuration

-
import yaml
-
-yaml_string = """
-base_model: NousResearch/Meta-Llama-3.1-8B
-
-load_in_8bit: false
-load_in_4bit: true
-strict: false
-
-datasets:
-  - path: tatsu-lab/alpaca
-    type: alpaca
-dataset_prepared_path: last_run_prepared
-val_set_size: 0.05
-output_dir: ./outputs/lora-out
-
-sequence_len: 2048
-sample_packing: true
-eval_sample_packing: true
-pad_to_sequence_len: true
-
-adapter: qlora
-lora_model_dir:
-lora_r: 32
-lora_alpha: 16
-lora_dropout: 0.05
-lora_target_linear: true
-lora_fan_in_fan_out:
-lora_modules_to_save:
-  - embed_tokens
-  - lm_head
-
-wandb_project:
-wandb_entity:
-wandb_watch:
-wandb_name:
-wandb_log_model:
-
-gradient_accumulation_steps: 2
-micro_batch_size: 1
-num_epochs: 1
-optimizer: paged_adamw_8bit
-lr_scheduler: cosine
-learning_rate: 2e-5
-
-train_on_inputs: false
-group_by_length: false
-bf16: auto
-fp16:
-tf32: false
-
-gradient_checkpointing: true
-early_stopping_patience:
-resume_from_checkpoint:
-logging_steps: 1
-xformers_attention:
-flash_attention: false
-sdp_attention: true
-
-warmup_steps: 1
-max_steps: 25
-evals_per_epoch: 1
-eval_table_size:
-saves_per_epoch: 1
-debug:
-deepspeed:
-weight_decay: 0.0
-fsdp:
-fsdp_config:
-special_tokens:
-  pad_token: <|end_of_text|>
-"""
-
-
-# Convert the YAML string to a Python dictionary
-yaml_dict = yaml.safe_load(yaml_string)
-
-# Specify your file path
-file_path = 'test_axolotl.yaml'
-
-# Write the YAML file
-with open(file_path, 'w') as file:
-    yaml.dump(yaml_dict, file)
-
-

Above we have a configuration file with base LLM model and datasets specified, among many other things. Axolotl can automatically detect whether the specified datasets are on HuggingFace repo or local machine.

-

The Axolotl configuration options encompass model and dataset selection, data pre-processing, and training. Let’s go through them line by line:

-
    -
  • “base model”: String value, specifies the underlying pre-trained LLM that will be used for finetuning
  • -
-

Next we have options for model weights quantization. Quantization allows for reduction in occupied memory on GPUs.

-
    -
  • “load_in_8bit”: Boolean value, whether to quantize the model weights into 8-bit integer.

  • -
  • “load_in_4bit”: Boolean value, whether to quantize the model weights into 4-bit integer.

  • -
  • “strict”: Boolean value. If false, it allows for overriding established configuration options in the yaml file when executing in command-line interface.

  • -
  • “datasets”: a list of dicts that contain path and type of data sets as well as other optional configurations where datasets are concerned. Supports multiple datasets.

  • -
  • “val_set_size”: Either a float value less than one or an integer less than the total size of dataset. Sets the size of validation set from the whole dataset. If float, sets the proportion of the dataset assigned for validation. If integer, sets the direct size of validation set.

  • -
  • “output_dir”: String value. Path of trained model.

  • -
-

For data preprocessing:

-
    -
  • “sequence_len”: Integer. Specifies the maximum sequence length of the input. Typically 2048 or less.

  • -
  • “pad_to_sequence_len”: Boolean. Padding input to maximum sequence length.

  • -
  • “sample_packing”: Boolean. Specifies whether to use multi-packing with block diagonal attention.

  • -
  • “special_tokens”: Python dict, optional. Allows users to specify the additional special tokens to be ignored by the tokenizer.

  • -
-

For LoRA configuration and its hyperparamters:

-
    -
  • “adapter”: String. Either “lora” or “qlora”, depending on user’s choice.

  • -
  • “lora_model_dir”: String, Optional. Path to directory that contains LoRA model, if there is already a trained LoRA model the user would like to use.

  • -
  • “lora_r”: Integer. Refers to the rank of LoRA decomposition matrices. Higher value will reduce LoRA efficiency. Recommended to be set to 8.

  • -
  • “lora_alpha”: Integer. Scale the weight matrices by \(\frac{\text{lora_alpha}}{\text{lora_r}}\)Recommended to be fixed at 16.

  • -
  • “lora_dropout”: Float that is 1 or less. The dropout probability of a lora layer.

  • -
  • “lora_target_linear”: Boolean. If true, lora will target all linear modules in the transformers architecture.

  • -
  • “lora_modules_to_save”: If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.

  • -
-

See LoRA for detailed explanation of LoRA implementation.

-

For the training configurations:

-
    -
  • “gradient_accumulation_steps”: Integer. The number of steps over which to accumulate gradient for batch training. E.g. if 2, backprop is performed every two steps.

  • -
  • “micro_batch_size”: Integer. Batch size per gpu / gradient_accumulation_steps

  • -
  • “num_epochs”: Integer. Number of epochs. One epoch is when training has looped over every batch in the whole data set once.

  • -
  • “optimizer”: The optimizer to use for the training.

  • -
  • “learning_rate”: The learning rate.

  • -
  • “lr_scheduler”: The learning rate scheduler to use for adjusting learning rate during training.

  • -
  • “train_on_inputs”: Boolean. Whether to ignore or include the user’s prompt from the training labels.

  • -
  • “group_by_length”: Boolean. Whether to group similarly sized data to minimize padding.

  • -
  • “bf16”: Either “auto”, “true”, or “false”. Whether to use CUDA bf16 floating point format. If set to “auto”, will automatically apply bf16 should the gpu supports it.

  • -
  • “fp16”: Optional. Specifies whether to use CUDA fp16. Automatically set to true if “bf16” is set to true. Otherwise false.

  • -
  • “tf32”: Boolean. Whether to use CUDA tf32. Will override bf16.

  • -
  • “gradient_checkpointing”: Boolean. Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing

  • -
  • “gradient_checkpointing_kwargs”: Python Dict. Fed into the trainer.

  • -
  • “logging_steps”: Integer. Log training information over every specified number of steps.

  • -
  • “flash_attention”: Boolean. Whether to use the flash attention mechanism.

  • -
  • “sdp_attention”: Boolean. Whether to use the Scaled Dot Product attention mechanism (the attention mechanism in the original implementation of transformers.)

  • -
  • “warmup_steps”: Integer. The number of pre-training steps where a very low learning rate is used.

  • -
  • “evals_per_epoch”: Integer. Number of evaluations to be performed within one training epoch.

  • -
  • “saves_per_epoch”: Integer. Number of times the model is saved in one training epoch.

  • -
  • “weight_decay”: Positive Float. Sets the “strength” of weight decay (i.e. setting the coefficient of L2 regularization)

  • -
-

The above is but a snippet aiming to get users familiarized with the types of streamlined configuration options axolotl provides. For a full list of configuration options, see here

-

Train the model

-
-
!accelerate launch -m axolotl.cli.train /content/test_axolotl.yaml
-
-

Predict with trained model

-
-
!accelerate launch -m axolotl.cli.inference /content/test_axolotl.yaml \
-    --lora_model_dir="./outputs/lora-out" --gradio
+
import os
+# Optionally, upload your own JSONL to your Google Drive
+GOOGLE_DRIVE_PATH = ""  # ex: "MyDrive/Colab\ Notebooks/train.jsonl"
+
+# "Select All" permissions, or you may get the error:
+# "MessageError: Error: credential propagation was unsuccessful"
+if GOOGLE_DRIVE_PATH:
+    from google.colab import drive
+    # Mount your Google Drive
+    GOOGLE_DRIVE_MNT = "/content/drive/"
+    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)
+    tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip("/"))
+    # make sure file exists
+    if not os.path.isfile(tmp_path):
+        raise ValueError(f"File {tmp_path} does not exist")
+    dataset_id = tmp_path
-
-

Deeper Dive

-

It is also helpful to gain some familiarity over some of the core inner workings of axolotl

-
-

Configuration Normalization

-

Axolotl uses a custom Dict class, called DictDefault -to store configurations specified in the yaml configuration file (into a Python variable named cfg). The definition for this custom Dict can be found in the utils/dict.py

-

DictDefault is amended such that calling a missing key from it will result in a None return type. This is important because if some configuration options aren’t specified by the user, the None type allows Axolotl to perform boolean operations to determine the default settings for missing configurations. For more examples on how this is done, check out utils/config/init.py

-
-

Loading Models, Tokenizers, and Trainer

-

If we inspect cli.train.py, we will find that most of the heavy lifting were done by the function train() which is itself imported from src/axolotl/train.py.

-

train() takes care of loading the appropriate tokenizer and pre-trained model through load_model() and load_tokenizer() from src/axolotl/utils/models.py respectively.

-

load_tokenizer() loads in the appropriate tokenizer given the desired model, as well as chat templates.

-

ModelLoader class follows after tokenizer has been selected. It will automatically discern the base model type, load in the desired model, as well as applying model-appropriate attention mechanism modifications (e.g. flash attention). Depending on which base model the user chooses in the configuration, ModelLoader will utilize the corresponding “attention hijacking” script. For example, if the user specified the base model to be NousResearch/Meta-Llama-3.1-8B, which is of llama type, and set flash_attn to True, ModelLoader will load in llama_attn_hijack_flash.py. For a list of supported attention hijacking, please refer to the directory /src/axolotl/monkeypatch/

-

Another important operation encompassed in train() is setting up the training that takes into account of user-specified traning configurations (e.g. num_epochs, optimizer) through the use of setup_trainer() from /src/axolotl/utils/trainer.py, which in turn relies on modules from /src/axolotl/core/trainer_builder.py. -trainer_builder.py provides a list of trainer object options bespoke for the task type (Causal or Reinforcement learning (‘dpo’, ‘ipo’, ‘kto’) )

+
+

Configure for Supervised Fine-Tuning (SFT)

+
+
from axolotl.utils.dict import DictDefault
+from axolotl.cli.config import load_cfg
+
+# Axolotl provides full control and transparency over model and training configuration
+config = DictDefault(
+    base_model = "Qwen/Qwen3-14B",  # Use the instruct tuned model, but we're aligning it to be a pirate
+    load_in_4bit = True,  # set to True for qLoRA
+    adapter = "qlora",
+    lora_r = 32,
+    lora_alpha = 64,
+    lora_target_modules = [
+        "q_proj", "k_proj", "v_proj", "o_proj",  # train self_attn linear modules
+        "gate_proj", "down_proj", "up_proj",  # train MLP linear modules
+    ],
+    lora_qkv_kernel = True,  # optimized triton kernels for LoRA
+    lora_o_kernel = True,
+    lora_mlp_kernel = True,
+    embeddings_skip_upcast = True,  # keep embeddings in fp16 so the model fits in 15GB VRAM
+    xformers_attention = True,  # use xformers on Colab w/ T4 for memory efficient attention, flash_attention only on Ampere or above
+    plugins = [
+        # more efficient training using Apple's Cut Cross Entropy; https://github.com/apple/ml-cross-entropy
+        "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin",
+    ],
+    sample_packing = True,  # 2-6x increase in tokens per micro-batch
+    # when using packing, use a slightly higher learning rate to account for fewer steps
+    # alternatively, reduce the micro_batch_size + gradient_accumulation_steps to achieve closer to the same number of steps/epoch
+    learning_rate = 0.00019,
+    sequence_len = 4096,  # larger sequence length improves packing efficiency for more tokens/sec
+    micro_batch_size = 1,
+    gradient_accumulation_steps = 1,
+    gradient_checkpointing = True,  # tradeoff reduced VRAM for increased time
+    gradient_checkpointing_kwargs = {
+        "use_reentrant": False,
+    },
+    optimizer = "paged_adamw_8bit",
+    lr_scheduler = "cosine",
+    warmup_steps = 5,
+    fp16 = True,  # use float16 + automatic mixed precision, bfloat16 not supported on Colab w/ T4
+    bf16 = False,
+    max_grad_norm = 0.1,  # gradient clipping
+    num_epochs = 1,
+    saves_per_epoch = 2,  # how many checkpoints to save over one epoch
+    logging_steps = 1,
+    output_dir = "./outputs/qwen-sft-pirate-rrr",
+    chat_template = "qwen3",
+    datasets = [
+        {
+            "path": dataset_id,  # Huggingface Dataset id or path to train.jsonl
+            "type": "chat_template",
+            "split": "train",
+            "eot_tokens": ["<|im_end|>"],
+        }
+    ],
+    dataloader_prefetch_factor = 8,  # dataloader optimizations
+    dataloader_num_workers = 2,
+    dataloader_pin_memory = True,
+  )
+
+# validates the configuration
+cfg = load_cfg(config)
+
+
+
[2025-05-08 13:40:27,488] [INFO] [root.register:348] [PID:174] Attempting to load plugin: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+[2025-05-08 13:40:27,493] [INFO] [root.register:351] [PID:174] Plugin loaded successfully: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
+
+[2025-05-08 13:40:27,959] [INFO] [axolotl.utils.schemas.config.check_eval_packing:721] [PID:174] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`
+
+[2025-05-08 13:40:27,960] [INFO] [axolotl.utils.schemas.config.hint_sample_packing_padding:514] [PID:174] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing
+
+[2025-05-08 13:40:27,961] [INFO] [axolotl.utils.schemas.config.check_bf16:1251] [PID:174] [RANK:0] bf16 support detected, but not enabled for this configuration.
+
+
+
+
+ +
+
+
+
[2025-05-08 13:40:28,590] [INFO] [axolotl.normalize_config:237] [PID:174] [RANK:0] cuda memory usage baseline: 0.000GB (+0.002GB cache, +0.359GB misc)
+
+
+
+
+
+
from axolotl.utils import patch_optimized_env
+# speedup downloads from HF 🤗 and set "PYTORCH_CUDA_ALLOC_CONF" env to save memory
+patch_optimized_env()
+
-
-

Monkey patch

-

The Monkey patch directory is where model architecture/optimization patching scripts are stored (these are modifications that are not implemented in the official releases, hence the name monkey patch). It includes attention jacking, ReLoRA, and unsloth optimization.

+
+

Datasets

+

Axolotl has a robust suite of loaders and transforms to parse most open datasets of any format into the appropriate chat template for your model. Axolotl will mask input tokens from the user’s prompt so that the train loss is only calculated against the model’s response. For more information, see our documentation on dataset preparation.

+
+
from axolotl.common.datasets import load_datasets
+
+# Load, parse and tokenize the datasets to be formatted with qwen3 chat template
+# Drop long samples from the dataset that overflow the max sequence length
+dataset_meta = load_datasets(cfg=cfg)
+
+ +
+
+ +
+
+ +
+
+ +
+
+
+
[2025-05-08 13:41:00,844] [DEBUG] [axolotl.utils.models.load_tokenizer:441] [PID:174] [RANK:0] EOS: 151645 / <|im_end|>
+
+[2025-05-08 13:41:00,845] [DEBUG] [axolotl.utils.models.load_tokenizer:442] [PID:174] [RANK:0] BOS: None / None
+
+[2025-05-08 13:41:00,846] [DEBUG] [axolotl.utils.models.load_tokenizer:443] [PID:174] [RANK:0] PAD: 151643 / <|endoftext|>
+
+[2025-05-08 13:41:00,847] [DEBUG] [axolotl.utils.models.load_tokenizer:444] [PID:174] [RANK:0] UNK: None / None
+
+[2025-05-08 13:41:00,869] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:271] [PID:174] [RANK:0] Unable to find prepared dataset in last_run_prepared/97037817611d38b3a9c681753c3c4c95
+
+[2025-05-08 13:41:00,870] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:272] [PID:174] [RANK:0] Loading raw datasets...
+
+[2025-05-08 13:41:00,870] [WARNING] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:274] [PID:174] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset.
+
+[2025-05-08 13:41:00,871] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:281] [PID:174] [RANK:0] No seed provided, using default seed of 42
+
+
+
+
+ +
+
+ +
+
+
+
[2025-05-08 13:41:04,196] [INFO] [axolotl.utils.data.sft.get_dataset_wrapper:484] [PID:174] [RANK:0] Loading dataset with base_type: chat_template and prompt_style: None
+
+[2025-05-08 13:41:04,233] [INFO] [axolotl.__call__:761] [PID:174] [RANK:0] Using chat template:
+
+---
+
+{%- if tools %}
+
+    {{- '<|im_start|>system\n' }}
+
+    {%- if messages[0].role == 'system' %}
+
+        {{- messages[0].content + '\n\n' }}
+
+    {%- endif %}
+
+    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+
+    {%- for tool in tools %}
+
+        {{- "\n" }}
+
+        {{- tool | tojson }}
+
+    {%- endfor %}
+
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+
+{%- else %}
+
+    {%- if messages[0].role == 'system' %}
+
+        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
+
+    {%- endif %}
+
+{%- endif %}
+
+{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
+
+{%- for message in messages[::-1] %}
+
+    {%- set index = (messages|length - 1) - loop.index0 %}
+
+    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
+
+        {%- set ns.multi_step_tool = false %}
+
+        {%- set ns.last_query_index = index %}
+
+    {%- endif %}
+
+{%- endfor %}
+
+{%- for message in messages %}
+
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
+
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+
+    {%- elif message.role == "assistant" %}
+
+        {%- set content = message.content %}
+
+        {%- set reasoning_content = '' %}
+
+        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
+
+            {%- set reasoning_content = message.reasoning_content %}
+
+        {%- else %}
+
+            {%- if '</think>' in message.content %}
+
+                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
+
+                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
+
+            {%- endif %}
+
+        {%- endif %}
+
+        {%- if loop.index0 > ns.last_query_index %}
+
+            {%- if loop.last or (not loop.last and reasoning_content) %}
+
+                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
+
+            {%- else %}
+
+                {{- '<|im_start|>' + message.role + '\n' + content }}
+
+            {%- endif %}
+
+        {%- else %}
+
+            {{- '<|im_start|>' + message.role + '\n' + content }}
+
+        {%- endif %}
+
+        {%- if message.tool_calls %}
+
+            {%- for tool_call in message.tool_calls %}
+
+                {%- if (loop.first and content) or (not loop.first) %}
+
+                    {{- '\n' }}
+
+                {%- endif %}
+
+                {%- if tool_call.function %}
+
+                    {%- set tool_call = tool_call.function %}
+
+                {%- endif %}
+
+                {{- '<tool_call>\n{"name": "' }}
+
+                {{- tool_call.name }}
+
+                {{- '", "arguments": ' }}
+
+                {%- if tool_call.arguments is string %}
+
+                    {{- tool_call.arguments }}
+
+                {%- else %}
+
+                    {{- tool_call.arguments | tojson }}
+
+                {%- endif %}
+
+                {{- '}\n</tool_call>' }}
+
+            {%- endfor %}
+
+        {%- endif %}
+
+        {{- '<|im_end|>\n' }}
+
+    {%- elif message.role == "tool" %}
+
+        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
+
+            {{- '<|im_start|>user' }}
+
+        {%- endif %}
+
+        {{- '\n<tool_response>\n' }}
+
+        {{- message.content }}
+
+        {{- '\n</tool_response>' }}
+
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+
+            {{- '<|im_end|>\n' }}
+
+        {%- endif %}
+
+    {%- endif %}
+
+{%- endfor %}
+
+{%- if add_generation_prompt %}
+
+    {{- '<|im_start|>assistant\n' }}
+
+    {%- if enable_thinking is defined and enable_thinking is false %}
+
+        {{- '<think>\n\n</think>\n\n' }}
+
+    {%- endif %}
+
+{%- endif %}
+
+---
+
+
+
+
+ +
+
+
+
[2025-05-08 13:42:09,195] [INFO] [axolotl.utils.data.utils.drop_long_seq_in_dataset:177] [PID:174] [RANK:0] min_input_len: 23
+
+[2025-05-08 13:42:09,196] [INFO] [axolotl.utils.data.utils.drop_long_seq_in_dataset:179] [PID:174] [RANK:0] max_input_len: 3380
+
+
+
+
+ +
+
+ +
+
+ +
+
+
+
[2025-05-08 13:42:21,651] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:351] [PID:174] [RANK:0] Saving merged prepared dataset to disk... last_run_prepared/97037817611d38b3a9c681753c3c4c95
+
+
+
+
+ +
+
+
+
[2025-05-08 13:42:25,711] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:411] [PID:174] [RANK:0] gather_len_batches: [1540]
+
+[2025-05-08 13:42:25,714] [INFO] [axolotl.calc_sample_packing_eff_est:491] [PID:174] [RANK:0] sample_packing_eff_est across ranks: [0.9987832601968344]
+
+
+
+
+
+
+

Training

+
+
from axolotl.train import train
+
+# just train the first 25 steps for demo.
+# This is sufficient to align the model as we've used packing to maximize the trainable samples per step.
+cfg.max_steps = 25
+model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
+
+
+
     #@@ #@@      @@# @@#
+
+    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.
+
+    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@
+
+      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@
+
+    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@
+
+    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@
+
+     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@
+
+                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@
+
+    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@
+
+                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@
+
+    @@@@  @@@@@@@@@@@@@@@@
+
+
+
+[2025-05-07 22:08:14,344] [INFO] [axolotl.monkeypatch.peft.utils.patch_peft_prep_code:76] [PID:1336] [RANK:0] patching prepare_model_for_kbit_training to allow for overrides
+
+[2025-05-07 22:08:14,549] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:80] [PID:1336] [RANK:0] Applying Cut Cross Entropy to model type: qwen3
+
+
+
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+ +
+
+
[2025-05-07 22:09:49,798] [INFO] [accelerate.utils.modeling.get_balanced_memory:990] [PID:1336] We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).
+
+
+ +
+
+ +
+
+
+
[2025-05-07 22:11:37,521] [INFO] [axolotl.utils.models.load_model:1302] [PID:1336] [RANK:0] cuda memory usage after model load: 9.264GB (+1.721GB cache, +0.375GB misc)
+
+[2025-05-07 22:11:37,532] [INFO] [axolotl.utils.models.prepare_model:1205] [PID:1336] [RANK:0] converting PEFT model w/ prepare_model_for_kbit_training
+
+[2025-05-07 22:11:37,537] [INFO] [axolotl.utils.models.load_model:1341] [PID:1336] [RANK:0] Converting modules to torch.float16
+
+trainable params: 128,450,560 || all params: 14,896,757,760 || trainable%: 0.8623
+
+[2025-05-07 22:11:40,170] [INFO] [axolotl.utils.models.load_model:1402] [PID:1336] [RANK:0] cuda memory usage after adapters: 9.743GB (+1.476GB cache, +0.375GB misc)
+
+
+
+
+
/usr/local/lib/python3.11/dist-packages/axolotl/core/trainers/base.py:64: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `AxolotlTrainer.__init__`. Use `processing_class` instead.
+  super().__init__(*_args, **kwargs)
+No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
+
+
+
+
[2025-05-07 22:11:41,755] [INFO] [axolotl.train.save_initial_configs:359] [PID:1336] [RANK:0] Pre-saving adapter config to ./outputs/qwen-sft-pirate-rrr...
+
+[2025-05-07 22:11:41,756] [INFO] [axolotl.train.save_initial_configs:363] [PID:1336] [RANK:0] Pre-saving tokenizer to ./outputs/qwen-sft-pirate-rrr...
+
+[2025-05-07 22:11:41,974] [INFO] [axolotl.train.save_initial_configs:366] [PID:1336] [RANK:0] Pre-saving model config to ./outputs/qwen-sft-pirate-rrr...
+
+[2025-05-07 22:11:41,982] [INFO] [axolotl.train.execute_training:211] [PID:1336] [RANK:0] Starting trainer...
+
+[2025-05-07 22:11:45,047] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:411] [PID:1336] [RANK:0] gather_len_batches: [1540]
+
+
+
+
+
You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
+You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
+
+
+
+ + + [25/25 09:25, Epoch 0/1] +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
StepTraining Loss
11.092300
21.554200
31.041400
41.733800
51.430000
61.258500
71.343600
81.101700
91.086500
100.813200
110.689600
120.826700
131.541800
140.948000
151.357000
161.085800
171.516800
181.146800
190.834800
200.968000
211.388800
221.511500
231.338500
241.206600
251.504600
+

+

+
+
+
[2025-05-07 22:12:42,746] [INFO] [axolotl.callbacks.on_step_end:128] [PID:1336] [RANK:0] cuda memory usage while training: 9.768GB (+3.287GB cache, +0.646GB misc)
+
+[2025-05-07 22:21:46,859] [INFO] [axolotl.train.save_trained_model:231] [PID:1336] [RANK:0] Training completed! Saving pre-trained model to ./outputs/qwen-sft-pirate-rrr.
+
+
+
+
+
+
+

Inferencing the trained model

+
+
import torch
+from transformers import TextStreamer
+
+messages = [
+    {
+        "role": "user",
+        "content": "Explain the Pythagorean theorem to me.",
+    },
+]
+
+prompt = tokenizer.apply_chat_template(
+    messages,
+    add_generation_prompt=True,
+    tokenize=False,
+    enable_thinking = False,
+)
+
+outputs = model.generate(
+    **tokenizer(prompt, return_tensors = "pt").to("cuda"),
+    max_new_tokens = 192,
+    temperature = 1.0, top_p = 0.8, top_k = 32,
+    streamer = TextStreamer(tokenizer, skip_prompt = True),
+)
+
+
Ahoy there, matey! Shiver me timbers, ye be lookin' for the Pythagorean theorem, eh? Well, hold yer horses and listen up, for I'll be tellin' ye all about it in me own special way.
+
+The Pythagorean theorem be a real gem of a mathematical trick that helps ye find the length of a side of a right triangle. Now, a right triangle be a triangle with a right angle, which be that little corner that looks like a square. 
+
+The theorem be named after a clever fellow named Pythagoras, who be a mathematician from ancient Greece. He discovered that if ye have a right triangle, the square of the length of the hypotenuse (that be the side opposite the right angle) be equal to the sum of the squares of the other two sides. 
+
+In other words, if ye have a triangle with sides of length a, b, and c (
+
+
+
+
+

Saving your trained model

+

Axolotl automatically saves checkpoints to the output_dir path.

+
+
# Show the saved checkpoints in the output_dir
+!ls -lh "./outputs/qwen-sft-pirate-rrr"
+
+
total 506M
+-rw-r--r-- 1 root root  845 May  7 22:21 adapter_config.json
+-rw-r--r-- 1 root root 491M May  7 22:21 adapter_model.safetensors
+-rw-r--r-- 1 root root  707 May  7 22:11 added_tokens.json
+drwxr-xr-x 2 root root 4.0K May  7 22:17 checkpoint-13
+drwxr-xr-x 2 root root 4.0K May  7 22:21 checkpoint-25
+-rw-r--r-- 1 root root 1.2K May  7 22:11 config.json
+-rw-r--r-- 1 root root 1.6M May  7 22:11 merges.txt
+-rw-r--r-- 1 root root 2.6K May  7 22:21 README.md
+-rw-r--r-- 1 root root  613 May  7 22:11 special_tokens_map.json
+-rw-r--r-- 1 root root 9.5K May  7 22:11 tokenizer_config.json
+-rw-r--r-- 1 root root  11M May  7 22:11 tokenizer.json
+-rw-r--r-- 1 root root 2.7M May  7 22:11 vocab.json
+
+
+

Setting hub_model_id: in the original config would have automatically uploaded the model to HuggingFace Hub (e.g. hub_model_id: username/model_id)

+

If you prefer to manually upload the training artifacts, we can still upload the entire final checkpoint to HuggingFace from the CLI.

+
+
from huggingface_hub import notebook_login
+# remove the partial epoch checkpoints
+!rm -rf "./outputs/qwen-sft-pirate-rrr/checkpoint-*"
+
+# HF Notebook login widget
+notebook_login()
+
+# upload the LoRA adapter for your model to HF, remember to update the username/model-name below
+!huggingface-cli upload --repo-type=model winglian/pirate-qwen-14B "./outputs/qwen-sft-pirate-rrr"
+
+ +
+
+
It seems you are trying to upload a large folder at once. This might take some time and then fail if the folder is too large. For such cases, it is recommended to upload in smaller batches or to use `HfApi().upload_large_folder(...)`/`huggingface-cli upload-large-folder` instead. For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder.
+Start hashing 40 files.
+Finished hashing 40 files.
+Uploading files using Xet Storage..
+Uploading...:  87% 1.82G/2.10G [00:23<00:04, 67.3MB/s]Cancellation requested; stopping current tasks.
+Traceback (most recent call last):
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py", line 598, in _upload_xet_files
+    upload_files(
+RuntimeError: Xet Runtime Error: Task cancelled; possible runtime shutdown in progress (task 9 was cancelled).
+
+During handling of the above exception, another exception occurred:
+
+Traceback (most recent call last):
+  File "/usr/local/bin/huggingface-cli", line 8, in <module>
+    sys.exit(main())
+             ^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/huggingface_cli.py", line 57, in main
+    service.run()
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py", line 207, in run
+    print(self._upload())
+          ^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py", line 302, in _upload
+    return self.api.upload_folder(
+           ^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py", line 1633, in _inner
+    return fn(self, *args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py", line 4942, in upload_folder
+    commit_info = self.create_commit(
+                  ^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py", line 1633, in _inner
+    return fn(self, *args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py", line 4202, in create_commit
+    self.preupload_lfs_files(
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py", line 4483, in preupload_lfs_files
+    _upload_xet_files(**upload_kwargs, create_pr=create_pr)  # type: ignore [arg-type]
+    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py", line 114, in _inner_fn
+    return fn(*args, **kwargs)
+           ^^^^^^^^^^^^^^^^^^^
+  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py", line 592, in _upload_xet_files
+    with progress_cm as progress:
+  File "/usr/local/lib/python3.11/dist-packages/tqdm/std.py", line 1138, in __exit__
+    def __exit__(self, exc_type, exc_value, traceback):
+
+KeyboardInterrupt
+^C
+
+
+ - + - + - +