Update human_chat_qlora.yml: working config for RTX 5080 (seq_len 2048, qlora, chat_template)

This commit is contained in:
2026-05-13 18:59:19 +00:00
parent 74f2263ac7
commit 981a13e110

View File

@@ -1,14 +1,15 @@
# Llama 3.1 8B Human-like LoRA fine-tune (HQQ quantization) # Llama 3.1 8B - Human-like QLoRA fine-tune
# #
# Goal: natural, warm conversation; never corrects user errors; direct responses # Goal: natural, warm conversation; never corrects user errors; direct responses
# Hardware: single RTX 5080 (16 GB VRAM) # Hardware: single RTX 5080 (16 GB VRAM)
# Method: LoRA on HQQ 4-bit quantized base (bypasses bitsandbytes — RTX 5080 compatible) # Method: QLoRA (4-bit) via bitsandbytes (compiled from source for sm_120)
# #
# Prerequisites: # Prerequisites:
# pip install -e '.[flash-attn]' (inside your axolotl repo) # See SETUP_MIAAI.md for full environment setup including bitsandbytes compilation
# huggingface-cli login (meta-llama is a gated model) # huggingface-cli login (meta-llama is a gated model)
# #
# Run: # Run:
# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
# axolotl train human_chat_qlora.yml # axolotl train human_chat_qlora.yml
# axolotl merge-lora human_chat_qlora.yml # (optional) merge adapter into base # axolotl merge-lora human_chat_qlora.yml # (optional) merge adapter into base
@@ -16,57 +17,56 @@ base_model: meta-llama/Meta-Llama-3.1-8B-Instruct
model_type: LlamaForCausalLM model_type: LlamaForCausalLM
tokenizer_type: AutoTokenizer tokenizer_type: AutoTokenizer
# HQQ quantization — no bitsandbytes required, works on RTX 5080 (sm_120) load_in_4bit: true
quant_method: hqq
strict: false strict: false
trust_remote_code: true
torch_dtype: bfloat16
# --- System prompt baked into every conversation --- # --- System prompt baked into every conversation ---
# This is the primary lever for "no error correcting, more human-like"
chat_template: llama3 chat_template: llama3
default_system_message: >- default_system_message: >-
You are a direct, warm, and genuinely helpful assistant. You are a direct, warm, and genuinely helpful assistant.
Respond to the user's intent naturally never comment on typos, grammar, Respond to the user's intent naturally - never comment on typos, grammar,
or phrasing issues in their message. Just understand what they mean and give or phrasing issues in their message. Just understand what they mean and give
a clear, useful, conversational answer as if talking to a knowledgeable friend. a clear, useful, conversational answer as if talking to a knowledgeable friend.
# --- Datasets --- # --- Datasets ---
# Both use ShareGPT format: conversations field, from/value keys # SlimOrca: ~74k carefully curated conversations - good for natural tone
# OpenHermes-2.5: broad instruction coverage - sampled to 5% to keep balance
datasets: datasets:
- path: Open-Orca/SlimOrca - path: Open-Orca/SlimOrca
type: chat_template type: chat_template
field_messages: conversations field_messages: conversations
message_field_role: from message_field_role: from
message_field_content: value message_field_content: value
split: "train[:3%]" split: train[:3%]
- path: teknium/OpenHermes-2.5 - path: teknium/OpenHermes-2.5
type: chat_template type: chat_template
field_messages: conversations field_messages: conversations
message_field_role: from message_field_role: from
message_field_content: value message_field_content: value
split: "train[:5%]" split: train[:5%]
dataset_prepared_path: last_run_prepared dataset_prepared_path: last_run_prepared
val_set_size: 0.01 val_set_size: 0.01
output_dir: ./outputs/llama31-8b-humanchat output_dir: ./outputs/llama31-8b-humanchat
sequence_len: 4096 # sequence_len 2048 required on 16GB VRAM - 4096 OOMs during loss computation
# (logits tensor: batch x seq_len x 128k vocab exceeds available memory)
sequence_len: 2048
sample_packing: true sample_packing: true
pad_to_sequence_len: true pad_to_sequence_len: true
# --- LoRA adapter (on top of HQQ quantized base) --- # --- QLoRA adapter ---
adapter: lora adapter: qlora
lora_r: 64 lora_r: 64
lora_alpha: 32 lora_alpha: 32
lora_dropout: 0.05 lora_dropout: 0.05
lora_target_linear: true lora_target_linear: true
# --- Training hyperparameters --- # --- Training hyperparameters ---
# Effective batch = micro_batch_size x gradient_accumulation = 2 x 4 = 8 # Effective batch = micro_batch_size x gradient_accumulation = 1 x 8 = 8
micro_batch_size: 2 micro_batch_size: 1
gradient_accumulation_steps: 4 gradient_accumulation_steps: 8
num_epochs: 2 num_epochs: 2
optimizer: paged_adamw_32bit optimizer: paged_adamw_32bit
lr_scheduler: cosine lr_scheduler: cosine