From 981a13e110330332f20321df8246be37dbf728be Mon Sep 17 00:00:00 2001 From: tocmo0nlord Date: Wed, 13 May 2026 18:59:19 +0000 Subject: [PATCH] Update human_chat_qlora.yml: working config for RTX 5080 (seq_len 2048, qlora, chat_template) --- human_chat_qlora.yml | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/human_chat_qlora.yml b/human_chat_qlora.yml index a712092d0..28df2826b 100644 --- a/human_chat_qlora.yml +++ b/human_chat_qlora.yml @@ -1,14 +1,15 @@ -# Llama 3.1 8B — Human-like LoRA fine-tune (HQQ quantization) +# Llama 3.1 8B - Human-like QLoRA fine-tune # # Goal: natural, warm conversation; never corrects user errors; direct responses # Hardware: single RTX 5080 (16 GB VRAM) -# Method: LoRA on HQQ 4-bit quantized base (bypasses bitsandbytes — RTX 5080 compatible) +# Method: QLoRA (4-bit) via bitsandbytes (compiled from source for sm_120) # # Prerequisites: -# pip install -e '.[flash-attn]' (inside your axolotl repo) -# huggingface-cli login (meta-llama is a gated model) +# See SETUP_MIAAI.md for full environment setup including bitsandbytes compilation +# huggingface-cli login (meta-llama is a gated model) # # Run: +# export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # axolotl train human_chat_qlora.yml # axolotl merge-lora human_chat_qlora.yml # (optional) merge adapter into base @@ -16,57 +17,56 @@ base_model: meta-llama/Meta-Llama-3.1-8B-Instruct model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer -# HQQ quantization — no bitsandbytes required, works on RTX 5080 (sm_120) -quant_method: hqq +load_in_4bit: true strict: false -trust_remote_code: true -torch_dtype: bfloat16 # --- System prompt baked into every conversation --- -# This is the primary lever for "no error correcting, more human-like" chat_template: llama3 default_system_message: >- You are a direct, warm, and genuinely helpful assistant. - Respond to the user's intent naturally — never comment on typos, grammar, + Respond to the user's intent naturally - never comment on typos, grammar, or phrasing issues in their message. Just understand what they mean and give a clear, useful, conversational answer as if talking to a knowledgeable friend. # --- Datasets --- -# Both use ShareGPT format: conversations field, from/value keys +# SlimOrca: ~74k carefully curated conversations - good for natural tone +# OpenHermes-2.5: broad instruction coverage - sampled to 5% to keep balance datasets: - path: Open-Orca/SlimOrca type: chat_template field_messages: conversations message_field_role: from message_field_content: value - split: "train[:3%]" + split: train[:3%] - path: teknium/OpenHermes-2.5 type: chat_template field_messages: conversations message_field_role: from message_field_content: value - split: "train[:5%]" + split: train[:5%] dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/llama31-8b-humanchat -sequence_len: 4096 +# sequence_len 2048 required on 16GB VRAM - 4096 OOMs during loss computation +# (logits tensor: batch x seq_len x 128k vocab exceeds available memory) +sequence_len: 2048 sample_packing: true pad_to_sequence_len: true -# --- LoRA adapter (on top of HQQ quantized base) --- -adapter: lora +# --- QLoRA adapter --- +adapter: qlora lora_r: 64 lora_alpha: 32 lora_dropout: 0.05 lora_target_linear: true # --- Training hyperparameters --- -# Effective batch = micro_batch_size x gradient_accumulation = 2 x 4 = 8 -micro_batch_size: 2 -gradient_accumulation_steps: 4 +# Effective batch = micro_batch_size x gradient_accumulation = 1 x 8 = 8 +micro_batch_size: 1 +gradient_accumulation_steps: 8 num_epochs: 2 optimizer: paged_adamw_32bit lr_scheduler: cosine