From 71c6a56e7a2279bd6a3fd944cf84925df19e7a41 Mon Sep 17 00:00:00 2001 From: tocmo0nlord Date: Wed, 13 May 2026 13:55:52 +0000 Subject: [PATCH] switch to HQQ quantization to bypass bitsandbytes sm_120 issue --- human_chat_qlora.yml | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/human_chat_qlora.yml b/human_chat_qlora.yml index ba5f68a52..a712092d0 100644 --- a/human_chat_qlora.yml +++ b/human_chat_qlora.yml @@ -1,8 +1,8 @@ -# Llama 3.1 8B — Human-like QLoRA fine-tune +# Llama 3.1 8B — Human-like LoRA fine-tune (HQQ quantization) # # Goal: natural, warm conversation; never corrects user errors; direct responses # Hardware: single RTX 5080 (16 GB VRAM) -# Method: QLoRA (4-bit) via Axolotl +# Method: LoRA on HQQ 4-bit quantized base (bypasses bitsandbytes — RTX 5080 compatible) # # Prerequisites: # pip install -e '.[flash-attn]' (inside your axolotl repo) @@ -16,15 +16,11 @@ base_model: meta-llama/Meta-Llama-3.1-8B-Instruct model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer -load_in_4bit: true +# HQQ quantization — no bitsandbytes required, works on RTX 5080 (sm_120) +quant_method: hqq strict: false trust_remote_code: true - -# Explicit dtype settings to avoid weight conversion errors torch_dtype: bfloat16 -bnb_4bit_use_double_quant: true -bnb_4bit_quant_type: nf4 -bnb_4bit_compute_dtype: bfloat16 # --- System prompt baked into every conversation --- # This is the primary lever for "no error correcting, more human-like" @@ -37,8 +33,6 @@ default_system_message: >- # --- Datasets --- # Both use ShareGPT format: conversations field, from/value keys -# SlimOrca: ~15k sample of high-quality multi-turn conversations -# OpenHermes-2.5: broad instruction coverage, 5% sample (~50k) datasets: - path: Open-Orca/SlimOrca type: chat_template @@ -62,8 +56,8 @@ sequence_len: 4096 sample_packing: true pad_to_sequence_len: true -# --- QLoRA adapter --- -adapter: qlora +# --- LoRA adapter (on top of HQQ quantized base) --- +adapter: lora lora_r: 64 lora_alpha: 32 lora_dropout: 0.05