# Llama 3.1 8B — Human-like QLoRA fine-tune # # Goal: natural, warm conversation; never corrects user errors; direct responses # Hardware: single RTX 5080 (16 GB VRAM) # Method: QLoRA (4-bit) via Axolotl # # Prerequisites: # pip install -e '.[flash-attn]' (inside your axolotl repo) # huggingface-cli login (meta-llama is a gated model) # # Run: # axolotl train human_chat_qlora.yml # axolotl merge-lora human_chat_qlora.yml # (optional) merge adapter into base base_model: meta-llama/Meta-Llama-3.1-8B-Instruct model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer load_in_4bit: true strict: false trust_remote_code: true # Explicit dtype settings to avoid weight conversion errors torch_dtype: bfloat16 bnb_4bit_use_double_quant: true bnb_4bit_quant_type: nf4 bnb_4bit_compute_dtype: bfloat16 # --- System prompt baked into every conversation --- # This is the primary lever for "no error correcting, more human-like" chat_template: llama3 default_system_message: >- You are a direct, warm, and genuinely helpful assistant. Respond to the user's intent naturally — never comment on typos, grammar, or phrasing issues in their message. Just understand what they mean and give a clear, useful, conversational answer as if talking to a knowledgeable friend. # --- Datasets --- # Both use ShareGPT format: conversations field, from/value keys # SlimOrca: ~15k sample of high-quality multi-turn conversations # OpenHermes-2.5: broad instruction coverage, 5% sample (~50k) datasets: - path: Open-Orca/SlimOrca type: chat_template field_messages: conversations message_field_role: from message_field_content: value split: "train[:3%]" - path: teknium/OpenHermes-2.5 type: chat_template field_messages: conversations message_field_role: from message_field_content: value split: "train[:5%]" dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/llama31-8b-humanchat sequence_len: 4096 sample_packing: true pad_to_sequence_len: true # --- QLoRA adapter --- adapter: qlora lora_r: 64 lora_alpha: 32 lora_dropout: 0.05 lora_target_linear: true # --- Training hyperparameters --- # Effective batch = micro_batch_size x gradient_accumulation = 2 x 4 = 8 micro_batch_size: 2 gradient_accumulation_steps: 4 num_epochs: 2 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 2e-4 warmup_ratio: 0.05 weight_decay: 0.1 train_on_inputs: false group_by_length: false bf16: auto tf32: false # --- Memory & speed --- gradient_checkpointing: true attn_implementation: flash_attention_2 # --- Logging & checkpointing --- logging_steps: 10 evals_per_epoch: 2 saves_per_epoch: 1 special_tokens: pad_token: "<|eot_id|>"