# Llama 3.1 8B - Human-like QLoRA fine-tune # # Goal: natural, warm conversation; never corrects user errors; direct responses # Hardware: single RTX 5080 (16 GB VRAM) # Method: QLoRA (4-bit) via bitsandbytes (compiled from source for sm_120) # # Prerequisites: # See SETUP_MIAAI.md for full environment setup including bitsandbytes compilation # huggingface-cli login (meta-llama is a gated model) # # Run: # export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True # axolotl train human_chat_qlora.yml # axolotl merge-lora human_chat_qlora.yml # (optional) merge adapter into base base_model: meta-llama/Meta-Llama-3.1-8B-Instruct model_type: LlamaForCausalLM tokenizer_type: AutoTokenizer load_in_4bit: true strict: false # --- System prompt baked into every conversation --- chat_template: llama3 default_system_message: >- You are a direct, warm, and genuinely helpful assistant. Respond to the user's intent naturally - never comment on typos, grammar, or phrasing issues in their message. Just understand what they mean and give a clear, useful, conversational answer as if talking to a knowledgeable friend. # --- Datasets --- # SlimOrca: ~74k carefully curated conversations - good for natural tone # OpenHermes-2.5: broad instruction coverage - sampled to 5% to keep balance datasets: - path: Open-Orca/SlimOrca type: chat_template field_messages: conversations message_field_role: from message_field_content: value split: train[:3%] - path: teknium/OpenHermes-2.5 type: chat_template field_messages: conversations message_field_role: from message_field_content: value split: train[:5%] dataset_prepared_path: last_run_prepared val_set_size: 0.01 output_dir: ./outputs/llama31-8b-humanchat # sequence_len 2048 required on 16GB VRAM - 4096 OOMs during loss computation # (logits tensor: batch x seq_len x 128k vocab exceeds available memory) sequence_len: 2048 sample_packing: true pad_to_sequence_len: true # --- QLoRA adapter --- adapter: qlora lora_r: 64 lora_alpha: 32 lora_dropout: 0.05 lora_target_linear: true # --- Training hyperparameters --- # Effective batch = micro_batch_size x gradient_accumulation = 1 x 8 = 8 micro_batch_size: 1 gradient_accumulation_steps: 8 num_epochs: 2 optimizer: paged_adamw_32bit lr_scheduler: cosine learning_rate: 2e-4 warmup_ratio: 0.05 weight_decay: 0.1 train_on_inputs: false group_by_length: false bf16: auto tf32: false # --- Memory & speed --- gradient_checkpointing: true attn_implementation: flash_attention_2 # --- Logging & checkpointing --- logging_steps: 10 evals_per_epoch: 2 saves_per_epoch: 1 special_tokens: pad_token: "<|eot_id|>"