From 74f2263ac72ed35bd1474c5616ffcaf65e39eeac Mon Sep 17 00:00:00 2001
From: tocmo0nlord <tocmo0nlord@192.168.1.63>
Date: Wed, 13 May 2026 18:58:51 +0000
Subject: [PATCH] Update SETUP_MIAAI.md: bitsandbytes sm_120 patch, OOM fixes,
 working training config

---
 SETUP_MIAAI.md | 107 ++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 101 insertions(+), 6 deletions(-)

diff --git a/SETUP_MIAAI.md b/SETUP_MIAAI.md
index 36d1a8db9..1563d6fc1 100644
--- a/SETUP_MIAAI.md
+++ b/SETUP_MIAAI.md
@@ -1,11 +1,13 @@
 # Axolotl Setup — miaai (RTX 5080, CUDA 13.2)
 
 ## System Info
-- GPU: NVIDIA RTX 5080 (16GB VRAM)
-- Driver: 580.126.09 — max CUDA 13.0 (nvcc from conda resolves to 13.2)
-- OS: Ubuntu (Python 3.13 system — do NOT use system Python for ML)
+- GPU: NVIDIA RTX 5080 (16GB VRAM, sm_120 / Blackwell)
+- Driver: 580.126.09 — max CUDA 13.0 shown by nvidia-smi, but nvcc from conda is 13.2
+- OS: Ubuntu 25.10 (Python 3.13 system — do NOT use system Python for ML)
 - Axolotl branch: `activeblue/main`
 
+---
+
 ## One-time Setup
 
 ### 1. Install Miniconda
@@ -32,13 +34,16 @@ git rebase upstream/main        # keeps activeblue patches on top
 git push origin activeblue/main --force-with-lease
 ```
 
-### 4. Install CUDA toolkit (needed to compile flash-attn)
+### 4. Install CUDA toolkit (needed to compile flash-attn and bitsandbytes)
 ```bash
 conda install -y -c "nvidia/label/cuda-12.8.0" cuda-toolkit
 export CUDA_HOME=$CONDA_PREFIX
 export PATH=$CUDA_HOME/bin:$PATH
 ```
 
+> NOTE: Despite installing from the cuda-12.8.0 channel, conda resolves nvcc to **13.2.78**.
+> This is fine — use cu132 everywhere to match.
+
 ### 5. Install PyTorch — use cu132 (matches nvcc from conda)
 > NOTE: torchaudio has no cu132 wheel — skip it, not needed for LLM training
 ```bash
@@ -52,26 +57,109 @@ pip install -e "."
 ```
 
 > **flash-attn compiles CUDA kernels from source — takes 15–25 min on 10 cores of i7-14700K.**
-> Always set `MAX_JOBS` to the number of available CPU cores to parallelize and speed up compilation:
+> Always set `MAX_JOBS` to the number of available CPU cores:
 ```bash
 MAX_JOBS=10 pip install flash-attn --no-build-isolation
 ```
 
+### 7. Compile bitsandbytes from source for sm_120 (RTX 5080 / Blackwell)
+
+The prebuilt bitsandbytes wheels do not include sm_120 support and CUDA 13.2 dropped sm_50–53.
+You must compile from source with a patched CMakeLists.txt.
+
+```bash
+# Clone bitsandbytes v0.49.1
+git clone --branch v0.49.1 --depth 1 https://github.com/bitsandbytes-foundation/bitsandbytes.git /tmp/bnb_0491
+cd /tmp/bnb_0491
+
+# Patch CMakeLists.txt: override arch list to sm_120 only, just before the foreach loop
+# (cmake >= 3.23.0 skips the manual arch block and uses its own built-in list which lacks sm_120)
+sed -i '/    foreach(capability \${CMAKE_CUDA_ARCHITECTURES_ALL})/i\    # RTX 5080 sm_120 patch: override before capability list is built\n    set(CMAKE_CUDA_ARCHITECTURES_ALL 120)' CMakeLists.txt
+
+# Verify the patch landed at the right line
+grep -n "ARCHITECTURES_ALL\|foreach" CMakeLists.txt | tail -5
+# Should show: set(CMAKE_CUDA_ARCHITECTURES_ALL 120) immediately before the foreach line
+
+# Configure — must point cmake at conda's nvcc
+cmake \
+  -DCMAKE_CUDA_COMPILER=/opt/miniconda3/envs/axolotl/bin/nvcc \
+  -DCOMPUTE_BACKEND=cuda \
+  -S /tmp/bnb_0491 \
+  -B /tmp/bnb_0491/build 2>&1 | grep -E "(Capabilit|CUDA Ver|Error)"
+# Expected: "CUDA Capabilities Selected: 120"
+
+# Build (j10 uses 10 cores — adjust to your CPU)
+cmake --build /tmp/bnb_0491/build -j10
+
+# Install into conda site-packages
+SITE_PKG=/opt/miniconda3/envs/axolotl/lib/python3.11/site-packages
+cp -r /tmp/bnb_0491/bitsandbytes "$SITE_PKG/"
+
+# Verify
+python3 -c "
+import torch, bitsandbytes as bnb
+x = torch.randn(64, 64, device='cuda')
+l = bnb.nn.Linear8bitLt(64, 64).cuda()
+print('bitsandbytes CUDA OK:', l(x).shape)
+"
+```
+
+---
+
 ## Every Session (after first-time setup)
 ```bash
 export PATH="/opt/miniconda3/bin:$PATH"
 conda activate axolotl
 export CUDA_HOME=$CONDA_PREFIX
 export PATH=$CUDA_HOME/bin:$PATH
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 cd /home/tocmo0nlord/axolotl
 ```
 
+---
+
+## Training Config — human_chat_qlora.yml
+
+Key settings that work on RTX 5080 (16GB):
+
+| Setting | Value | Notes |
+|---|---|---|
+| `sequence_len` | `2048` | 4096 causes OOM during loss computation (logits x 128k vocab) |
+| `micro_batch_size` | `1` | Keep low; effective batch = micro x grad_accum |
+| `gradient_accumulation_steps` | `8` | Effective batch = 8 |
+| `adapter` | `qlora` | QLoRA 4-bit via bitsandbytes |
+| `attn_implementation` | `flash_attention_2` | Not the deprecated `flash_attention: true` |
+| `type` (datasets) | `chat_template` | Not the deprecated `sharegpt` |
+
+Dataset fields for SlimOrca / OpenHermes-2.5 (sharegpt-format with different field names):
+```yaml
+datasets:
+  - path: Open-Orca/SlimOrca
+    type: chat_template
+    field_messages: conversations
+    message_field_role: from
+    message_field_content: value
+    split: "train[:3%]"
+```
+
 ## Run Training
 ```bash
-axolotl train human_chat_qlora.yml
+export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
+axolotl train ~/human_chat_qlora.yml
 ```
 
+Expected startup sequence:
+1. Config validation + capability detection (shows `sm_120`)
+2. Dataset tokenization (~65k samples, ~30 seconds)
+3. `Loading weights: 100% 291/291`
+4. `trainable params: 167,772,160 || all params: 8,198,033,408 || trainable%: 2.05`
+5. Initial eval: loss ~0.81, perplexity ~2.25, VRAM ~8.5GB
+6. Training steps at ~2.6 it/s, VRAM ~9-10GB
+
+---
+
 ## Common Pitfalls Encountered
+
 | Problem | Cause | Fix |
 |---|---|---|
 | `externally-managed-environment` | System Python 3.13 blocks pip | Use conda env, never system pip |
@@ -81,3 +169,10 @@ axolotl train human_chat_qlora.yml
 | `torchaudio` not found for cu132 | No cu132 wheel exists | Skip torchaudio — not needed |
 | `src refspec main does not match` | Fork default branch is `activeblue/main` | `git push origin activeblue/main` |
 | flash-attn compile is slow | Single-threaded by default | Set `MAX_JOBS=<cpu_count>` before pip install |
+| `nvcc fatal: Unsupported gpu architecture 'compute_50'` | bitsandbytes CMakeLists.txt hardcodes sm_50; CUDA 13.2 dropped it | Patch CMakeLists.txt (see step 7 above) |
+| `CUDA Capabilities Selected: 50;52;...` (ignores sm_120) | cmake >= 3.23 built-in arch list lacks sm_120 | Add `set(CMAKE_CUDA_ARCHITECTURES_ALL 120)` before foreach loop |
+| `BackendUnavailable: scikit_build_core` | pip install of bnb tries to rebuild | Copy .so directly to site-packages instead |
+| `torch.OutOfMemoryError` during eval | logits tensor (batch x 4096 x 128k vocab) too large | Set `sequence_len: 2048`, `micro_batch_size: 1` |
+| `type: sharegpt` deprecation warning | axolotl removed sharegpt type | Use `type: chat_template` with field mappings |
+| `flash_attention: true` deprecation | Old config key removed | Use `attn_implementation: flash_attention_2` |
+| Capybara dataset `field_messages null` | Capybara uses input/output format, not conversations | Switch to SlimOrca or OpenHermes-2.5 |