Initial commit: Odoo 18 RAG stack

Scraper, indexer, and FastAPI query service for Retrieval-Augmented Generation over Odoo 18 documentation. Uses Qdrant + Ollama (nomic-embed-text + llama3.1). Integrates with ActiveBlue PeerBus agent interface. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 11:25:55 -04:00
commit 7fb1573bac
10 changed files with 1295 additions and 0 deletions
--- a/indexer/indexer.py
+++ b/indexer/indexer.py
@@ -0,0 +1,244 @@
+#!/usr/bin/env python3
+"""
+Odoo 18 RAG Indexer
+====================
+Reads scraped pages, chunks them, embeds with nomic-embed-text via Ollama,
+and upserts into Qdrant.
+
+Usage:
+    python indexer.py               # index everything
+    python indexer.py --reset       # drop collection and re-index
+    python indexer.py --module accounting
+
+Requires:
+    - Qdrant running:  docker compose up -d qdrant
+    - Ollama with model pulled:  ollama pull nomic-embed-text
+"""
+
+import json
+import logging
+import argparse
+import hashlib
+import time
+import os
+from pathlib import Path
+from dataclasses import dataclass
+
+import requests
+from qdrant_client import QdrantClient
+from qdrant_client.models import (
+    Distance, VectorParams, PointStruct,
+    Filter, FieldCondition, MatchValue,
+)
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s  %(message)s")
+log = logging.getLogger(__name__)
+
+OLLAMA_URL      = os.getenv("OLLAMA_URL", "http://miaai:11434")
+QDRANT_URL      = os.getenv("QDRANT_URL", "http://localhost:6333")
+EMBED_MODEL     = os.getenv("EMBED_MODEL", "nomic-embed-text")
+COLLECTION_NAME = os.getenv("COLLECTION_NAME", "odoo18_docs")
+VECTOR_SIZE     = 768
+RAW_DATA_FILE   = Path("../data/raw/odoo18_docs_raw.jsonl")
+BATCH_SIZE      = 32
+CHUNK_SIZE      = 512
+CHUNK_OVERLAP   = 64
+UPSERT_BATCH    = 100
+
+
+@dataclass
+class Chunk:
+    chunk_id: str
+    doc_id: str
+    url: str
+    title: str
+    module: str
+    section: str
+    headings: list
+    text: str
+    chunk_index: int
+
+
+def split_text(text: str) -> list:
+    target_words = int(CHUNK_SIZE * 0.75)
+    overlap_words = int(CHUNK_OVERLAP * 0.75)
+
+    sentences = []
+    current = []
+    for word in text.split():
+        current.append(word)
+        if word.endswith((".", "?", "!", ":\n", "\n\n")):
+            sentences.append(" ".join(current))
+            current = []
+    if current:
+        sentences.append(" ".join(current))
+
+    chunks = []
+    buffer_words = []
+    buffer_count = 0
+
+    for sentence in sentences:
+        s_words = sentence.split()
+        s_count = len(s_words)
+        if buffer_count + s_count > target_words and buffer_words:
+            chunks.append(" ".join(buffer_words))
+            overlap_slice = buffer_words[-overlap_words:] if overlap_words else []
+            buffer_words = overlap_slice + s_words
+            buffer_count = len(buffer_words)
+        else:
+            buffer_words.extend(s_words)
+            buffer_count += s_count
+
+    if buffer_words:
+        chunks.append(" ".join(buffer_words))
+
+    return [c for c in chunks if len(c.strip()) > 80]
+
+
+def chunk_page(page: dict) -> list:
+    text_chunks = split_text(page["text"])
+    if not text_chunks:
+        return []
+
+    chunks = []
+    for idx, text in enumerate(text_chunks):
+        chunk_id = hashlib.sha256(f"{page['doc_id']}_{idx}".encode()).hexdigest()[:20]
+        chunks.append(Chunk(
+            chunk_id=chunk_id,
+            doc_id=page["doc_id"],
+            url=page["url"],
+            title=page["title"],
+            module=page.get("module", "general"),
+            section=page.get("section", ""),
+            headings=page.get("headings", []),
+            text=text,
+            chunk_index=idx,
+        ))
+    return chunks
+
+
+def embed_batch(texts: list) -> list:
+    resp = requests.post(
+        f"{OLLAMA_URL}/api/embed",
+        json={"model": EMBED_MODEL, "input": texts},
+        timeout=120,
+    )
+    resp.raise_for_status()
+    embeddings = resp.json().get("embeddings", [])
+    if len(embeddings) != len(texts):
+        raise ValueError(f"Expected {len(texts)} embeddings, got {len(embeddings)}")
+    return embeddings
+
+
+def check_ollama() -> bool:
+    try:
+        resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
+        models = [m["name"] for m in resp.json().get("models", [])]
+        if not any(EMBED_MODEL in m for m in models):
+            log.error(f"Model '{EMBED_MODEL}' not found. Run: ollama pull {EMBED_MODEL}")
+            return False
+        log.info(f"Ollama OK at {OLLAMA_URL} — model {EMBED_MODEL} ready")
+        return True
+    except Exception as e:
+        log.error(f"Ollama unreachable at {OLLAMA_URL}: {e}")
+        return False
+
+
+def setup_collection(client: QdrantClient, reset: bool = False):
+    exists = client.collection_exists(COLLECTION_NAME)
+
+    if exists and reset:
+        log.info(f"Dropping collection '{COLLECTION_NAME}'...")
+        client.delete_collection(COLLECTION_NAME)
+        exists = False
+
+    if not exists:
+        log.info(f"Creating collection '{COLLECTION_NAME}' (dim={VECTOR_SIZE})...")
+        client.create_collection(
+            collection_name=COLLECTION_NAME,
+            vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
+        )
+        client.create_payload_index(COLLECTION_NAME, field_name="module", field_schema="keyword")
+        client.create_payload_index(COLLECTION_NAME, field_name="url",    field_schema="keyword")
+    else:
+        info = client.get_collection(COLLECTION_NAME)
+        log.info(f"Collection '{COLLECTION_NAME}' exists ({info.points_count} points)")
+
+
+def upsert_chunks(client: QdrantClient, chunks: list, vectors: list):
+    points = []
+    for chunk, vector in zip(chunks, vectors):
+        points.append(PointStruct(
+            id=int(chunk.chunk_id[:8], 16),
+            vector=vector,
+            payload={
+                "chunk_id":    chunk.chunk_id,
+                "doc_id":      chunk.doc_id,
+                "url":         chunk.url,
+                "title":       chunk.title,
+                "module":      chunk.module,
+                "section":     chunk.section,
+                "headings":    chunk.headings,
+                "text":        chunk.text,
+                "chunk_index": chunk.chunk_index,
+            },
+        ))
+    for i in range(0, len(points), UPSERT_BATCH):
+        client.upsert(collection_name=COLLECTION_NAME, points=points[i:i + UPSERT_BATCH])
+
+
+def index(module_filter: str | None = None, reset: bool = False):
+    if not check_ollama():
+        raise SystemExit(1)
+
+    if not RAW_DATA_FILE.exists():
+        raise FileNotFoundError(
+            f"Raw data not found: {RAW_DATA_FILE}\n"
+            f"Run the scraper first: docker compose run --rm scraper"
+        )
+
+    client = QdrantClient(url=QDRANT_URL)
+    setup_collection(client, reset=reset)
+
+    pages = []
+    with open(RAW_DATA_FILE, encoding="utf-8") as f:
+        for line in f:
+            page = json.loads(line.strip())
+            if module_filter and page.get("module") != module_filter:
+                continue
+            pages.append(page)
+    log.info(f"Loaded {len(pages)} pages")
+
+    all_chunks = []
+    for page in pages:
+        all_chunks.extend(chunk_page(page))
+    log.info(f"Created {len(all_chunks)} chunks")
+
+    total = len(all_chunks)
+    embedded = failed = 0
+
+    for i in range(0, total, BATCH_SIZE):
+        batch = all_chunks[i:i + BATCH_SIZE]
+        try:
+            vectors = embed_batch([c.text for c in batch])
+            upsert_chunks(client, batch, vectors)
+            embedded += len(batch)
+            log.info(f"Progress: {embedded}/{total} ({embedded/total*100:.0f}%)")
+        except Exception as e:
+            log.error(f"Batch {i//BATCH_SIZE} failed: {e}")
+            failed += len(batch)
+            time.sleep(2)
+
+    info = client.get_collection(COLLECTION_NAME)
+    log.info(
+        f"\n✅ Done. Embedded: {embedded}, Failed: {failed}\n"
+        f"   Total vectors in Qdrant: {info.points_count}"
+    )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Odoo 18 RAG indexer")
+    parser.add_argument("--module", help="Index only one module")
+    parser.add_argument("--reset", action="store_true", help="Drop and recreate collection")
+    args = parser.parse_args()
+    index(module_filter=args.module, reset=args.reset)