Initial commit: Odoo 18 RAG stack
Scraper, indexer, and FastAPI query service for Retrieval-Augmented Generation over Odoo 18 documentation. Uses Qdrant + Ollama (nomic-embed-text + llama3.1). Integrates with ActiveBlue PeerBus agent interface. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
244
indexer/indexer.py
Normal file
244
indexer/indexer.py
Normal file
@@ -0,0 +1,244 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Odoo 18 RAG Indexer
|
||||
====================
|
||||
Reads scraped pages, chunks them, embeds with nomic-embed-text via Ollama,
|
||||
and upserts into Qdrant.
|
||||
|
||||
Usage:
|
||||
python indexer.py # index everything
|
||||
python indexer.py --reset # drop collection and re-index
|
||||
python indexer.py --module accounting
|
||||
|
||||
Requires:
|
||||
- Qdrant running: docker compose up -d qdrant
|
||||
- Ollama with model pulled: ollama pull nomic-embed-text
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import argparse
|
||||
import hashlib
|
||||
import time
|
||||
import os
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
import requests
|
||||
from qdrant_client import QdrantClient
|
||||
from qdrant_client.models import (
|
||||
Distance, VectorParams, PointStruct,
|
||||
Filter, FieldCondition, MatchValue,
|
||||
)
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://miaai:11434")
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text")
|
||||
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "odoo18_docs")
|
||||
VECTOR_SIZE = 768
|
||||
RAW_DATA_FILE = Path("../data/raw/odoo18_docs_raw.jsonl")
|
||||
BATCH_SIZE = 32
|
||||
CHUNK_SIZE = 512
|
||||
CHUNK_OVERLAP = 64
|
||||
UPSERT_BATCH = 100
|
||||
|
||||
|
||||
@dataclass
|
||||
class Chunk:
|
||||
chunk_id: str
|
||||
doc_id: str
|
||||
url: str
|
||||
title: str
|
||||
module: str
|
||||
section: str
|
||||
headings: list
|
||||
text: str
|
||||
chunk_index: int
|
||||
|
||||
|
||||
def split_text(text: str) -> list:
|
||||
target_words = int(CHUNK_SIZE * 0.75)
|
||||
overlap_words = int(CHUNK_OVERLAP * 0.75)
|
||||
|
||||
sentences = []
|
||||
current = []
|
||||
for word in text.split():
|
||||
current.append(word)
|
||||
if word.endswith((".", "?", "!", ":\n", "\n\n")):
|
||||
sentences.append(" ".join(current))
|
||||
current = []
|
||||
if current:
|
||||
sentences.append(" ".join(current))
|
||||
|
||||
chunks = []
|
||||
buffer_words = []
|
||||
buffer_count = 0
|
||||
|
||||
for sentence in sentences:
|
||||
s_words = sentence.split()
|
||||
s_count = len(s_words)
|
||||
if buffer_count + s_count > target_words and buffer_words:
|
||||
chunks.append(" ".join(buffer_words))
|
||||
overlap_slice = buffer_words[-overlap_words:] if overlap_words else []
|
||||
buffer_words = overlap_slice + s_words
|
||||
buffer_count = len(buffer_words)
|
||||
else:
|
||||
buffer_words.extend(s_words)
|
||||
buffer_count += s_count
|
||||
|
||||
if buffer_words:
|
||||
chunks.append(" ".join(buffer_words))
|
||||
|
||||
return [c for c in chunks if len(c.strip()) > 80]
|
||||
|
||||
|
||||
def chunk_page(page: dict) -> list:
|
||||
text_chunks = split_text(page["text"])
|
||||
if not text_chunks:
|
||||
return []
|
||||
|
||||
chunks = []
|
||||
for idx, text in enumerate(text_chunks):
|
||||
chunk_id = hashlib.sha256(f"{page['doc_id']}_{idx}".encode()).hexdigest()[:20]
|
||||
chunks.append(Chunk(
|
||||
chunk_id=chunk_id,
|
||||
doc_id=page["doc_id"],
|
||||
url=page["url"],
|
||||
title=page["title"],
|
||||
module=page.get("module", "general"),
|
||||
section=page.get("section", ""),
|
||||
headings=page.get("headings", []),
|
||||
text=text,
|
||||
chunk_index=idx,
|
||||
))
|
||||
return chunks
|
||||
|
||||
|
||||
def embed_batch(texts: list) -> list:
|
||||
resp = requests.post(
|
||||
f"{OLLAMA_URL}/api/embed",
|
||||
json={"model": EMBED_MODEL, "input": texts},
|
||||
timeout=120,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
embeddings = resp.json().get("embeddings", [])
|
||||
if len(embeddings) != len(texts):
|
||||
raise ValueError(f"Expected {len(texts)} embeddings, got {len(embeddings)}")
|
||||
return embeddings
|
||||
|
||||
|
||||
def check_ollama() -> bool:
|
||||
try:
|
||||
resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
|
||||
models = [m["name"] for m in resp.json().get("models", [])]
|
||||
if not any(EMBED_MODEL in m for m in models):
|
||||
log.error(f"Model '{EMBED_MODEL}' not found. Run: ollama pull {EMBED_MODEL}")
|
||||
return False
|
||||
log.info(f"Ollama OK at {OLLAMA_URL} — model {EMBED_MODEL} ready")
|
||||
return True
|
||||
except Exception as e:
|
||||
log.error(f"Ollama unreachable at {OLLAMA_URL}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def setup_collection(client: QdrantClient, reset: bool = False):
|
||||
exists = client.collection_exists(COLLECTION_NAME)
|
||||
|
||||
if exists and reset:
|
||||
log.info(f"Dropping collection '{COLLECTION_NAME}'...")
|
||||
client.delete_collection(COLLECTION_NAME)
|
||||
exists = False
|
||||
|
||||
if not exists:
|
||||
log.info(f"Creating collection '{COLLECTION_NAME}' (dim={VECTOR_SIZE})...")
|
||||
client.create_collection(
|
||||
collection_name=COLLECTION_NAME,
|
||||
vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
|
||||
)
|
||||
client.create_payload_index(COLLECTION_NAME, field_name="module", field_schema="keyword")
|
||||
client.create_payload_index(COLLECTION_NAME, field_name="url", field_schema="keyword")
|
||||
else:
|
||||
info = client.get_collection(COLLECTION_NAME)
|
||||
log.info(f"Collection '{COLLECTION_NAME}' exists ({info.points_count} points)")
|
||||
|
||||
|
||||
def upsert_chunks(client: QdrantClient, chunks: list, vectors: list):
|
||||
points = []
|
||||
for chunk, vector in zip(chunks, vectors):
|
||||
points.append(PointStruct(
|
||||
id=int(chunk.chunk_id[:8], 16),
|
||||
vector=vector,
|
||||
payload={
|
||||
"chunk_id": chunk.chunk_id,
|
||||
"doc_id": chunk.doc_id,
|
||||
"url": chunk.url,
|
||||
"title": chunk.title,
|
||||
"module": chunk.module,
|
||||
"section": chunk.section,
|
||||
"headings": chunk.headings,
|
||||
"text": chunk.text,
|
||||
"chunk_index": chunk.chunk_index,
|
||||
},
|
||||
))
|
||||
for i in range(0, len(points), UPSERT_BATCH):
|
||||
client.upsert(collection_name=COLLECTION_NAME, points=points[i:i + UPSERT_BATCH])
|
||||
|
||||
|
||||
def index(module_filter: str | None = None, reset: bool = False):
|
||||
if not check_ollama():
|
||||
raise SystemExit(1)
|
||||
|
||||
if not RAW_DATA_FILE.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Raw data not found: {RAW_DATA_FILE}\n"
|
||||
f"Run the scraper first: docker compose run --rm scraper"
|
||||
)
|
||||
|
||||
client = QdrantClient(url=QDRANT_URL)
|
||||
setup_collection(client, reset=reset)
|
||||
|
||||
pages = []
|
||||
with open(RAW_DATA_FILE, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
page = json.loads(line.strip())
|
||||
if module_filter and page.get("module") != module_filter:
|
||||
continue
|
||||
pages.append(page)
|
||||
log.info(f"Loaded {len(pages)} pages")
|
||||
|
||||
all_chunks = []
|
||||
for page in pages:
|
||||
all_chunks.extend(chunk_page(page))
|
||||
log.info(f"Created {len(all_chunks)} chunks")
|
||||
|
||||
total = len(all_chunks)
|
||||
embedded = failed = 0
|
||||
|
||||
for i in range(0, total, BATCH_SIZE):
|
||||
batch = all_chunks[i:i + BATCH_SIZE]
|
||||
try:
|
||||
vectors = embed_batch([c.text for c in batch])
|
||||
upsert_chunks(client, batch, vectors)
|
||||
embedded += len(batch)
|
||||
log.info(f"Progress: {embedded}/{total} ({embedded/total*100:.0f}%)")
|
||||
except Exception as e:
|
||||
log.error(f"Batch {i//BATCH_SIZE} failed: {e}")
|
||||
failed += len(batch)
|
||||
time.sleep(2)
|
||||
|
||||
info = client.get_collection(COLLECTION_NAME)
|
||||
log.info(
|
||||
f"\n✅ Done. Embedded: {embedded}, Failed: {failed}\n"
|
||||
f" Total vectors in Qdrant: {info.points_count}"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Odoo 18 RAG indexer")
|
||||
parser.add_argument("--module", help="Index only one module")
|
||||
parser.add_argument("--reset", action="store_true", help="Drop and recreate collection")
|
||||
args = parser.parse_args()
|
||||
index(module_filter=args.module, reset=args.reset)
|
||||
Reference in New Issue
Block a user