#!/usr/bin/env python3 """ Odoo 18 RAG Indexer ==================== Reads scraped pages, chunks them, embeds with nomic-embed-text via Ollama, and upserts into Qdrant. Usage: python indexer.py # index everything python indexer.py --reset # drop collection and re-index python indexer.py --module accounting Requires: - Qdrant running: docker compose up -d qdrant - Ollama with model pulled: ollama pull nomic-embed-text """ import json import logging import argparse import hashlib import time import os from pathlib import Path from dataclasses import dataclass import requests from qdrant_client import QdrantClient from qdrant_client.models import ( Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue, ) logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") log = logging.getLogger(__name__) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://miaai:11434") QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333") EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text") COLLECTION_NAME = os.getenv("COLLECTION_NAME", "odoo18_docs") VECTOR_SIZE = 768 RAW_DATA_FILE = Path("../data/raw/odoo18_docs_raw.jsonl") BATCH_SIZE = 32 CHUNK_SIZE = 512 CHUNK_OVERLAP = 64 UPSERT_BATCH = 100 @dataclass class Chunk: chunk_id: str doc_id: str url: str title: str module: str section: str headings: list text: str chunk_index: int def split_text(text: str) -> list: target_words = int(CHUNK_SIZE * 0.75) overlap_words = int(CHUNK_OVERLAP * 0.75) sentences = [] current = [] for word in text.split(): current.append(word) if word.endswith((".", "?", "!", ":\n", "\n\n")): sentences.append(" ".join(current)) current = [] if current: sentences.append(" ".join(current)) chunks = [] buffer_words = [] buffer_count = 0 for sentence in sentences: s_words = sentence.split() s_count = len(s_words) if buffer_count + s_count > target_words and buffer_words: chunks.append(" ".join(buffer_words)) overlap_slice = buffer_words[-overlap_words:] if overlap_words else [] buffer_words = overlap_slice + s_words buffer_count = len(buffer_words) else: buffer_words.extend(s_words) buffer_count += s_count if buffer_words: chunks.append(" ".join(buffer_words)) return [c for c in chunks if len(c.strip()) > 80] def chunk_page(page: dict) -> list: text_chunks = split_text(page["text"]) if not text_chunks: return [] chunks = [] for idx, text in enumerate(text_chunks): chunk_id = hashlib.sha256(f"{page['doc_id']}_{idx}".encode()).hexdigest()[:20] chunks.append(Chunk( chunk_id=chunk_id, doc_id=page["doc_id"], url=page["url"], title=page["title"], module=page.get("module", "general"), section=page.get("section", ""), headings=page.get("headings", []), text=text, chunk_index=idx, )) return chunks def embed_batch(texts: list) -> list: resp = requests.post( f"{OLLAMA_URL}/api/embed", json={"model": EMBED_MODEL, "input": texts}, timeout=120, ) resp.raise_for_status() embeddings = resp.json().get("embeddings", []) if len(embeddings) != len(texts): raise ValueError(f"Expected {len(texts)} embeddings, got {len(embeddings)}") return embeddings def check_ollama() -> bool: try: resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5) models = [m["name"] for m in resp.json().get("models", [])] if not any(EMBED_MODEL in m for m in models): log.error(f"Model '{EMBED_MODEL}' not found. Run: ollama pull {EMBED_MODEL}") return False log.info(f"Ollama OK at {OLLAMA_URL} — model {EMBED_MODEL} ready") return True except Exception as e: log.error(f"Ollama unreachable at {OLLAMA_URL}: {e}") return False def setup_collection(client: QdrantClient, reset: bool = False): exists = client.collection_exists(COLLECTION_NAME) if exists and reset: log.info(f"Dropping collection '{COLLECTION_NAME}'...") client.delete_collection(COLLECTION_NAME) exists = False if not exists: log.info(f"Creating collection '{COLLECTION_NAME}' (dim={VECTOR_SIZE})...") client.create_collection( collection_name=COLLECTION_NAME, vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE), ) client.create_payload_index(COLLECTION_NAME, field_name="module", field_schema="keyword") client.create_payload_index(COLLECTION_NAME, field_name="url", field_schema="keyword") else: info = client.get_collection(COLLECTION_NAME) log.info(f"Collection '{COLLECTION_NAME}' exists ({info.points_count} points)") def upsert_chunks(client: QdrantClient, chunks: list, vectors: list): points = [] for chunk, vector in zip(chunks, vectors): points.append(PointStruct( id=int(chunk.chunk_id[:8], 16), vector=vector, payload={ "chunk_id": chunk.chunk_id, "doc_id": chunk.doc_id, "url": chunk.url, "title": chunk.title, "module": chunk.module, "section": chunk.section, "headings": chunk.headings, "text": chunk.text, "chunk_index": chunk.chunk_index, }, )) for i in range(0, len(points), UPSERT_BATCH): client.upsert(collection_name=COLLECTION_NAME, points=points[i:i + UPSERT_BATCH]) def index(module_filter: str | None = None, reset: bool = False): if not check_ollama(): raise SystemExit(1) if not RAW_DATA_FILE.exists(): raise FileNotFoundError( f"Raw data not found: {RAW_DATA_FILE}\n" f"Run the scraper first: docker compose run --rm scraper" ) client = QdrantClient(url=QDRANT_URL) setup_collection(client, reset=reset) pages = [] with open(RAW_DATA_FILE, encoding="utf-8") as f: for line in f: page = json.loads(line.strip()) if module_filter and page.get("module") != module_filter: continue pages.append(page) log.info(f"Loaded {len(pages)} pages") all_chunks = [] for page in pages: all_chunks.extend(chunk_page(page)) log.info(f"Created {len(all_chunks)} chunks") total = len(all_chunks) embedded = failed = 0 for i in range(0, total, BATCH_SIZE): batch = all_chunks[i:i + BATCH_SIZE] try: vectors = embed_batch([c.text for c in batch]) upsert_chunks(client, batch, vectors) embedded += len(batch) log.info(f"Progress: {embedded}/{total} ({embedded/total*100:.0f}%)") except Exception as e: log.error(f"Batch {i//BATCH_SIZE} failed: {e}") failed += len(batch) time.sleep(2) info = client.get_collection(COLLECTION_NAME) log.info( f"\nāœ… Done. Embedded: {embedded}, Failed: {failed}\n" f" Total vectors in Qdrant: {info.points_count}" ) if __name__ == "__main__": parser = argparse.ArgumentParser(description="Odoo 18 RAG indexer") parser.add_argument("--module", help="Index only one module") parser.add_argument("--reset", action="store_true", help="Drop and recreate collection") args = parser.parse_args() index(module_filter=args.module, reset=args.reset)