odootrain/indexer/indexer.py

#!/usr/bin/env python3
"""
Odoo 18 RAG Indexer
====================
Reads scraped pages, chunks them, embeds with nomic-embed-text via Ollama,
and upserts into Qdrant.

Usage:
    python indexer.py               # index everything
    python indexer.py --reset       # drop collection and re-index
    python indexer.py --module accounting

Requires:
    - Qdrant running:  docker compose up -d qdrant
    - Ollama with model pulled:  ollama pull nomic-embed-text
"""

import json
import logging
import argparse
import hashlib
import time
import os
from pathlib import Path
from dataclasses import dataclass

import requests
from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance, VectorParams, PointStruct,
    Filter, FieldCondition, MatchValue,
)

logging.basicConfig(level=logging.INFO, format="%(levelname)s  %(message)s")
log = logging.getLogger(__name__)

OLLAMA_URL      = os.getenv("OLLAMA_URL", "http://miaai:11434")
QDRANT_URL      = os.getenv("QDRANT_URL", "http://localhost:6333")
EMBED_MODEL     = os.getenv("EMBED_MODEL", "nomic-embed-text")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "odoo18_docs")
VECTOR_SIZE     = 768
RAW_DATA_FILE   = Path("../data/raw/odoo18_docs_raw.jsonl")
BATCH_SIZE      = 32
CHUNK_SIZE      = 512
CHUNK_OVERLAP   = 64
UPSERT_BATCH    = 100


@dataclass
class Chunk:
    chunk_id: str
    doc_id: str
    url: str
    title: str
    module: str
    section: str
    headings: list
    text: str
    chunk_index: int


def split_text(text: str) -> list:
    target_words = int(CHUNK_SIZE * 0.75)
    overlap_words = int(CHUNK_OVERLAP * 0.75)

    sentences = []
    current = []
    for word in text.split():
        current.append(word)
        if word.endswith((".", "?", "!", ":\n", "\n\n")):
            sentences.append(" ".join(current))
            current = []
    if current:
        sentences.append(" ".join(current))

    chunks = []
    buffer_words = []
    buffer_count = 0

    for sentence in sentences:
        s_words = sentence.split()
        s_count = len(s_words)
        if buffer_count + s_count > target_words and buffer_words:
            chunks.append(" ".join(buffer_words))
            overlap_slice = buffer_words[-overlap_words:] if overlap_words else []
            buffer_words = overlap_slice + s_words
            buffer_count = len(buffer_words)
        else:
            buffer_words.extend(s_words)
            buffer_count += s_count

    if buffer_words:
        chunks.append(" ".join(buffer_words))

    return [c for c in chunks if len(c.strip()) > 80]


def chunk_page(page: dict) -> list:
    text_chunks = split_text(page["text"])
    if not text_chunks:
        return []

    chunks = []
    for idx, text in enumerate(text_chunks):
        chunk_id = hashlib.sha256(f"{page['doc_id']}_{idx}".encode()).hexdigest()[:20]
        chunks.append(Chunk(
            chunk_id=chunk_id,
            doc_id=page["doc_id"],
            url=page["url"],
            title=page["title"],
            module=page.get("module", "general"),
            section=page.get("section", ""),
            headings=page.get("headings", []),
            text=text,
            chunk_index=idx,
        ))
    return chunks


def embed_batch(texts: list) -> list:
    resp = requests.post(
        f"{OLLAMA_URL}/api/embed",
        json={"model": EMBED_MODEL, "input": texts},
        timeout=120,
    )
    resp.raise_for_status()
    embeddings = resp.json().get("embeddings", [])
    if len(embeddings) != len(texts):
        raise ValueError(f"Expected {len(texts)} embeddings, got {len(embeddings)}")
    return embeddings


def check_ollama() -> bool:
    try:
        resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5)
        models = [m["name"] for m in resp.json().get("models", [])]
        if not any(EMBED_MODEL in m for m in models):
            log.error(f"Model '{EMBED_MODEL}' not found. Run: ollama pull {EMBED_MODEL}")
            return False
        log.info(f"Ollama OK at {OLLAMA_URL} — model {EMBED_MODEL} ready")
        return True
    except Exception as e:
        log.error(f"Ollama unreachable at {OLLAMA_URL}: {e}")
        return False


def setup_collection(client: QdrantClient, reset: bool = False):
    exists = client.collection_exists(COLLECTION_NAME)

    if exists and reset:
        log.info(f"Dropping collection '{COLLECTION_NAME}'...")
        client.delete_collection(COLLECTION_NAME)
        exists = False

    if not exists:
        log.info(f"Creating collection '{COLLECTION_NAME}' (dim={VECTOR_SIZE})...")
        client.create_collection(
            collection_name=COLLECTION_NAME,
            vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE),
        )
        client.create_payload_index(COLLECTION_NAME, field_name="module", field_schema="keyword")
        client.create_payload_index(COLLECTION_NAME, field_name="url",    field_schema="keyword")
    else:
        info = client.get_collection(COLLECTION_NAME)
        log.info(f"Collection '{COLLECTION_NAME}' exists ({info.points_count} points)")


def upsert_chunks(client: QdrantClient, chunks: list, vectors: list):
    points = []
    for chunk, vector in zip(chunks, vectors):
        points.append(PointStruct(
            id=int(chunk.chunk_id[:8], 16),
            vector=vector,
            payload={
                "chunk_id":    chunk.chunk_id,
                "doc_id":      chunk.doc_id,
                "url":         chunk.url,
                "title":       chunk.title,
                "module":      chunk.module,
                "section":     chunk.section,
                "headings":    chunk.headings,
                "text":        chunk.text,
                "chunk_index": chunk.chunk_index,
            },
        ))
    for i in range(0, len(points), UPSERT_BATCH):
        client.upsert(collection_name=COLLECTION_NAME, points=points[i:i + UPSERT_BATCH])


def index(module_filter: str | None = None, reset: bool = False):
    if not check_ollama():
        raise SystemExit(1)

    if not RAW_DATA_FILE.exists():
        raise FileNotFoundError(
            f"Raw data not found: {RAW_DATA_FILE}\n"
            f"Run the scraper first: docker compose run --rm scraper"
        )

    client = QdrantClient(url=QDRANT_URL)
    setup_collection(client, reset=reset)

    pages = []
    with open(RAW_DATA_FILE, encoding="utf-8") as f:
        for line in f:
            page = json.loads(line.strip())
            if module_filter and page.get("module") != module_filter:
                continue
            pages.append(page)
    log.info(f"Loaded {len(pages)} pages")

    all_chunks = []
    for page in pages:
        all_chunks.extend(chunk_page(page))
    log.info(f"Created {len(all_chunks)} chunks")

    total = len(all_chunks)
    embedded = failed = 0

    for i in range(0, total, BATCH_SIZE):
        batch = all_chunks[i:i + BATCH_SIZE]
        try:
            vectors = embed_batch([c.text for c in batch])
            upsert_chunks(client, batch, vectors)
            embedded += len(batch)
            log.info(f"Progress: {embedded}/{total} ({embedded/total*100:.0f}%)")
        except Exception as e:
            log.error(f"Batch {i//BATCH_SIZE} failed: {e}")
            failed += len(batch)
            time.sleep(2)

    info = client.get_collection(COLLECTION_NAME)
    log.info(
        f"\n✅ Done. Embedded: {embedded}, Failed: {failed}\n"
        f"   Total vectors in Qdrant: {info.points_count}"
    )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Odoo 18 RAG indexer")
    parser.add_argument("--module", help="Index only one module")
    parser.add_argument("--reset", action="store_true", help="Drop and recreate collection")
    args = parser.parse_args()
    index(module_filter=args.module, reset=args.reset)