commit 7fb1573bac15f76586c265d6b8ee80573724bc1a Author: Carlos Garcia Date: Thu May 14 11:25:55 2026 -0400 Initial commit: Odoo 18 RAG stack Scraper, indexer, and FastAPI query service for Retrieval-Augmented Generation over Odoo 18 documentation. Uses Qdrant + Ollama (nomic-embed-text + llama3.1). Integrates with ActiveBlue PeerBus agent interface. Co-Authored-By: Claude Sonnet 4.6 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..7d3e154 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +.venv/ +venv/ +.env + +# Data (scraped docs — too large for git, regenerate with scraper) +data/raw/ +data/*.jsonl + +# Docker +.docker/ + +# OS +.DS_Store +Thumbs.db diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..1177901 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM python:3.11-slim + +WORKDIR /app + +RUN apt-get update && apt-get install -y --no-install-recommends \ + libxml2 libxslt1.1 curl \ + && rm -rf /var/lib/apt/lists/* + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY scraper/ ./scraper/ +COPY indexer/ ./indexer/ +COPY api/ ./api/ + +CMD ["uvicorn", "api.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..8ba8ec7 --- /dev/null +++ b/README.md @@ -0,0 +1,127 @@ +# odoo18-rag + +Retrieval-Augmented Generation over the full Odoo 18 documentation. +Built for the ActiveBlue AI agent stack. + +## Stack + +| Component | What it does | +|---|---| +| `scraper/` | Crawls odoo.com/documentation/18.0, outputs clean JSONL | +| `indexer/` | Chunks pages, embeds with `nomic-embed-text`, loads Qdrant | +| `api/` | FastAPI — `/ask`, `/ask/stream`, `/agent/ask`, `/health` | +| Qdrant | Vector database (Docker) | +| Ollama @ `miaai:11434` | Embeddings + generation (local, HIPAA-safe) | + +## Quick start + +```bash +# 1. Pull the embedding model on miaai +ollama pull nomic-embed-text + +# 2. Start Qdrant + RAG API +docker compose up -d + +# 3. Scrape the docs (~800 pages, ~20 min) +docker compose run --rm scraper + +# 4. Index into Qdrant (~30-40 min) +docker compose run --rm indexer + +# 5. Test +curl http://localhost:8000/health +curl -X POST http://localhost:8000/ask \ + -H "Content-Type: application/json" \ + -d '{"question": "How do I run a payroll batch in Odoo 18?"}' +``` + +## Endpoints + +| Method | Path | Description | +|---|---|---| +| GET | `/health` | Qdrant + Ollama connectivity | +| GET | `/stats` | Vector count, models in use | +| GET | `/modules` | List indexed Odoo modules | +| POST | `/ask` | Blocking answer + sources | +| POST | `/ask/stream` | SSE token stream | +| POST | `/agent/ask` | ActiveBlue PeerBus integration | + +### Ask with module filter + +```bash +curl -X POST http://localhost:8000/ask \ + -H "Content-Type: application/json" \ + -d '{"question": "How do reordering rules work?", "module": "inventory"}' +``` + +### Streaming + +```bash +curl -N -X POST http://localhost:8000/ask/stream \ + -H "Content-Type: application/json" \ + -d '{"question": "Explain the Quote-to-Cash workflow"}' +``` + +## Agent integration + +```python +from api.odoo_rag_agent import OdooRagAgent + +agent = OdooRagAgent(rag_url="http://localhost:8000") + +# Generic +result = await agent.ask("How do I configure NACHA payments?") + +# Module-scoped +result = await agent.ask_payroll("How do I generate a payslip batch?") +result = await agent.ask_accounting("What is the chart of accounts?") +result = await agent.ask_inventory("How does MTO work?") + +# Streaming +async for token in agent.ask_stream("Explain the CRM pipeline"): + print(token, end="", flush=True) + +# PeerBus +response = await agent.handle_peer_message({ + "action": "ask", + "payload": {"question": "How do I set up taxes?", "module": "accounting"}, + "request_id": "req-001" +}) +``` + +## Re-indexing + +Odoo releases doc updates regularly. Re-index to stay current: + +```bash +docker compose run --rm scraper +docker compose run --rm indexer python /app/indexer/indexer.py --reset +``` + +Or add a monthly cron on the host: + +```cron +0 3 1 * * cd /opt/odoo18-rag && docker compose run --rm scraper && docker compose run --rm indexer python /app/indexer/indexer.py --reset +``` + +## Scraper options + +```bash +# Single module only +docker compose run --rm scraper python /app/scraper/scraper.py --module accounting + +# Quick test (first 50 pages) +docker compose run --rm scraper python /app/scraper/scraper.py --limit 50 +``` + +## Environment variables + +All configurable via `docker-compose.yml` environment section: + +| Variable | Default | Description | +|---|---|---| +| `OLLAMA_URL` | `http://miaai:11434` | Ollama endpoint | +| `QDRANT_URL` | `http://qdrant:6333` | Qdrant endpoint | +| `EMBED_MODEL` | `nomic-embed-text` | Embedding model | +| `GEN_MODEL` | `llama3.1` | Generation model | +| `COLLECTION_NAME` | `odoo18_docs` | Qdrant collection | diff --git a/api/main.py b/api/main.py new file mode 100644 index 0000000..3e92358 --- /dev/null +++ b/api/main.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +""" +Odoo 18 RAG Query API +====================== +FastAPI service — embeds the question, retrieves top-K chunks from Qdrant, +builds a prompt, and streams or returns the answer from Ollama. + +Endpoints: + POST /ask blocking answer + sources + POST /ask/stream Server-Sent Events token stream + POST /agent/ask ActiveBlue AI agent integration + GET /health connectivity check + GET /modules list indexed modules + GET /stats collection stats + +Run: + uvicorn api.main:app --host 0.0.0.0 --port 8000 --reload +""" + +import json +import logging +import os + +import httpx +from fastapi import FastAPI, HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import StreamingResponse +from pydantic import BaseModel, Field +from qdrant_client import QdrantClient +from qdrant_client.models import Filter, FieldCondition, MatchValue +from typing import AsyncIterator + +logging.basicConfig(level=logging.INFO) +log = logging.getLogger("odoo18_rag") + +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://miaai:11434") +QDRANT_URL = os.getenv("QDRANT_URL", "http://qdrant:6333") +EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text") +GEN_MODEL = os.getenv("GEN_MODEL", "llama3.1") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "odoo18_docs") +TOP_K = 6 +MAX_CONTEXT = 4000 + +SYSTEM_PROMPT = """\ +You are an expert Odoo 18 consultant for ActiveBlue LLC, an MSP serving \ +medical and dental practices. You have deep knowledge of all Odoo 18 modules: \ +Finance, Accounting, Inventory, Manufacturing, Purchase, Sales, CRM, HR, \ +Payroll, eCommerce, Helpdesk, Project, and more. + +Answer questions clearly and concisely using the provided documentation context. \ +Use numbered steps when explaining procedures. Always mention the Odoo menu path \ +when explaining navigation. If the context doesn't cover the question fully, say \ +so and answer from general knowledge.\ +""" + + +# ── Models ──────────────────────────────────────────────────────────────────── + +class AskRequest(BaseModel): + question: str = Field(..., min_length=5, max_length=2000) + module: str | None = Field(None, description="Filter to one Odoo module") + model: str | None = Field(None, description="Override the LLM model") + top_k: int = Field(TOP_K, ge=1, le=20) + temperature: float = Field(0.3, ge=0.0, le=1.0) + + +class Source(BaseModel): + url: str + title: str + module: str + section: str + + +class AskResponse(BaseModel): + answer: str + sources: list[Source] + model: str + question: str + + +# ── App ─────────────────────────────────────────────────────────────────────── + +app = FastAPI( + title="Odoo 18 RAG API", + description="Retrieval-Augmented Generation over Odoo 18 documentation", + version="1.0.0", +) +app.add_middleware( + CORSMiddleware, + allow_origins=["*"], + allow_methods=["*"], + allow_headers=["*"], +) + +qdrant = QdrantClient(url=QDRANT_URL) + + +# ── Helpers ─────────────────────────────────────────────────────────────────── + +async def embed_query(text: str) -> list: + async with httpx.AsyncClient(timeout=30) as client: + resp = await client.post( + f"{OLLAMA_URL}/api/embed", + json={"model": EMBED_MODEL, "input": [text]}, + ) + resp.raise_for_status() + embeddings = resp.json().get("embeddings", []) + if not embeddings: + raise HTTPException(500, "Empty embedding response from Ollama") + return embeddings[0] + + +def retrieve(vector: list, top_k: int, module: str | None) -> list: + query_filter = None + if module: + query_filter = Filter( + must=[FieldCondition(key="module", match=MatchValue(value=module))] + ) + results = qdrant.search( + collection_name=COLLECTION_NAME, + query_vector=vector, + limit=top_k, + query_filter=query_filter, + with_payload=True, + ) + return [hit.payload for hit in results] + + +def build_prompt(question: str, chunks: list) -> str: + context_parts = [] + char_count = 0 + for i, chunk in enumerate(chunks, 1): + block = ( + f"[Source {i}: {chunk.get('title', '')} | {chunk.get('section', '')}]\n" + f"{chunk.get('text', '')}\n" + f"URL: {chunk.get('url', '')}\n" + ) + if char_count + len(block) > MAX_CONTEXT: + break + context_parts.append(block) + char_count += len(block) + + return ( + f"{SYSTEM_PROMPT}\n\n" + f"## Relevant documentation\n\n" + f"{'---'.join(context_parts)}\n\n" + f"---\n\n" + f"## Question\n\n{question}\n\n" + f"## Answer\n" + ) + + +def dedupe_sources(chunks: list) -> list[Source]: + seen = set() + sources = [] + for chunk in chunks: + url = chunk.get("url", "") + if url not in seen: + seen.add(url) + sources.append(Source( + url=url, + title=chunk.get("title", ""), + module=chunk.get("module", ""), + section=chunk.get("section", ""), + )) + return sources + + +async def generate_blocking(prompt: str, model: str, temperature: float) -> str: + async with httpx.AsyncClient(timeout=120) as client: + resp = await client.post( + f"{OLLAMA_URL}/api/generate", + json={ + "model": model, + "prompt": prompt, + "stream": False, + "options": {"temperature": temperature, "num_ctx": 8192}, + }, + ) + resp.raise_for_status() + return resp.json().get("response", "").strip() + + +async def generate_stream(prompt: str, model: str, temperature: float) -> AsyncIterator[str]: + async with httpx.AsyncClient(timeout=120) as client: + async with client.stream( + "POST", + f"{OLLAMA_URL}/api/generate", + json={ + "model": model, + "prompt": prompt, + "stream": True, + "options": {"temperature": temperature, "num_ctx": 8192}, + }, + ) as resp: + async for line in resp.aiter_lines(): + if line.strip(): + try: + data = json.loads(line) + token = data.get("response", "") + if token: + yield token + if data.get("done"): + break + except json.JSONDecodeError: + continue + + +# ── Endpoints ───────────────────────────────────────────────────────────────── + +@app.get("/health") +async def health(): + status = {"api": "ok", "qdrant": "unknown", "ollama": "unknown"} + try: + info = qdrant.get_collection(COLLECTION_NAME) + status["qdrant"] = f"ok ({info.points_count} vectors)" + except Exception as e: + status["qdrant"] = f"error: {e}" + try: + async with httpx.AsyncClient(timeout=5) as client: + resp = await client.get(f"{OLLAMA_URL}/api/tags") + models = [m["name"] for m in resp.json().get("models", [])] + status["ollama"] = f"ok ({len(models)} models)" + except Exception as e: + status["ollama"] = f"error: {e}" + return status + + +@app.get("/modules") +async def list_modules(): + try: + result = qdrant.scroll(collection_name=COLLECTION_NAME, limit=1000, with_payload=["module"]) + modules = sorted(set(p.payload.get("module", "general") for p in result[0])) + return {"modules": modules} + except Exception as e: + raise HTTPException(500, str(e)) + + +@app.get("/stats") +async def stats(): + try: + info = qdrant.get_collection(COLLECTION_NAME) + return { + "collection": COLLECTION_NAME, + "vectors": info.points_count, + "vector_size": 768, + "embed_model": EMBED_MODEL, + "gen_model": GEN_MODEL, + } + except Exception as e: + raise HTTPException(500, str(e)) + + +@app.post("/ask", response_model=AskResponse) +async def ask(req: AskRequest): + model = req.model or GEN_MODEL + + try: + vector = await embed_query(req.question) + except Exception as e: + raise HTTPException(500, f"Embedding failed: {e}") + + chunks = retrieve(vector, req.top_k, req.module) + if not chunks: + raise HTTPException(404, "No relevant documentation found.") + + prompt = build_prompt(req.question, chunks) + + try: + answer = await generate_blocking(prompt, model, req.temperature) + except Exception as e: + raise HTTPException(500, f"Generation failed: {e}") + + return AskResponse( + answer=answer, + sources=dedupe_sources(chunks), + model=model, + question=req.question, + ) + + +@app.post("/ask/stream") +async def ask_stream(req: AskRequest): + model = req.model or GEN_MODEL + + try: + vector = await embed_query(req.question) + except Exception as e: + raise HTTPException(500, f"Embedding failed: {e}") + + chunks = retrieve(vector, req.top_k, req.module) + if not chunks: + raise HTTPException(404, "No relevant documentation found.") + + prompt = build_prompt(req.question, chunks) + sources = [s.model_dump() for s in dedupe_sources(chunks)] + + async def sse(): + async for token in generate_stream(prompt, model, req.temperature): + yield f"data: {json.dumps({'type': 'token', 'content': token})}\n\n" + yield f"data: {json.dumps({'type': 'sources', 'sources': sources})}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse(sse(), media_type="text/event-stream") + + +@app.post("/agent/ask") +async def agent_ask(req: AskRequest): + """ActiveBlue AI agent endpoint — compatible with PeerBus message format.""" + result = await ask(req) + return { + "answer": result.answer, + "sources": [s.url for s in result.sources], + "module_context": req.module, + "model_used": result.model, + } diff --git a/api/odoo_rag_agent.py b/api/odoo_rag_agent.py new file mode 100644 index 0000000..54ed4a5 --- /dev/null +++ b/api/odoo_rag_agent.py @@ -0,0 +1,147 @@ +""" +ActiveBlue AI Agent — Odoo 18 RAG Specialist +============================================= +Drop-in specialist agent for the ActiveBlue AI system. +Implements the PeerBus interface defined in ACTIVEBLUE_AI_SPEC.md. + +Usage: + from api.odoo_rag_agent import OdooRagAgent + + agent = OdooRagAgent(rag_url="http://localhost:8000") + result = await agent.ask("How do I run a payroll batch?") + print(result["answer"]) +""" + +import json +import httpx +import logging +from typing import AsyncIterator + +log = logging.getLogger(__name__) + + +class OdooRagAgent: + name = "odoo18_rag" + description = "Answers Odoo 18 questions using RAG over official documentation" + capabilities = [ + "odoo_how_to", + "odoo_configuration", + "odoo_troubleshooting", + "odoo_workflow", + ] + privacy_mode = "local" # uses local Ollama — HIPAA safe + + def __init__( + self, + rag_url: str = "http://localhost:8000", + timeout: int = 120, + default_model: str | None = None, + ): + self.rag_url = rag_url.rstrip("/") + self.timeout = timeout + self.default_model = default_model + + async def ask( + self, + question: str, + module: str | None = None, + top_k: int = 6, + temperature: float = 0.3, + ) -> dict: + payload = {"question": question, "top_k": top_k, "temperature": temperature} + if module: + payload["module"] = module + if self.default_model: + payload["model"] = self.default_model + + async with httpx.AsyncClient(timeout=self.timeout) as client: + resp = await client.post(f"{self.rag_url}/ask", json=payload) + resp.raise_for_status() + return resp.json() + + async def ask_stream( + self, + question: str, + module: str | None = None, + top_k: int = 6, + temperature: float = 0.3, + ) -> AsyncIterator[str]: + payload = {"question": question, "top_k": top_k, "temperature": temperature} + if module: + payload["module"] = module + + async with httpx.AsyncClient(timeout=self.timeout) as client: + async with client.stream("POST", f"{self.rag_url}/ask/stream", json=payload) as resp: + async for line in resp.aiter_lines(): + if line.startswith("data: "): + data_str = line[6:] + if data_str == "[DONE]": + break + try: + data = json.loads(data_str) + if data.get("type") == "token": + yield data["content"] + elif data.get("type") == "sources": + yield json.dumps(data) + except json.JSONDecodeError: + continue + + async def handle_peer_message(self, message: dict) -> dict: + """PeerBus message handler for the ActiveBlue Master AI.""" + action = message.get("action") + payload = message.get("payload", {}) + req_id = message.get("request_id") + + if action == "ask": + result = await self.ask( + question = payload.get("question", ""), + module = payload.get("module"), + top_k = payload.get("top_k", 6), + temperature = payload.get("temperature", 0.3), + ) + return {"request_id": req_id, "agent": self.name, "status": "ok", "result": result} + + elif action == "capabilities": + return { + "request_id": req_id, + "agent": self.name, + "capabilities": self.capabilities, + "description": self.description, + "privacy_mode": self.privacy_mode, + } + + elif action == "health": + return await self.health() + + return {"request_id": req_id, "agent": self.name, "status": "error", "error": f"Unknown action: {action}"} + + async def health(self) -> dict: + try: + async with httpx.AsyncClient(timeout=5) as client: + resp = await client.get(f"{self.rag_url}/health") + return {"agent": self.name, "status": "ok", "rag": resp.json()} + except Exception as e: + return {"agent": self.name, "status": "error", "error": str(e)} + + # ── Module convenience wrappers ─────────────────────────────────────────── + + async def ask_accounting(self, question: str) -> dict: + return await self.ask(question, module="accounting") + + async def ask_payroll(self, question: str) -> dict: + return await self.ask(question, module="payroll") + + async def ask_inventory(self, question: str) -> dict: + return await self.ask(question, module="inventory") + + async def ask_crm(self, question: str) -> dict: + return await self.ask(question, module="crm") + + async def ask_hr(self, question: str) -> dict: + return await self.ask(question, module="employees") + + async def ask_manufacturing(self, question: str) -> dict: + return await self.ask(question, module="manufacturing") + + async def ask_helpdesk(self, question: str) -> dict: + return await self.ask(question, module="helpdesk") diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..b12c17e --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,102 @@ +version: "3.9" + +# ─── Odoo 18 RAG Stack ──────────────────────────────────────────────────────── +# rag-api:8000 ──► qdrant:6333 (internal docker network) +# rag-api ──► miaai:11434 (direct outbound to Ollama) +# +# Usage: +# docker compose up -d # start Qdrant + RAG API +# docker compose logs -f rag-api # follow API logs +# docker compose run --rm scraper # scrape Odoo 18 docs +# docker compose run --rm indexer # embed + load into Qdrant +# docker compose run --rm indexer python /app/indexer/indexer.py --reset + +services: + + qdrant: + image: qdrant/qdrant:v1.9.0 + container_name: odoo18-qdrant + restart: unless-stopped + volumes: + - qdrant_storage:/qdrant/storage + ports: + - "6333:6333" + - "6334:6334" + environment: + QDRANT__SERVICE__HTTP_PORT: 6333 + QDRANT__SERVICE__GRPC_PORT: 6334 + QDRANT__LOG_LEVEL: INFO + networks: + - rag_net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:6333/healthz"] + interval: 30s + timeout: 10s + retries: 3 + + rag-api: + build: . + container_name: odoo18-rag-api + restart: unless-stopped + depends_on: + qdrant: + condition: service_healthy + ports: + - "8000:8000" + environment: + OLLAMA_URL: "http://miaai:11434" + QDRANT_URL: "http://qdrant:6333" + COLLECTION_NAME: "odoo18_docs" + EMBED_MODEL: "nomic-embed-text" + GEN_MODEL: "llama3.1" + LOG_LEVEL: "INFO" + volumes: + - ./data:/app/data + extra_hosts: + - "miaai:192.168.2.9" + networks: + - rag_net + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8000/health"] + interval: 30s + timeout: 10s + retries: 3 + + scraper: + build: . + container_name: odoo18-scraper + profiles: ["scraper"] + command: python /app/scraper/scraper.py + volumes: + - ./data:/app/data + networks: + - rag_net + environment: + PYTHONUNBUFFERED: "1" + + indexer: + build: . + container_name: odoo18-indexer + profiles: ["indexer"] + command: python /app/indexer/indexer.py + depends_on: + qdrant: + condition: service_healthy + volumes: + - ./data:/app/data + extra_hosts: + - "miaai:192.168.2.9" + networks: + - rag_net + environment: + OLLAMA_URL: "http://miaai:11434" + QDRANT_URL: "http://qdrant:6333" + PYTHONUNBUFFERED: "1" + +networks: + rag_net: + driver: bridge + +volumes: + qdrant_storage: + name: odoo18_qdrant_storage diff --git a/indexer/indexer.py b/indexer/indexer.py new file mode 100644 index 0000000..d4b41c4 --- /dev/null +++ b/indexer/indexer.py @@ -0,0 +1,244 @@ +#!/usr/bin/env python3 +""" +Odoo 18 RAG Indexer +==================== +Reads scraped pages, chunks them, embeds with nomic-embed-text via Ollama, +and upserts into Qdrant. + +Usage: + python indexer.py # index everything + python indexer.py --reset # drop collection and re-index + python indexer.py --module accounting + +Requires: + - Qdrant running: docker compose up -d qdrant + - Ollama with model pulled: ollama pull nomic-embed-text +""" + +import json +import logging +import argparse +import hashlib +import time +import os +from pathlib import Path +from dataclasses import dataclass + +import requests +from qdrant_client import QdrantClient +from qdrant_client.models import ( + Distance, VectorParams, PointStruct, + Filter, FieldCondition, MatchValue, +) + +logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") +log = logging.getLogger(__name__) + +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://miaai:11434") +QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333") +EMBED_MODEL = os.getenv("EMBED_MODEL", "nomic-embed-text") +COLLECTION_NAME = os.getenv("COLLECTION_NAME", "odoo18_docs") +VECTOR_SIZE = 768 +RAW_DATA_FILE = Path("../data/raw/odoo18_docs_raw.jsonl") +BATCH_SIZE = 32 +CHUNK_SIZE = 512 +CHUNK_OVERLAP = 64 +UPSERT_BATCH = 100 + + +@dataclass +class Chunk: + chunk_id: str + doc_id: str + url: str + title: str + module: str + section: str + headings: list + text: str + chunk_index: int + + +def split_text(text: str) -> list: + target_words = int(CHUNK_SIZE * 0.75) + overlap_words = int(CHUNK_OVERLAP * 0.75) + + sentences = [] + current = [] + for word in text.split(): + current.append(word) + if word.endswith((".", "?", "!", ":\n", "\n\n")): + sentences.append(" ".join(current)) + current = [] + if current: + sentences.append(" ".join(current)) + + chunks = [] + buffer_words = [] + buffer_count = 0 + + for sentence in sentences: + s_words = sentence.split() + s_count = len(s_words) + if buffer_count + s_count > target_words and buffer_words: + chunks.append(" ".join(buffer_words)) + overlap_slice = buffer_words[-overlap_words:] if overlap_words else [] + buffer_words = overlap_slice + s_words + buffer_count = len(buffer_words) + else: + buffer_words.extend(s_words) + buffer_count += s_count + + if buffer_words: + chunks.append(" ".join(buffer_words)) + + return [c for c in chunks if len(c.strip()) > 80] + + +def chunk_page(page: dict) -> list: + text_chunks = split_text(page["text"]) + if not text_chunks: + return [] + + chunks = [] + for idx, text in enumerate(text_chunks): + chunk_id = hashlib.sha256(f"{page['doc_id']}_{idx}".encode()).hexdigest()[:20] + chunks.append(Chunk( + chunk_id=chunk_id, + doc_id=page["doc_id"], + url=page["url"], + title=page["title"], + module=page.get("module", "general"), + section=page.get("section", ""), + headings=page.get("headings", []), + text=text, + chunk_index=idx, + )) + return chunks + + +def embed_batch(texts: list) -> list: + resp = requests.post( + f"{OLLAMA_URL}/api/embed", + json={"model": EMBED_MODEL, "input": texts}, + timeout=120, + ) + resp.raise_for_status() + embeddings = resp.json().get("embeddings", []) + if len(embeddings) != len(texts): + raise ValueError(f"Expected {len(texts)} embeddings, got {len(embeddings)}") + return embeddings + + +def check_ollama() -> bool: + try: + resp = requests.get(f"{OLLAMA_URL}/api/tags", timeout=5) + models = [m["name"] for m in resp.json().get("models", [])] + if not any(EMBED_MODEL in m for m in models): + log.error(f"Model '{EMBED_MODEL}' not found. Run: ollama pull {EMBED_MODEL}") + return False + log.info(f"Ollama OK at {OLLAMA_URL} — model {EMBED_MODEL} ready") + return True + except Exception as e: + log.error(f"Ollama unreachable at {OLLAMA_URL}: {e}") + return False + + +def setup_collection(client: QdrantClient, reset: bool = False): + exists = client.collection_exists(COLLECTION_NAME) + + if exists and reset: + log.info(f"Dropping collection '{COLLECTION_NAME}'...") + client.delete_collection(COLLECTION_NAME) + exists = False + + if not exists: + log.info(f"Creating collection '{COLLECTION_NAME}' (dim={VECTOR_SIZE})...") + client.create_collection( + collection_name=COLLECTION_NAME, + vectors_config=VectorParams(size=VECTOR_SIZE, distance=Distance.COSINE), + ) + client.create_payload_index(COLLECTION_NAME, field_name="module", field_schema="keyword") + client.create_payload_index(COLLECTION_NAME, field_name="url", field_schema="keyword") + else: + info = client.get_collection(COLLECTION_NAME) + log.info(f"Collection '{COLLECTION_NAME}' exists ({info.points_count} points)") + + +def upsert_chunks(client: QdrantClient, chunks: list, vectors: list): + points = [] + for chunk, vector in zip(chunks, vectors): + points.append(PointStruct( + id=int(chunk.chunk_id[:8], 16), + vector=vector, + payload={ + "chunk_id": chunk.chunk_id, + "doc_id": chunk.doc_id, + "url": chunk.url, + "title": chunk.title, + "module": chunk.module, + "section": chunk.section, + "headings": chunk.headings, + "text": chunk.text, + "chunk_index": chunk.chunk_index, + }, + )) + for i in range(0, len(points), UPSERT_BATCH): + client.upsert(collection_name=COLLECTION_NAME, points=points[i:i + UPSERT_BATCH]) + + +def index(module_filter: str | None = None, reset: bool = False): + if not check_ollama(): + raise SystemExit(1) + + if not RAW_DATA_FILE.exists(): + raise FileNotFoundError( + f"Raw data not found: {RAW_DATA_FILE}\n" + f"Run the scraper first: docker compose run --rm scraper" + ) + + client = QdrantClient(url=QDRANT_URL) + setup_collection(client, reset=reset) + + pages = [] + with open(RAW_DATA_FILE, encoding="utf-8") as f: + for line in f: + page = json.loads(line.strip()) + if module_filter and page.get("module") != module_filter: + continue + pages.append(page) + log.info(f"Loaded {len(pages)} pages") + + all_chunks = [] + for page in pages: + all_chunks.extend(chunk_page(page)) + log.info(f"Created {len(all_chunks)} chunks") + + total = len(all_chunks) + embedded = failed = 0 + + for i in range(0, total, BATCH_SIZE): + batch = all_chunks[i:i + BATCH_SIZE] + try: + vectors = embed_batch([c.text for c in batch]) + upsert_chunks(client, batch, vectors) + embedded += len(batch) + log.info(f"Progress: {embedded}/{total} ({embedded/total*100:.0f}%)") + except Exception as e: + log.error(f"Batch {i//BATCH_SIZE} failed: {e}") + failed += len(batch) + time.sleep(2) + + info = client.get_collection(COLLECTION_NAME) + log.info( + f"\n✅ Done. Embedded: {embedded}, Failed: {failed}\n" + f" Total vectors in Qdrant: {info.points_count}" + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Odoo 18 RAG indexer") + parser.add_argument("--module", help="Index only one module") + parser.add_argument("--reset", action="store_true", help="Drop and recreate collection") + args = parser.parse_args() + index(module_filter=args.module, reset=args.reset) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..1231389 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +fastapi==0.111.0 +uvicorn[standard]==0.29.0 +httpx==0.27.0 +pydantic==2.7.0 +qdrant-client==1.9.0 +requests==2.31.0 +beautifulsoup4==4.12.3 +lxml==5.2.1 +python-dotenv==1.0.1 diff --git a/scraper/scraper.py b/scraper/scraper.py new file mode 100644 index 0000000..6f62881 --- /dev/null +++ b/scraper/scraper.py @@ -0,0 +1,316 @@ +#!/usr/bin/env python3 +""" +Odoo 18 Documentation Scraper +============================== +Crawls the Odoo 18 docs sitemap, extracts clean text from each page, +and saves structured JSON ready for the indexer. + +Usage: + python scraper.py # full crawl + python scraper.py --module accounting # single module + python scraper.py --limit 50 # test run + +Output: ../data/raw/odoo18_docs_raw.jsonl +""" + +import json +import time +import re +import argparse +import hashlib +import logging +from pathlib import Path +from urllib.parse import urljoin +from dataclasses import dataclass, asdict + +import requests +from bs4 import BeautifulSoup + +logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") +log = logging.getLogger(__name__) + +BASE_URL = "https://www.odoo.com/documentation/18.0" +SITEMAP_URL = f"{BASE_URL}/sitemap.xml" +OUTPUT_DIR = Path("../data/raw") +OUTPUT_FILE = OUTPUT_DIR / "odoo18_docs_raw.jsonl" +DELAY_SECONDS = 1.2 +MAX_RETRIES = 3 +REQUEST_TIMEOUT = 20 + +HEADERS = { + "User-Agent": ( + "Mozilla/5.0 (compatible; ActiveBlue-RAG-Indexer/1.0; " + "+https://activeblue.net)" + ), +} + +MODULE_PATHS = { + "accounting": "/applications/finance/accounting", + "invoicing": "/applications/finance", + "inventory": "/applications/inventory_and_mrp/inventory", + "purchase": "/applications/inventory_and_mrp/purchase", + "manufacturing": "/applications/inventory_and_mrp/manufacturing", + "sales": "/applications/sales/sales", + "crm": "/applications/sales/crm", + "employees": "/applications/hr/employees", + "payroll": "/applications/hr/payroll", + "timesheets": "/applications/services/timesheets", + "project": "/applications/services/project", + "helpdesk": "/applications/services/helpdesk", + "ecommerce": "/applications/websites/ecommerce", + "website": "/applications/websites/website", + "marketing": "/applications/marketing", + "pos": "/applications/sales/point_of_sale", + "quality": "/applications/inventory_and_mrp/quality", + "maintenance": "/applications/inventory_and_mrp/maintenance", + "fleet": "/applications/hr/fleet", + "discuss": "/applications/productivity/discuss", + "studio": "/applications/studio", + "general": "/applications/general", + "install": "/administration", +} + +NOISE_SELECTORS = [ + "nav", "footer", "header", ".toctree-wrapper", + ".wy-nav-side", ".wy-menu", ".wy-side-nav-search", + ".rst-footer-buttons", "#edit-on-github", + "[role='navigation']", ".breadcrumbs", + ".sidebar", ".sphinxsidebar", + "script", "style", +] + + +@dataclass +class DocPage: + url: str + title: str + module: str + section: str + text: str + headings: list + doc_id: str + + +def fetch_sitemap_urls(sitemap_url: str, module_filter: str | None) -> list: + log.info(f"Fetching sitemap: {sitemap_url}") + resp = requests.get(sitemap_url, headers=HEADERS, timeout=REQUEST_TIMEOUT) + resp.raise_for_status() + soup = BeautifulSoup(resp.text, "xml") + all_urls = [loc.text.strip() for loc in soup.find_all("loc")] + urls = [ + u for u in all_urls + if "/18.0/" in u or "/documentation/18.0" in u + if not any(f"/{lang}/" in u for lang in ["fr", "de", "es", "pt", "nl", "zh"]) + ] + if module_filter: + path = MODULE_PATHS.get(module_filter) + if not path: + raise ValueError(f"Unknown module '{module_filter}'. Choose from: {', '.join(MODULE_PATHS)}") + urls = [u for u in urls if path in u] + log.info(f"Module filter '{module_filter}': {len(urls)} pages") + else: + log.info(f"Total pages: {len(urls)}") + return urls + + +def fallback_urls() -> list: + """Curated fallback list if sitemap is unavailable.""" + paths = [ + "/applications/finance/accounting.html", + "/applications/finance/accounting/customer_invoices.html", + "/applications/finance/accounting/customer_invoices/overview.html", + "/applications/finance/accounting/vendor_bills.html", + "/applications/finance/accounting/get_started/chart_of_accounts.html", + "/applications/finance/accounting/get_started/cheat_sheet.html", + "/applications/finance/accounting/get_started/multi_currency.html", + "/applications/finance/accounting/reporting/budget.html", + "/applications/finance/accounting/reporting/analytic_accounting.html", + "/applications/finance/accounting/bank.html", + "/applications/finance/accounting/taxes.html", + "/applications/finance/accounting/reporting.html", + "/applications/finance/expenses.html", + "/applications/finance/expenses/reinvoice_expenses.html", + "/applications/finance/payment_providers.html", + "/applications/finance.html", + "/applications/sales.html", + "/applications/sales/sales.html", + "/applications/sales/crm.html", + "/applications/sales/crm/pipeline.html", + "/applications/sales/crm/acquire_leads/email_manual.html", + "/applications/sales/crm/pipeline/manage_sales_teams.html", + "/applications/sales/crm/optimize/utilize_activities.html", + "/applications/inventory_and_mrp/inventory.html", + "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment.html", + "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/mto.html", + "/applications/inventory_and_mrp/inventory/warehouses_storage/replenishment/reordering_rules.html", + "/applications/inventory_and_mrp/inventory/shipping_receiving/daily_operations.html", + "/applications/inventory_and_mrp/purchase.html", + "/applications/inventory_and_mrp/purchase/manage_deals/rfq.html", + "/applications/inventory_and_mrp/purchase/manage_deals/manage.html", + "/applications/inventory_and_mrp/purchase/manage_deals/blanket_orders.html", + "/applications/inventory_and_mrp/purchase/manage_deals/calls_for_tenders.html", + "/applications/inventory_and_mrp/manufacturing.html", + "/applications/inventory_and_mrp/manufacturing/workflows.html", + "/applications/inventory_and_mrp/manufacturing/workflows/use_mps.html", + "/applications/inventory_and_mrp/manufacturing/workflows/manufacturing_backorders.html", + "/applications/inventory_and_mrp/manufacturing/subcontracting.html", + "/applications/inventory_and_mrp/manufacturing/advanced_configuration/kit_shipping.html", + "/applications/hr.html", + "/applications/hr/employees.html", + "/applications/hr/employees/new_employee.html", + "/applications/hr/payroll.html", + "/applications/hr/payroll/contracts.html", + "/applications/hr/payroll/payslips.html", + "/applications/hr/payroll/batches.html", + "/applications/websites/ecommerce.html", + "/applications/websites/ecommerce/products.html", + "/applications/websites/ecommerce/checkout_payment_shipping/checkout.html", + "/applications/websites/ecommerce/checkout_payment_shipping/payments.html", + "/applications/websites/ecommerce/customer_accounts.html", + "/applications/services/helpdesk.html", + "/applications/services/helpdesk/advanced/after_sales.html", + "/applications/services/project.html", + "/applications/finance/fiscal_localizations/united_states.html", + "/applications.html", + "/applications/general.html", + ] + return [urljoin(BASE_URL, p) for p in paths] + + +def infer_module(url: str) -> str: + for module, path in MODULE_PATHS.items(): + if path.lstrip("/") in url: + return module + return "general" + + +def extract_section(soup: BeautifulSoup) -> str: + bc = soup.select(".breadcrumbs a, .wy-breadcrumbs a, nav[aria-label='breadcrumb'] a") + if bc: + return " > ".join(a.get_text(strip=True) for a in bc if a.get_text(strip=True)) + h1 = soup.find("h1") + return h1.get_text(strip=True) if h1 else "Odoo 18 Docs" + + +def clean_text(soup: BeautifulSoup) -> tuple: + for sel in NOISE_SELECTORS: + for el in soup.select(sel): + el.decompose() + + content = ( + soup.find("div", {"class": "document"}) + or soup.find("article") + or soup.find("main") + or soup.find("div", {"role": "main"}) + or soup.find("body") + ) + if not content: + return "", [] + + headings = [] + lines = [] + + for el in content.descendants: + if not hasattr(el, "name"): + continue + if el.name in ("h1", "h2", "h3", "h4"): + text = el.get_text(strip=True) + if text: + prefix = "#" * int(el.name[1]) + lines.append(f"\n{prefix} {text}\n") + if el.name in ("h2", "h3"): + headings.append(text) + elif el.name == "p": + text = el.get_text(separator=" ", strip=True) + if text and len(text) > 20: + lines.append(text) + elif el.name == "li": + text = el.get_text(separator=" ", strip=True) + if text and len(text) > 5: + lines.append(f"- {text}") + elif el.name == "code": + text = el.get_text(strip=True) + if text: + lines.append(f"`{text}`") + + raw = "\n".join(lines) + clean = re.sub(r"\n{3,}", "\n\n", raw).strip() + return clean, headings + + +def fetch_page(url: str) -> DocPage | None: + for attempt in range(MAX_RETRIES): + try: + resp = requests.get(url, headers=HEADERS, timeout=REQUEST_TIMEOUT) + if resp.status_code == 404: + log.warning(f"404: {url}") + return None + resp.raise_for_status() + + soup = BeautifulSoup(resp.text, "html.parser") + title_tag = soup.find("title") + title = title_tag.get_text(strip=True) if title_tag else url + title = re.sub(r"\s*—\s*Odoo.*", "", title).strip() + + text, headings = clean_text(soup) + if len(text) < 100: + return None + + return DocPage( + url=url, + title=title, + module=infer_module(url), + section=extract_section(soup), + text=text, + headings=headings, + doc_id=hashlib.sha256(url.encode()).hexdigest()[:16], + ) + except requests.RequestException as e: + if attempt < MAX_RETRIES - 1: + wait = 2 ** attempt + log.warning(f"Retry {attempt+1} for {url}: {e} (wait {wait}s)") + time.sleep(wait) + else: + log.error(f"Failed: {url}: {e}") + return None + + +def crawl(module: str | None = None, limit: int | None = None): + OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + + try: + urls = fetch_sitemap_urls(SITEMAP_URL, module) + except Exception as e: + log.warning(f"Sitemap unavailable ({e}), using fallback list") + urls = fallback_urls() + if module: + path = MODULE_PATHS.get(module, "") + urls = [u for u in urls if path.lstrip("/") in u] + + if limit: + urls = urls[:limit] + + log.info(f"Crawling {len(urls)} pages...") + written = skipped = 0 + + with open(OUTPUT_FILE, "w", encoding="utf-8") as f: + for i, url in enumerate(urls, 1): + log.info(f"[{i}/{len(urls)}] {url}") + page = fetch_page(url) + if page: + f.write(json.dumps(asdict(page), ensure_ascii=False) + "\n") + written += 1 + else: + skipped += 1 + time.sleep(DELAY_SECONDS) + + log.info(f"\n✅ Done. Written: {written}, Skipped: {skipped}") + log.info(f" Output: {OUTPUT_FILE}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Odoo 18 docs scraper") + parser.add_argument("--module", help=f"Filter to one module: {', '.join(MODULE_PATHS)}") + parser.add_argument("--limit", type=int, help="Max pages (for testing)") + args = parser.parse_args() + crawl(module=args.module, limit=args.limit)